In [1]:
import pandas as pd
import xgboost
from numpy.ma.core import minimum
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, OrdinalEncoder, StandardScaler

train_df_copy = pd.read_csv("cleaned_train_only_dropped_y.csv")

In [2]:
train_df_copy

Unnamed: 0,app_name,app_category,size_unknown_unit,downloads_unstandardized,reviews_count,free_paid,price_if_paid,age_rating,app_tags,release_date,app_version,compatible_os_version,Y
0,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,4.7
1,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,4.3
2,Paper flowers instructions,ART_AND_DESIGN,167,5.6M,"50,000+",Free,0,Everyone,Art & Design,"March 26, 2017",1.0,2.3 and up,4.4
3,Smoke Effect Photo Maker - Smoke Editor,ART_AND_DESIGN,178,19M,"50,000+",Free,0,Everyone,Art & Design,"April 26, 2018",1.1,4.0.3 and up,3.8
4,Infinite Painter,ART_AND_DESIGN,36815,29M,"1,000,000+",Free,0,Everyone,Art & Design,"June 14, 2018",6.1.61.1,4.2 and up,4.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6704,TED,EDUCATION,181927,18M,"10,000,000+",Free,0,Everyone 10+,Education,"July 27, 2018",3.2.5,4.1 and up,4.6
6705,Choices: Stories You Play,GAME,807246,93M,"10,000,000+",Free,0,Teen,Simulation,"July 30, 2018",2.3.5,4.0.3 and up,4.6
6706,Toca Life: City,EDUCATION,31092,24M,"500,000+",Paid,$3.99,Everyone,Education;Pretend Play,"July 6, 2018",1.5-play,4.4 and up,4.7
6707,UNICORN - Color By Number & Pixel Art Coloring,ART_AND_DESIGN,8204,24M,"500,000+",Free,0,Everyone,Art & Design;Creativity,"August 2, 2018",1.0.9,4.4 and up,4.7


* Maybe try stratified sampling on review groups

In [3]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(train_df_copy, test_size=0.1, random_state=42)

In [4]:
train_set.shape

(6038, 13)

In [5]:
test_set.shape

(671, 13)

* For imputer try knn and iterative
* Our missing data is MNAR

In [6]:
from preprocessing import *

column_transform = ColumnTransformer([
    ("categories", category_pipeline(), ["app_category", "free_paid"]),
    ("boxcox", box_cox_pipeline(), ["size_unknown_unit"]), # maybe change to mb
    ("downloads", downloads_pipeline(), ["downloads_unstandardized"]),
    ("reviews", reviews_pipeline(), ["reviews_count"]),
    ("price", price_pipeline(), ["price_if_paid"]),
    ("age_rating", age_rating_pipeline(), ["age_rating"]),
    ("dates", release_date_pipeline(), ["release_date"]),
    ("os", os_version_pipeline(), ["compatible_os_version"]),
],
    remainder='drop')

In [7]:
column_transform

* Currently there's still errors in mapping and numpy in the pipeline

In [8]:
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, SelectFromModel, RFE
from sklearn import set_config


df = train_set.copy()
X = df.drop(columns=['Y'])
y = df['Y']

In [9]:
try:
    column_transform.fit(X)
    X_prepared = column_transform.transform(X)
except Exception as e:
    print(f"Error during transformation: {e}")
    raise


# feature_names = column_transform.get_feature_names_out()
X_prepared_df = pd.DataFrame(X_prepared, index=X.index)


In [10]:
X_prepared_df

Unnamed: 0,0
1335,<Compressed Sparse Row sparse matrix of dtype ...
1406,<Compressed Sparse Row sparse matrix of dtype ...
2318,<Compressed Sparse Row sparse matrix of dtype ...
1807,<Compressed Sparse Row sparse matrix of dtype ...
4689,<Compressed Sparse Row sparse matrix of dtype ...
...,...
3772,<Compressed Sparse Row sparse matrix of dtype ...
5191,<Compressed Sparse Row sparse matrix of dtype ...
5226,<Compressed Sparse Row sparse matrix of dtype ...
5390,<Compressed Sparse Row sparse matrix of dtype ...


In [24]:
from sklearn.metrics import make_scorer, mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, RepeatedKFold
from sklearn.pipeline import Pipeline

# Define pipeline
xgb_pipeline = Pipeline([
    ("preprocessing", column_transform),
    ("regression", XGBRegressor(random_state=42, n_jobs=-1))
])

# Define parameter grid
param_grid = {
    "regression__n_estimators": [100, 200],
    "regression__max_depth": [3, 4, 5],
    "regression__learning_rate": [0.05, 0.1],
    "regression__subsample": [0.8, 1.0],
    "regression__colsample_bytree": [0.8, 1.0]
}

# Use MSE as scoring
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
# GridSearchCV setup
grid_search = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring=mse_scorer,
    n_jobs=-1,
    verbose=1
)

# Run the search
grid_search.fit(X, y)

# Output best result
print("Best parameters:", grid_search.best_params_)
print("Best MSE:", -grid_search.best_score_)


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best parameters: {'regression__colsample_bytree': 1.0, 'regression__learning_rate': 0.05, 'regression__max_depth': 4, 'regression__n_estimators': 200, 'regression__subsample': 0.8}
Best MSE: 0.24427213626458077


In [25]:
df_test = test_set.copy()
X_test = df.drop(columns=['Y'])
y_test = df['Y']

In [26]:
y_test

1335    4.6
1406    3.8
2318    4.2
1807    4.3
4689    4.7
       ... 
3772    3.5
5191    5.0
5226    3.4
5390    4.0
860     3.8
Name: Y, Length: 6038, dtype: float64

In [27]:
from sklearn.metrics import mean_absolute_error, r2_score

best_pipeline = grid_search.best_estimator_

# # Fit on training set
# best_pipeline.fit(X, y)

# Predict on test set
y_pred = best_pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Output
print(f"Test MSE: {mse:.4f}")
print(f"Test MAE: {mae:.4f}")
print(f"Test R²: {r2:.4f}")



Test MSE: 0.1869
Test MAE: 0.3043
Test R²: 0.3528


In [28]:
import pandas as pd

# Load test data (input features only)
test_df = pd.read_csv("app-rating-competition/test.csv")
sample_submission = pd.read_csv("app-rating-competition/SampleSubmission.csv")


In [29]:
test_df

Unnamed: 0,app_name,app_category,size_unknown_unit,downloads_unstandardized,reviews_count,free_paid,price_if_paid,age_rating,app_tags,release_date,app_version,compatible_os_version
0,Girls hairstyles 2018,BEAUTY,62,3.1M,"10,000+",Free,0,Everyone,Beauty,"May 28, 2018",1.9.2,4.0.3 and up
1,Dairy Queen,FOOD_AND_DRINK,742,43M,"100,000+",Free,0,Everyone,Food & Drink,"July 25, 2018",2.1.0,4.1 and up
2,Remote Control For All AC - Universal Remote,PRODUCTIVITY,166,6.1M,"10,000+",Free,0,Everyone,Productivity,"July 28, 2018",1.1,4.0.3 and up
3,Ultimate Chest Tracker,PRODUCTIVITY,40328,23M,"1,000,000+",Free,0,Everyone,Productivity,"October 17, 2016",1.9,2.3 and up
4,Mobilight-BM,FINANCE,6,6.2M,500+,Free,0,Everyone,Finance,"August 3, 2018",2.0.10,4.0.3 and up
...,...,...,...,...,...,...,...,...,...,...,...,...
1868,FREE Stock Market Trading Tips,FINANCE,714,3.6M,"50,000+",Free,0,Everyone,Finance,"June 7, 2017",2.10.3,2.3.3 and up
1869,Next Portuguese(BR) Langpack,TOOLS,1320,778k,"100,000+",Free,0,Everyone,Tools,"September 13, 2013",1.0,2.0 and up
1870,850 Sports News Digest,SPORTS,539,Varies with device,"10,000+",Free,0,Everyone,Sports,"August 23, 2016",Varies with device,Varies with device
1871,Lite for Facebook Messenger,COMMUNICATION,76498,4.3M,"1,000,000+",Free,0,Teen,Communication,"June 20, 2018",6.3.2,Varies with device


In [30]:
preprocessed_train = xgb_pipeline.named_steps["preprocessing"].transform(X)
print("Preprocessed train shape:", preprocessed_train.shape)


Preprocessed train shape: (6038, 61)


In [33]:

# 2. Predict using trained pipeline
y_pred_submission = best_pipeline.predict(test_df)

# 4. Create final submission by replacing Y column
sample_submission["Y"] = y_pred_submission

# 5. Save to CSV
sample_submission.to_csv("submission25.csv", index=False)

print("✅ submission.csv created successfully with row_id and rounded Y.")


✅ submission.csv created successfully with row_id and rounded Y.


In [34]:
# 2. Predict using trained pipeline
y_pred_submission = best_pipeline.predict(test_df)

# 3. Round predictions to 1 decimal place
y_pred_submission = np.round(y_pred_submission, 1)

# 4. Create final submission by replacing Y column
sample_submission["Y"] = y_pred_submission

# 5. Save to CSV
sample_submission.to_csv("submission26.csv", index=False)

print("✅ submission.csv created successfully with row_id and rounded Y.")


✅ submission.csv created successfully with row_id and rounded Y.
