In [2]:
!pip install --quiet optuna

In [1]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import optuna as opt
import xgboost as xgb
import catboost as cat
import os
import sys
import warnings
import gc

from sklearn.preprocessing import StandardScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')

# Load the Dataset

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
df_train = pd.read_parquet("/content/drive/MyDrive/CS760/train.parquet")
df_val = pd.read_parquet('/content/drive/MyDrive/CS760/val.parquet')
df_test = pd.read_parquet("/content/drive/MyDrive/CS760/test.parquet")

print(f"Shape of the training data : {df_train.shape}")
print(f"Shape of the validation data : {df_val.shape}")
print(f"Shape of the test data : {df_test.shape}")

Shape of the training data : (2060626, 13)
Shape of the validation data : (257578, 13)
Shape of the test data : (257579, 13)


In [6]:
df_train.head()

Unnamed: 0,r_stars,r_stars_square,r_length,u_friends_count,u_review_count,u_month_age,b_stars,b_review_count,r_sen,r_sub,r_rea,r_id,r_useful
0,-1.693936,-1.494513,0.961938,-0.331709,-0.323091,-0.609501,-0.934654,-0.238997,-1.403165,1.124223,-0.79669,4847617,0.118048
1,-0.394383,-0.662991,-0.836997,0.248959,0.116364,-0.277802,0.953066,-0.475961,-1.15298,0.742918,1.431968,5577152,0.011136
2,-0.394383,-0.662991,-0.515448,-0.271701,-0.258584,0.664981,-0.305414,-0.309486,0.345055,0.438695,1.339823,1113002,0.030032
3,-1.693936,-1.494513,1.970036,-0.250521,-0.371472,-1.232336,-1.563893,-0.35448,-2.13503,-0.014948,-0.677443,6353340,0.01799
4,0.255394,0.06459,0.118959,-0.331709,-0.379535,-1.027147,-0.934654,-0.080021,0.011747,0.312105,-0.634984,5836201,0.006991


In [7]:
df_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
r_stars,2060626.0,-1.671941e-16,1.0,-1.693936,-1.044159,0.255394,0.9051706,0.9051706
r_stars_square,2060626.0,-1.01894e-17,1.0,-1.494513,-1.182692,0.06458972,1.000051,1.000051
r_length,2060626.0,-2.3882160000000002e-17,1.0,-1.158545,-0.6631864,-0.2808043,0.3275309,8.122912
u_friends_count,2060626.0,-1.731163e-17,1.0,-0.331709,-0.329944,-0.2628759,-0.03696244,26.13194
u_review_count,2060626.0,3.25578e-17,1.0,-0.385583,-0.3634084,-0.2968854,-0.03079341,34.83733
u_month_age,2060626.0,3.771456e-16,1.0,-1.237169,-0.8424,-0.1823749,0.6486289,4.604853
b_stars,2060626.0,-8.181173e-17,1.0,-3.451613,-0.3054136,0.3238264,0.9530663,1.582306
b_review_count,2060626.0,2.5826930000000003e-17,1.0,-0.498458,-0.4459658,-0.318485,0.006965998,10.84433
r_sen,2060626.0,5.128801e-16,1.0,-6.135095,-0.634691,-0.007891442,0.6128473,4.243026
r_sub,2060626.0,-4.767259e-16,1.0,-3.029519,-0.6557234,-0.02057011,0.6333986,3.785211


In [8]:
X_train, y_train = df_train.drop(['r_useful', 'r_id'], axis=1).values, df_train['r_useful'].values
X_val, y_val = df_val.drop(['r_useful', 'r_id'], axis=1).values, df_val['r_useful'].values
X_test, y_test = df_test.drop(['r_useful', 'r_id'], axis=1).values, df_test['r_useful'].values

In [9]:
print(X_train.shape[0] / (X_train.shape[0] + X_val.shape[0] + X_test.shape[0]))
print(X_val.shape[0] / (X_train.shape[0] + X_val.shape[0] + X_test.shape[0]))
print(X_test.shape[0] / (X_train.shape[0] + X_val.shape[0] + X_test.shape[0]))

0.7999998447074151
0.09999988353056138
0.10000027176202343


In [10]:
def objective(trial):
  """ Function to tune parameters """
  gc.collect()
  params = {
      "n_estimators":trial.suggest_categorical('n_estimators', [100, 500, 1000, 2500, 5000, 10000]),
      "max_depth" : trial.suggest_int("max_depth", 3, 20),
      "learning_rate" : trial.suggest_categorical('lr', [0.001, 0.005, 0.01, 0.1, 1.]),
      "reg_alpha": trial.suggest_categorical("reg_alpha", [1e-3, 1e-2, 1e-1, 1, 10, 100]),
      "reg_lambda": trial.suggest_categorical("reg_lambda", [1e-3, 1e-2, 1e-1, 1, 10, 100])
  }


  model = xgb.XGBRegressor(objective="reg:squarederror",
                            n_jobs=-1,
                            grow_policy='lossguide',
                            tree_method="gpu_hist",
                            predictor="gpu_predictor",
                            booster='gbtree',
                            sampling_method='gradient_based',
                            use_label_encoder=False,
                            eval_metrics=['rmse'], 
                            random_state=42,
                            enable_categorical=False,
                            **params)
  
  std = StandardScaler()
  yt = std.fit_transform(y_train.reshape(-1, 1)).reshape(-1)
  yv = std.transform(y_val.reshape(-1, 1)).reshape(-1)
  model.fit(X_train, yt, eval_set=[(X_val, yv)], early_stopping_rounds=300, verbose=100)
  y_pred = model.predict(X_val)

  return np.sqrt(mean_squared_error(yv, y_pred))

In [11]:
study = opt.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

[32m[I 2022-08-22 18:36:08,839][0m A new study created in memory with name: no-name-2e9e58a0-9aab-496a-b468-841e59fec858[0m


[0]	validation_0-rmse:1.49646
Will train until validation_0-rmse hasn't improved in 300 rounds.
[100]	validation_0-rmse:1.66955
[200]	validation_0-rmse:1.6703
[300]	validation_0-rmse:1.6705
Stopping. Best iteration:
[0]	validation_0-rmse:1.49646



[32m[I 2022-08-22 18:36:40,286][0m Trial 0 finished with value: 1.4964612835756272 and parameters: {'n_estimators': 1000, 'max_depth': 11, 'lr': 1.0, 'reg_alpha': 1, 'reg_lambda': 10}. Best is trial 0 with value: 1.4964612835756272.[0m


[0]	validation_0-rmse:1.56848
Will train until validation_0-rmse hasn't improved in 300 rounds.
[100]	validation_0-rmse:1.55853
[200]	validation_0-rmse:1.55047
[300]	validation_0-rmse:1.54568
[400]	validation_0-rmse:1.54413
[500]	validation_0-rmse:1.54497
[600]	validation_0-rmse:1.54752
[700]	validation_0-rmse:1.55145
Stopping. Best iteration:
[410]	validation_0-rmse:1.54412



[32m[I 2022-08-22 18:37:09,375][0m Trial 1 finished with value: 1.5441168868746622 and parameters: {'n_estimators': 5000, 'max_depth': 10, 'lr': 0.001, 'reg_alpha': 0.01, 'reg_lambda': 0.1}. Best is trial 0 with value: 1.4964612835756272.[0m


[0]	validation_0-rmse:1.56783
Will train until validation_0-rmse hasn't improved in 300 rounds.
[99]	validation_0-rmse:1.51789


[32m[I 2022-08-22 18:37:19,087][0m Trial 2 finished with value: 1.517893740765174 and parameters: {'n_estimators': 100, 'max_depth': 15, 'lr': 0.005, 'reg_alpha': 0.1, 'reg_lambda': 10}. Best is trial 0 with value: 1.4964612835756272.[0m


[0]	validation_0-rmse:1.56784
Will train until validation_0-rmse hasn't improved in 300 rounds.
[100]	validation_0-rmse:1.51835
[200]	validation_0-rmse:1.50048
[300]	validation_0-rmse:1.49466
[400]	validation_0-rmse:1.49442
[500]	validation_0-rmse:1.49376
[600]	validation_0-rmse:1.49339
[700]	validation_0-rmse:1.4932
[800]	validation_0-rmse:1.49309
[900]	validation_0-rmse:1.49285
[1000]	validation_0-rmse:1.49505
[1100]	validation_0-rmse:1.49956
Stopping. Best iteration:
[894]	validation_0-rmse:1.49278



[32m[I 2022-08-22 18:37:40,818][0m Trial 3 finished with value: 1.4927830534779138 and parameters: {'n_estimators': 5000, 'max_depth': 6, 'lr': 0.005, 'reg_alpha': 0.001, 'reg_lambda': 0.01}. Best is trial 3 with value: 1.4927830534779138.[0m


[0]	validation_0-rmse:1.56705
Will train until validation_0-rmse hasn't improved in 300 rounds.
[100]	validation_0-rmse:1.53669
[200]	validation_0-rmse:1.5776
[300]	validation_0-rmse:1.61485
Stopping. Best iteration:
[59]	validation_0-rmse:1.52918



[32m[I 2022-08-22 18:39:20,708][0m Trial 4 finished with value: 1.529177366735057 and parameters: {'n_estimators': 2500, 'max_depth': 16, 'lr': 0.01, 'reg_alpha': 0.001, 'reg_lambda': 1}. Best is trial 3 with value: 1.4927830534779138.[0m


[0]	validation_0-rmse:2.01942
Will train until validation_0-rmse hasn't improved in 300 rounds.
[100]	validation_0-rmse:2.06113
[200]	validation_0-rmse:2.06112
[300]	validation_0-rmse:2.06116
Stopping. Best iteration:
[0]	validation_0-rmse:2.01942



[32m[I 2022-08-22 18:39:48,285][0m Trial 5 finished with value: 2.0194222370381967 and parameters: {'n_estimators': 10000, 'max_depth': 11, 'lr': 1.0, 'reg_alpha': 0.1, 'reg_lambda': 0.001}. Best is trial 3 with value: 1.4927830534779138.[0m


[0]	validation_0-rmse:1.55899
Will train until validation_0-rmse hasn't improved in 300 rounds.
[100]	validation_0-rmse:1.59757
[200]	validation_0-rmse:1.60001
[300]	validation_0-rmse:1.60054
Stopping. Best iteration:
[0]	validation_0-rmse:1.55899



[32m[I 2022-08-22 18:39:59,714][0m Trial 6 finished with value: 1.5589873215800585 and parameters: {'n_estimators': 10000, 'max_depth': 10, 'lr': 0.1, 'reg_alpha': 10, 'reg_lambda': 0.001}. Best is trial 3 with value: 1.4927830534779138.[0m


[0]	validation_0-rmse:1.56705
Will train until validation_0-rmse hasn't improved in 300 rounds.
[100]	validation_0-rmse:1.49892
[200]	validation_0-rmse:1.49026
[300]	validation_0-rmse:1.48997
[400]	validation_0-rmse:1.4898
[499]	validation_0-rmse:1.48978


[32m[I 2022-08-22 18:40:07,168][0m Trial 7 finished with value: 1.4897510230297106 and parameters: {'n_estimators': 500, 'max_depth': 4, 'lr': 0.01, 'reg_alpha': 0.01, 'reg_lambda': 0.1}. Best is trial 7 with value: 1.4897510230297106.[0m


[0]	validation_0-rmse:1.56711
Will train until validation_0-rmse hasn't improved in 300 rounds.
[100]	validation_0-rmse:1.58139
[200]	validation_0-rmse:1.60739
[300]	validation_0-rmse:1.61583
Stopping. Best iteration:
[31]	validation_0-rmse:1.54981



[32m[I 2022-08-22 18:40:55,894][0m Trial 8 finished with value: 1.5498143176551287 and parameters: {'n_estimators': 1000, 'max_depth': 19, 'lr': 0.01, 'reg_alpha': 10, 'reg_lambda': 0.1}. Best is trial 7 with value: 1.4897510230297106.[0m


[0]	validation_0-rmse:1.5671
Will train until validation_0-rmse hasn't improved in 300 rounds.
[100]	validation_0-rmse:1.61969
[200]	validation_0-rmse:1.62922
[300]	validation_0-rmse:1.64347
Stopping. Best iteration:
[12]	validation_0-rmse:1.55903



[32m[I 2022-08-22 18:41:14,028][0m Trial 9 finished with value: 1.5590252886178413 and parameters: {'n_estimators': 5000, 'max_depth': 11, 'lr': 0.01, 'reg_alpha': 0.001, 'reg_lambda': 0.001}. Best is trial 7 with value: 1.4897510230297106.[0m


In [12]:
study.best_params

{'n_estimators': 500,
 'max_depth': 4,
 'lr': 0.01,
 'reg_alpha': 0.01,
 'reg_lambda': 0.1}

In [13]:
model = xgb.XGBRegressor(objective="reg:squarederror", 
                        n_jobs=-1, 
                        grow_policy='lossguide',
                        tree_method="gpu_hist", 
                        predictor="gpu_predictor",
                        booster='gbtree',
                        sampling_method='gradient_based',
                        use_label_encoder=False,
                        random_state=42, 
                        enable_categorical=False,
                        **study.best_params)

std = StandardScaler()
y_trn = std.fit_transform(y_train.reshape(-1, 1)).reshape(-1)
y_tst = std.transform(y_test.reshape(-1, 1)).reshape(-1)
model.fit(X_train, y_trn)

## Score on test set 
mae = mean_absolute_error(y_tst, model.predict(X_test))
rmse = np.sqrt(mean_squared_error(y_tst, model.predict(X_test)))
print(f"MAE on test set : {mae}")
print(f"RMSE on test set : {rmse}")

MAE on test set : 0.031178268387622877
RMSE on test set : 1.042864155606246


# CatBoost

In [18]:
cat_model = cat.CatBoostRegressor()
y_v = std.transform(y_val.reshape(-1, 1)).reshape(-1)
cat_model.fit(X_train, y_trn, eval_set=(X_val, y_v), early_stopping_rounds=100, verbose=10)

mae = mean_absolute_error(y_tst, cat_model.predict(X_test))
rmse = np.sqrt(mean_squared_error(y_tst, cat_model.predict(X_test)))
print(f"MAE on test set : {mae}")
print(f"RMSE on test set : {rmse}")

Learning rate set to 0.168731
0:	learn: 0.9958396	test: 1.4870462	best: 1.4870462 (0)	total: 209ms	remaining: 3m 29s
10:	learn: 0.9745091	test: 1.4917212	best: 1.4870462 (0)	total: 2.31s	remaining: 3m 27s
20:	learn: 0.9602364	test: 1.4958209	best: 1.4870462 (0)	total: 4.48s	remaining: 3m 28s
30:	learn: 0.9362577	test: 1.4966666	best: 1.4870462 (0)	total: 6.69s	remaining: 3m 29s
40:	learn: 0.9183570	test: 1.5022424	best: 1.4870462 (0)	total: 8.83s	remaining: 3m 26s
50:	learn: 0.8934002	test: 1.5054488	best: 1.4870462 (0)	total: 11s	remaining: 3m 24s
60:	learn: 0.8749582	test: 1.5113139	best: 1.4870462 (0)	total: 13.2s	remaining: 3m 22s
70:	learn: 0.8574366	test: 1.5171076	best: 1.4870462 (0)	total: 15.4s	remaining: 3m 21s
80:	learn: 0.8461288	test: 1.5193089	best: 1.4870462 (0)	total: 17.5s	remaining: 3m 18s
90:	learn: 0.8372951	test: 1.5202276	best: 1.4870462 (0)	total: 19.6s	remaining: 3m 15s
100:	learn: 0.8138182	test: 1.5210498	best: 1.4870462 (0)	total: 21.8s	remaining: 3m 13s
Stop

In [19]:
xgb_tst_pred = model.predict(X_test)
cat_tst_pred = cat_model.predict(X_test)
avg_pred = (xgb_tst_pred + cat_tst_pred) / 2

mae = mean_absolute_error(y_tst, avg_pred)
rmse = np.sqrt(mean_squared_error(y_tst, avg_pred))

print(f"MAE on test set : {mae}")
print(f"RMSE on test set : {rmse}")

MAE on test set : 0.0264262518862571
RMSE on test set : 0.6181186975597794
