In [57]:
!pip install --quiet optuna

In [58]:
import numpy as np
import pandas as pd
import optuna as opt
import xgboost as xgb
from pathlib import Path
import gc

from sklearn.metrics import mean_absolute_error, mean_squared_error

In [59]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [60]:
RANDOM_STATE = 2
DATA_DIR = Path("/content/drive/MyDrive/CS760/transformed")

In [61]:
df_train = pd.read_parquet(DATA_DIR/"train.parquet")
df_val = pd.read_parquet(DATA_DIR/'val.parquet')
df_test = pd.read_parquet(DATA_DIR/"test.parquet")

print(f"Shape of the training data : {df_train.shape}")
print(f"Shape of the validation data : {df_val.shape}")
print(f"Shape of the test data : {df_test.shape}")

Shape of the training data : (2060626, 13)
Shape of the validation data : (257578, 13)
Shape of the test data : (257579, 13)


In [62]:
df_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
r_stars,2060626.0,-1.671941e-16,1.0,-1.693936,-1.044159,0.255394,0.9051706,0.9051706
r_stars_square,2060626.0,-1.01894e-17,1.0,-1.494513,-1.182692,0.06458972,1.000051,1.000051
r_length,2060626.0,-2.3882160000000002e-17,1.0,-1.158545,-0.6631864,-0.2808043,0.3275309,8.122912
u_friends_count,2060626.0,-1.731163e-17,1.0,-0.331709,-0.329944,-0.2628759,-0.03696244,26.13194
u_review_count,2060626.0,3.25578e-17,1.0,-0.385583,-0.3634084,-0.2968854,-0.03079341,34.83733
u_month_age,2060626.0,3.771456e-16,1.0,-1.237169,-0.8424,-0.1823749,0.6486289,4.604853
b_stars,2060626.0,-8.181173e-17,1.0,-3.451613,-0.3054136,0.3238264,0.9530663,1.582306
b_review_count,2060626.0,2.5826930000000003e-17,1.0,-0.498458,-0.4459658,-0.318485,0.006965998,10.84433
r_sen,2060626.0,5.128801e-16,1.0,-6.135095,-0.634691,-0.007891442,0.6128473,4.243026
r_sub,2060626.0,-4.767259e-16,1.0,-3.029519,-0.6557234,-0.02057011,0.6333986,3.785211


In [63]:
X_train, y_train = df_train.drop(['r_useful', 'r_id'], axis=1).values, df_train['r_useful'].values
X_val, y_val = df_val.drop(['r_useful', 'r_id'], axis=1).values, df_val['r_useful'].values
X_test, y_test = df_test.drop(['r_useful', 'r_id'], axis=1).values, df_test['r_useful'].values

In [66]:
def objective(trial):
  """ Function to tune parameters """
  gc.collect()
  params = {
      "n_estimators":trial.suggest_categorical('n_estimators', [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]),
      "max_depth" : trial.suggest_int("max_depth", 3, 20),
      "learning_rate" : trial.suggest_categorical('lr', [0.01, 0.1, 0.2, 0.3, 0.4]),
      "reg_alpha": trial.suggest_categorical("reg_alpha", [1e-3, 1e-2, 1e-1, 1, 10, 100]),
      "reg_lambda": trial.suggest_categorical("reg_lambda", [1e-3, 1e-2, 1e-1, 1, 10, 100])
  }


  model = xgb.XGBRegressor(objective="reg:squarederror",
                            n_jobs=-1,
                            grow_policy='lossguide',
                            tree_method="gpu_hist",
                            predictor="gpu_predictor",
                            booster='gbtree',
                            sampling_method='gradient_based',
                            eval_metrics=['rmse'], 
                            random_state=RANDOM_STATE,
                            enable_categorical=False,
                            early_stopping_rounds=300,
                            **params)
  
  model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
  y_pred = model.predict(X_val)

  return mean_squared_error(y_val, y_pred, squared=False)

study = opt.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
study.best_params

[32m[I 2022-08-23 03:36:04,225][0m A new study created in memory with name: no-name-b5950168-e4f7-4f16-b38c-3d43efafd226[0m
[32m[I 2022-08-23 03:36:07,461][0m Trial 0 finished with value: 12.985403877728588 and parameters: {'n_estimators': 16, 'max_depth': 14, 'lr': 0.3, 'reg_alpha': 1, 'reg_lambda': 1}. Best is trial 0 with value: 12.985403877728588.[0m
[32m[I 2022-08-23 03:36:08,124][0m Trial 1 finished with value: 11.733884004944384 and parameters: {'n_estimators': 2, 'max_depth': 13, 'lr': 0.01, 'reg_alpha': 10, 'reg_lambda': 10}. Best is trial 1 with value: 11.733884004944384.[0m
[32m[I 2022-08-23 03:36:54,664][0m Trial 2 finished with value: 14.36026823566918 and parameters: {'n_estimators': 64, 'max_depth': 18, 'lr': 0.3, 'reg_alpha': 1, 'reg_lambda': 0.01}. Best is trial 1 with value: 11.733884004944384.[0m
[32m[I 2022-08-23 03:37:18,080][0m Trial 3 finished with value: 13.42419845560389 and parameters: {'n_estimators': 512, 'max_depth': 10, 'lr': 0.2, 'reg_alpha'

{'n_estimators': 128,
 'max_depth': 17,
 'lr': 0.01,
 'reg_alpha': 10,
 'reg_lambda': 100}

In [69]:
  model = xgb.XGBRegressor(objective="reg:squarederror",
                            n_jobs=-1,
                            grow_policy='lossguide',
                            tree_method="gpu_hist",
                            predictor="gpu_predictor",
                            booster='gbtree',
                            sampling_method='gradient_based',
                            eval_metrics=['rmse'], 
                            random_state=RANDOM_STATE,
                            enable_categorical=False,
                            early_stopping_rounds=300,
                            **study.best_params)
  model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

XGBRegressor(early_stopping_rounds=300, enable_categorical=False,
             eval_metrics=['rmse'], grow_policy='lossguide', lr=0.01,
             max_depth=17, n_estimators=128, n_jobs=-1,
             objective='reg:squarederror', predictor='gpu_predictor',
             random_state=2, reg_alpha=10, reg_lambda=100,
             sampling_method='gradient_based', tree_method='gpu_hist')

In [71]:
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)
print(f"train results - RMSE: {mean_squared_error(y_train, train_pred, squared=False)}, MAE: {mean_absolute_error(y_train, train_pred)}")
print(f"test results - RMSE: {mean_squared_error(y_test, test_pred, squared=False)}, MAE: {mean_absolute_error(y_test, test_pred)}")

train results - RMSE: 6.646199771165186, MAE: 0.1459062233659509
test results - RMSE: 3.036702853178775, MAE: 0.15059195799390807
