<a href="https://colab.research.google.com/github/yashveersinghsohi/machine_hack_competitions/blob/optimization/Data_Science_Student_Championship/Optimal_Model/optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Git Clone

In [2]:
# !git clone -b optimization https://github.com/yashveersinghsohi/machine_hack_competitions.git

# Imports

In [4]:
# !pip install shap
# !pip install pyod
# !pip install optuna

In [27]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error
from pyod.models.hbos import HBOS
import optuna
from functools import partial
import pickle

# Dataset

In [6]:
root_dir = '/content/machine_hack_competitions/Data_Science_Student_Championship/Features/'

X_train = pd.read_csv(root_dir+'X_train.csv')
y_train = pd.read_csv(root_dir+'y_train.csv')

X_val = pd.read_csv(root_dir+'X_val.csv')
y_val = pd.read_csv(root_dir+'y_val.csv')

X_test = pd.read_csv(root_dir+'X_test.csv')
y_test = pd.read_csv(root_dir+'y_test.csv')

X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((167738, 20), (167738, 1), (41935, 20), (41935, 1), (89861, 20), (89861, 1))

# Helper Functions

In [7]:
def calculate_metrics(datasets, model):
  X, y = datasets
  mae = mean_absolute_error(y_true=y, y_pred=model.predict(X)).round(4)
  mse = mean_squared_error(y_true=y, y_pred=model.predict(X)).round(4)
  rmse = np.sqrt(mse).round(4)
  msle = mean_squared_log_error(y_true=y, y_pred=np.where(model.predict(X) < 0, 0, model.predict(X))).round(4)
  rmsle = np.sqrt(msle).round(4)
  return {'mae': mae, 'rmse': rmse, 'rmsle': rmsle}

In [8]:
def create_submission(
    model, X, 
    submission_path='/content/machine_hack_competitions/Data_Science_Student_Championship/Competition_Files/submission.csv',
    features = None, submission_name = ''
  ):
  X_copy = X.copy()
  X_copy['fare_pred'] = model.predict(X[features])
  submission_df = pd.read_csv(submission_path)
  submission_df['total_fare'] = X_copy['tip'] + X_copy['miscellaneous_fees'] + X_copy['fare_pred']
  submission_df['total_fare'] = np.where(submission_df['total_fare'] < 0, 0, submission_df['total_fare'])
  print(submission_df.head())
  submission_df.to_csv(submission_name, index=False)

# Model Fitting Functions

In [9]:
def model(
    features, datasets, 
    od_params={'contamination':0.1}, xgb_params={'random_state':42}
  ):
  X_train, X_val, y_train, y_val = datasets
  od = HBOS(**od_params)
  od.fit(X_train[features])
  X_train['od_proba'] = od.predict_proba(X_train[features])[:, 1]
  X_val['od_proba'] = od.predict_proba(X_val[features])[:, 1]
  xgb = XGBRegressor(**xgb_params)
  xgb.fit(X_train[features+['od_proba']], y_train.to_numpy().reshape(-1,))
  train_metrics = calculate_metrics(datasets=(X_train[features+['od_proba']], y_train), model=xgb)
  val_metrics = calculate_metrics(datasets=(X_val[features+['od_proba']], y_val), model=xgb)
  return {'od': od, 'xgb': xgb, 'train_metrics': train_metrics, 'val_metrics': val_metrics}

## Small Baseline Model

In [10]:
small_baseline_cache = model(
  features=[
    'trip_duration', 'distance_traveled', 'num_of_passengers', 
    'tip', 'miscellaneous_fees', 'surge_applied'
  ],
  datasets = [X_train, X_val, y_train, y_val],
  od_params = {'contamination':0.1},
  xgb_params = {'random_state':42}
)

In [11]:
small_baseline_cache['train_metrics'], small_baseline_cache['val_metrics']

({'mae': 5.2079, 'rmse': 21.8594, 'rmsle': 0.1568},
 {'mae': 6.6321, 'rmse': 38.4963, 'rmsle': 0.1855})

## Large Baseline Model

In [12]:
large_baseline_cache = model(
  features=[
    'trip_duration', 'distance_traveled', 'num_of_passengers', 
    'tip', 'miscellaneous_fees', 'surge_applied', 
    'is_miscellaneous_fees_negative', 'is_miscellaneous_fees_0', 
    'is_tip_0', 'is_trip_duration_0'
  ],
  datasets = [X_train, X_val, y_train, y_val],
  od_params = {'contamination':0.1},
  xgb_params = {'random_state':42}
)

In [13]:
large_baseline_cache['train_metrics'], large_baseline_cache['val_metrics']

({'mae': 5.1497, 'rmse': 21.1785, 'rmsle': 0.1523},
 {'mae': 6.7004, 'rmse': 39.6068, 'rmsle': 0.1873})

# Bayesian Optimization

In [14]:
def optimize_model(n_trials=15):
  optimization_function = partial(optimize)
  study = optuna.create_study(direction="minimize")
  study.optimize(optimization_function, n_trials=n_trials)
  return study.best_params

In [31]:
def optimize(trial):    
  # Pass 1
  # n_estimators = trial.suggest_int("n_estimators", 100, 1000)
  # max_depth = trial.suggest_int("max_depth", 3, 9)
  # colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
  # subsample = trial.suggest_float("subsample", 0.01, 1.0)
  # learning_rate = trial.suggest_float("learning_rate", 0.001, 0.01)
  # gamma = trial.suggest_float("gamma", 0.001, 0.02)
  # contamination = trial.suggest_float('contamination', 0.01, 0.3)
  
  # Pass 2
  objective = trial.suggest_categorical("objective", ['reg:squarederror', 'reg:squaredlogerror'])
  n_estimators = trial.suggest_int("n_estimators", 700, 1500)
  max_depth = trial.suggest_int("max_depth", 7, 15)
  colsample_bytree = trial.suggest_float("colsample_bytree", 0.6, 1.0)
  subsample = trial.suggest_float("subsample", 0.6, 1.0)
  learning_rate = trial.suggest_float("learning_rate", 0.001, 0.1)
  gamma = trial.suggest_float("gamma", 0.001, 0.02)
  contamination = trial.suggest_float('contamination', 0.01, 0.4)

  trial_model_cache = model(
    features=[
      'trip_duration', 'distance_traveled', 'num_of_passengers', 
      'tip', 'miscellaneous_fees', 'surge_applied', 
      'is_miscellaneous_fees_negative', 'is_miscellaneous_fees_0', 
      'is_tip_0', 'is_trip_duration_0'
    ],
    datasets = [X_train, X_val, y_train, y_val],
    od_params = {'contamination': contamination},
    xgb_params = {
      'random_state':42, 
      'n_estimators': n_estimators, 
      'max_depth': max_depth, 
      'colsample_bytree': colsample_bytree, 
      'subsample': subsample, 
      'learning_rate': learning_rate,
      'gamma': gamma,
      'tree_method': 'gpu_hist'
    }
  )

  val_rmsle = trial_model_cache['val_metrics']['rmsle']
  return val_rmsle

## Optimizing Large Model

In [32]:
large_best_params = optimize_model(n_trials=500)

[I 2023-06-10 02:37:29,773] A new study created in memory with name: no-name-9b16726e-6b35-43c6-b940-a03eeed1d1d8
[I 2023-06-10 02:37:36,313] Trial 0 finished with value: 0.1921 and parameters: {'objective': 'reg:squaredlogerror', 'n_estimators': 890, 'max_depth': 8, 'colsample_bytree': 0.7465273758718326, 'subsample': 0.6839944793432091, 'learning_rate': 0.07963708029337117, 'gamma': 0.011838383394775002, 'contamination': 0.17786276496239065}. Best is trial 0 with value: 0.1921.
[I 2023-06-10 02:37:56,016] Trial 1 finished with value: 0.1806 and parameters: {'objective': 'reg:squaredlogerror', 'n_estimators': 872, 'max_depth': 13, 'colsample_bytree': 0.9119964518767998, 'subsample': 0.9238138456885476, 'learning_rate': 0.06629911685902011, 'gamma': 0.011119648196287791, 'contamination': 0.14893233967143518}. Best is trial 1 with value: 0.1806.
[I 2023-06-10 02:38:17,258] Trial 2 finished with value: 0.1833 and parameters: {'objective': 'reg:squaredlogerror', 'n_estimators': 712, 'max_

In [33]:
large_best_params

{'objective': 'reg:squarederror',
 'n_estimators': 758,
 'max_depth': 11,
 'colsample_bytree': 0.9609664397472213,
 'subsample': 0.8533504015048231,
 'learning_rate': 0.008281622333649642,
 'gamma': 0.010677879030123294,
 'contamination': 0.19421243844072444}

In [34]:
optimal_large_model_cache = model(
    features=[
      'trip_duration', 'distance_traveled', 'num_of_passengers', 
      'tip', 'miscellaneous_fees', 'surge_applied', 
      'is_miscellaneous_fees_negative', 'is_miscellaneous_fees_0', 
      'is_tip_0', 'is_trip_duration_0'
    ],
    datasets = [X_train, X_val, y_train, y_val],
    od_params = {'contamination': large_best_params['contamination']},
    xgb_params = {
      'random_state':42, 
      'n_estimators': large_best_params['n_estimators'], 
      'max_depth': large_best_params['max_depth'], 
      'colsample_bytree': large_best_params['colsample_bytree'], 
      'subsample': large_best_params['subsample'], 
      'learning_rate': large_best_params['learning_rate'],
      'gamma': large_best_params['gamma'],
      'tree_method': 'gpu_hist'
    }
  )

In [35]:
optimal_large_model_cache['train_metrics'], optimal_large_model_cache['val_metrics']

({'mae': 4.617, 'rmse': 26.1015, 'rmsle': 0.1342},
 {'mae': 6.2636, 'rmse': 38.6493, 'rmsle': 0.1718})

In [36]:
large_features = [
  'trip_duration', 'distance_traveled', 'num_of_passengers', 
  'tip', 'miscellaneous_fees', 'surge_applied', 
  'is_miscellaneous_fees_negative', 'is_miscellaneous_fees_0', 
  'is_tip_0', 'is_trip_duration_0'
]
X_test_submission = X_test.copy()
X_test_submission['od_proba'] = optimal_large_model_cache['od'].predict_proba(X_test[large_features])[:, 1]
create_submission(
  model=optimal_large_model_cache['xgb'], 
  X=X_test_submission, features=large_features+['od_proba'], 
  submission_name = 'OPT_LARGE_MODEL_PASS_2.csv'
)

   total_fare
0  111.418518
1   60.939213
2  118.928406
3   89.259521
4  108.651665


In [37]:
with open('large_model_cache_2.pickle', 'wb') as handle:
  pickle.dump(optimal_large_model_cache, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('large_model_cache_2.pickle', 'rb') as handle:
  loaded = pickle.load(handle)

In [38]:
optimal_large_model_cache

{'od': HBOS(alpha=0.1, contamination=0.19421243844072444, n_bins=10, tol=0.5),
 'xgb': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.9609664397472213, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0.010677879030123294, gpu_id=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.008281622333649642, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=11, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=758, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, ...),
 'train_metrics': {'mae': 4.617, 'rmse': 26.1015, 'rmsle': 0.1342},
 'val_metrics': {'mae': 6.2636, 'rmse

In [39]:
loaded

{'od': HBOS(alpha=0.1, contamination=0.19421243844072444, n_bins=10, tol=0.5),
 'xgb': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.9609664397472213, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0.010677879030123294, gpu_id=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.008281622333649642, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=11, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=758, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, ...),
 'train_metrics': {'mae': 4.617, 'rmse': 26.1015, 'rmsle': 0.1342},
 'val_metrics': {'mae': 6.2636, 'rmse