# Select the best model based on validation RMSE

1. **Best full feature XGB validation RMSE: 6.2885**
4. Best limited feature XGB validation RMSE: 6.5263
3. Best full feature Ridge validation RMSE: 6.7215
2. Best full feature SGD elastic net validation RMSE: 7.4534
1. Best limited feature Ridge validation RMSE: 9.9607
1. Best limited feature SGD elastic net validation RMSE: 10.2980

Winner is full feature XGB with {'subsample': 1.0, 'min_child_weight': 100, 'max_depth': 8, 'learning_rate': 0.05, 'colsample_bytree': 0.6}!



# Train winning model on full train data

In [1]:
# first specify the model with winning hyperparameters and final training specs

from xgboost import XGBRegressor

mdl = XGBRegressor(
    tree_method="hist",
    enable_categorical=True,  # if using pandas categorical dtypes
    n_estimators=2000,        # large, rely on early stopping
    objective="reg:squarederror",
    eval_metric="rmse",
    early_stopping_rounds=50,
    n_jobs=-1,
    subsample=1.0, 
    min_child_weight=100, 
    max_depth=8, 
    learning_rate=0.05, 
    colsample_bytree=0.6
)



In [2]:
# next train it on full_train data (all data except testing)

from sklearn.model_selection import train_test_split
import pickle
import gc

with open("df_full_train.pkl", "rb") as f:
    df_full_train=pickle.load(f)

with open("dv_full.pkl", "rb") as f:
    dv_full=pickle.load(f)

y_full_train = df_full_train.base_passenger_fare.values
X_full_train = dv_full.fit_transform(df_full_train.drop(columns='base_passenger_fare').to_dict(orient='records'))

del df_full_train
gc.collect()

# split some data for early stopping
X_train, X_stop_xgb, y_train, y_stop_xgb = train_test_split(
    X_full_train, y_full_train, test_size=0.1, random_state=0)

mdl.fit(X_train, y_train, eval_set=[(X_stop_xgb, y_stop_xgb)])

with open("final_model_trained.pkl", "wb") as f:
    pickle.dump(mdl, f)

[0]	validation_0-rmse:15.46704
[1]	validation_0-rmse:14.92640
[2]	validation_0-rmse:14.36018
[3]	validation_0-rmse:13.79631
[4]	validation_0-rmse:13.62037
[5]	validation_0-rmse:13.09654
[6]	validation_0-rmse:12.60615
[7]	validation_0-rmse:12.20670
[8]	validation_0-rmse:11.80391
[9]	validation_0-rmse:11.47631
[10]	validation_0-rmse:11.08617
[11]	validation_0-rmse:10.72050
[12]	validation_0-rmse:10.38650
[13]	validation_0-rmse:10.11571
[14]	validation_0-rmse:9.82366
[15]	validation_0-rmse:9.55945
[16]	validation_0-rmse:9.32869
[17]	validation_0-rmse:9.12190
[18]	validation_0-rmse:8.92879
[19]	validation_0-rmse:8.76155
[20]	validation_0-rmse:8.59675
[21]	validation_0-rmse:8.40603
[22]	validation_0-rmse:8.23009
[23]	validation_0-rmse:8.06672
[24]	validation_0-rmse:7.92130
[25]	validation_0-rmse:7.81405
[26]	validation_0-rmse:7.69983
[27]	validation_0-rmse:7.60423
[28]	validation_0-rmse:7.49266
[29]	validation_0-rmse:7.38955
[30]	validation_0-rmse:7.31513
[31]	validation_0-rmse:7.24489
[32]

# Check winner's performance on test data 
It's performance on test data will approximate it's performance in production (deployed) on other unseen data.

In [12]:
# finally predict on test data and check RMSE

import pickle
import gc
from sklearn.metrics import root_mean_squared_error

with open("final_model_trained.pkl", "rb") as f:
    mdl=pickle.load(f)

with open("df_test.pkl", "rb") as f:
    df_test=pickle.load(f)

with open("dv_full.pkl", "rb") as f:
    dv_full=pickle.load(f)

y_test = df_test.base_passenger_fare.values
X_test = dv_full.transform(df_test.drop(columns='base_passenger_fare').to_dict(orient='records'))

del df_test
gc.collect()

y_pred = mdl.predict(X_test)
test_rmse = root_mean_squared_error(y_test, y_pred)

print(f"Final model testing RMSE: {test_rmse:.4f}")

Final model testing RMSE: 6.1446


## Final model testing RMSE: 6.1446