### Imports

In [25]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.model_selection import cross_validate

### Load Best Model

In [187]:
expected_model_version = '1.0'
model_path = '../models/IS1_model_Sat_hgb.pkl'
if os.path.exists(model_path):
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    if model.version != expected_model_version:
        print("Expected model version doesn't match version loaded")
    if model.sklearn_version != sklearn_version:
        print("Warning: model created under different sklearn version")
else:
    print("Expected model not found")

### Load and Filter Data

In [188]:
#Processed data used in generating best GMS model
data = pd.read_csv('../data/df_for_modeling.csv')

In [189]:
# Drop some columns that we don't want/need for modeling
df_filter = data.copy()
df_filter.drop(['P_Date','P_Date_str','DOW', 'GMS_Completed', 'GMST_Diff%_from_GMS_pds_l10_dw','GMS_per_constr_past_diff_from_RPB_ct','Constructors','overall_day_mean_GMST(m)','Uniclue','Duplicate_Answers','Unchecked_Sq','Quantum'],inplace=True,axis=1)

In [190]:
#Move target feature to the front
col = df_filter.pop('GMST(m)')
df_filter.insert(0, col.name, col)

In [191]:
# Filter to a specific puzzle day
#df_filter = (df_filter.loc[(df_filter['DOW_num'] == 1)]) #Sun
#df_filter = (df_filter.loc[(df_filter['DOW_num'] == 2)]) #Mon
#df_filter = (df_filter.loc[(df_filter['DOW_num'] == 3)]) #Tue
#df_filter = (df_filter.loc[(df_filter['DOW_num'] == 4)]) #Wed
#df_filter = (df_filter.loc[(df_filter['DOW_num'] == 5)]) #Thu
#df_filter = (df_filter.loc[(df_filter['DOW_num'] == 6)]) #Fri
df_filter = (df_filter.loc[(df_filter['DOW_num'] == 7)]) #Sat

In [192]:
df_filter.drop(['DOW_num'],inplace=True,axis=1)

In [193]:
#Deep copy of this df_filter that is now pared down to just the predictive features and the target feature itself 
# All features are derived from data available prior to any given puzzle being predicted on. No data leakage!
df_model1 = df_filter.copy()

### Refit Model on All Available Data 

In [194]:
X = df_model1[model.X_columns]
y = df_model1['GMST(m)']

In [195]:
len(X), len(y)

(266, 266)

In [196]:
model.fit(X, y)

Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                ('standardscaler', None),
                ('histgradientboostingregressor',
                 HistGradientBoostingRegressor(learning_rate=0.02, max_depth=2,
                                               max_iter=115,
                                               random_state=47))])

In [198]:
# Get full sample mean for the day being fit
y.mean()

26.54147869674188

for all scoring options, see https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

#### R-squared (COD)

In [199]:
cv_results = cross_validate(model, X, y, scoring='r2', cv=5)

In [200]:
cv_results['test_score']

array([-0.00333583,  0.17921751,  0.04481136, -0.00967384,  0.00477697])

In [201]:
rs_mean, rs_std = np.mean(1 * cv_results['test_score']), np.std(-1 * cv_results['test_score'])
rs_mean, rs_std

(0.04315923263411157, 0.07062705883023017)

#### Mean Absolute Error (MAE)

In [202]:
cv_results1 = cross_validate(model, X, y, scoring='neg_mean_absolute_error', cv=5)

In [203]:
cv_results1['test_score']

array([-4.27938853, -5.23446968, -4.06648338, -5.09453299, -5.33394784])

In [204]:
mae_mean, mae_std = np.mean(-1 * cv_results1['test_score']), np.std(-1 * cv_results1['test_score'])
mae_mean, mae_std

(4.801764484549025, 0.5233891960555943)

#### Mean Squared Error (MSE)

In [205]:
cv_results2 = cross_validate(model, X, y, scoring='neg_mean_squared_error', cv=5)

In [206]:
cv_results2['test_score']

array([-28.84093622, -41.95361001, -22.17095022, -42.21224706,
       -45.52013244])

In [207]:
mse_mean, mse_std = np.mean(-1 * cv_results2['test_score']), np.std(-1 * cv_results2['test_score'])
mse_mean, mse_std

(36.139575189124415, 9.022946448355688)

#### Root Mean Squared Error (RMSE)

In [208]:
cv_results3 = cross_validate(model, X, y, scoring='neg_root_mean_squared_error', cv=5)

In [209]:
cv_results3['test_score']

array([-5.3703758 , -6.47716064, -4.70860385, -6.49709528, -6.74686093])

In [210]:
rmse_mean, rmse_std = np.mean(-1 * cv_results3['test_score']), np.std(-1 * cv_results3['test_score'])
rmse_mean, rmse_std

(5.960019300803742, 0.7859676349388006)