### Imports

In [2]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.model_selection import cross_validate

### Load Best Model

In [3]:
expected_model_version = '1.0'
model_path = '../models/best model/IS1_full_best_model_linear.pkl'
if os.path.exists(model_path):
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    if model.version != expected_model_version:
        print("Expected model version doesn't match version loaded")
    if model.sklearn_version != sklearn_version:
        print("Warning: model created under different sklearn version")
else:
    print("Expected model not found")

### Load and Filter Data

In [51]:
#Processed data used in generating best GMS model
data = pd.read_csv('../data/df_for_modeling.csv')

In [52]:
# Drop some columns that we don't want/need for modeling
df_filter = data.copy()
df_filter.drop(['P_Date','P_Date_str', 'GMS_Completed', 'DOW', 'GMST_Diff%_from_GMS_pds_l40_dw','GMS_per_constr_past_diff_from_RPB_ct','Constructors','overall_day_mean_GMST(m)','Uniclue','Duplicate_Answers','Unchecked_Sq','Quantum'],inplace=True,axis=1)

In [53]:
#Filter out Sunday
df_filter =df_filter[df_filter["DOW_num"]!=1]

In [54]:
#Move target feature to the front
col = df_filter.pop('GMST(m)')
df_filter.insert(0, col.name, col)

In [55]:
# Also now drop short and intermediate-term GMS past performance features
#Imperative for final model performance
df_filter.drop(['GMS_pds_l10_dw','GMS_pds_l25_dw','GMS_pds_l10_stdev','GMS_pds_l25_stdev'],inplace=True,axis=1) #linear RMSE: (3.953721267825486, 0.231741035464759) 
#df_filter.drop(['GMS_pds_l10_dw','GMS_pds_l10_stdev'],inplace=True,axis=1) #linear RMSE: (3.9576275009691413, 0.22676168953792353)
#df_filter.drop(['GMS_pds_l10_dw','GMS_pds_l50_dw','GMS_pds_l10_stdev','GMS_pds_l50_stdev'],inplace=True,axis=1) #linear RMSE: (3.9670459176013635, 0.21705983140955096)
#df_filter.drop(['GMS_pds_l25_dw','GMS_pds_l50_dw','GMS_pds_l25_stdev','GMS_pds_l50_stdev'],inplace=True,axis=1) #linear RMSE: (3.9840272126935297, 0.237305851677329)
#df_filter.drop(['GMS_pds_l10_dw','GMS_pds_l25_dw','GMS_pds_l50_dw','GMS_pds_l10_stdev','GMS_pds_l25_stdev','GMS_pds_l50_stdev'],inplace=True,axis=1) #linear RMSE: (4.043534832823217, 0.23857038246642934)

In [56]:
#Deep copy of this df_filter that is now pared down to just the predictive features and the target feature itself 
# All features are derived from data available prior to any given puzzle being predicted on. No data leakage!
df_model1 = df_filter.copy()

In [57]:
df_model1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1601 entries, 0 to 1866
Data columns (total 32 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   GMST(m)                                1601 non-null   float64
 1   DOW_num                                1601 non-null   float64
 2   GMS_pds_l40_dw                         1601 non-null   float64
 3   GMS_pds_l40_stdev                      1601 non-null   float64
 4   GMS_per_constr_avg_past_diff_from_RPB  1105 non-null   float64
 5   GMS_pds_prior_solves_ct                1601 non-null   float64
 6   GMS_npds_prior_solves_ct               1601 non-null   float64
 7   Words                                  1601 non-null   int64  
 8   Blocks                                 1601 non-null   int64  
 9   Unused_Letters                         1601 non-null   int64  
 10  Stacks                                 1601 non-null   int64  
 11  Uniq

### Refit Model on All Available Data 

In [58]:
X = df_model1[model.X_columns]
y = df_model1['GMST(m)']

In [59]:
len(X), len(y)

(1601, 1601)

In [60]:
model.fit(X, y)

Pipeline(steps=[('simpleimputer', SimpleImputer()), ('standardscaler', None),
                ('selectkbest',
                 SelectKBest(k=23,
                             score_func=<function f_regression at 0x00000207878BAAF0>)),
                ('linearregression', LinearRegression())])

for all scoring options, see https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

#### R-squared (COD)

In [61]:
cv_results = cross_validate(model, X, y, scoring='r2', cv=5)

In [62]:
cv_results['test_score']

array([0.78826756, 0.77817795, 0.78503979, 0.78082194, 0.72167707])

In [63]:
rs_mean, rs_std = np.mean(1 * cv_results['test_score']), np.std(-1 * cv_results['test_score'])
rs_mean, rs_std

(0.7707968628609084, 0.02480251249101489)

#### Mean Absolute Error (MAE)

In [64]:
cv_results1 = cross_validate(model, X, y, scoring='neg_mean_absolute_error', cv=5)

In [65]:
cv_results1['test_score']

array([-2.27616214, -2.51629889, -2.60598337, -2.85086445, -3.40507438])

In [66]:
mae_mean, mae_std = np.mean(-1 * cv_results1['test_score']), np.std(-1 * cv_results1['test_score'])
mae_mean, mae_std

(2.730876644796478, 0.38401741765069475)

#### Mean Squared Error (MSE)

In [67]:
cv_results2 = cross_validate(model, X, y, scoring='neg_mean_squared_error', cv=5)

In [68]:
cv_results2['test_score']

array([-11.15826642, -13.41252109, -12.63917928, -16.40087456,
       -21.6948274 ])

In [69]:
mse_mean, mse_std = np.mean(-1 * cv_results2['test_score']), np.std(-1 * cv_results2['test_score'])
mse_mean, mse_std

(15.061133750421671, 3.731406100208003)

#### Root Mean Squared Error (RMSE)

In [70]:
cv_results3 = cross_validate(model, X, y, scoring='neg_root_mean_squared_error', cv=5)

In [71]:
cv_results3['test_score']

array([-3.34039914, -3.66231089, -3.55516234, -4.04979932, -4.65777065])

In [72]:
rmse_mean, rmse_std = np.mean(-1 * cv_results3['test_score']), np.std(-1 * cv_results3['test_score'])
rmse_mean, rmse_std

(3.853088469723212, 0.4635115909097747)