### Imports

In [2]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.model_selection import cross_validate

### Load Best Model

In [3]:
expected_model_version = '1.0'
model_path = '../models/full best model/IS2_full_best_model_gb.pkl'
if os.path.exists(model_path):
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    if model.version != expected_model_version:
        print("Expected model version doesn't match version loaded")
    if model.sklearn_version != sklearn_version:
        print("Warning: model created under different sklearn version")
else:
    print("Expected model not found")

### Load and Filter Data

In [4]:
#Processed data used in generating best IS2 model
data = pd.read_csv('../data/df_for_modeling_no_decay_weighting.csv')

In [5]:
# Drop some columns that we don't want/need for modeling (and a few would cause subtle forms of data leakage)
df_filter = data.copy()
df_filter.drop(['P_Date','P_Date_str', 'Comp_Date', 'Comp_Date_str', 'IS2_Completed','DOW', 'IS_pds_l10_ndw', 'GMST(m)', 'Constructors','overall_day_mean_IST(m)','Uniclue','Duplicate_Answers','Unchecked_Sq','Quantum'],inplace=True,axis=1)

In [6]:
# This one has no impact on prediction quality but was raising warning in the Linear Modeling for some reason (probably all zeroes in the training set)
df_filter.drop(['Outside_Grid'], inplace=True, axis=1)
#df_filter.drop(['Duplicate_Clues', 'Circle_Count', 'Shade_Count', 'Unusual_Sym', 'Black_Square_Fill'], inplace=True, axis=1)

In [7]:
#Move target feature to the front
col = df_filter.pop('IS2_ST(m)')
df_filter.insert(0, col.name, col)

In [8]:
#Filter out Sunday
df_filter =df_filter[df_filter["DOW_num"]!=1]

In [9]:
#Deep copy of this df_filter that is now pared down to just the predictive features and the target feature itself 
# All features are derived from data available prior to any given puzzle being predicted on. No data leakage!
df_model1 = df_filter.copy()

In [10]:
df_model1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 979 entries, 0 to 1131
Data columns (total 35 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   IS2_ST(m)                             979 non-null    float64
 1   Comp_Hr                               979 non-null    int64  
 2   Solve_day_phase                       979 non-null    float64
 3   IS_per_sdp_avg_past_diff_from_RPB     979 non-null    float64
 4   DOW_num                               979 non-null    float64
 5   IS_pds_l10_stdev                      979 non-null    float64
 6   IS_pds_l10_ndw_SOS_adj                979 non-null    float64
 7   IS_per_constr_avg_past_diff_from_RPB  551 non-null    float64
 8   IS2_pds_prior_solves_ct               979 non-null    float64
 9   IS2_npds_prior_solves_ct              979 non-null    float64
 10  IS2_solves_l7                         979 non-null    float64
 11  Words             

### Refit Model on All Available Data 

In [11]:
X = df_model1[model.X_columns]
y = df_model1['IS2_ST(m)']

In [12]:
len(X), len(y)

(979, 979)

In [13]:
model.fit(X, y)

Pipeline(steps=[('simpleimputer', SimpleImputer()), ('standardscaler', None),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(learning_rate=0.035, max_depth=7,
                                           max_features=14, n_estimators=115,
                                           random_state=43, subsample=0.6))])

for all scoring options, see https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

#### R-squared (COD)

In [14]:
cv_results = cross_validate(model, X, y, scoring='r2', cv=5)

In [15]:
cv_results['test_score']

array([0.39962505, 0.46197675, 0.49013565, 0.45628802, 0.19568999])

In [16]:
rs_mean, rs_std = np.mean(1 * cv_results['test_score']), np.std(-1 * cv_results['test_score'])
rs_mean, rs_std

(0.40074309392264135, 0.10665257000920417)

#### Mean Absolute Error (MAE)

In [17]:
cv_results1 = cross_validate(model, X, y, scoring='neg_mean_absolute_error', cv=5)

In [18]:
cv_results1['test_score']

array([ -4.72659144,  -4.70610196,  -7.59000191,  -5.68642136,
       -10.38888492])

In [19]:
mae_mean, mae_std = np.mean(-1 * cv_results1['test_score']), np.std(-1 * cv_results1['test_score'])
mae_mean, mae_std

(6.619600318609278, 2.157079840641811)

#### Mean Squared Error (MSE)

In [20]:
cv_results2 = cross_validate(model, X, y, scoring='neg_mean_squared_error', cv=5)

In [21]:
cv_results2['test_score']

array([ -45.78502209,  -46.05833178, -157.06301113,  -93.04788437,
       -199.63135064])

In [22]:
mse_mean, mse_std = np.mean(-1 * cv_results2['test_score']), np.std(-1 * cv_results2['test_score'])
mse_mean, mse_std

(108.31712000167265, 61.21114303038956)

#### Root Mean Squared Error (RMSE)

In [23]:
cv_results3 = cross_validate(model, X, y, scoring='neg_root_mean_squared_error', cv=5)

In [24]:
cv_results3['test_score']

array([ -6.76646304,  -6.7866289 , -12.53247825,  -9.64613313,
       -14.12909589])

In [25]:
rmse_mean, rmse_std = np.mean(-1 * cv_results3['test_score']), np.std(-1 * cv_results3['test_score'])
rmse_mean, rmse_std

(9.97215984180669, 2.9787829882572243)