### Imports

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.model_selection import cross_validate

### Load Best Model

In [2]:
expected_model_version = '1.0'
model_path = '../models/full best model/IS2_full_best_model_gb.pkl'
if os.path.exists(model_path):
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    if model.version != expected_model_version:
        print("Expected model version doesn't match version loaded")
    if model.sklearn_version != sklearn_version:
        print("Warning: model created under different sklearn version")
else:
    print("Expected model not found")

### Load and Filter Data

In [3]:
#Processed data used in generating best IS1 model
data = pd.read_csv('../data/df_for_modeling_no_decay_weighting.csv')

In [4]:
# Drop some columns that we don't want/need for modeling (and a few would cause subtle forms of data leakage)
df_filter = data.copy()
df_filter.drop(['P_Date','P_Date_str', 'Comp_Date', 'Comp_Date_str', 'IS2_Completed','DOW', 'IS_pds_l10_ndw', 'GMST(m)', 'Constructors','overall_day_mean_IST(m)','Uniclue','Duplicate_Answers','Unchecked_Sq','Quantum'],inplace=True,axis=1)

In [5]:
# This one has no impact on prediction quality but was raising warning in the Linear Modeling for some reason (probably all zeroes in the training set)
df_filter.drop(['Outside_Grid'], inplace=True, axis=1)
#df_filter.drop(['Duplicate_Clues', 'Circle_Count', 'Shade_Count', 'Unusual_Sym', 'Black_Square_Fill'], inplace=True, axis=1)

In [6]:
#Move target feature to the front
col = df_filter.pop('IS2_ST(m)')
df_filter.insert(0, col.name, col)

In [7]:
#Filter out Sunday
df_filter =df_filter[df_filter["DOW_num"]!=1]

In [8]:
#Deep copy of this df_filter that is now pared down to just the predictive features and the target feature itself 
# All features are derived from data available prior to any given puzzle being predicted on. No data leakage!
df_model1 = df_filter.copy()

In [9]:
df_model1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 956 entries, 0 to 1107
Data columns (total 35 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   IS2_ST(m)                             956 non-null    float64
 1   Comp_Hr                               956 non-null    int64  
 2   Solve_day_phase                       956 non-null    float64
 3   IS_per_sdp_avg_past_diff_from_RPB     956 non-null    float64
 4   DOW_num                               956 non-null    float64
 5   IS_pds_l10_stdev                      956 non-null    float64
 6   IS_pds_l10_ndw_SOS_adj                956 non-null    float64
 7   IS_per_constr_avg_past_diff_from_RPB  532 non-null    float64
 8   IS2_pds_prior_solves_ct               956 non-null    float64
 9   IS2_npds_prior_solves_ct              956 non-null    float64
 10  IS2_solves_l7                         956 non-null    float64
 11  Words             

### Refit Model on All Available Data 

In [10]:
X = df_model1[model.X_columns]
y = df_model1['IS2_ST(m)']

In [11]:
len(X), len(y)

(956, 956)

In [12]:
model.fit(X, y)

Pipeline(steps=[('simpleimputer', SimpleImputer()), ('standardscaler', None),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(learning_rate=0.03, max_depth=5,
                                           max_features=10, n_estimators=130,
                                           random_state=47, subsample=0.5))])

for all scoring options, see https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

#### R-squared (COD)

In [13]:
cv_results = cross_validate(model, X, y, scoring='r2', cv=5)

In [14]:
cv_results['test_score']

array([0.41238054, 0.535412  , 0.43588017, 0.44453534, 0.34474959])

In [15]:
rs_mean, rs_std = np.mean(1 * cv_results['test_score']), np.std(-1 * cv_results['test_score'])
rs_mean, rs_std

(0.4345915292603707, 0.06136808458547803)

#### Mean Absolute Error (MAE)

In [16]:
cv_results1 = cross_validate(model, X, y, scoring='neg_mean_absolute_error', cv=5)

In [17]:
cv_results1['test_score']

array([-4.24725507, -4.61909321, -7.72423524, -5.84393011, -9.35490018])

In [18]:
mae_mean, mae_std = np.mean(-1 * cv_results1['test_score']), np.std(-1 * cv_results1['test_score'])
mae_mean, mae_std

(6.357882761554082, 1.9282799672891437)

#### Mean Squared Error (MSE)

In [19]:
cv_results2 = cross_validate(model, X, y, scoring='neg_mean_squared_error', cv=5)

In [20]:
cv_results2['test_score']

array([ -35.25055117,  -43.36378064, -174.85221399,  -96.40069267,
       -163.95843333])

In [21]:
mse_mean, mse_std = np.mean(-1 * cv_results2['test_score']), np.std(-1 * cv_results2['test_score'])
mse_mean, mse_std

(102.76513436205838, 58.426728083431165)

#### Root Mean Squared Error (RMSE)

In [22]:
cv_results3 = cross_validate(model, X, y, scoring='neg_root_mean_squared_error', cv=5)

In [23]:
cv_results3['test_score']

array([ -5.93721746,  -6.58511812, -13.22316959,  -9.81838544,
       -12.80462547])

In [24]:
rmse_mean, rmse_std = np.mean(-1 * cv_results3['test_score']), np.std(-1 * cv_results3['test_score'])
rmse_mean, rmse_std

(9.673703215314124, 3.0306105761182116)