### Imports

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.model_selection import cross_validate

### Load Best Model

In [12]:
expected_model_version = '1.0'
model_path = '../models/full best model/IS2_full_best_model_gb.pkl'
if os.path.exists(model_path):
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    if model.version != expected_model_version:
        print("Expected model version doesn't match version loaded")
    if model.sklearn_version != sklearn_version:
        print("Warning: model created under different sklearn version")
else:
    print("Expected model not found")

### Load and Filter Data

In [4]:
#Processed data used in generating best IS1 model
data = pd.read_csv('../data/df_for_modeling_no_decay_weighting.csv')

In [5]:
# Drop some columns that we don't want/need for modeling (and a few would cause subtle forms of data leakage)
df_filter = data.copy()
df_filter.drop(['P_Date','P_Date_str', 'Comp_Date', 'Comp_Date_str', 'IS2_Completed','DOW', 'IS_pds_l10_ndw', 'GMST(m)', 'Constructors','overall_day_mean_IST(m)','Uniclue','Duplicate_Answers','Unchecked_Sq','Quantum'],inplace=True,axis=1)

In [6]:
# This one has no impact on prediction quality but was raising warning in the Linear Modeling for some reason (probably all zeroes in the training set)
df_filter.drop(['Outside_Grid'], inplace=True, axis=1)
#df_filter.drop(['Duplicate_Clues', 'Circle_Count', 'Shade_Count', 'Unusual_Sym', 'Black_Square_Fill'], inplace=True, axis=1)

In [7]:
#Move target feature to the front
col = df_filter.pop('IS2_ST(m)')
df_filter.insert(0, col.name, col)

In [8]:
#Filter out Sunday
df_filter =df_filter[df_filter["DOW_num"]!=1]

In [9]:
#Deep copy of this df_filter that is now pared down to just the predictive features and the target feature itself 
# All features are derived from data available prior to any given puzzle being predicted on. No data leakage!
df_model1 = df_filter.copy()

In [10]:
df_model1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 956 entries, 0 to 1107
Data columns (total 35 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   IS2_ST(m)                             956 non-null    float64
 1   Comp_Hr                               956 non-null    int64  
 2   Solve_day_phase                       956 non-null    float64
 3   IS_per_sdp_avg_past_diff_from_RPB     956 non-null    float64
 4   DOW_num                               956 non-null    float64
 5   IS_pds_l10_stdev                      956 non-null    float64
 6   IS_pds_l10_ndw_SOS_adj                956 non-null    float64
 7   IS_per_constr_avg_past_diff_from_RPB  532 non-null    float64
 8   IS2_pds_prior_solves_ct               956 non-null    float64
 9   IS2_npds_prior_solves_ct              956 non-null    float64
 10  IS2_solves_l7                         956 non-null    float64
 11  Words             

### Refit Model on All Available Data 

In [13]:
X = df_model1[model.X_columns]
y = df_model1['IS2_ST(m)']

KeyError: "['IS1_solves_l7'] not in index"

In [14]:
len(X), len(y)

(828, 828)

In [15]:
model.fit(X, y)

Pipeline(steps=[('simpleimputer', SimpleImputer()),
                ('standardscaler', StandardScaler()),
                ('selectkbest',
                 SelectKBest(k=7,
                             score_func=<function f_regression at 0x0000015416840AF0>)),
                ('linearregression', LinearRegression())])

for all scoring options, see https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

#### R-squared (COD)

In [16]:
cv_results = cross_validate(model, X, y, scoring='r2', cv=5)

In [17]:
cv_results['test_score']

array([0.40263598, 0.52752213, 0.47542477, 0.58623311, 0.65279598])

In [18]:
rs_mean, rs_std = np.mean(1 * cv_results['test_score']), np.std(-1 * cv_results['test_score'])
rs_mean, rs_std

(0.5289223951493731, 0.0865348446305319)

#### Mean Absolute Error (MAE)

In [19]:
cv_results1 = cross_validate(model, X, y, scoring='neg_mean_absolute_error', cv=5)

In [20]:
cv_results1['test_score']

array([-2.1259755 , -2.99230495, -2.63641758, -2.29336943, -2.52506997])

In [21]:
mae_mean, mae_std = np.mean(-1 * cv_results1['test_score']), np.std(-1 * cv_results1['test_score'])
mae_mean, mae_std

(2.514627484844376, 0.29769888152335344)

#### Mean Squared Error (MSE)

In [22]:
cv_results2 = cross_validate(model, X, y, scoring='neg_mean_squared_error', cv=5)

In [23]:
cv_results2['test_score']

array([ -9.23995268, -22.92690288, -16.84686265, -10.77805498,
       -14.12510351])

In [24]:
mse_mean, mse_std = np.mean(-1 * cv_results2['test_score']), np.std(-1 * cv_results2['test_score'])
mse_mean, mse_std

(14.783375338632116, 4.853462032919279)

#### Root Mean Squared Error (RMSE)

In [25]:
cv_results3 = cross_validate(model, X, y, scoring='neg_root_mean_squared_error', cv=5)

In [26]:
cv_results3['test_score']

array([-3.03972905, -4.78820456, -4.10449298, -3.28299482, -3.75833787])

In [27]:
rmse_mean, rmse_std = np.mean(-1 * cv_results3['test_score']), np.std(-1 * cv_results3['test_score'])
rmse_mean, rmse_std

(3.794751854099323, 0.6190587245503208)