The goal of this notebook is to determine the best time-decay weighting for past performance for IS1. The evaluation metric is overall correlation across puzzle days between performance over the previous ten matches at different time decay weightings, and the performance in the next match.

Conclusion is that, for the ranges exlored, using the previous 10 puzzles with a'standard' decay weighting yields the best correlation to the 'next' puzzle to be predicted. Caveats: the range of decay curves an # ofpast puzzles explored is very limited, and correlations are NOT the same as predictive value (which will be looked at in the modeling phase vis a vis different decay curves). 

In [2]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import datetime
warnings.filterwarnings('ignore')
#print(sns.__version__)

In [3]:
#Read file into a DataFrame and print head.
xword_data = pd.ExcelFile('../data/NYT_XWord_data.xlsx', engine='openpyxl')
print(xword_data.sheet_names)

['Sheet1']


In [4]:
df1 = xword_data.parse('Sheet1')

In [5]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2217 entries, 0 to 2216
Data columns (total 69 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   Puzzle_Date                   2217 non-null   datetime64[ns]
 1   Completed_Date (IS1)          1173 non-null   datetime64[ns]
 2   Completed_Date (IS2)          1068 non-null   datetime64[ns]
 3   Day_of_Week                   2217 non-null   object        
 4   Time (s) (IS1)                1173 non-null   float64       
 5   Time (s) (IS2)                1068 non-null   float64       
 6   Global_Median_Solver_Time(s)  2217 non-null   int64         
 7   Difficulty                    2217 non-null   object        
 8   Median_Solver%_AVG_PM         2217 non-null   int64         
 9   Grid Size                     2217 non-null   int64         
 10  No_Constructors               2217 non-null   int64         
 11  Constructors(by seniority)    

In [6]:
df1 = df1[["Puzzle_Date", "Completed_Date (IS1)", "Day_of_Week", "Time (s) (IS1)"]]

In [17]:
# Rename some columns for brevity and clarity
df1.rename(columns={'Day_of_Week': 'DOW', 'Puzzle_Date': 'P_Date', 'Time (s) (IS1)':'IST(m)', 'Completed_Date (IS1)':'Comp_Date'}, inplace=True)

In [18]:
# Keep only puzzles for which IS1 has a solve time
df_IS1 = df1.dropna(subset=['IST(m)'])

In [19]:
df_IS1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1173 entries, 0 to 1172
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   P_Date     1173 non-null   datetime64[ns]
 1   Comp_Date  1173 non-null   datetime64[ns]
 2   DOW        1173 non-null   object        
 3   IST(m)     1173 non-null   float64       
dtypes: datetime64[ns](2), float64(1), object(1)
memory usage: 45.8+ KB


first version has no decay weighting (all weights are 1)

In [20]:
#IS_pds_l10_dw
#Provides decay-weighted(dw), puzzle day-specific (pds) mean solve time performance for the individual solver (IS1) over the previous 10 puzzles
# Note that, unlike the 10-puzzle moving average, this weighted average does NOT include the "puzzle at hand" itself

df_IS1 = df_IS1.sort_values(by=['DOW', 'Comp_Date'], ascending = False)

#w = [10,9,8,7,6,5,4,3,2,1] #weight assigned to puzzle
w = [1,1,1,1,1,1,1,1,1,1] #weight assigned to puzzle

df_IS1["IS_pds_l10_dw_1"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-1)*w[0]
df_IS1["IS_pds_l10_dw_2"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-2)*w[1]
df_IS1["IS_pds_l10_dw_3"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-3)*w[2]
df_IS1["IS_pds_l10_dw_4"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-4)*w[3]
df_IS1["IS_pds_l10_dw_5"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-5)*w[4]
df_IS1["IS_pds_l10_dw_6"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-6)*w[5]
df_IS1["IS_pds_l10_dw_7"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-7)*w[6]
df_IS1["IS_pds_l10_dw_8"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-8)*w[7]
df_IS1["IS_pds_l10_dw_9"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-9)*w[8]
df_IS1["IS_pds_l10_dw_10"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-10)*w[9]

df_IS1["IS_pds_l10_dw_1_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-1)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-1))*w[0]
df_IS1["IS_pds_l10_dw_2_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-2)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-2))*w[1]
df_IS1["IS_pds_l10_dw_3_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-3)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-3))*w[2]
df_IS1["IS_pds_l10_dw_4_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-4)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-4))*w[3]
df_IS1["IS_pds_l10_dw_5_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-5)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-5))*w[4]
df_IS1["IS_pds_l10_dw_6_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-6)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-6))*w[5]
df_IS1["IS_pds_l10_dw_7_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-7)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-7))*w[6]
df_IS1["IS_pds_l10_dw_8_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-8)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-8))*w[7]
df_IS1["IS_pds_l10_dw_9_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-9)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-9))*w[8]
df_IS1["IS_pds_l10_dw_10_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-10)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-10))*w[9]

df_IS1["IS_pds_l10_ws"] = df_IS1[["IS_pds_l10_dw_1", "IS_pds_l10_dw_2", "IS_pds_l10_dw_3", "IS_pds_l10_dw_4", "IS_pds_l10_dw_5", "IS_pds_l10_dw_6", "IS_pds_l10_dw_7", "IS_pds_l10_dw_8", "IS_pds_l10_dw_9", "IS_pds_l10_dw_10"]].sum(axis=1)
df_IS1["IS_pds_l10_ws_ct"] = df_IS1[["IS_pds_l10_dw_1_ct", "IS_pds_l10_dw_2_ct", "IS_pds_l10_dw_3_ct", "IS_pds_l10_dw_4_ct", "IS_pds_l10_dw_5_ct", "IS_pds_l10_dw_6_ct", "IS_pds_l10_dw_7_ct", "IS_pds_l10_dw_8_ct", "IS_pds_l10_dw_9_ct", "IS_pds_l10_dw_10_ct"]].sum(axis=1)
df_IS1["IS_pds_l10_dw"] = df_IS1["IS_pds_l10_ws"]/df_IS1["IS_pds_l10_ws_ct"]

# Deleting transient columns
df_IS1 = df_IS1.drop(["IS_pds_l10_dw_1", "IS_pds_l10_dw_2", "IS_pds_l10_dw_3", "IS_pds_l10_dw_4", "IS_pds_l10_dw_5", "IS_pds_l10_dw_6", "IS_pds_l10_dw_7", "IS_pds_l10_dw_8", "IS_pds_l10_dw_9", "IS_pds_l10_dw_10", "IS_pds_l10_dw_1_ct", "IS_pds_l10_dw_2_ct", "IS_pds_l10_dw_3_ct", "IS_pds_l10_dw_4_ct", "IS_pds_l10_dw_5_ct", "IS_pds_l10_dw_6_ct", "IS_pds_l10_dw_7_ct", "IS_pds_l10_dw_8_ct", "IS_pds_l10_dw_9_ct", "IS_pds_l10_dw_10_ct", "IS_pds_l10_ws", "IS_pds_l10_ws_ct"], axis = 1)

In [21]:
# Calculate Pearson R for 15x15 grids for IS1 for this feature
IS1_15x15 = df_IS1.loc[df_IS1["DOW"]!=("Sunday")]

print(IS1_15x15['IST(m)'].corr(IS1_15x15['IS_pds_l10_dw']))

# Calculate Pearson R for each puzzle day for this feature
IS1_Sun = df_IS1.loc[df_IS1["DOW"]==("Sunday")]
print(IS1_Sun['IST(m)'].corr(IS1_Sun['IS_pds_l10_dw']))

IS1_Mon = df_IS1.loc[df_IS1["DOW"]==("Monday")]
print(IS1_Mon['IST(m)'].corr(IS1_Mon['IS_pds_l10_dw']))

IS1_Tue = df_IS1.loc[df_IS1["DOW"]==("Tuesday")]
print(IS1_Tue['IST(m)'].corr(IS1_Tue['IS_pds_l10_dw']))

IS1_Wed = df_IS1.loc[df_IS1["DOW"]==("Wednesday")]
print(IS1_Wed['IST(m)'].corr(IS1_Wed['IS_pds_l10_dw']))

IS1_Thu = df_IS1.loc[df_IS1["DOW"]==("Thursday")]
print(IS1_Thu['IST(m)'].corr(IS1_Thu['IS_pds_l10_dw']))

IS1_Fri = df_IS1.loc[df_IS1["DOW"]==("Friday")]
print(IS1_Fri['IST(m)'].corr(IS1_Fri['IS_pds_l10_dw']))

IS1_Sat = df_IS1.loc[df_IS1["DOW"]==("Saturday")]
print(IS1_Sat['IST(m)'].corr(IS1_Sat['IS_pds_l10_dw']))

0.7298144123823953
0.4369244648995447
0.48404623198559427
0.4089516559619141
0.2609697424518978
0.1830683285134033
0.28917690813022684
0.3534599608567863


now a straight decay version (10, 9,8,7,6....). This is superior to no weighting version, both overall and for almost all of the individual puzzle days.  

In [22]:
#IS_pds_l10_dw
#Provides decay-weighted(dw), puzzle day-specific (pds) mean solve time performance for the individual solver (IS1) over the previous 10 puzzles
# Note that, unlike the 10-puzzle moving average, this weighted average does NOT include the "puzzle at hand" itself

df_IS1 = df_IS1.sort_values(by=['DOW', 'Comp_Date'], ascending = False)

w = [10,9,8,7,6,5,4,3,2,1] #weight assigned to puzzle
#w = [1,1,1,1,1,1,1,1,1,1] #weight assigned to puzzle

df_IS1["IS_pds_l10_dw_1"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-1)*w[0]
df_IS1["IS_pds_l10_dw_2"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-2)*w[1]
df_IS1["IS_pds_l10_dw_3"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-3)*w[2]
df_IS1["IS_pds_l10_dw_4"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-4)*w[3]
df_IS1["IS_pds_l10_dw_5"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-5)*w[4]
df_IS1["IS_pds_l10_dw_6"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-6)*w[5]
df_IS1["IS_pds_l10_dw_7"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-7)*w[6]
df_IS1["IS_pds_l10_dw_8"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-8)*w[7]
df_IS1["IS_pds_l10_dw_9"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-9)*w[8]
df_IS1["IS_pds_l10_dw_10"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-10)*w[9]

df_IS1["IS_pds_l10_dw_1_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-1)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-1))*w[0]
df_IS1["IS_pds_l10_dw_2_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-2)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-2))*w[1]
df_IS1["IS_pds_l10_dw_3_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-3)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-3))*w[2]
df_IS1["IS_pds_l10_dw_4_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-4)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-4))*w[3]
df_IS1["IS_pds_l10_dw_5_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-5)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-5))*w[4]
df_IS1["IS_pds_l10_dw_6_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-6)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-6))*w[5]
df_IS1["IS_pds_l10_dw_7_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-7)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-7))*w[6]
df_IS1["IS_pds_l10_dw_8_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-8)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-8))*w[7]
df_IS1["IS_pds_l10_dw_9_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-9)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-9))*w[8]
df_IS1["IS_pds_l10_dw_10_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-10)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-10))*w[9]

df_IS1["IS_pds_l10_ws"] = df_IS1[["IS_pds_l10_dw_1", "IS_pds_l10_dw_2", "IS_pds_l10_dw_3", "IS_pds_l10_dw_4", "IS_pds_l10_dw_5", "IS_pds_l10_dw_6", "IS_pds_l10_dw_7", "IS_pds_l10_dw_8", "IS_pds_l10_dw_9", "IS_pds_l10_dw_10"]].sum(axis=1)
df_IS1["IS_pds_l10_ws_ct"] = df_IS1[["IS_pds_l10_dw_1_ct", "IS_pds_l10_dw_2_ct", "IS_pds_l10_dw_3_ct", "IS_pds_l10_dw_4_ct", "IS_pds_l10_dw_5_ct", "IS_pds_l10_dw_6_ct", "IS_pds_l10_dw_7_ct", "IS_pds_l10_dw_8_ct", "IS_pds_l10_dw_9_ct", "IS_pds_l10_dw_10_ct"]].sum(axis=1)
df_IS1["IS_pds_l10_dw"] = df_IS1["IS_pds_l10_ws"]/df_IS1["IS_pds_l10_ws_ct"]

# Deleting transient columns
df_IS1 = df_IS1.drop(["IS_pds_l10_dw_1", "IS_pds_l10_dw_2", "IS_pds_l10_dw_3", "IS_pds_l10_dw_4", "IS_pds_l10_dw_5", "IS_pds_l10_dw_6", "IS_pds_l10_dw_7", "IS_pds_l10_dw_8", "IS_pds_l10_dw_9", "IS_pds_l10_dw_10", "IS_pds_l10_dw_1_ct", "IS_pds_l10_dw_2_ct", "IS_pds_l10_dw_3_ct", "IS_pds_l10_dw_4_ct", "IS_pds_l10_dw_5_ct", "IS_pds_l10_dw_6_ct", "IS_pds_l10_dw_7_ct", "IS_pds_l10_dw_8_ct", "IS_pds_l10_dw_9_ct", "IS_pds_l10_dw_10_ct", "IS_pds_l10_ws", "IS_pds_l10_ws_ct"], axis = 1)

In [23]:
# Calculate Pearson R for 15x15 grids for IS1 for this feature
IS1_15x15 = df_IS1.loc[df_IS1["DOW"]!=("Sunday")]

print(IS1_15x15['IST(m)'].corr(IS1_15x15['IS_pds_l10_dw']))

# Calculate Pearson R for each puzzle day for this feature
IS1_Sun = df_IS1.loc[df_IS1["DOW"]==("Sunday")]
print(IS1_Sun['IST(m)'].corr(IS1_Sun['IS_pds_l10_dw']))

IS1_Mon = df_IS1.loc[df_IS1["DOW"]==("Monday")]
print(IS1_Mon['IST(m)'].corr(IS1_Mon['IS_pds_l10_dw']))

IS1_Tue = df_IS1.loc[df_IS1["DOW"]==("Tuesday")]
print(IS1_Tue['IST(m)'].corr(IS1_Tue['IS_pds_l10_dw']))

IS1_Wed = df_IS1.loc[df_IS1["DOW"]==("Wednesday")]
print(IS1_Wed['IST(m)'].corr(IS1_Wed['IS_pds_l10_dw']))

IS1_Thu = df_IS1.loc[df_IS1["DOW"]==("Thursday")]
print(IS1_Thu['IST(m)'].corr(IS1_Thu['IS_pds_l10_dw']))

IS1_Fri = df_IS1.loc[df_IS1["DOW"]==("Friday")]
print(IS1_Fri['IST(m)'].corr(IS1_Fri['IS_pds_l10_dw']))

IS1_Sat = df_IS1.loc[df_IS1["DOW"]==("Saturday")]
print(IS1_Sat['IST(m)'].corr(IS1_Sat['IS_pds_l10_dw']))

0.7365399948610674
0.43786031767557404
0.48277248669809963
0.4261762717217038
0.31181513487890117
0.17319168087379946
0.2943346221444319
0.4025226561581087


now a decay version with muted decay (20, 19, 18, 17, 16....). This is superior to no weighting version, both overall and for almost all of the individual puzzle days. This is slightly worse than the 10,9,8,...version for the all puzzle days.

In [45]:
#IS_pds_l10_dw
#Provides decay-weighted(dw), puzzle day-specific (pds) mean solve time performance for the individual solver (IS1) over the previous 10 puzzles
# Note that, unlike the 10-puzzle moving average, this weighted average does NOT include the "puzzle at hand" itself

df_IS1 = df_IS1.sort_values(by=['DOW', 'Comp_Date'], ascending = False)

w = [10,9,8,7,6,5,5,4,3,3] #weight assigned to puzzle
#w = [10,9,8,7,6,5,4,3,2,1] #weight assigned to puzzle
#w = [1,1,1,1,1,1,1,1,1,1] #weight assigned to puzzle

df_IS1["IS_pds_l10_dw_1"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-1)*w[0]
df_IS1["IS_pds_l10_dw_2"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-2)*w[1]
df_IS1["IS_pds_l10_dw_3"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-3)*w[2]
df_IS1["IS_pds_l10_dw_4"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-4)*w[3]
df_IS1["IS_pds_l10_dw_5"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-5)*w[4]
df_IS1["IS_pds_l10_dw_6"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-6)*w[5]
df_IS1["IS_pds_l10_dw_7"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-7)*w[6]
df_IS1["IS_pds_l10_dw_8"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-8)*w[7]
df_IS1["IS_pds_l10_dw_9"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-9)*w[8]
df_IS1["IS_pds_l10_dw_10"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-10)*w[9]

df_IS1["IS_pds_l10_dw_1_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-1)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-1))*w[0]
df_IS1["IS_pds_l10_dw_2_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-2)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-2))*w[1]
df_IS1["IS_pds_l10_dw_3_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-3)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-3))*w[2]
df_IS1["IS_pds_l10_dw_4_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-4)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-4))*w[3]
df_IS1["IS_pds_l10_dw_5_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-5)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-5))*w[4]
df_IS1["IS_pds_l10_dw_6_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-6)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-6))*w[5]
df_IS1["IS_pds_l10_dw_7_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-7)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-7))*w[6]
df_IS1["IS_pds_l10_dw_8_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-8)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-8))*w[7]
df_IS1["IS_pds_l10_dw_9_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-9)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-9))*w[8]
df_IS1["IS_pds_l10_dw_10_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-10)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-10))*w[9]

df_IS1["IS_pds_l10_ws"] = df_IS1[["IS_pds_l10_dw_1", "IS_pds_l10_dw_2", "IS_pds_l10_dw_3", "IS_pds_l10_dw_4", "IS_pds_l10_dw_5", "IS_pds_l10_dw_6", "IS_pds_l10_dw_7", "IS_pds_l10_dw_8", "IS_pds_l10_dw_9", "IS_pds_l10_dw_10"]].sum(axis=1)
df_IS1["IS_pds_l10_ws_ct"] = df_IS1[["IS_pds_l10_dw_1_ct", "IS_pds_l10_dw_2_ct", "IS_pds_l10_dw_3_ct", "IS_pds_l10_dw_4_ct", "IS_pds_l10_dw_5_ct", "IS_pds_l10_dw_6_ct", "IS_pds_l10_dw_7_ct", "IS_pds_l10_dw_8_ct", "IS_pds_l10_dw_9_ct", "IS_pds_l10_dw_10_ct"]].sum(axis=1)
df_IS1["IS_pds_l10_dw"] = df_IS1["IS_pds_l10_ws"]/df_IS1["IS_pds_l10_ws_ct"]

# Deleting transient columns
df_IS1 = df_IS1.drop(["IS_pds_l10_dw_1", "IS_pds_l10_dw_2", "IS_pds_l10_dw_3", "IS_pds_l10_dw_4", "IS_pds_l10_dw_5", "IS_pds_l10_dw_6", "IS_pds_l10_dw_7", "IS_pds_l10_dw_8", "IS_pds_l10_dw_9", "IS_pds_l10_dw_10", "IS_pds_l10_dw_1_ct", "IS_pds_l10_dw_2_ct", "IS_pds_l10_dw_3_ct", "IS_pds_l10_dw_4_ct", "IS_pds_l10_dw_5_ct", "IS_pds_l10_dw_6_ct", "IS_pds_l10_dw_7_ct", "IS_pds_l10_dw_8_ct", "IS_pds_l10_dw_9_ct", "IS_pds_l10_dw_10_ct", "IS_pds_l10_ws", "IS_pds_l10_ws_ct"], axis = 1)

In [46]:
# Calculate Pearson R for 15x15 grids for IS1 for this feature
IS1_15x15 = df_IS1.loc[df_IS1["DOW"]!=("Sunday")]

print(IS1_15x15['IST(m)'].corr(IS1_15x15['IS_pds_l10_dw']))

# Calculate Pearson R for each puzzle day for this feature
IS1_Sun = df_IS1.loc[df_IS1["DOW"]==("Sunday")]
print(IS1_Sun['IST(m)'].corr(IS1_Sun['IS_pds_l10_dw']))

IS1_Mon = df_IS1.loc[df_IS1["DOW"]==("Monday")]
print(IS1_Mon['IST(m)'].corr(IS1_Mon['IS_pds_l10_dw']))

IS1_Tue = df_IS1.loc[df_IS1["DOW"]==("Tuesday")]
print(IS1_Tue['IST(m)'].corr(IS1_Tue['IS_pds_l10_dw']))

IS1_Wed = df_IS1.loc[df_IS1["DOW"]==("Wednesday")]
print(IS1_Wed['IST(m)'].corr(IS1_Wed['IS_pds_l10_dw']))

IS1_Thu = df_IS1.loc[df_IS1["DOW"]==("Thursday")]
print(IS1_Thu['IST(m)'].corr(IS1_Thu['IS_pds_l10_dw']))

IS1_Fri = df_IS1.loc[df_IS1["DOW"]==("Friday")]
print(IS1_Fri['IST(m)'].corr(IS1_Fri['IS_pds_l10_dw']))

IS1_Sat = df_IS1.loc[df_IS1["DOW"]==("Saturday")]
print(IS1_Sat['IST(m)'].corr(IS1_Sat['IS_pds_l10_dw']))

0.7368207096576194
0.44226224961821214
0.4825216640225096
0.4234245584744643
0.30147967414626803
0.17836384886938803
0.29534334256628836
0.3965322979625461


now a version with accelerated decay. Not as good for all 15x15 as the straight 10,9,8,7... decay version. 

In [27]:
#IS_pds_l10_dw
#Provides decay-weighted(dw), puzzle day-specific (pds) mean solve time performance for the individual solver (IS1) over the previous 10 puzzles
# Note that, unlike the 10-puzzle moving average, this weighted average does NOT include the "puzzle at hand" itself

df_IS1 = df_IS1.sort_values(by=['DOW', 'Comp_Date'], ascending = False)

w = [10,8,6,4,2,2,2,1,1,1] #weight assigned to puzzle
#w = [20,19,18,17,16,15,14,13,12,11] #weight assigned to puzzle
#w = [10,9,8,7,6,5,4,3,2,1] #weight assigned to puzzle
#w = [1,1,1,1,1,1,1,1,1,1] #weight assigned to puzzle

df_IS1["IS_pds_l10_dw_1"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-1)*w[0]
df_IS1["IS_pds_l10_dw_2"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-2)*w[1]
df_IS1["IS_pds_l10_dw_3"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-3)*w[2]
df_IS1["IS_pds_l10_dw_4"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-4)*w[3]
df_IS1["IS_pds_l10_dw_5"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-5)*w[4]
df_IS1["IS_pds_l10_dw_6"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-6)*w[5]
df_IS1["IS_pds_l10_dw_7"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-7)*w[6]
df_IS1["IS_pds_l10_dw_8"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-8)*w[7]
df_IS1["IS_pds_l10_dw_9"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-9)*w[8]
df_IS1["IS_pds_l10_dw_10"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-10)*w[9]

df_IS1["IS_pds_l10_dw_1_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-1)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-1))*w[0]
df_IS1["IS_pds_l10_dw_2_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-2)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-2))*w[1]
df_IS1["IS_pds_l10_dw_3_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-3)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-3))*w[2]
df_IS1["IS_pds_l10_dw_4_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-4)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-4))*w[3]
df_IS1["IS_pds_l10_dw_5_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-5)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-5))*w[4]
df_IS1["IS_pds_l10_dw_6_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-6)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-6))*w[5]
df_IS1["IS_pds_l10_dw_7_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-7)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-7))*w[6]
df_IS1["IS_pds_l10_dw_8_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-8)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-8))*w[7]
df_IS1["IS_pds_l10_dw_9_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-9)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-9))*w[8]
df_IS1["IS_pds_l10_dw_10_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-10)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-10))*w[9]

df_IS1["IS_pds_l10_ws"] = df_IS1[["IS_pds_l10_dw_1", "IS_pds_l10_dw_2", "IS_pds_l10_dw_3", "IS_pds_l10_dw_4", "IS_pds_l10_dw_5", "IS_pds_l10_dw_6", "IS_pds_l10_dw_7", "IS_pds_l10_dw_8", "IS_pds_l10_dw_9", "IS_pds_l10_dw_10"]].sum(axis=1)
df_IS1["IS_pds_l10_ws_ct"] = df_IS1[["IS_pds_l10_dw_1_ct", "IS_pds_l10_dw_2_ct", "IS_pds_l10_dw_3_ct", "IS_pds_l10_dw_4_ct", "IS_pds_l10_dw_5_ct", "IS_pds_l10_dw_6_ct", "IS_pds_l10_dw_7_ct", "IS_pds_l10_dw_8_ct", "IS_pds_l10_dw_9_ct", "IS_pds_l10_dw_10_ct"]].sum(axis=1)
df_IS1["IS_pds_l10_dw"] = df_IS1["IS_pds_l10_ws"]/df_IS1["IS_pds_l10_ws_ct"]

# Deleting transient columns
df_IS1 = df_IS1.drop(["IS_pds_l10_dw_1", "IS_pds_l10_dw_2", "IS_pds_l10_dw_3", "IS_pds_l10_dw_4", "IS_pds_l10_dw_5", "IS_pds_l10_dw_6", "IS_pds_l10_dw_7", "IS_pds_l10_dw_8", "IS_pds_l10_dw_9", "IS_pds_l10_dw_10", "IS_pds_l10_dw_1_ct", "IS_pds_l10_dw_2_ct", "IS_pds_l10_dw_3_ct", "IS_pds_l10_dw_4_ct", "IS_pds_l10_dw_5_ct", "IS_pds_l10_dw_6_ct", "IS_pds_l10_dw_7_ct", "IS_pds_l10_dw_8_ct", "IS_pds_l10_dw_9_ct", "IS_pds_l10_dw_10_ct", "IS_pds_l10_ws", "IS_pds_l10_ws_ct"], axis = 1)

In [28]:
# Calculate Pearson R for 15x15 grids for IS1 for this feature
IS1_15x15 = df_IS1.loc[df_IS1["DOW"]!=("Sunday")]

print(IS1_15x15['IST(m)'].corr(IS1_15x15['IS_pds_l10_dw']))

# Calculate Pearson R for each puzzle day for this feature
IS1_Sun = df_IS1.loc[df_IS1["DOW"]==("Sunday")]
print(IS1_Sun['IST(m)'].corr(IS1_Sun['IS_pds_l10_dw']))

IS1_Mon = df_IS1.loc[df_IS1["DOW"]==("Monday")]
print(IS1_Mon['IST(m)'].corr(IS1_Mon['IS_pds_l10_dw']))

IS1_Tue = df_IS1.loc[df_IS1["DOW"]==("Tuesday")]
print(IS1_Tue['IST(m)'].corr(IS1_Tue['IS_pds_l10_dw']))

IS1_Wed = df_IS1.loc[df_IS1["DOW"]==("Wednesday")]
print(IS1_Wed['IST(m)'].corr(IS1_Wed['IS_pds_l10_dw']))

IS1_Thu = df_IS1.loc[df_IS1["DOW"]==("Thursday")]
print(IS1_Thu['IST(m)'].corr(IS1_Thu['IS_pds_l10_dw']))

IS1_Fri = df_IS1.loc[df_IS1["DOW"]==("Friday")]
print(IS1_Fri['IST(m)'].corr(IS1_Fri['IS_pds_l10_dw']))

IS1_Sat = df_IS1.loc[df_IS1["DOW"]==("Saturday")]
print(IS1_Sat['IST(m)'].corr(IS1_Sat['IS_pds_l10_dw']))

0.734118903029271
0.435403207803947
0.4654284001304108
0.4113907977447582
0.3145378653154631
0.16403641266770488
0.2923735193890566
0.41189763899937837


now a different accelerated decay version. still not as good as 10,9,8,7...

In [29]:
#IS_pds_l10_dw
#Provides decay-weighted(dw), puzzle day-specific (pds) mean solve time performance for the individual solver (IS1) over the previous 10 puzzles
# Note that, unlike the 10-puzzle moving average, this weighted average does NOT include the "puzzle at hand" itself

df_IS1 = df_IS1.sort_values(by=['DOW', 'Comp_Date'], ascending = False)

w = [20,18,14,8,4,4,2,2,1,1] #weight assigned to puzzle
#w = [20,19,18,17,16,15,14,13,12,11] #weight assigned to puzzle
#w = [10,9,8,7,6,5,4,3,2,1] #weight assigned to puzzle
#w = [1,1,1,1,1,1,1,1,1,1] #weight assigned to puzzle

df_IS1["IS_pds_l10_dw_1"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-1)*w[0]
df_IS1["IS_pds_l10_dw_2"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-2)*w[1]
df_IS1["IS_pds_l10_dw_3"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-3)*w[2]
df_IS1["IS_pds_l10_dw_4"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-4)*w[3]
df_IS1["IS_pds_l10_dw_5"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-5)*w[4]
df_IS1["IS_pds_l10_dw_6"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-6)*w[5]
df_IS1["IS_pds_l10_dw_7"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-7)*w[6]
df_IS1["IS_pds_l10_dw_8"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-8)*w[7]
df_IS1["IS_pds_l10_dw_9"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-9)*w[8]
df_IS1["IS_pds_l10_dw_10"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-10)*w[9]

df_IS1["IS_pds_l10_dw_1_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-1)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-1))*w[0]
df_IS1["IS_pds_l10_dw_2_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-2)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-2))*w[1]
df_IS1["IS_pds_l10_dw_3_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-3)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-3))*w[2]
df_IS1["IS_pds_l10_dw_4_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-4)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-4))*w[3]
df_IS1["IS_pds_l10_dw_5_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-5)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-5))*w[4]
df_IS1["IS_pds_l10_dw_6_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-6)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-6))*w[5]
df_IS1["IS_pds_l10_dw_7_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-7)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-7))*w[6]
df_IS1["IS_pds_l10_dw_8_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-8)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-8))*w[7]
df_IS1["IS_pds_l10_dw_9_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-9)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-9))*w[8]
df_IS1["IS_pds_l10_dw_10_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-10)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-10))*w[9]

df_IS1["IS_pds_l10_ws"] = df_IS1[["IS_pds_l10_dw_1", "IS_pds_l10_dw_2", "IS_pds_l10_dw_3", "IS_pds_l10_dw_4", "IS_pds_l10_dw_5", "IS_pds_l10_dw_6", "IS_pds_l10_dw_7", "IS_pds_l10_dw_8", "IS_pds_l10_dw_9", "IS_pds_l10_dw_10"]].sum(axis=1)
df_IS1["IS_pds_l10_ws_ct"] = df_IS1[["IS_pds_l10_dw_1_ct", "IS_pds_l10_dw_2_ct", "IS_pds_l10_dw_3_ct", "IS_pds_l10_dw_4_ct", "IS_pds_l10_dw_5_ct", "IS_pds_l10_dw_6_ct", "IS_pds_l10_dw_7_ct", "IS_pds_l10_dw_8_ct", "IS_pds_l10_dw_9_ct", "IS_pds_l10_dw_10_ct"]].sum(axis=1)
df_IS1["IS_pds_l10_dw"] = df_IS1["IS_pds_l10_ws"]/df_IS1["IS_pds_l10_ws_ct"]

# Deleting transient columns
df_IS1 = df_IS1.drop(["IS_pds_l10_dw_1", "IS_pds_l10_dw_2", "IS_pds_l10_dw_3", "IS_pds_l10_dw_4", "IS_pds_l10_dw_5", "IS_pds_l10_dw_6", "IS_pds_l10_dw_7", "IS_pds_l10_dw_8", "IS_pds_l10_dw_9", "IS_pds_l10_dw_10", "IS_pds_l10_dw_1_ct", "IS_pds_l10_dw_2_ct", "IS_pds_l10_dw_3_ct", "IS_pds_l10_dw_4_ct", "IS_pds_l10_dw_5_ct", "IS_pds_l10_dw_6_ct", "IS_pds_l10_dw_7_ct", "IS_pds_l10_dw_8_ct", "IS_pds_l10_dw_9_ct", "IS_pds_l10_dw_10_ct", "IS_pds_l10_ws", "IS_pds_l10_ws_ct"], axis = 1)

In [30]:
# Calculate Pearson R for 15x15 grids for IS1 for this feature
IS1_15x15 = df_IS1.loc[df_IS1["DOW"]!=("Sunday")]

print(IS1_15x15['IST(m)'].corr(IS1_15x15['IS_pds_l10_dw']))

# Calculate Pearson R for each puzzle day for this feature
IS1_Sun = df_IS1.loc[df_IS1["DOW"]==("Sunday")]
print(IS1_Sun['IST(m)'].corr(IS1_Sun['IS_pds_l10_dw']))

IS1_Mon = df_IS1.loc[df_IS1["DOW"]==("Monday")]
print(IS1_Mon['IST(m)'].corr(IS1_Mon['IS_pds_l10_dw']))

IS1_Tue = df_IS1.loc[df_IS1["DOW"]==("Tuesday")]
print(IS1_Tue['IST(m)'].corr(IS1_Tue['IS_pds_l10_dw']))

IS1_Wed = df_IS1.loc[df_IS1["DOW"]==("Wednesday")]
print(IS1_Wed['IST(m)'].corr(IS1_Wed['IS_pds_l10_dw']))

IS1_Thu = df_IS1.loc[df_IS1["DOW"]==("Thursday")]
print(IS1_Thu['IST(m)'].corr(IS1_Thu['IS_pds_l10_dw']))

IS1_Fri = df_IS1.loc[df_IS1["DOW"]==("Friday")]
print(IS1_Fri['IST(m)'].corr(IS1_Fri['IS_pds_l10_dw']))

IS1_Sat = df_IS1.loc[df_IS1["DOW"]==("Saturday")]
print(IS1_Sat['IST(m)'].corr(IS1_Sat['IS_pds_l10_dw']))

0.7322446812406366
0.43381402527723734
0.46045602900539256
0.4098604863040065
0.320740359440232
0.1544225608582156
0.29328134948091467
0.413185049528606


now steady decay, but including the last 20 puzzles instead of the last 10. Including the last 20 instead of the last 10 seems to yield inferior correlations. 

In [35]:
#IS_pds_l20_dw
#Provides decay-weighted(dw), puzzle day-specific (pds) mean solve time performance for the individual solver (IS1) over the previous 20 puzzles
# Note that, unlike the 20-puzzle moving average, this weighted average does NOT include the "puzzle at hand" itself

df_IS1 = df_IS1.sort_values(by=['DOW', 'Comp_Date'], ascending = False)

w = [20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] #weight assigned to puzzle
#w = [20,18,14,8,4,4,2,2,1,1] #weight assigned to puzzle
#w = [20,19,18,17,16,15,14,13,12,11] #weight assigned to puzzle
#w = [10,9,8,7,6,5,4,3,2,1] #weight assigned to puzzle
#w = [1,1,1,1,1,1,1,1,1,1] #weight assigned to puzzle

df_IS1["IS_pds_l20_dw_1"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-1)*w[0]
df_IS1["IS_pds_l20_dw_2"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-2)*w[1]
df_IS1["IS_pds_l20_dw_3"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-3)*w[2]
df_IS1["IS_pds_l20_dw_4"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-4)*w[3]
df_IS1["IS_pds_l20_dw_5"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-5)*w[4]
df_IS1["IS_pds_l20_dw_6"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-6)*w[5]
df_IS1["IS_pds_l20_dw_7"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-7)*w[6]
df_IS1["IS_pds_l20_dw_8"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-8)*w[7]
df_IS1["IS_pds_l20_dw_9"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-9)*w[8]
df_IS1["IS_pds_l20_dw_10"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-10)*w[9]
df_IS1["IS_pds_l20_dw_11"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-11)*w[10]
df_IS1["IS_pds_l20_dw_12"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-12)*w[11]
df_IS1["IS_pds_l20_dw_13"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-13)*w[12]
df_IS1["IS_pds_l20_dw_14"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-14)*w[13]
df_IS1["IS_pds_l20_dw_15"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-15)*w[14]
df_IS1["IS_pds_l20_dw_16"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-16)*w[15]
df_IS1["IS_pds_l20_dw_17"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-17)*w[16]
df_IS1["IS_pds_l20_dw_18"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-18)*w[17]
df_IS1["IS_pds_l20_dw_19"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-19)*w[18]
df_IS1["IS_pds_l20_dw_20"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-20)*w[19]

df_IS1["IS_pds_l20_dw_1_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-1)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-1))*w[0]
df_IS1["IS_pds_l20_dw_2_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-2)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-2))*w[1]
df_IS1["IS_pds_l20_dw_3_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-3)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-3))*w[2]
df_IS1["IS_pds_l20_dw_4_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-4)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-4))*w[3]
df_IS1["IS_pds_l20_dw_5_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-5)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-5))*w[4]
df_IS1["IS_pds_l20_dw_6_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-6)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-6))*w[5]
df_IS1["IS_pds_l20_dw_7_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-7)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-7))*w[6]
df_IS1["IS_pds_l20_dw_8_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-8)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-8))*w[7]
df_IS1["IS_pds_l20_dw_9_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-9)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-9))*w[8]
df_IS1["IS_pds_l20_dw_10_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-10)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-10))*w[9]
df_IS1["IS_pds_l20_dw_11_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-11)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-11))*w[10]
df_IS1["IS_pds_l20_dw_12_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-12)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-12))*w[11]
df_IS1["IS_pds_l20_dw_13_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-13)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-13))*w[12]
df_IS1["IS_pds_l20_dw_14_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-14)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-14))*w[13]
df_IS1["IS_pds_l20_dw_15_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-15)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-15))*w[14]
df_IS1["IS_pds_l20_dw_16_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-16)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-16))*w[15]
df_IS1["IS_pds_l20_dw_17_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-17)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-17))*w[16]
df_IS1["IS_pds_l20_dw_18_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-18)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-18))*w[17]
df_IS1["IS_pds_l20_dw_19_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-19)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-19))*w[18]
df_IS1["IS_pds_l20_dw_20_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-20)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-20))*w[19]

df_IS1["IS_pds_l20_ws"] = df_IS1[["IS_pds_l20_dw_1", "IS_pds_l20_dw_2", "IS_pds_l20_dw_3", "IS_pds_l20_dw_4", "IS_pds_l20_dw_5", "IS_pds_l20_dw_6", "IS_pds_l20_dw_7", "IS_pds_l20_dw_8", "IS_pds_l20_dw_9", "IS_pds_l20_dw_10", "IS_pds_l20_dw_11", "IS_pds_l20_dw_12", "IS_pds_l20_dw_13", "IS_pds_l20_dw_14", "IS_pds_l20_dw_15", "IS_pds_l20_dw_16", "IS_pds_l20_dw_17", "IS_pds_l20_dw_18", "IS_pds_l20_dw_19", "IS_pds_l20_dw_20"]].sum(axis=1)
df_IS1["IS_pds_l20_ws_ct"] = df_IS1[["IS_pds_l20_dw_1_ct", "IS_pds_l20_dw_2_ct", "IS_pds_l20_dw_3_ct", "IS_pds_l20_dw_4_ct", "IS_pds_l20_dw_5_ct", "IS_pds_l20_dw_6_ct", "IS_pds_l20_dw_7_ct", "IS_pds_l20_dw_8_ct", "IS_pds_l20_dw_9_ct", "IS_pds_l20_dw_10_ct", "IS_pds_l20_dw_11_ct", "IS_pds_l20_dw_12_ct", "IS_pds_l20_dw_13_ct", "IS_pds_l20_dw_14_ct", "IS_pds_l20_dw_15_ct", "IS_pds_l20_dw_16_ct", "IS_pds_l20_dw_17_ct", "IS_pds_l20_dw_18_ct", "IS_pds_l20_dw_19_ct", "IS_pds_l20_dw_20_ct"]].sum(axis=1)
df_IS1["IS_pds_l20_dw"] = df_IS1["IS_pds_l20_ws"]/df_IS1["IS_pds_l20_ws_ct"]

# Deleting transient columns
df_IS1 = df_IS1.drop(["IS_pds_l20_dw_1", "IS_pds_l20_dw_2", "IS_pds_l20_dw_3", "IS_pds_l20_dw_4", "IS_pds_l20_dw_5", "IS_pds_l20_dw_6", "IS_pds_l20_dw_7", "IS_pds_l20_dw_8", "IS_pds_l20_dw_9", "IS_pds_l20_dw_10", "IS_pds_l20_dw_11", "IS_pds_l20_dw_12", "IS_pds_l20_dw_13", "IS_pds_l20_dw_14", "IS_pds_l20_dw_15", "IS_pds_l20_dw_16", "IS_pds_l20_dw_17", "IS_pds_l20_dw_18", "IS_pds_l20_dw_19", "IS_pds_l20_dw_20", "IS_pds_l20_dw_1_ct", "IS_pds_l20_dw_2_ct", "IS_pds_l20_dw_3_ct", "IS_pds_l20_dw_4_ct", "IS_pds_l20_dw_5_ct", "IS_pds_l20_dw_6_ct", "IS_pds_l20_dw_7_ct", "IS_pds_l20_dw_8_ct", "IS_pds_l20_dw_9_ct", "IS_pds_l20_dw_10_ct", "IS_pds_l20_dw_11_ct", "IS_pds_l20_dw_12_ct", "IS_pds_l20_dw_13_ct", "IS_pds_l20_dw_14_ct", "IS_pds_l20_dw_15_ct", "IS_pds_l20_dw_16_ct", "IS_pds_l20_dw_17_ct", "IS_pds_l20_dw_18_ct", "IS_pds_l20_dw_19_ct", "IS_pds_l20_dw_20_ct", "IS_pds_l20_ws", "IS_pds_l20_ws_ct"], axis = 1)

In [36]:
# Calculate Pearson R for 15x15 grids for IS1 for this feature
IS1_15x15 = df_IS1.loc[df_IS1["DOW"]!=("Sunday")]

print(IS1_15x15['IST(m)'].corr(IS1_15x15['IS_pds_l20_dw']))

# Calculate Pearson R for each puzzle day for this feature
IS1_Sun = df_IS1.loc[df_IS1["DOW"]==("Sunday")]
print(IS1_Sun['IST(m)'].corr(IS1_Sun['IS_pds_l20_dw']))

IS1_Mon = df_IS1.loc[df_IS1["DOW"]==("Monday")]
print(IS1_Mon['IST(m)'].corr(IS1_Mon['IS_pds_l20_dw']))

IS1_Tue = df_IS1.loc[df_IS1["DOW"]==("Tuesday")]
print(IS1_Tue['IST(m)'].corr(IS1_Tue['IS_pds_l20_dw']))

IS1_Wed = df_IS1.loc[df_IS1["DOW"]==("Wednesday")]
print(IS1_Wed['IST(m)'].corr(IS1_Wed['IS_pds_l20_dw']))

IS1_Thu = df_IS1.loc[df_IS1["DOW"]==("Thursday")]
print(IS1_Thu['IST(m)'].corr(IS1_Thu['IS_pds_l20_dw']))

IS1_Fri = df_IS1.loc[df_IS1["DOW"]==("Friday")]
print(IS1_Fri['IST(m)'].corr(IS1_Fri['IS_pds_l20_dw']))

IS1_Sat = df_IS1.loc[df_IS1["DOW"]==("Saturday")]
print(IS1_Sat['IST(m)'].corr(IS1_Sat['IS_pds_l20_dw']))

0.7325850577816002
0.41083322208358414
0.4763163276314988
0.4300607205578315
0.2650270771008689
0.19059003363097266
0.2858479356858116
0.334938132634092


correlations for l10 with same decay weighting as the l20 above 

0.7365399948610674
0.43786031767557404
0.48277248669809963
0.4261762717217038
0.31181513487890117
0.17319168087379946
0.2943346221444319
0.4025226561581087

now what about last 20, but with no decay weighting at all? (1,1,1,1...). Clearly not as good as the 'standard' decay weighting version for either l10 or 120. Surprisingly, slightly better than l20 with standard decay weighting, but not as good as l10 with standard decay weighting. 

In [39]:
#IS_pds_l20_dw
#Provides decay-weighted(dw), puzzle day-specific (pds) mean solve time performance for the individual solver (IS1) over the previous 20 puzzles
# Note that, unlike the 20-puzzle moving average, this weighted average does NOT include the "puzzle at hand" itself

df_IS1 = df_IS1.sort_values(by=['DOW', 'Comp_Date'], ascending = False)

w = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] #weight assigned to puzzle
#w = [20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] #weight assigned to puzzle

df_IS1["IS_pds_l20_dw_1"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-1)*w[0]
df_IS1["IS_pds_l20_dw_2"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-2)*w[1]
df_IS1["IS_pds_l20_dw_3"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-3)*w[2]
df_IS1["IS_pds_l20_dw_4"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-4)*w[3]
df_IS1["IS_pds_l20_dw_5"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-5)*w[4]
df_IS1["IS_pds_l20_dw_6"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-6)*w[5]
df_IS1["IS_pds_l20_dw_7"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-7)*w[6]
df_IS1["IS_pds_l20_dw_8"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-8)*w[7]
df_IS1["IS_pds_l20_dw_9"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-9)*w[8]
df_IS1["IS_pds_l20_dw_10"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-10)*w[9]
df_IS1["IS_pds_l20_dw_11"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-11)*w[10]
df_IS1["IS_pds_l20_dw_12"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-12)*w[11]
df_IS1["IS_pds_l20_dw_13"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-13)*w[12]
df_IS1["IS_pds_l20_dw_14"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-14)*w[13]
df_IS1["IS_pds_l20_dw_15"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-15)*w[14]
df_IS1["IS_pds_l20_dw_16"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-16)*w[15]
df_IS1["IS_pds_l20_dw_17"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-17)*w[16]
df_IS1["IS_pds_l20_dw_18"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-18)*w[17]
df_IS1["IS_pds_l20_dw_19"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-19)*w[18]
df_IS1["IS_pds_l20_dw_20"] = df_IS1.groupby(['DOW'])['IST(m)'].shift(-20)*w[19]

df_IS1["IS_pds_l20_dw_1_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-1)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-1))*w[0]
df_IS1["IS_pds_l20_dw_2_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-2)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-2))*w[1]
df_IS1["IS_pds_l20_dw_3_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-3)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-3))*w[2]
df_IS1["IS_pds_l20_dw_4_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-4)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-4))*w[3]
df_IS1["IS_pds_l20_dw_5_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-5)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-5))*w[4]
df_IS1["IS_pds_l20_dw_6_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-6)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-6))*w[5]
df_IS1["IS_pds_l20_dw_7_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-7)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-7))*w[6]
df_IS1["IS_pds_l20_dw_8_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-8)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-8))*w[7]
df_IS1["IS_pds_l20_dw_9_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-9)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-9))*w[8]
df_IS1["IS_pds_l20_dw_10_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-10)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-10))*w[9]
df_IS1["IS_pds_l20_dw_11_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-11)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-1))*w[10]
df_IS1["IS_pds_l20_dw_12_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-12)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-2))*w[11]
df_IS1["IS_pds_l20_dw_13_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-13)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-3))*w[12]
df_IS1["IS_pds_l20_dw_14_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-14)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-4))*w[13]
df_IS1["IS_pds_l20_dw_15_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-15)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-5))*w[14]
df_IS1["IS_pds_l20_dw_16_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-16)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-6))*w[15]
df_IS1["IS_pds_l20_dw_17_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-17)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-7))*w[16]
df_IS1["IS_pds_l20_dw_18_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-18)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-8))*w[17]
df_IS1["IS_pds_l20_dw_19_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-19)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-9))*w[18]
df_IS1["IS_pds_l20_dw_20_ct"] = (df_IS1.groupby(['DOW'])['IST(m)'].shift(-20)/df_IS1.groupby(['DOW'])['IST(m)'].shift(-10))*w[19]

df_IS1["IS_pds_l20_ws"] = df_IS1[["IS_pds_l20_dw_1", "IS_pds_l20_dw_2", "IS_pds_l20_dw_3", "IS_pds_l20_dw_4", "IS_pds_l20_dw_5", "IS_pds_l20_dw_6", "IS_pds_l20_dw_7", "IS_pds_l20_dw_8", "IS_pds_l20_dw_9", "IS_pds_l20_dw_10", "IS_pds_l20_dw_11", "IS_pds_l20_dw_12", "IS_pds_l20_dw_13", "IS_pds_l20_dw_14", "IS_pds_l20_dw_15", "IS_pds_l20_dw_16", "IS_pds_l20_dw_17", "IS_pds_l20_dw_18", "IS_pds_l20_dw_19", "IS_pds_l20_dw_20"]].sum(axis=1)
df_IS1["IS_pds_l20_ws_ct"] = df_IS1[["IS_pds_l20_dw_1_ct", "IS_pds_l20_dw_2_ct", "IS_pds_l20_dw_3_ct", "IS_pds_l20_dw_4_ct", "IS_pds_l20_dw_5_ct", "IS_pds_l20_dw_6_ct", "IS_pds_l20_dw_7_ct", "IS_pds_l20_dw_8_ct", "IS_pds_l20_dw_9_ct", "IS_pds_l20_dw_10_ct", "IS_pds_l20_dw_11_ct", "IS_pds_l20_dw_12_ct", "IS_pds_l20_dw_13_ct", "IS_pds_l20_dw_14_ct", "IS_pds_l20_dw_15_ct", "IS_pds_l20_dw_16_ct", "IS_pds_l20_dw_17_ct", "IS_pds_l20_dw_18_ct", "IS_pds_l20_dw_19_ct", "IS_pds_l20_dw_20_ct"]].sum(axis=1)
df_IS1["IS_pds_l20_dw"] = df_IS1["IS_pds_l20_ws"]/df_IS1["IS_pds_l20_ws_ct"]

# Deleting transient columns
df_IS1 = df_IS1.drop(["IS_pds_l20_dw_1", "IS_pds_l20_dw_2", "IS_pds_l20_dw_3", "IS_pds_l20_dw_4", "IS_pds_l20_dw_5", "IS_pds_l20_dw_6", "IS_pds_l20_dw_7", "IS_pds_l20_dw_8", "IS_pds_l20_dw_9", "IS_pds_l20_dw_10", "IS_pds_l20_dw_11", "IS_pds_l20_dw_12", "IS_pds_l20_dw_13", "IS_pds_l20_dw_14", "IS_pds_l20_dw_15", "IS_pds_l20_dw_16", "IS_pds_l20_dw_17", "IS_pds_l20_dw_18", "IS_pds_l20_dw_19", "IS_pds_l20_dw_20", "IS_pds_l20_dw_1_ct", "IS_pds_l20_dw_2_ct", "IS_pds_l20_dw_3_ct", "IS_pds_l20_dw_4_ct", "IS_pds_l20_dw_5_ct", "IS_pds_l20_dw_6_ct", "IS_pds_l20_dw_7_ct", "IS_pds_l20_dw_8_ct", "IS_pds_l20_dw_9_ct", "IS_pds_l20_dw_10_ct", "IS_pds_l20_dw_11_ct", "IS_pds_l20_dw_12_ct", "IS_pds_l20_dw_13_ct", "IS_pds_l20_dw_14_ct", "IS_pds_l20_dw_15_ct", "IS_pds_l20_dw_16_ct", "IS_pds_l20_dw_17_ct", "IS_pds_l20_dw_18_ct", "IS_pds_l20_dw_19_ct", "IS_pds_l20_dw_20_ct", "IS_pds_l20_ws", "IS_pds_l20_ws_ct"], axis = 1)

In [40]:
# Calculate Pearson R for 15x15 grids for IS1 for this feature
IS1_15x15 = df_IS1.loc[df_IS1["DOW"]!=("Sunday")]

print(IS1_15x15['IST(m)'].corr(IS1_15x15['IS_pds_l20_dw']))

# Calculate Pearson R for each puzzle day for this feature
IS1_Sun = df_IS1.loc[df_IS1["DOW"]==("Sunday")]
print(IS1_Sun['IST(m)'].corr(IS1_Sun['IS_pds_l20_dw']))

IS1_Mon = df_IS1.loc[df_IS1["DOW"]==("Monday")]
print(IS1_Mon['IST(m)'].corr(IS1_Mon['IS_pds_l20_dw']))

IS1_Tue = df_IS1.loc[df_IS1["DOW"]==("Tuesday")]
print(IS1_Tue['IST(m)'].corr(IS1_Tue['IS_pds_l20_dw']))

IS1_Wed = df_IS1.loc[df_IS1["DOW"]==("Wednesday")]
print(IS1_Wed['IST(m)'].corr(IS1_Wed['IS_pds_l20_dw']))

IS1_Thu = df_IS1.loc[df_IS1["DOW"]==("Thursday")]
print(IS1_Thu['IST(m)'].corr(IS1_Thu['IS_pds_l20_dw']))

IS1_Fri = df_IS1.loc[df_IS1["DOW"]==("Friday")]
print(IS1_Fri['IST(m)'].corr(IS1_Fri['IS_pds_l20_dw']))

IS1_Sat = df_IS1.loc[df_IS1["DOW"]==("Saturday")]
print(IS1_Sat['IST(m)'].corr(IS1_Sat['IS_pds_l20_dw']))

0.7327275631309264
0.4276911513832265
0.4806563471315929
0.4222889263348428
0.30019955629481915
0.18712775081865662
0.2961726468075646
0.3756280497671812


compare to l20 with 'standard' decay weighting

0.7325850577816002
0.41083322208358414
0.4763163276314988
0.4300607205578315
0.2650270771008689
0.19059003363097266
0.2858479356858116
0.334938132634092

and compare to l10 with 'standard' decay weighting

0.7365399948610674
0.43786031767557404
0.48277248669809963
0.4261762717217038
0.31181513487890117
0.17319168087379946
0.2943346221444319
0.4025226561581087

and compare to l10 with no decay weighting

0.7298144123823953
0.4369244648995447
0.48404623198559427
0.4089516559619141
0.2609697424518978
0.1830683285134033
0.28917690813022684
0.3534599608567863