The goal of this notebook is to determine the best time-decay weighting for past performance for IS2. The evaluation metric is overall correlation across puzzle days between performance over the previous ten matches at different time decay weightings, and the performance in the next match.

Conclusion is that, for the ranges exlored, using the previous 10 puzzles with a'standard' decay weighting yields the best correlation to the 'next' puzzle to be predicted. Caveats: the range of decay curves an # ofpast puzzles explored is very limited, and correlations are NOT the same as predictive value (which will be looked at in the modeling phase vis a vis different decay curves). 

In [1]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import datetime
warnings.filterwarnings('ignore')
#print(sns.__version__)

In [2]:
#Read file into a DataFrame and print head.
xword_data = pd.ExcelFile('../data/NYT_XWord_data.xlsx', engine='openpyxl')
print(xword_data.sheet_names)

['Sheet1']


In [3]:
df1 = xword_data.parse('Sheet1')

In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2217 entries, 0 to 2216
Data columns (total 69 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   Puzzle_Date                   2217 non-null   datetime64[ns]
 1   Completed_Date (IS1)          1173 non-null   datetime64[ns]
 2   Completed_Date (IS2)          1068 non-null   datetime64[ns]
 3   Day_of_Week                   2217 non-null   object        
 4   Time (s) (IS1)                1173 non-null   float64       
 5   Time (s) (IS2)                1068 non-null   float64       
 6   Global_Median_Solver_Time(s)  2217 non-null   int64         
 7   Difficulty                    2217 non-null   object        
 8   Median_Solver%_AVG_PM         2217 non-null   int64         
 9   Grid Size                     2217 non-null   int64         
 10  No_Constructors               2217 non-null   int64         
 11  Constructors(by seniority)    

In [6]:
df1 = df1[["Puzzle_Date", "Completed_Date (IS2)", "Day_of_Week", "Time (s) (IS2)"]]

In [5]:
# Rename some columns for brevity and clarity
df1.rename(columns={'Day_of_Week': 'DOW', 'Puzzle_Date': 'P_Date', 'Time (s) (IS2)':'IST(m)', 'Completed_Date (IS2)':'Comp_Date'}, inplace=True)

In [6]:
# Keep only puzzles for which IS1 has a solve time
df_IS2 = df1.dropna(subset=['IST(m)'])

In [7]:
df_IS2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1068 entries, 1 to 2136
Data columns (total 69 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   P_Date                        1068 non-null   datetime64[ns]
 1   Completed_Date (IS1)          651 non-null    datetime64[ns]
 2   Comp_Date                     1068 non-null   datetime64[ns]
 3   DOW                           1068 non-null   object        
 4   Time (s) (IS1)                651 non-null    float64       
 5   IST(m)                        1068 non-null   float64       
 6   Global_Median_Solver_Time(s)  1068 non-null   int64         
 7   Difficulty                    1068 non-null   object        
 8   Median_Solver%_AVG_PM         1068 non-null   int64         
 9   Grid Size                     1068 non-null   int64         
 10  No_Constructors               1068 non-null   int64         
 11  Constructors(by seniority)    

first version has no decay weighting (all weights are 1)

In [9]:
#IS_pds_l10_dw
#Provides decay-weighted(dw), puzzle day-specific (pds) mean solve time performance for the individual solver (IS2) over the previous 10 puzzles
# Note that, unlike the 10-puzzle moving average, this weighted average does NOT include the "puzzle at hand" itself

df_IS2 = df_IS2.sort_values(by=['DOW', 'Comp_Date'], ascending = False)

#w = [10,9,8,7,6,5,4,3,2,1] #weight assigned to puzzle
w = [1,1,1,1,1,1,1,1,1,1] #weight assigned to puzzle

df_IS2["IS_pds_l10_dw_1"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-1)*w[0]
df_IS2["IS_pds_l10_dw_2"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-2)*w[1]
df_IS2["IS_pds_l10_dw_3"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-3)*w[2]
df_IS2["IS_pds_l10_dw_4"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-4)*w[3]
df_IS2["IS_pds_l10_dw_5"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-5)*w[4]
df_IS2["IS_pds_l10_dw_6"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-6)*w[5]
df_IS2["IS_pds_l10_dw_7"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-7)*w[6]
df_IS2["IS_pds_l10_dw_8"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-8)*w[7]
df_IS2["IS_pds_l10_dw_9"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-9)*w[8]
df_IS2["IS_pds_l10_dw_10"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-10)*w[9]

df_IS2["IS_pds_l10_dw_1_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-1)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-1))*w[0]
df_IS2["IS_pds_l10_dw_2_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-2)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-2))*w[1]
df_IS2["IS_pds_l10_dw_3_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-3)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-3))*w[2]
df_IS2["IS_pds_l10_dw_4_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-4)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-4))*w[3]
df_IS2["IS_pds_l10_dw_5_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-5)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-5))*w[4]
df_IS2["IS_pds_l10_dw_6_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-6)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-6))*w[5]
df_IS2["IS_pds_l10_dw_7_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-7)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-7))*w[6]
df_IS2["IS_pds_l10_dw_8_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-8)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-8))*w[7]
df_IS2["IS_pds_l10_dw_9_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-9)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-9))*w[8]
df_IS2["IS_pds_l10_dw_10_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-10)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-10))*w[9]

df_IS2["IS_pds_l10_ws"] = df_IS2[["IS_pds_l10_dw_1", "IS_pds_l10_dw_2", "IS_pds_l10_dw_3", "IS_pds_l10_dw_4", "IS_pds_l10_dw_5", "IS_pds_l10_dw_6", "IS_pds_l10_dw_7", "IS_pds_l10_dw_8", "IS_pds_l10_dw_9", "IS_pds_l10_dw_10"]].sum(axis=1)
df_IS2["IS_pds_l10_ws_ct"] = df_IS2[["IS_pds_l10_dw_1_ct", "IS_pds_l10_dw_2_ct", "IS_pds_l10_dw_3_ct", "IS_pds_l10_dw_4_ct", "IS_pds_l10_dw_5_ct", "IS_pds_l10_dw_6_ct", "IS_pds_l10_dw_7_ct", "IS_pds_l10_dw_8_ct", "IS_pds_l10_dw_9_ct", "IS_pds_l10_dw_10_ct"]].sum(axis=1)
df_IS2["IS_pds_l10_dw"] = df_IS2["IS_pds_l10_ws"]/df_IS2["IS_pds_l10_ws_ct"]

# Deleting transient columns
df_IS2 = df_IS2.drop(["IS_pds_l10_dw_1", "IS_pds_l10_dw_2", "IS_pds_l10_dw_3", "IS_pds_l10_dw_4", "IS_pds_l10_dw_5", "IS_pds_l10_dw_6", "IS_pds_l10_dw_7", "IS_pds_l10_dw_8", "IS_pds_l10_dw_9", "IS_pds_l10_dw_10", "IS_pds_l10_dw_1_ct", "IS_pds_l10_dw_2_ct", "IS_pds_l10_dw_3_ct", "IS_pds_l10_dw_4_ct", "IS_pds_l10_dw_5_ct", "IS_pds_l10_dw_6_ct", "IS_pds_l10_dw_7_ct", "IS_pds_l10_dw_8_ct", "IS_pds_l10_dw_9_ct", "IS_pds_l10_dw_10_ct", "IS_pds_l10_ws", "IS_pds_l10_ws_ct"], axis = 1)

In [10]:
# Calculate Pearson R for 15x15 grids for IS2 for this feature
IS2_15x15 = df_IS2.loc[df_IS2["DOW"]!=("Sunday")]

print(IS2_15x15['IST(m)'].corr(IS2_15x15['IS_pds_l10_dw']))

# Calculate Pearson R for each puzzle day for this feature
IS2_Sun = df_IS2.loc[df_IS2["DOW"]==("Sunday")]
print(IS2_Sun['IST(m)'].corr(IS2_Sun['IS_pds_l10_dw']))

IS2_Mon = df_IS2.loc[df_IS2["DOW"]==("Monday")]
print(IS2_Mon['IST(m)'].corr(IS2_Mon['IS_pds_l10_dw']))

IS2_Tue = df_IS2.loc[df_IS2["DOW"]==("Tuesday")]
print(IS2_Tue['IST(m)'].corr(IS2_Tue['IS_pds_l10_dw']))

IS2_Wed = df_IS2.loc[df_IS2["DOW"]==("Wednesday")]
print(IS2_Wed['IST(m)'].corr(IS2_Wed['IS_pds_l10_dw']))

IS2_Thu = df_IS2.loc[df_IS2["DOW"]==("Thursday")]
print(IS2_Thu['IST(m)'].corr(IS2_Thu['IS_pds_l10_dw']))

IS2_Fri = df_IS2.loc[df_IS2["DOW"]==("Friday")]
print(IS2_Fri['IST(m)'].corr(IS2_Fri['IS_pds_l10_dw']))

IS2_Sat = df_IS2.loc[df_IS2["DOW"]==("Saturday")]
print(IS2_Sat['IST(m)'].corr(IS2_Sat['IS_pds_l10_dw']))

0.7399212939253964
0.614730673867633
0.6131100395755362
0.6187556066722694
0.5807616550284673
0.513208031645239
0.38041838288651614
0.39067931967328706


now a straight decay version (10, 9,8,7,6....). This is superior to no weighting version, both overall and for almost all of the individual puzzle days.  

In [13]:
#IS_pds_l10_dw
#Provides decay-weighted(dw), puzzle day-specific (pds) mean solve time performance for the individual solver (IS2) over the previous 10 puzzles
# Note that, unlike the 10-puzzle moving average, this weighted average does NOT include the "puzzle at hand" itself

df_IS2 = df_IS2.sort_values(by=['DOW', 'Comp_Date'], ascending = False)

w = [10,9,8,7,6,5,4,3,2,1] #weight assigned to puzzle
#w = [1,1,1,1,1,1,1,1,1,1] #weight assigned to puzzle

df_IS2["IS_pds_l10_dw_1"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-1)*w[0]
df_IS2["IS_pds_l10_dw_2"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-2)*w[1]
df_IS2["IS_pds_l10_dw_3"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-3)*w[2]
df_IS2["IS_pds_l10_dw_4"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-4)*w[3]
df_IS2["IS_pds_l10_dw_5"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-5)*w[4]
df_IS2["IS_pds_l10_dw_6"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-6)*w[5]
df_IS2["IS_pds_l10_dw_7"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-7)*w[6]
df_IS2["IS_pds_l10_dw_8"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-8)*w[7]
df_IS2["IS_pds_l10_dw_9"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-9)*w[8]
df_IS2["IS_pds_l10_dw_10"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-10)*w[9]

df_IS2["IS_pds_l10_dw_1_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-1)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-1))*w[0]
df_IS2["IS_pds_l10_dw_2_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-2)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-2))*w[1]
df_IS2["IS_pds_l10_dw_3_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-3)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-3))*w[2]
df_IS2["IS_pds_l10_dw_4_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-4)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-4))*w[3]
df_IS2["IS_pds_l10_dw_5_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-5)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-5))*w[4]
df_IS2["IS_pds_l10_dw_6_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-6)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-6))*w[5]
df_IS2["IS_pds_l10_dw_7_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-7)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-7))*w[6]
df_IS2["IS_pds_l10_dw_8_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-8)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-8))*w[7]
df_IS2["IS_pds_l10_dw_9_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-9)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-9))*w[8]
df_IS2["IS_pds_l10_dw_10_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-10)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-10))*w[9]

df_IS2["IS_pds_l10_ws"] = df_IS2[["IS_pds_l10_dw_1", "IS_pds_l10_dw_2", "IS_pds_l10_dw_3", "IS_pds_l10_dw_4", "IS_pds_l10_dw_5", "IS_pds_l10_dw_6", "IS_pds_l10_dw_7", "IS_pds_l10_dw_8", "IS_pds_l10_dw_9", "IS_pds_l10_dw_10"]].sum(axis=1)
df_IS2["IS_pds_l10_ws_ct"] = df_IS2[["IS_pds_l10_dw_1_ct", "IS_pds_l10_dw_2_ct", "IS_pds_l10_dw_3_ct", "IS_pds_l10_dw_4_ct", "IS_pds_l10_dw_5_ct", "IS_pds_l10_dw_6_ct", "IS_pds_l10_dw_7_ct", "IS_pds_l10_dw_8_ct", "IS_pds_l10_dw_9_ct", "IS_pds_l10_dw_10_ct"]].sum(axis=1)
df_IS2["IS_pds_l10_dw"] = df_IS2["IS_pds_l10_ws"]/df_IS2["IS_pds_l10_ws_ct"]

# Deleting transient columns
df_IS2 = df_IS2.drop(["IS_pds_l10_dw_1", "IS_pds_l10_dw_2", "IS_pds_l10_dw_3", "IS_pds_l10_dw_4", "IS_pds_l10_dw_5", "IS_pds_l10_dw_6", "IS_pds_l10_dw_7", "IS_pds_l10_dw_8", "IS_pds_l10_dw_9", "IS_pds_l10_dw_10", "IS_pds_l10_dw_1_ct", "IS_pds_l10_dw_2_ct", "IS_pds_l10_dw_3_ct", "IS_pds_l10_dw_4_ct", "IS_pds_l10_dw_5_ct", "IS_pds_l10_dw_6_ct", "IS_pds_l10_dw_7_ct", "IS_pds_l10_dw_8_ct", "IS_pds_l10_dw_9_ct", "IS_pds_l10_dw_10_ct", "IS_pds_l10_ws", "IS_pds_l10_ws_ct"], axis = 1)

In [14]:
# Calculate Pearson R for 15x15 grids for IS2 for this feature
IS2_15x15 = df_IS2.loc[df_IS2["DOW"]!=("Sunday")]

print(IS2_15x15['IST(m)'].corr(IS2_15x15['IS_pds_l10_dw']))

# Calculate Pearson R for each puzzle day for this feature
IS2_Sun = df_IS2.loc[df_IS2["DOW"]==("Sunday")]
print(IS2_Sun['IST(m)'].corr(IS2_Sun['IS_pds_l10_dw']))

IS2_Mon = df_IS2.loc[df_IS2["DOW"]==("Monday")]
print(IS2_Mon['IST(m)'].corr(IS2_Mon['IS_pds_l10_dw']))

IS2_Tue = df_IS2.loc[df_IS2["DOW"]==("Tuesday")]
print(IS2_Tue['IST(m)'].corr(IS2_Tue['IS_pds_l10_dw']))

IS2_Wed = df_IS2.loc[df_IS2["DOW"]==("Wednesday")]
print(IS2_Wed['IST(m)'].corr(IS2_Wed['IS_pds_l10_dw']))

IS2_Thu = df_IS2.loc[df_IS2["DOW"]==("Thursday")]
print(IS2_Thu['IST(m)'].corr(IS2_Thu['IS_pds_l10_dw']))

IS2_Fri = df_IS2.loc[df_IS2["DOW"]==("Friday")]
print(IS2_Fri['IST(m)'].corr(IS2_Fri['IS_pds_l10_dw']))

IS2_Sat = df_IS2.loc[df_IS2["DOW"]==("Saturday")]
print(IS2_Sat['IST(m)'].corr(IS2_Sat['IS_pds_l10_dw']))

0.7390288071618135
0.5962098163188504
0.6100053209287373
0.6039735775314865
0.593650326786817
0.5093382521046165
0.3778934303949527
0.39445868771098525


now a decay version with muted decay (20, 19, 18, 17, 16....). 

In [37]:
#IS_pds_l10_dw
#Provides decay-weighted(dw), puzzle day-specific (pds) mean solve time performance for the individual solver (IS2) over the previous 10 puzzles
# Note that, unlike the 10-puzzle moving average, this weighted average does NOT include the "puzzle at hand" itself

df_IS2 = df_IS2.sort_values(by=['DOW', 'Comp_Date'], ascending = False)

w = [10,9,8,7,6,5,5,5,5,5] #weight assigned to puzzle
#w = [10,9,8,7,6,5,4,3,2,1] #weight assigned to puzzle
#w = [1,1,1,1,1,1,1,1,1,1] #weight assigned to puzzle

df_IS2["IS_pds_l10_dw_1"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-1)*w[0]
df_IS2["IS_pds_l10_dw_2"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-2)*w[1]
df_IS2["IS_pds_l10_dw_3"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-3)*w[2]
df_IS2["IS_pds_l10_dw_4"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-4)*w[3]
df_IS2["IS_pds_l10_dw_5"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-5)*w[4]
df_IS2["IS_pds_l10_dw_6"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-6)*w[5]
df_IS2["IS_pds_l10_dw_7"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-7)*w[6]
df_IS2["IS_pds_l10_dw_8"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-8)*w[7]
df_IS2["IS_pds_l10_dw_9"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-9)*w[8]
df_IS2["IS_pds_l10_dw_10"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-10)*w[9]

df_IS2["IS_pds_l10_dw_1_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-1)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-1))*w[0]
df_IS2["IS_pds_l10_dw_2_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-2)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-2))*w[1]
df_IS2["IS_pds_l10_dw_3_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-3)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-3))*w[2]
df_IS2["IS_pds_l10_dw_4_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-4)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-4))*w[3]
df_IS2["IS_pds_l10_dw_5_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-5)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-5))*w[4]
df_IS2["IS_pds_l10_dw_6_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-6)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-6))*w[5]
df_IS2["IS_pds_l10_dw_7_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-7)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-7))*w[6]
df_IS2["IS_pds_l10_dw_8_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-8)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-8))*w[7]
df_IS2["IS_pds_l10_dw_9_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-9)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-9))*w[8]
df_IS2["IS_pds_l10_dw_10_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-10)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-10))*w[9]

df_IS2["IS_pds_l10_ws"] = df_IS2[["IS_pds_l10_dw_1", "IS_pds_l10_dw_2", "IS_pds_l10_dw_3", "IS_pds_l10_dw_4", "IS_pds_l10_dw_5", "IS_pds_l10_dw_6", "IS_pds_l10_dw_7", "IS_pds_l10_dw_8", "IS_pds_l10_dw_9", "IS_pds_l10_dw_10"]].sum(axis=1)
df_IS2["IS_pds_l10_ws_ct"] = df_IS2[["IS_pds_l10_dw_1_ct", "IS_pds_l10_dw_2_ct", "IS_pds_l10_dw_3_ct", "IS_pds_l10_dw_4_ct", "IS_pds_l10_dw_5_ct", "IS_pds_l10_dw_6_ct", "IS_pds_l10_dw_7_ct", "IS_pds_l10_dw_8_ct", "IS_pds_l10_dw_9_ct", "IS_pds_l10_dw_10_ct"]].sum(axis=1)
df_IS2["IS_pds_l10_dw"] = df_IS2["IS_pds_l10_ws"]/df_IS2["IS_pds_l10_ws_ct"]

# Deleting transient columns
df_IS2 = df_IS2.drop(["IS_pds_l10_dw_1", "IS_pds_l10_dw_2", "IS_pds_l10_dw_3", "IS_pds_l10_dw_4", "IS_pds_l10_dw_5", "IS_pds_l10_dw_6", "IS_pds_l10_dw_7", "IS_pds_l10_dw_8", "IS_pds_l10_dw_9", "IS_pds_l10_dw_10", "IS_pds_l10_dw_1_ct", "IS_pds_l10_dw_2_ct", "IS_pds_l10_dw_3_ct", "IS_pds_l10_dw_4_ct", "IS_pds_l10_dw_5_ct", "IS_pds_l10_dw_6_ct", "IS_pds_l10_dw_7_ct", "IS_pds_l10_dw_8_ct", "IS_pds_l10_dw_9_ct", "IS_pds_l10_dw_10_ct", "IS_pds_l10_ws", "IS_pds_l10_ws_ct"], axis = 1)

In [38]:
# Calculate Pearson R for 15x15 grids for IS2 for this feature
IS2_15x15 = df_IS2.loc[df_IS2["DOW"]!=("Sunday")]

print(IS2_15x15['IST(m)'].corr(IS2_15x15['IS_pds_l10_dw']))

# Calculate Pearson R for each puzzle day for this feature
IS2_Sun = df_IS2.loc[df_IS2["DOW"]==("Sunday")]
print(IS2_Sun['IST(m)'].corr(IS2_Sun['IS_pds_l10_dw']))

IS2_Mon = df_IS2.loc[df_IS2["DOW"]==("Monday")]
print(IS2_Mon['IST(m)'].corr(IS2_Mon['IS_pds_l10_dw']))

IS2_Tue = df_IS2.loc[df_IS2["DOW"]==("Tuesday")]
print(IS2_Tue['IST(m)'].corr(IS2_Tue['IS_pds_l10_dw']))

IS2_Wed = df_IS2.loc[df_IS2["DOW"]==("Wednesday")]
print(IS2_Wed['IST(m)'].corr(IS2_Wed['IS_pds_l10_dw']))

IS2_Thu = df_IS2.loc[df_IS2["DOW"]==("Thursday")]
print(IS2_Thu['IST(m)'].corr(IS2_Thu['IS_pds_l10_dw']))

IS2_Fri = df_IS2.loc[df_IS2["DOW"]==("Friday")]
print(IS2_Fri['IST(m)'].corr(IS2_Fri['IS_pds_l10_dw']))

IS2_Sat = df_IS2.loc[df_IS2["DOW"]==("Saturday")]
print(IS2_Sat['IST(m)'].corr(IS2_Sat['IS_pds_l10_dw']))

0.743352921442702
0.6072318221331192
0.6107834249823985
0.6149923911934436
0.5870921489067161
0.5203705671884663
0.3836191184216351
0.40382174289009476


now a version with accelerated decay. Not as good for all 15x15 as the straight 10,9,8,7... decay version. 

In [17]:
#IS_pds_l10_dw
#Provides decay-weighted(dw), puzzle day-specific (pds) mean solve time performance for the individual solver (IS2) over the previous 10 puzzles
# Note that, unlike the 10-puzzle moving average, this weighted average does NOT include the "puzzle at hand" itself

df_IS2 = df_IS2.sort_values(by=['DOW', 'Comp_Date'], ascending = False)

w = [10,8,6,4,2,2,2,1,1,1] #weight assigned to puzzle
#w = [20,19,18,17,16,15,14,13,12,11] #weight assigned to puzzle
#w = [10,9,8,7,6,5,4,3,2,1] #weight assigned to puzzle
#w = [1,1,1,1,1,1,1,1,1,1] #weight assigned to puzzle

df_IS2["IS_pds_l10_dw_1"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-1)*w[0]
df_IS2["IS_pds_l10_dw_2"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-2)*w[1]
df_IS2["IS_pds_l10_dw_3"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-3)*w[2]
df_IS2["IS_pds_l10_dw_4"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-4)*w[3]
df_IS2["IS_pds_l10_dw_5"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-5)*w[4]
df_IS2["IS_pds_l10_dw_6"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-6)*w[5]
df_IS2["IS_pds_l10_dw_7"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-7)*w[6]
df_IS2["IS_pds_l10_dw_8"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-8)*w[7]
df_IS2["IS_pds_l10_dw_9"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-9)*w[8]
df_IS2["IS_pds_l10_dw_10"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-10)*w[9]

df_IS2["IS_pds_l10_dw_1_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-1)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-1))*w[0]
df_IS2["IS_pds_l10_dw_2_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-2)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-2))*w[1]
df_IS2["IS_pds_l10_dw_3_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-3)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-3))*w[2]
df_IS2["IS_pds_l10_dw_4_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-4)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-4))*w[3]
df_IS2["IS_pds_l10_dw_5_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-5)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-5))*w[4]
df_IS2["IS_pds_l10_dw_6_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-6)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-6))*w[5]
df_IS2["IS_pds_l10_dw_7_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-7)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-7))*w[6]
df_IS2["IS_pds_l10_dw_8_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-8)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-8))*w[7]
df_IS2["IS_pds_l10_dw_9_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-9)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-9))*w[8]
df_IS2["IS_pds_l10_dw_10_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-10)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-10))*w[9]

df_IS2["IS_pds_l10_ws"] = df_IS2[["IS_pds_l10_dw_1", "IS_pds_l10_dw_2", "IS_pds_l10_dw_3", "IS_pds_l10_dw_4", "IS_pds_l10_dw_5", "IS_pds_l10_dw_6", "IS_pds_l10_dw_7", "IS_pds_l10_dw_8", "IS_pds_l10_dw_9", "IS_pds_l10_dw_10"]].sum(axis=1)
df_IS2["IS_pds_l10_ws_ct"] = df_IS2[["IS_pds_l10_dw_1_ct", "IS_pds_l10_dw_2_ct", "IS_pds_l10_dw_3_ct", "IS_pds_l10_dw_4_ct", "IS_pds_l10_dw_5_ct", "IS_pds_l10_dw_6_ct", "IS_pds_l10_dw_7_ct", "IS_pds_l10_dw_8_ct", "IS_pds_l10_dw_9_ct", "IS_pds_l10_dw_10_ct"]].sum(axis=1)
df_IS2["IS_pds_l10_dw"] = df_IS2["IS_pds_l10_ws"]/df_IS2["IS_pds_l10_ws_ct"]

# Deleting transient columns
df_IS2 = df_IS2.drop(["IS_pds_l10_dw_1", "IS_pds_l10_dw_2", "IS_pds_l10_dw_3", "IS_pds_l10_dw_4", "IS_pds_l10_dw_5", "IS_pds_l10_dw_6", "IS_pds_l10_dw_7", "IS_pds_l10_dw_8", "IS_pds_l10_dw_9", "IS_pds_l10_dw_10", "IS_pds_l10_dw_1_ct", "IS_pds_l10_dw_2_ct", "IS_pds_l10_dw_3_ct", "IS_pds_l10_dw_4_ct", "IS_pds_l10_dw_5_ct", "IS_pds_l10_dw_6_ct", "IS_pds_l10_dw_7_ct", "IS_pds_l10_dw_8_ct", "IS_pds_l10_dw_9_ct", "IS_pds_l10_dw_10_ct", "IS_pds_l10_ws", "IS_pds_l10_ws_ct"], axis = 1)

In [18]:
# Calculate Pearson R for 15x15 grids for IS2 for this feature
IS2_15x15 = df_IS2.loc[df_IS2["DOW"]!=("Sunday")]

print(IS2_15x15['IST(m)'].corr(IS2_15x15['IS_pds_l10_dw']))

# Calculate Pearson R for each puzzle day for this feature
IS2_Sun = df_IS2.loc[df_IS2["DOW"]==("Sunday")]
print(IS2_Sun['IST(m)'].corr(IS2_Sun['IS_pds_l10_dw']))

IS2_Mon = df_IS2.loc[df_IS2["DOW"]==("Monday")]
print(IS2_Mon['IST(m)'].corr(IS2_Mon['IS_pds_l10_dw']))

IS2_Tue = df_IS2.loc[df_IS2["DOW"]==("Tuesday")]
print(IS2_Tue['IST(m)'].corr(IS2_Tue['IS_pds_l10_dw']))

IS2_Wed = df_IS2.loc[df_IS2["DOW"]==("Wednesday")]
print(IS2_Wed['IST(m)'].corr(IS2_Wed['IS_pds_l10_dw']))

IS2_Thu = df_IS2.loc[df_IS2["DOW"]==("Thursday")]
print(IS2_Thu['IST(m)'].corr(IS2_Thu['IS_pds_l10_dw']))

IS2_Fri = df_IS2.loc[df_IS2["DOW"]==("Friday")]
print(IS2_Fri['IST(m)'].corr(IS2_Fri['IS_pds_l10_dw']))

IS2_Sat = df_IS2.loc[df_IS2["DOW"]==("Saturday")]
print(IS2_Sat['IST(m)'].corr(IS2_Sat['IS_pds_l10_dw']))

0.7374504113236329
0.5831933493354268
0.5967285462808041
0.5903856321214993
0.5858867897702658
0.5094534409602129
0.36669444211999347
0.4036925133407536


now a different accelerated decay version. still not as good as 10,9,8,7...

In [20]:
#IS_pds_l10_dw
#Provides decay-weighted(dw), puzzle day-specific (pds) mean solve time performance for the individual solver (IS2) over the previous 10 puzzles
# Note that, unlike the 10-puzzle moving average, this weighted average does NOT include the "puzzle at hand" itself

df_IS2 = df_IS2.sort_values(by=['DOW', 'Comp_Date'], ascending = False)

w = [20,18,14,8,4,4,2,2,1,1] #weight assigned to puzzle
#w = [20,19,18,17,16,15,14,13,12,11] #weight assigned to puzzle
#w = [10,9,8,7,6,5,4,3,2,1] #weight assigned to puzzle
#w = [1,1,1,1,1,1,1,1,1,1] #weight assigned to puzzle

df_IS2["IS_pds_l10_dw_1"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-1)*w[0]
df_IS2["IS_pds_l10_dw_2"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-2)*w[1]
df_IS2["IS_pds_l10_dw_3"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-3)*w[2]
df_IS2["IS_pds_l10_dw_4"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-4)*w[3]
df_IS2["IS_pds_l10_dw_5"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-5)*w[4]
df_IS2["IS_pds_l10_dw_6"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-6)*w[5]
df_IS2["IS_pds_l10_dw_7"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-7)*w[6]
df_IS2["IS_pds_l10_dw_8"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-8)*w[7]
df_IS2["IS_pds_l10_dw_9"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-9)*w[8]
df_IS2["IS_pds_l10_dw_10"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-10)*w[9]

df_IS2["IS_pds_l10_dw_1_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-1)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-1))*w[0]
df_IS2["IS_pds_l10_dw_2_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-2)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-2))*w[1]
df_IS2["IS_pds_l10_dw_3_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-3)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-3))*w[2]
df_IS2["IS_pds_l10_dw_4_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-4)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-4))*w[3]
df_IS2["IS_pds_l10_dw_5_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-5)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-5))*w[4]
df_IS2["IS_pds_l10_dw_6_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-6)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-6))*w[5]
df_IS2["IS_pds_l10_dw_7_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-7)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-7))*w[6]
df_IS2["IS_pds_l10_dw_8_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-8)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-8))*w[7]
df_IS2["IS_pds_l10_dw_9_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-9)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-9))*w[8]
df_IS2["IS_pds_l10_dw_10_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-10)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-10))*w[9]

df_IS2["IS_pds_l10_ws"] = df_IS2[["IS_pds_l10_dw_1", "IS_pds_l10_dw_2", "IS_pds_l10_dw_3", "IS_pds_l10_dw_4", "IS_pds_l10_dw_5", "IS_pds_l10_dw_6", "IS_pds_l10_dw_7", "IS_pds_l10_dw_8", "IS_pds_l10_dw_9", "IS_pds_l10_dw_10"]].sum(axis=1)
df_IS2["IS_pds_l10_ws_ct"] = df_IS2[["IS_pds_l10_dw_1_ct", "IS_pds_l10_dw_2_ct", "IS_pds_l10_dw_3_ct", "IS_pds_l10_dw_4_ct", "IS_pds_l10_dw_5_ct", "IS_pds_l10_dw_6_ct", "IS_pds_l10_dw_7_ct", "IS_pds_l10_dw_8_ct", "IS_pds_l10_dw_9_ct", "IS_pds_l10_dw_10_ct"]].sum(axis=1)
df_IS2["IS_pds_l10_dw"] = df_IS2["IS_pds_l10_ws"]/df_IS2["IS_pds_l10_ws_ct"]

# Deleting transient columns
df_IS2 = df_IS2.drop(["IS_pds_l10_dw_1", "IS_pds_l10_dw_2", "IS_pds_l10_dw_3", "IS_pds_l10_dw_4", "IS_pds_l10_dw_5", "IS_pds_l10_dw_6", "IS_pds_l10_dw_7", "IS_pds_l10_dw_8", "IS_pds_l10_dw_9", "IS_pds_l10_dw_10", "IS_pds_l10_dw_1_ct", "IS_pds_l10_dw_2_ct", "IS_pds_l10_dw_3_ct", "IS_pds_l10_dw_4_ct", "IS_pds_l10_dw_5_ct", "IS_pds_l10_dw_6_ct", "IS_pds_l10_dw_7_ct", "IS_pds_l10_dw_8_ct", "IS_pds_l10_dw_9_ct", "IS_pds_l10_dw_10_ct", "IS_pds_l10_ws", "IS_pds_l10_ws_ct"], axis = 1)

In [21]:
# Calculate Pearson R for 15x15 grids for IS2 for this feature
IS2_15x15 = df_IS2.loc[df_IS2["DOW"]!=("Sunday")]

print(IS2_15x15['IST(m)'].corr(IS2_15x15['IS_pds_l10_dw']))

# Calculate Pearson R for each puzzle day for this feature
IS2_Sun = df_IS2.loc[df_IS2["DOW"]==("Sunday")]
print(IS2_Sun['IST(m)'].corr(IS2_Sun['IS_pds_l10_dw']))

IS2_Mon = df_IS2.loc[df_IS2["DOW"]==("Monday")]
print(IS2_Mon['IST(m)'].corr(IS2_Mon['IS_pds_l10_dw']))

IS2_Tue = df_IS2.loc[df_IS2["DOW"]==("Tuesday")]
print(IS2_Tue['IST(m)'].corr(IS2_Tue['IS_pds_l10_dw']))

IS2_Wed = df_IS2.loc[df_IS2["DOW"]==("Wednesday")]
print(IS2_Wed['IST(m)'].corr(IS2_Wed['IS_pds_l10_dw']))

IS2_Thu = df_IS2.loc[df_IS2["DOW"]==("Thursday")]
print(IS2_Thu['IST(m)'].corr(IS2_Thu['IS_pds_l10_dw']))

IS2_Fri = df_IS2.loc[df_IS2["DOW"]==("Friday")]
print(IS2_Fri['IST(m)'].corr(IS2_Fri['IS_pds_l10_dw']))

IS2_Sat = df_IS2.loc[df_IS2["DOW"]==("Saturday")]
print(IS2_Sat['IST(m)'].corr(IS2_Sat['IS_pds_l10_dw']))

0.7352582695063057
0.5780216675416522
0.5899086588990501
0.5859699155795907
0.5864156959326547
0.5070810513009779
0.35896491214799003
0.403606693448494


now steady decay, but including the last 20 puzzles instead of the last 10. Including the last 20 instead of the last 10 seems to yield inferior correlations. 

In [22]:
#IS_pds_l20_dw
#Provides decay-weighted(dw), puzzle day-specific (pds) mean solve time performance for the individual solver (IS2) over the previous 20 puzzles
# Note that, unlike the 20-puzzle moving average, this weighted average does NOT include the "puzzle at hand" itself

df_IS2 = df_IS2.sort_values(by=['DOW', 'Comp_Date'], ascending = False)

w = [20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] #weight assigned to puzzle
#w = [20,18,14,8,4,4,2,2,1,1] #weight assigned to puzzle
#w = [20,19,18,17,16,15,14,13,12,11] #weight assigned to puzzle
#w = [10,9,8,7,6,5,4,3,2,1] #weight assigned to puzzle
#w = [1,1,1,1,1,1,1,1,1,1] #weight assigned to puzzle

df_IS2["IS_pds_l20_dw_1"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-1)*w[0]
df_IS2["IS_pds_l20_dw_2"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-2)*w[1]
df_IS2["IS_pds_l20_dw_3"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-3)*w[2]
df_IS2["IS_pds_l20_dw_4"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-4)*w[3]
df_IS2["IS_pds_l20_dw_5"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-5)*w[4]
df_IS2["IS_pds_l20_dw_6"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-6)*w[5]
df_IS2["IS_pds_l20_dw_7"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-7)*w[6]
df_IS2["IS_pds_l20_dw_8"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-8)*w[7]
df_IS2["IS_pds_l20_dw_9"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-9)*w[8]
df_IS2["IS_pds_l20_dw_10"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-10)*w[9]
df_IS2["IS_pds_l20_dw_11"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-11)*w[10]
df_IS2["IS_pds_l20_dw_12"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-12)*w[11]
df_IS2["IS_pds_l20_dw_13"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-13)*w[12]
df_IS2["IS_pds_l20_dw_14"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-14)*w[13]
df_IS2["IS_pds_l20_dw_15"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-15)*w[14]
df_IS2["IS_pds_l20_dw_16"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-16)*w[15]
df_IS2["IS_pds_l20_dw_17"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-17)*w[16]
df_IS2["IS_pds_l20_dw_18"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-18)*w[17]
df_IS2["IS_pds_l20_dw_19"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-19)*w[18]
df_IS2["IS_pds_l20_dw_20"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-20)*w[19]

df_IS2["IS_pds_l20_dw_1_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-1)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-1))*w[0]
df_IS2["IS_pds_l20_dw_2_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-2)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-2))*w[1]
df_IS2["IS_pds_l20_dw_3_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-3)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-3))*w[2]
df_IS2["IS_pds_l20_dw_4_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-4)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-4))*w[3]
df_IS2["IS_pds_l20_dw_5_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-5)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-5))*w[4]
df_IS2["IS_pds_l20_dw_6_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-6)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-6))*w[5]
df_IS2["IS_pds_l20_dw_7_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-7)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-7))*w[6]
df_IS2["IS_pds_l20_dw_8_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-8)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-8))*w[7]
df_IS2["IS_pds_l20_dw_9_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-9)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-9))*w[8]
df_IS2["IS_pds_l20_dw_10_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-10)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-10))*w[9]
df_IS2["IS_pds_l20_dw_11_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-11)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-11))*w[10]
df_IS2["IS_pds_l20_dw_12_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-12)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-12))*w[11]
df_IS2["IS_pds_l20_dw_13_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-13)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-13))*w[12]
df_IS2["IS_pds_l20_dw_14_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-14)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-14))*w[13]
df_IS2["IS_pds_l20_dw_15_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-15)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-15))*w[14]
df_IS2["IS_pds_l20_dw_16_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-16)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-16))*w[15]
df_IS2["IS_pds_l20_dw_17_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-17)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-17))*w[16]
df_IS2["IS_pds_l20_dw_18_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-18)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-18))*w[17]
df_IS2["IS_pds_l20_dw_19_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-19)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-19))*w[18]
df_IS2["IS_pds_l20_dw_20_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-20)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-20))*w[19]

df_IS2["IS_pds_l20_ws"] = df_IS2[["IS_pds_l20_dw_1", "IS_pds_l20_dw_2", "IS_pds_l20_dw_3", "IS_pds_l20_dw_4", "IS_pds_l20_dw_5", "IS_pds_l20_dw_6", "IS_pds_l20_dw_7", "IS_pds_l20_dw_8", "IS_pds_l20_dw_9", "IS_pds_l20_dw_10", "IS_pds_l20_dw_11", "IS_pds_l20_dw_12", "IS_pds_l20_dw_13", "IS_pds_l20_dw_14", "IS_pds_l20_dw_15", "IS_pds_l20_dw_16", "IS_pds_l20_dw_17", "IS_pds_l20_dw_18", "IS_pds_l20_dw_19", "IS_pds_l20_dw_20"]].sum(axis=1)
df_IS2["IS_pds_l20_ws_ct"] = df_IS2[["IS_pds_l20_dw_1_ct", "IS_pds_l20_dw_2_ct", "IS_pds_l20_dw_3_ct", "IS_pds_l20_dw_4_ct", "IS_pds_l20_dw_5_ct", "IS_pds_l20_dw_6_ct", "IS_pds_l20_dw_7_ct", "IS_pds_l20_dw_8_ct", "IS_pds_l20_dw_9_ct", "IS_pds_l20_dw_10_ct", "IS_pds_l20_dw_11_ct", "IS_pds_l20_dw_12_ct", "IS_pds_l20_dw_13_ct", "IS_pds_l20_dw_14_ct", "IS_pds_l20_dw_15_ct", "IS_pds_l20_dw_16_ct", "IS_pds_l20_dw_17_ct", "IS_pds_l20_dw_18_ct", "IS_pds_l20_dw_19_ct", "IS_pds_l20_dw_20_ct"]].sum(axis=1)
df_IS2["IS_pds_l20_dw"] = df_IS2["IS_pds_l20_ws"]/df_IS2["IS_pds_l20_ws_ct"]

# Deleting transient columns
df_IS2 = df_IS2.drop(["IS_pds_l20_dw_1", "IS_pds_l20_dw_2", "IS_pds_l20_dw_3", "IS_pds_l20_dw_4", "IS_pds_l20_dw_5", "IS_pds_l20_dw_6", "IS_pds_l20_dw_7", "IS_pds_l20_dw_8", "IS_pds_l20_dw_9", "IS_pds_l20_dw_10", "IS_pds_l20_dw_11", "IS_pds_l20_dw_12", "IS_pds_l20_dw_13", "IS_pds_l20_dw_14", "IS_pds_l20_dw_15", "IS_pds_l20_dw_16", "IS_pds_l20_dw_17", "IS_pds_l20_dw_18", "IS_pds_l20_dw_19", "IS_pds_l20_dw_20", "IS_pds_l20_dw_1_ct", "IS_pds_l20_dw_2_ct", "IS_pds_l20_dw_3_ct", "IS_pds_l20_dw_4_ct", "IS_pds_l20_dw_5_ct", "IS_pds_l20_dw_6_ct", "IS_pds_l20_dw_7_ct", "IS_pds_l20_dw_8_ct", "IS_pds_l20_dw_9_ct", "IS_pds_l20_dw_10_ct", "IS_pds_l20_dw_11_ct", "IS_pds_l20_dw_12_ct", "IS_pds_l20_dw_13_ct", "IS_pds_l20_dw_14_ct", "IS_pds_l20_dw_15_ct", "IS_pds_l20_dw_16_ct", "IS_pds_l20_dw_17_ct", "IS_pds_l20_dw_18_ct", "IS_pds_l20_dw_19_ct", "IS_pds_l20_dw_20_ct", "IS_pds_l20_ws", "IS_pds_l20_ws_ct"], axis = 1)

In [24]:
# Calculate Pearson R for 15x15 grids for IS2 for this feature
IS2_15x15 = df_IS2.loc[df_IS2["DOW"]!=("Sunday")]

print(IS2_15x15['IST(m)'].corr(IS2_15x15['IS_pds_l20_dw']))

# Calculate Pearson R for each puzzle day for this feature
IS2_Sun = df_IS2.loc[df_IS2["DOW"]==("Sunday")]
print(IS2_Sun['IST(m)'].corr(IS2_Sun['IS_pds_l20_dw']))

IS2_Mon = df_IS2.loc[df_IS2["DOW"]==("Monday")]
print(IS2_Mon['IST(m)'].corr(IS2_Mon['IS_pds_l20_dw']))

IS2_Tue = df_IS2.loc[df_IS2["DOW"]==("Tuesday")]
print(IS2_Tue['IST(m)'].corr(IS2_Tue['IS_pds_l20_dw']))

IS2_Wed = df_IS2.loc[df_IS2["DOW"]==("Wednesday")]
print(IS2_Wed['IST(m)'].corr(IS2_Wed['IS_pds_l20_dw']))

IS2_Thu = df_IS2.loc[df_IS2["DOW"]==("Thursday")]
print(IS2_Thu['IST(m)'].corr(IS2_Thu['IS_pds_l20_dw']))

IS2_Fri = df_IS2.loc[df_IS2["DOW"]==("Friday")]
print(IS2_Fri['IST(m)'].corr(IS2_Fri['IS_pds_l20_dw']))

IS2_Sat = df_IS2.loc[df_IS2["DOW"]==("Saturday")]
print(IS2_Sat['IST(m)'].corr(IS2_Sat['IS_pds_l20_dw']))

0.7421678195181202
0.6060260087551166
0.6321682996857748
0.6169606175037124
0.5837706379353648
0.5243035030881197
0.365145381601401
0.3900435551088289


correlations for l10 with same decay weighting as the l20 above 



now what about last 20, but with no decay weighting at all? (1,1,1,1...). Clearly not as good as the 'standard' decay weighting version for either l10 or 120. Surprisingly, slightly better than l20 with standard decay weighting, but not as good as l10 with standard decay weighting. 

In [25]:
#IS_pds_l20_dw
#Provides decay-weighted(dw), puzzle day-specific (pds) mean solve time performance for the individual solver (IS2) over the previous 20 puzzles
# Note that, unlike the 20-puzzle moving average, this weighted average does NOT include the "puzzle at hand" itself

df_IS2 = df_IS2.sort_values(by=['DOW', 'Comp_Date'], ascending = False)

w = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] #weight assigned to puzzle
#w = [20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] #weight assigned to puzzle

df_IS2["IS_pds_l20_dw_1"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-1)*w[0]
df_IS2["IS_pds_l20_dw_2"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-2)*w[1]
df_IS2["IS_pds_l20_dw_3"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-3)*w[2]
df_IS2["IS_pds_l20_dw_4"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-4)*w[3]
df_IS2["IS_pds_l20_dw_5"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-5)*w[4]
df_IS2["IS_pds_l20_dw_6"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-6)*w[5]
df_IS2["IS_pds_l20_dw_7"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-7)*w[6]
df_IS2["IS_pds_l20_dw_8"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-8)*w[7]
df_IS2["IS_pds_l20_dw_9"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-9)*w[8]
df_IS2["IS_pds_l20_dw_10"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-10)*w[9]
df_IS2["IS_pds_l20_dw_11"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-11)*w[10]
df_IS2["IS_pds_l20_dw_12"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-12)*w[11]
df_IS2["IS_pds_l20_dw_13"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-13)*w[12]
df_IS2["IS_pds_l20_dw_14"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-14)*w[13]
df_IS2["IS_pds_l20_dw_15"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-15)*w[14]
df_IS2["IS_pds_l20_dw_16"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-16)*w[15]
df_IS2["IS_pds_l20_dw_17"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-17)*w[16]
df_IS2["IS_pds_l20_dw_18"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-18)*w[17]
df_IS2["IS_pds_l20_dw_19"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-19)*w[18]
df_IS2["IS_pds_l20_dw_20"] = df_IS2.groupby(['DOW'])['IST(m)'].shift(-20)*w[19]

df_IS2["IS_pds_l20_dw_1_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-1)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-1))*w[0]
df_IS2["IS_pds_l20_dw_2_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-2)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-2))*w[1]
df_IS2["IS_pds_l20_dw_3_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-3)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-3))*w[2]
df_IS2["IS_pds_l20_dw_4_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-4)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-4))*w[3]
df_IS2["IS_pds_l20_dw_5_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-5)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-5))*w[4]
df_IS2["IS_pds_l20_dw_6_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-6)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-6))*w[5]
df_IS2["IS_pds_l20_dw_7_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-7)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-7))*w[6]
df_IS2["IS_pds_l20_dw_8_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-8)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-8))*w[7]
df_IS2["IS_pds_l20_dw_9_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-9)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-9))*w[8]
df_IS2["IS_pds_l20_dw_10_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-10)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-10))*w[9]
df_IS2["IS_pds_l20_dw_11_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-11)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-1))*w[10]
df_IS2["IS_pds_l20_dw_12_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-12)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-2))*w[11]
df_IS2["IS_pds_l20_dw_13_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-13)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-3))*w[12]
df_IS2["IS_pds_l20_dw_14_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-14)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-4))*w[13]
df_IS2["IS_pds_l20_dw_15_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-15)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-5))*w[14]
df_IS2["IS_pds_l20_dw_16_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-16)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-6))*w[15]
df_IS2["IS_pds_l20_dw_17_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-17)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-7))*w[16]
df_IS2["IS_pds_l20_dw_18_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-18)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-8))*w[17]
df_IS2["IS_pds_l20_dw_19_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-19)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-9))*w[18]
df_IS2["IS_pds_l20_dw_20_ct"] = (df_IS2.groupby(['DOW'])['IST(m)'].shift(-20)/df_IS2.groupby(['DOW'])['IST(m)'].shift(-10))*w[19]

df_IS2["IS_pds_l20_ws"] = df_IS2[["IS_pds_l20_dw_1", "IS_pds_l20_dw_2", "IS_pds_l20_dw_3", "IS_pds_l20_dw_4", "IS_pds_l20_dw_5", "IS_pds_l20_dw_6", "IS_pds_l20_dw_7", "IS_pds_l20_dw_8", "IS_pds_l20_dw_9", "IS_pds_l20_dw_10", "IS_pds_l20_dw_11", "IS_pds_l20_dw_12", "IS_pds_l20_dw_13", "IS_pds_l20_dw_14", "IS_pds_l20_dw_15", "IS_pds_l20_dw_16", "IS_pds_l20_dw_17", "IS_pds_l20_dw_18", "IS_pds_l20_dw_19", "IS_pds_l20_dw_20"]].sum(axis=1)
df_IS2["IS_pds_l20_ws_ct"] = df_IS2[["IS_pds_l20_dw_1_ct", "IS_pds_l20_dw_2_ct", "IS_pds_l20_dw_3_ct", "IS_pds_l20_dw_4_ct", "IS_pds_l20_dw_5_ct", "IS_pds_l20_dw_6_ct", "IS_pds_l20_dw_7_ct", "IS_pds_l20_dw_8_ct", "IS_pds_l20_dw_9_ct", "IS_pds_l20_dw_10_ct", "IS_pds_l20_dw_11_ct", "IS_pds_l20_dw_12_ct", "IS_pds_l20_dw_13_ct", "IS_pds_l20_dw_14_ct", "IS_pds_l20_dw_15_ct", "IS_pds_l20_dw_16_ct", "IS_pds_l20_dw_17_ct", "IS_pds_l20_dw_18_ct", "IS_pds_l20_dw_19_ct", "IS_pds_l20_dw_20_ct"]].sum(axis=1)
df_IS2["IS_pds_l20_dw"] = df_IS2["IS_pds_l20_ws"]/df_IS2["IS_pds_l20_ws_ct"]

# Deleting transient columns
df_IS2 = df_IS2.drop(["IS_pds_l20_dw_1", "IS_pds_l20_dw_2", "IS_pds_l20_dw_3", "IS_pds_l20_dw_4", "IS_pds_l20_dw_5", "IS_pds_l20_dw_6", "IS_pds_l20_dw_7", "IS_pds_l20_dw_8", "IS_pds_l20_dw_9", "IS_pds_l20_dw_10", "IS_pds_l20_dw_11", "IS_pds_l20_dw_12", "IS_pds_l20_dw_13", "IS_pds_l20_dw_14", "IS_pds_l20_dw_15", "IS_pds_l20_dw_16", "IS_pds_l20_dw_17", "IS_pds_l20_dw_18", "IS_pds_l20_dw_19", "IS_pds_l20_dw_20", "IS_pds_l20_dw_1_ct", "IS_pds_l20_dw_2_ct", "IS_pds_l20_dw_3_ct", "IS_pds_l20_dw_4_ct", "IS_pds_l20_dw_5_ct", "IS_pds_l20_dw_6_ct", "IS_pds_l20_dw_7_ct", "IS_pds_l20_dw_8_ct", "IS_pds_l20_dw_9_ct", "IS_pds_l20_dw_10_ct", "IS_pds_l20_dw_11_ct", "IS_pds_l20_dw_12_ct", "IS_pds_l20_dw_13_ct", "IS_pds_l20_dw_14_ct", "IS_pds_l20_dw_15_ct", "IS_pds_l20_dw_16_ct", "IS_pds_l20_dw_17_ct", "IS_pds_l20_dw_18_ct", "IS_pds_l20_dw_19_ct", "IS_pds_l20_dw_20_ct", "IS_pds_l20_ws", "IS_pds_l20_ws_ct"], axis = 1)

In [26]:
# Calculate Pearson R for 15x15 grids for IS2 for this feature
IS2_15x15 = df_IS2.loc[df_IS2["DOW"]!=("Sunday")]

print(IS2_15x15['IST(m)'].corr(IS2_15x15['IS_pds_l20_dw']))

# Calculate Pearson R for each puzzle day for this feature
IS2_Sun = df_IS2.loc[df_IS2["DOW"]==("Sunday")]
print(IS2_Sun['IST(m)'].corr(IS2_Sun['IS_pds_l20_dw']))

IS2_Mon = df_IS2.loc[df_IS2["DOW"]==("Monday")]
print(IS2_Mon['IST(m)'].corr(IS2_Mon['IS_pds_l20_dw']))

IS2_Tue = df_IS2.loc[df_IS2["DOW"]==("Tuesday")]
print(IS2_Tue['IST(m)'].corr(IS2_Tue['IS_pds_l20_dw']))

IS2_Wed = df_IS2.loc[df_IS2["DOW"]==("Wednesday")]
print(IS2_Wed['IST(m)'].corr(IS2_Wed['IS_pds_l20_dw']))

IS2_Thu = df_IS2.loc[df_IS2["DOW"]==("Thursday")]
print(IS2_Thu['IST(m)'].corr(IS2_Thu['IS_pds_l20_dw']))

IS2_Fri = df_IS2.loc[df_IS2["DOW"]==("Friday")]
print(IS2_Fri['IST(m)'].corr(IS2_Fri['IS_pds_l20_dw']))

IS2_Sat = df_IS2.loc[df_IS2["DOW"]==("Saturday")]
print(IS2_Sat['IST(m)'].corr(IS2_Sat['IS_pds_l20_dw']))

0.738626648393393
0.6199415120167843
0.6167008428228382
0.6183490833513486
0.5893817770928548
0.5160695519580055
0.40383770052500617
0.39076563298384526


compare to l20 with 'standard' decay weighting



and compare to l10 with 'standard' decay weighting



and compare to l10 with no decay weighting

