The goal of this notebook is to determine the best time-decay weighting for past performance for GMS. The evaluation metric is overall correlation across puzzle days between performance over the previous ten matches at different time decay weightings, and the performance in the next match.

Conclusion is that, for the ranges exlored, using the previous 10 puzzles with a'standard' decay weighting yields the best correlation to the 'next' puzzle to be predicted. Caveats: the range of decay curves an # ofpast puzzles explored is very limited, and correlations are NOT the same as predictive value (which will be looked at in the modeling phase vis a vis different decay curves). 

In [1]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import datetime
warnings.filterwarnings('ignore')
#print(sns.__version__)

In [2]:
#Read file into a DataFrame and print head.
xword_data = pd.ExcelFile('../data/NYT_XWord_data.xlsx', engine='openpyxl')
print(xword_data.sheet_names)

['Sheet1']


In [10]:
df1 = xword_data.parse('Sheet1')

In [11]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2217 entries, 0 to 2216
Data columns (total 69 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   Puzzle_Date                   2217 non-null   datetime64[ns]
 1   Completed_Date (IS1)          1173 non-null   datetime64[ns]
 2   Completed_Date (IS2)          1068 non-null   datetime64[ns]
 3   Day_of_Week                   2217 non-null   object        
 4   Time (s) (IS1)                1173 non-null   float64       
 5   Time (s) (IS2)                1068 non-null   float64       
 6   Global_Median_Solver_Time(s)  2217 non-null   int64         
 7   Difficulty                    2217 non-null   object        
 8   Median_Solver%_AVG_PM         2217 non-null   int64         
 9   Grid Size                     2217 non-null   int64         
 10  No_Constructors               2217 non-null   int64         
 11  Constructors(by seniority)    

In [12]:
df1 = df1[["Puzzle_Date", "Day_of_Week", "Global_Median_Solver_Time(s)"]]

In [13]:
# Rename some columns for brevity and clarity
df1.rename(columns={'Day_of_Week': 'DOW', 'Puzzle_Date': 'P_Date', 'Global_Median_Solver_Time(s)':'GMST(s)'}, inplace=True)

In [14]:
# Keep only puzzles for which GMS has a solve time
df_GMS = df1.dropna(subset=['GMST(s)'])

In [16]:
df_GMS.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2217 entries, 0 to 2216
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   P_Date   2217 non-null   datetime64[ns]
 1   DOW      2217 non-null   object        
 2   GMST(s)  2217 non-null   int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 69.3+ KB


first version has no decay weighting (all weights are 1)

In [21]:
#GMS_pds_l10_dw
#Provides decay-weighted(dw), puzzle day-specific (pds) mean solve time performance for the individual solver (IS2) over the previous 10 puzzles
# Note that, unlike the 10-puzzle moving average, this weighted average does NOT include the "puzzle at hand" itself

df_GMS = df_GMS.sort_values(by=['DOW', 'P_Date'], ascending = False)

#w = [10,9,8,7,6,5,4,3,2,1] #weight assigned to puzzle
w = [1,1,1,1,1,1,1,1,1,1] #weight assigned to puzzle

df_GMS["GMS_pds_l10_dw_1"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-1)*w[0]
df_GMS["GMS_pds_l10_dw_2"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-2)*w[1]
df_GMS["GMS_pds_l10_dw_3"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-3)*w[2]
df_GMS["GMS_pds_l10_dw_4"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-4)*w[3]
df_GMS["GMS_pds_l10_dw_5"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-5)*w[4]
df_GMS["GMS_pds_l10_dw_6"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-6)*w[5]
df_GMS["GMS_pds_l10_dw_7"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-7)*w[6]
df_GMS["GMS_pds_l10_dw_8"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-8)*w[7]
df_GMS["GMS_pds_l10_dw_9"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-9)*w[8]
df_GMS["GMS_pds_l10_dw_10"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-10)*w[9]

df_GMS["GMS_pds_l10_dw_1_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-1)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-1))*w[0]
df_GMS["GMS_pds_l10_dw_2_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-2)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-2))*w[1]
df_GMS["GMS_pds_l10_dw_3_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-3)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-3))*w[2]
df_GMS["GMS_pds_l10_dw_4_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-4)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-4))*w[3]
df_GMS["GMS_pds_l10_dw_5_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-5)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-5))*w[4]
df_GMS["GMS_pds_l10_dw_6_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-6)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-6))*w[5]
df_GMS["GMS_pds_l10_dw_7_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-7)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-7))*w[6]
df_GMS["GMS_pds_l10_dw_8_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-8)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-8))*w[7]
df_GMS["GMS_pds_l10_dw_9_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-9)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-9))*w[8]
df_GMS["GMS_pds_l10_dw_10_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-10)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-10))*w[9]

df_GMS["GMS_pds_l10_ws"] = df_GMS[["GMS_pds_l10_dw_1", "GMS_pds_l10_dw_2", "GMS_pds_l10_dw_3", "GMS_pds_l10_dw_4", "GMS_pds_l10_dw_5", "GMS_pds_l10_dw_6", "GMS_pds_l10_dw_7", "GMS_pds_l10_dw_8", "GMS_pds_l10_dw_9", "GMS_pds_l10_dw_10"]].sum(axis=1)
df_GMS["GMS_pds_l10_ws_ct"] = df_GMS[["GMS_pds_l10_dw_1_ct", "GMS_pds_l10_dw_2_ct", "GMS_pds_l10_dw_3_ct", "GMS_pds_l10_dw_4_ct", "GMS_pds_l10_dw_5_ct", "GMS_pds_l10_dw_6_ct", "GMS_pds_l10_dw_7_ct", "GMS_pds_l10_dw_8_ct", "GMS_pds_l10_dw_9_ct", "GMS_pds_l10_dw_10_ct"]].sum(axis=1)
df_GMS["GMS_pds_l10_dw"] = df_GMS["GMS_pds_l10_ws"]/df_GMS["GMS_pds_l10_ws_ct"]

# Deleting transient columns
df_GMS = df_GMS.drop(["GMS_pds_l10_dw_1", "GMS_pds_l10_dw_2", "GMS_pds_l10_dw_3", "GMS_pds_l10_dw_4", "GMS_pds_l10_dw_5", "GMS_pds_l10_dw_6", "GMS_pds_l10_dw_7", "GMS_pds_l10_dw_8", "GMS_pds_l10_dw_9", "GMS_pds_l10_dw_10", "GMS_pds_l10_dw_1_ct", "GMS_pds_l10_dw_2_ct", "GMS_pds_l10_dw_3_ct", "GMS_pds_l10_dw_4_ct", "GMS_pds_l10_dw_5_ct", "GMS_pds_l10_dw_6_ct", "GMS_pds_l10_dw_7_ct", "GMS_pds_l10_dw_8_ct", "GMS_pds_l10_dw_9_ct", "GMS_pds_l10_dw_10_ct", "GMS_pds_l10_ws", "GMS_pds_l10_ws_ct"], axis = 1)

In [22]:
# Calculate Pearson R for 15x15 grids for GMS for this feature
GMS_15x15 = df_GMS.loc[df_GMS["DOW"]!=("Sunday")]

print(GMS_15x15['GMST(s)'].corr(GMS_15x15['GMS_pds_l10_dw']))

# Calculate Pearson R for each puzzle day for this feature
GMS_Sun = df_GMS.loc[df_GMS["DOW"]==("Sunday")]
print(GMS_Sun['GMST(s)'].corr(GMS_Sun['GMS_pds_l10_dw']))

GMS_Mon = df_GMS.loc[df_GMS["DOW"]==("Monday")]
print(GMS_Mon['GMST(s)'].corr(GMS_Mon['GMS_pds_l10_dw']))

GMS_Tue = df_GMS.loc[df_GMS["DOW"]==("Tuesday")]
print(GMS_Tue['GMST(s)'].corr(GMS_Tue['GMS_pds_l10_dw']))

GMS_Wed = df_GMS.loc[df_GMS["DOW"]==("Wednesday")]
print(GMS_Wed['GMST(s)'].corr(GMS_Wed['GMS_pds_l10_dw']))

GMS_Thu = df_GMS.loc[df_GMS["DOW"]==("Thursday")]
print(GMS_Thu['GMST(s)'].corr(GMS_Thu['GMS_pds_l10_dw']))

GMS_Fri = df_GMS.loc[df_GMS["DOW"]==("Friday")]
print(GMS_Fri['GMST(s)'].corr(GMS_Fri['GMS_pds_l10_dw']))

GMS_Sat = df_GMS.loc[df_GMS["DOW"]==("Saturday")]
print(GMS_Sat['GMST(s)'].corr(GMS_Sat['GMS_pds_l10_dw']))

0.8418627774024388
0.6346715718676434
0.5782258044301885
0.5439264401350923
0.4577624197387856
0.4144478623288913
0.41524360446506436
0.3956719368806027


In [23]:
#GMS_pds_l10_dw
#Provides decay-weighted(dw), puzzle day-specific (pds) mean solve time performance for the individual solver (IS2) over the previous 10 puzzles
# Note that, unlike the 10-puzzle moving average, this weighted average does NOT include the "puzzle at hand" itself

df_GMS = df_GMS.sort_values(by=['DOW', 'P_Date'], ascending = False)

w = [10,9,8,7,6,5,4,3,2,1] #weight assigned to puzzle
#w = [1,1,1,1,1,1,1,1,1,1] #weight assigned to puzzle

df_GMS["GMS_pds_l10_dw_1"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-1)*w[0]
df_GMS["GMS_pds_l10_dw_2"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-2)*w[1]
df_GMS["GMS_pds_l10_dw_3"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-3)*w[2]
df_GMS["GMS_pds_l10_dw_4"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-4)*w[3]
df_GMS["GMS_pds_l10_dw_5"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-5)*w[4]
df_GMS["GMS_pds_l10_dw_6"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-6)*w[5]
df_GMS["GMS_pds_l10_dw_7"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-7)*w[6]
df_GMS["GMS_pds_l10_dw_8"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-8)*w[7]
df_GMS["GMS_pds_l10_dw_9"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-9)*w[8]
df_GMS["GMS_pds_l10_dw_10"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-10)*w[9]

df_GMS["GMS_pds_l10_dw_1_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-1)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-1))*w[0]
df_GMS["GMS_pds_l10_dw_2_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-2)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-2))*w[1]
df_GMS["GMS_pds_l10_dw_3_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-3)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-3))*w[2]
df_GMS["GMS_pds_l10_dw_4_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-4)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-4))*w[3]
df_GMS["GMS_pds_l10_dw_5_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-5)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-5))*w[4]
df_GMS["GMS_pds_l10_dw_6_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-6)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-6))*w[5]
df_GMS["GMS_pds_l10_dw_7_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-7)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-7))*w[6]
df_GMS["GMS_pds_l10_dw_8_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-8)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-8))*w[7]
df_GMS["GMS_pds_l10_dw_9_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-9)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-9))*w[8]
df_GMS["GMS_pds_l10_dw_10_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-10)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-10))*w[9]

df_GMS["GMS_pds_l10_ws"] = df_GMS[["GMS_pds_l10_dw_1", "GMS_pds_l10_dw_2", "GMS_pds_l10_dw_3", "GMS_pds_l10_dw_4", "GMS_pds_l10_dw_5", "GMS_pds_l10_dw_6", "GMS_pds_l10_dw_7", "GMS_pds_l10_dw_8", "GMS_pds_l10_dw_9", "GMS_pds_l10_dw_10"]].sum(axis=1)
df_GMS["GMS_pds_l10_ws_ct"] = df_GMS[["GMS_pds_l10_dw_1_ct", "GMS_pds_l10_dw_2_ct", "GMS_pds_l10_dw_3_ct", "GMS_pds_l10_dw_4_ct", "GMS_pds_l10_dw_5_ct", "GMS_pds_l10_dw_6_ct", "GMS_pds_l10_dw_7_ct", "GMS_pds_l10_dw_8_ct", "GMS_pds_l10_dw_9_ct", "GMS_pds_l10_dw_10_ct"]].sum(axis=1)
df_GMS["GMS_pds_l10_dw"] = df_GMS["GMS_pds_l10_ws"]/df_GMS["GMS_pds_l10_ws_ct"]

# Deleting transient columns
df_GMS = df_GMS.drop(["GMS_pds_l10_dw_1", "GMS_pds_l10_dw_2", "GMS_pds_l10_dw_3", "GMS_pds_l10_dw_4", "GMS_pds_l10_dw_5", "GMS_pds_l10_dw_6", "GMS_pds_l10_dw_7", "GMS_pds_l10_dw_8", "GMS_pds_l10_dw_9", "GMS_pds_l10_dw_10", "GMS_pds_l10_dw_1_ct", "GMS_pds_l10_dw_2_ct", "GMS_pds_l10_dw_3_ct", "GMS_pds_l10_dw_4_ct", "GMS_pds_l10_dw_5_ct", "GMS_pds_l10_dw_6_ct", "GMS_pds_l10_dw_7_ct", "GMS_pds_l10_dw_8_ct", "GMS_pds_l10_dw_9_ct", "GMS_pds_l10_dw_10_ct", "GMS_pds_l10_ws", "GMS_pds_l10_ws_ct"], axis = 1)

In [24]:
# Calculate Pearson R for 15x15 grids for GMS for this feature
GMS_15x15 = df_GMS.loc[df_GMS["DOW"]!=("Sunday")]

print(GMS_15x15['GMST(s)'].corr(GMS_15x15['GMS_pds_l10_dw']))

# Calculate Pearson R for each puzzle day for this feature
GMS_Sun = df_GMS.loc[df_GMS["DOW"]==("Sunday")]
print(GMS_Sun['GMST(s)'].corr(GMS_Sun['GMS_pds_l10_dw']))

GMS_Mon = df_GMS.loc[df_GMS["DOW"]==("Monday")]
print(GMS_Mon['GMST(s)'].corr(GMS_Mon['GMS_pds_l10_dw']))

GMS_Tue = df_GMS.loc[df_GMS["DOW"]==("Tuesday")]
print(GMS_Tue['GMST(s)'].corr(GMS_Tue['GMS_pds_l10_dw']))

GMS_Wed = df_GMS.loc[df_GMS["DOW"]==("Wednesday")]
print(GMS_Wed['GMST(s)'].corr(GMS_Wed['GMS_pds_l10_dw']))

GMS_Thu = df_GMS.loc[df_GMS["DOW"]==("Thursday")]
print(GMS_Thu['GMST(s)'].corr(GMS_Thu['GMS_pds_l10_dw']))

GMS_Fri = df_GMS.loc[df_GMS["DOW"]==("Friday")]
print(GMS_Fri['GMST(s)'].corr(GMS_Fri['GMS_pds_l10_dw']))

GMS_Sat = df_GMS.loc[df_GMS["DOW"]==("Saturday")]
print(GMS_Sat['GMST(s)'].corr(GMS_Sat['GMS_pds_l10_dw']))

0.8377893660414176
0.6263346913720875
0.5477464961461604
0.5155292501609692
0.4493108812765185
0.37336779343324766
0.40772727227598005
0.39584696005974485


No decay weighing l10: 
0.8418627774024388
0.6346715718676434
0.5782258044301885
0.5439264401350923
0.4577624197387856
0.4144478623288913
0.41524360446506436
0.3956719368806027

In [27]:
#GMS_pds_l10_dw
#Provides decay-weighted(dw), puzzle day-specific (pds) mean solve time performance for the individual solver (IS2) over the previous 10 puzzles
# Note that, unlike the 10-puzzle moving average, this weighted average does NOT include the "puzzle at hand" itself

df_GMS = df_GMS.sort_values(by=['DOW', 'P_Date'], ascending = False)

w = [10,9,8,7,6,5,5,5,3,3] #weight assigned to puzzle
#w = [1,1,1,1,1,1,1,1,1,1] #weight assigned to puzzle

df_GMS["GMS_pds_l10_dw_1"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-1)*w[0]
df_GMS["GMS_pds_l10_dw_2"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-2)*w[1]
df_GMS["GMS_pds_l10_dw_3"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-3)*w[2]
df_GMS["GMS_pds_l10_dw_4"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-4)*w[3]
df_GMS["GMS_pds_l10_dw_5"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-5)*w[4]
df_GMS["GMS_pds_l10_dw_6"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-6)*w[5]
df_GMS["GMS_pds_l10_dw_7"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-7)*w[6]
df_GMS["GMS_pds_l10_dw_8"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-8)*w[7]
df_GMS["GMS_pds_l10_dw_9"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-9)*w[8]
df_GMS["GMS_pds_l10_dw_10"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-10)*w[9]

df_GMS["GMS_pds_l10_dw_1_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-1)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-1))*w[0]
df_GMS["GMS_pds_l10_dw_2_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-2)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-2))*w[1]
df_GMS["GMS_pds_l10_dw_3_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-3)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-3))*w[2]
df_GMS["GMS_pds_l10_dw_4_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-4)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-4))*w[3]
df_GMS["GMS_pds_l10_dw_5_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-5)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-5))*w[4]
df_GMS["GMS_pds_l10_dw_6_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-6)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-6))*w[5]
df_GMS["GMS_pds_l10_dw_7_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-7)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-7))*w[6]
df_GMS["GMS_pds_l10_dw_8_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-8)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-8))*w[7]
df_GMS["GMS_pds_l10_dw_9_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-9)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-9))*w[8]
df_GMS["GMS_pds_l10_dw_10_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-10)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-10))*w[9]

df_GMS["GMS_pds_l10_ws"] = df_GMS[["GMS_pds_l10_dw_1", "GMS_pds_l10_dw_2", "GMS_pds_l10_dw_3", "GMS_pds_l10_dw_4", "GMS_pds_l10_dw_5", "GMS_pds_l10_dw_6", "GMS_pds_l10_dw_7", "GMS_pds_l10_dw_8", "GMS_pds_l10_dw_9", "GMS_pds_l10_dw_10"]].sum(axis=1)
df_GMS["GMS_pds_l10_ws_ct"] = df_GMS[["GMS_pds_l10_dw_1_ct", "GMS_pds_l10_dw_2_ct", "GMS_pds_l10_dw_3_ct", "GMS_pds_l10_dw_4_ct", "GMS_pds_l10_dw_5_ct", "GMS_pds_l10_dw_6_ct", "GMS_pds_l10_dw_7_ct", "GMS_pds_l10_dw_8_ct", "GMS_pds_l10_dw_9_ct", "GMS_pds_l10_dw_10_ct"]].sum(axis=1)
df_GMS["GMS_pds_l10_dw"] = df_GMS["GMS_pds_l10_ws"]/df_GMS["GMS_pds_l10_ws_ct"]

# Deleting transient columns
df_GMS = df_GMS.drop(["GMS_pds_l10_dw_1", "GMS_pds_l10_dw_2", "GMS_pds_l10_dw_3", "GMS_pds_l10_dw_4", "GMS_pds_l10_dw_5", "GMS_pds_l10_dw_6", "GMS_pds_l10_dw_7", "GMS_pds_l10_dw_8", "GMS_pds_l10_dw_9", "GMS_pds_l10_dw_10", "GMS_pds_l10_dw_1_ct", "GMS_pds_l10_dw_2_ct", "GMS_pds_l10_dw_3_ct", "GMS_pds_l10_dw_4_ct", "GMS_pds_l10_dw_5_ct", "GMS_pds_l10_dw_6_ct", "GMS_pds_l10_dw_7_ct", "GMS_pds_l10_dw_8_ct", "GMS_pds_l10_dw_9_ct", "GMS_pds_l10_dw_10_ct", "GMS_pds_l10_ws", "GMS_pds_l10_ws_ct"], axis = 1)

In [28]:
# Calculate Pearson R for 15x15 grids for GMS for this feature
GMS_15x15 = df_GMS.loc[df_GMS["DOW"]!=("Sunday")]

print(GMS_15x15['GMST(s)'].corr(GMS_15x15['GMS_pds_l10_dw']))

# Calculate Pearson R for each puzzle day for this feature
GMS_Sun = df_GMS.loc[df_GMS["DOW"]==("Sunday")]
print(GMS_Sun['GMST(s)'].corr(GMS_Sun['GMS_pds_l10_dw']))

GMS_Mon = df_GMS.loc[df_GMS["DOW"]==("Monday")]
print(GMS_Mon['GMST(s)'].corr(GMS_Mon['GMS_pds_l10_dw']))

GMS_Tue = df_GMS.loc[df_GMS["DOW"]==("Tuesday")]
print(GMS_Tue['GMST(s)'].corr(GMS_Tue['GMS_pds_l10_dw']))

GMS_Wed = df_GMS.loc[df_GMS["DOW"]==("Wednesday")]
print(GMS_Wed['GMST(s)'].corr(GMS_Wed['GMS_pds_l10_dw']))

GMS_Thu = df_GMS.loc[df_GMS["DOW"]==("Thursday")]
print(GMS_Thu['GMST(s)'].corr(GMS_Thu['GMS_pds_l10_dw']))

GMS_Fri = df_GMS.loc[df_GMS["DOW"]==("Friday")]
print(GMS_Fri['GMST(s)'].corr(GMS_Fri['GMS_pds_l10_dw']))

GMS_Sat = df_GMS.loc[df_GMS["DOW"]==("Saturday")]
print(GMS_Sat['GMST(s)'].corr(GMS_Sat['GMS_pds_l10_dw']))

0.8397651181172807
0.6296195562396468
0.5597228353489881
0.5247106392363798
0.45674182021615184
0.3870246077137881
0.40897133894412496
0.4006394406650335


In [35]:
#GMS_pds_l20_dw
#Provides decay-weighted(dw), puzzle day-specific (pds) mean solve time performance for the individual solver (GMS) over the previous 20 puzzles
# Note that, unlike the 20-puzzle moving average, the GMS weighted average does NOT include the "puzzle at hand" itself

df_GMS = df_GMS.sort_values(by=['DOW', 'P_Date'], ascending = False)

w = [20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] #weight assigned to puzzle
#w = [20,18,14,8,4,4,2,2,1,1] #weight assigned to puzzle
#w = [20,19,18,17,16,15,14,13,12,11] #weight assigned to puzzle
#w = [10,9,8,7,6,5,4,3,2,1] #weight assigned to puzzle
#w = [1,1,1,1,1,1,1,1,1,1] #weight assigned to puzzle

df_GMS["GMS_pds_l20_dw_1"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-1)*w[0]
df_GMS["GMS_pds_l20_dw_2"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-2)*w[1]
df_GMS["GMS_pds_l20_dw_3"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-3)*w[2]
df_GMS["GMS_pds_l20_dw_4"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-4)*w[3]
df_GMS["GMS_pds_l20_dw_5"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-5)*w[4]
df_GMS["GMS_pds_l20_dw_6"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-6)*w[5]
df_GMS["GMS_pds_l20_dw_7"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-7)*w[6]
df_GMS["GMS_pds_l20_dw_8"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-8)*w[7]
df_GMS["GMS_pds_l20_dw_9"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-9)*w[8]
df_GMS["GMS_pds_l20_dw_10"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-10)*w[9]
df_GMS["GMS_pds_l20_dw_11"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-11)*w[10]
df_GMS["GMS_pds_l20_dw_12"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-12)*w[11]
df_GMS["GMS_pds_l20_dw_13"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-13)*w[12]
df_GMS["GMS_pds_l20_dw_14"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-14)*w[13]
df_GMS["GMS_pds_l20_dw_15"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-15)*w[14]
df_GMS["GMS_pds_l20_dw_16"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-16)*w[15]
df_GMS["GMS_pds_l20_dw_17"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-17)*w[16]
df_GMS["GMS_pds_l20_dw_18"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-18)*w[17]
df_GMS["GMS_pds_l20_dw_19"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-19)*w[18]
df_GMS["GMS_pds_l20_dw_20"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-20)*w[19]

df_GMS["GMS_pds_l20_dw_1_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-1)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-1))*w[0]
df_GMS["GMS_pds_l20_dw_2_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-2)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-2))*w[1]
df_GMS["GMS_pds_l20_dw_3_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-3)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-3))*w[2]
df_GMS["GMS_pds_l20_dw_4_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-4)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-4))*w[3]
df_GMS["GMS_pds_l20_dw_5_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-5)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-5))*w[4]
df_GMS["GMS_pds_l20_dw_6_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-6)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-6))*w[5]
df_GMS["GMS_pds_l20_dw_7_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-7)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-7))*w[6]
df_GMS["GMS_pds_l20_dw_8_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-8)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-8))*w[7]
df_GMS["GMS_pds_l20_dw_9_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-9)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-9))*w[8]
df_GMS["GMS_pds_l20_dw_10_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-10)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-10))*w[9]
df_GMS["GMS_pds_l20_dw_11_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-11)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-11))*w[10]
df_GMS["GMS_pds_l20_dw_12_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-12)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-12))*w[11]
df_GMS["GMS_pds_l20_dw_13_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-13)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-13))*w[12]
df_GMS["GMS_pds_l20_dw_14_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-14)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-14))*w[13]
df_GMS["GMS_pds_l20_dw_15_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-15)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-15))*w[14]
df_GMS["GMS_pds_l20_dw_16_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-16)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-16))*w[15]
df_GMS["GMS_pds_l20_dw_17_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-17)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-17))*w[16]
df_GMS["GMS_pds_l20_dw_18_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-18)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-18))*w[17]
df_GMS["GMS_pds_l20_dw_19_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-19)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-19))*w[18]
df_GMS["GMS_pds_l20_dw_20_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-20)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-20))*w[19]

df_GMS["GMS_pds_l20_ws"] = df_GMS[["GMS_pds_l20_dw_1", "GMS_pds_l20_dw_2", "GMS_pds_l20_dw_3", "GMS_pds_l20_dw_4", "GMS_pds_l20_dw_5", "GMS_pds_l20_dw_6", "GMS_pds_l20_dw_7", "GMS_pds_l20_dw_8", "GMS_pds_l20_dw_9", "GMS_pds_l20_dw_10", "GMS_pds_l20_dw_11", "GMS_pds_l20_dw_12", "GMS_pds_l20_dw_13", "GMS_pds_l20_dw_14", "GMS_pds_l20_dw_15", "GMS_pds_l20_dw_16", "GMS_pds_l20_dw_17", "GMS_pds_l20_dw_18", "GMS_pds_l20_dw_19", "GMS_pds_l20_dw_20"]].sum(axis=1)
df_GMS["GMS_pds_l20_ws_ct"] = df_GMS[["GMS_pds_l20_dw_1_ct", "GMS_pds_l20_dw_2_ct", "GMS_pds_l20_dw_3_ct", "GMS_pds_l20_dw_4_ct", "GMS_pds_l20_dw_5_ct", "GMS_pds_l20_dw_6_ct", "GMS_pds_l20_dw_7_ct", "GMS_pds_l20_dw_8_ct", "GMS_pds_l20_dw_9_ct", "GMS_pds_l20_dw_10_ct", "GMS_pds_l20_dw_11_ct", "GMS_pds_l20_dw_12_ct", "GMS_pds_l20_dw_13_ct", "GMS_pds_l20_dw_14_ct", "GMS_pds_l20_dw_15_ct", "GMS_pds_l20_dw_16_ct", "GMS_pds_l20_dw_17_ct", "GMS_pds_l20_dw_18_ct", "GMS_pds_l20_dw_19_ct", "GMS_pds_l20_dw_20_ct"]].sum(axis=1)
df_GMS["GMS_pds_l20_dw"] = df_GMS["GMS_pds_l20_ws"]/df_GMS["GMS_pds_l20_ws_ct"]

# Deleting transient columns
df_GMS = df_GMS.drop(["GMS_pds_l20_dw_1", "GMS_pds_l20_dw_2", "GMS_pds_l20_dw_3", "GMS_pds_l20_dw_4", "GMS_pds_l20_dw_5", "GMS_pds_l20_dw_6", "GMS_pds_l20_dw_7", "GMS_pds_l20_dw_8", "GMS_pds_l20_dw_9", "GMS_pds_l20_dw_10", "GMS_pds_l20_dw_11", "GMS_pds_l20_dw_12", "GMS_pds_l20_dw_13", "GMS_pds_l20_dw_14", "GMS_pds_l20_dw_15", "GMS_pds_l20_dw_16", "GMS_pds_l20_dw_17", "GMS_pds_l20_dw_18", "GMS_pds_l20_dw_19", "GMS_pds_l20_dw_20", "GMS_pds_l20_dw_1_ct", "GMS_pds_l20_dw_2_ct", "GMS_pds_l20_dw_3_ct", "GMS_pds_l20_dw_4_ct", "GMS_pds_l20_dw_5_ct", "GMS_pds_l20_dw_6_ct", "GMS_pds_l20_dw_7_ct", "GMS_pds_l20_dw_8_ct", "GMS_pds_l20_dw_9_ct", "GMS_pds_l20_dw_10_ct", "GMS_pds_l20_dw_11_ct", "GMS_pds_l20_dw_12_ct", "GMS_pds_l20_dw_13_ct", "GMS_pds_l20_dw_14_ct", "GMS_pds_l20_dw_15_ct", "GMS_pds_l20_dw_16_ct", "GMS_pds_l20_dw_17_ct", "GMS_pds_l20_dw_18_ct", "GMS_pds_l20_dw_19_ct", "GMS_pds_l20_dw_20_ct", "GMS_pds_l20_ws", "GMS_pds_l20_ws_ct"], axis = 1)

In [36]:
# Calculate Pearson R for 15x15 grids for GMS for this feature
GMS_15x15 = df_GMS.loc[df_GMS["DOW"]!=("Sunday")]

print(GMS_15x15['GMST(s)'].corr(GMS_15x15['GMS_pds_l20_dw']))

# Calculate Pearson R for each puzzle day for this feature
GMS_Sun = df_GMS.loc[df_GMS["DOW"]==("Sunday")]
print(GMS_Sun['GMST(s)'].corr(GMS_Sun['GMS_pds_l20_dw']))

GMS_Mon = df_GMS.loc[df_GMS["DOW"]==("Monday")]
print(GMS_Mon['GMST(s)'].corr(GMS_Mon['GMS_pds_l20_dw']))

GMS_Tue = df_GMS.loc[df_GMS["DOW"]==("Tuesday")]
print(GMS_Tue['GMST(s)'].corr(GMS_Tue['GMS_pds_l20_dw']))

GMS_Wed = df_GMS.loc[df_GMS["DOW"]==("Wednesday")]
print(GMS_Wed['GMST(s)'].corr(GMS_Wed['GMS_pds_l20_dw']))

GMS_Thu = df_GMS.loc[df_GMS["DOW"]==("Thursday")]
print(GMS_Thu['GMST(s)'].corr(GMS_Thu['GMS_pds_l20_dw']))

GMS_Fri = df_GMS.loc[df_GMS["DOW"]==("Friday")]
print(GMS_Fri['GMST(s)'].corr(GMS_Fri['GMS_pds_l20_dw']))

GMS_Sat = df_GMS.loc[df_GMS["DOW"]==("Saturday")]
print(GMS_Sat['GMST(s)'].corr(GMS_Sat['GMS_pds_l20_dw']))

0.8451902813422775
0.6446272452518622
0.5731007225677922
0.5446819555447372
0.4815082262032795
0.41478220064211974
0.4272289111769171
0.4125990737853344


In [37]:
#GMS_pds_l20_dw
#Provides decay-weighted(dw), puzzle day-specific (pds) mean solve time performance for the individual solver (GMS) over the previous 20 puzzles
# Note that, unlike the 20-puzzle moving average, the GMS weighted average does NOT include the "puzzle at hand" itself

df_GMS = df_GMS.sort_values(by=['DOW', 'P_Date'], ascending = False)

#w = [20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] #weight assigned to puzzle
#w = [20,18,14,8,4,4,2,2,1,1] #weight assigned to puzzle
#w = [20,19,18,17,16,15,14,13,12,11] #weight assigned to puzzle
#w = [10,9,8,7,6,5,4,3,2,1] #weight assigned to puzzle
w = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] #weight assigned to puzzle

df_GMS["GMS_pds_l20_dw_1"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-1)*w[0]
df_GMS["GMS_pds_l20_dw_2"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-2)*w[1]
df_GMS["GMS_pds_l20_dw_3"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-3)*w[2]
df_GMS["GMS_pds_l20_dw_4"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-4)*w[3]
df_GMS["GMS_pds_l20_dw_5"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-5)*w[4]
df_GMS["GMS_pds_l20_dw_6"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-6)*w[5]
df_GMS["GMS_pds_l20_dw_7"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-7)*w[6]
df_GMS["GMS_pds_l20_dw_8"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-8)*w[7]
df_GMS["GMS_pds_l20_dw_9"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-9)*w[8]
df_GMS["GMS_pds_l20_dw_10"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-10)*w[9]
df_GMS["GMS_pds_l20_dw_11"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-11)*w[10]
df_GMS["GMS_pds_l20_dw_12"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-12)*w[11]
df_GMS["GMS_pds_l20_dw_13"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-13)*w[12]
df_GMS["GMS_pds_l20_dw_14"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-14)*w[13]
df_GMS["GMS_pds_l20_dw_15"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-15)*w[14]
df_GMS["GMS_pds_l20_dw_16"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-16)*w[15]
df_GMS["GMS_pds_l20_dw_17"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-17)*w[16]
df_GMS["GMS_pds_l20_dw_18"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-18)*w[17]
df_GMS["GMS_pds_l20_dw_19"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-19)*w[18]
df_GMS["GMS_pds_l20_dw_20"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-20)*w[19]

df_GMS["GMS_pds_l20_dw_1_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-1)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-1))*w[0]
df_GMS["GMS_pds_l20_dw_2_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-2)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-2))*w[1]
df_GMS["GMS_pds_l20_dw_3_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-3)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-3))*w[2]
df_GMS["GMS_pds_l20_dw_4_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-4)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-4))*w[3]
df_GMS["GMS_pds_l20_dw_5_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-5)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-5))*w[4]
df_GMS["GMS_pds_l20_dw_6_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-6)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-6))*w[5]
df_GMS["GMS_pds_l20_dw_7_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-7)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-7))*w[6]
df_GMS["GMS_pds_l20_dw_8_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-8)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-8))*w[7]
df_GMS["GMS_pds_l20_dw_9_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-9)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-9))*w[8]
df_GMS["GMS_pds_l20_dw_10_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-10)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-10))*w[9]
df_GMS["GMS_pds_l20_dw_11_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-11)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-11))*w[10]
df_GMS["GMS_pds_l20_dw_12_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-12)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-12))*w[11]
df_GMS["GMS_pds_l20_dw_13_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-13)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-13))*w[12]
df_GMS["GMS_pds_l20_dw_14_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-14)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-14))*w[13]
df_GMS["GMS_pds_l20_dw_15_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-15)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-15))*w[14]
df_GMS["GMS_pds_l20_dw_16_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-16)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-16))*w[15]
df_GMS["GMS_pds_l20_dw_17_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-17)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-17))*w[16]
df_GMS["GMS_pds_l20_dw_18_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-18)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-18))*w[17]
df_GMS["GMS_pds_l20_dw_19_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-19)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-19))*w[18]
df_GMS["GMS_pds_l20_dw_20_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-20)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-20))*w[19]

df_GMS["GMS_pds_l20_ws"] = df_GMS[["GMS_pds_l20_dw_1", "GMS_pds_l20_dw_2", "GMS_pds_l20_dw_3", "GMS_pds_l20_dw_4", "GMS_pds_l20_dw_5", "GMS_pds_l20_dw_6", "GMS_pds_l20_dw_7", "GMS_pds_l20_dw_8", "GMS_pds_l20_dw_9", "GMS_pds_l20_dw_10", "GMS_pds_l20_dw_11", "GMS_pds_l20_dw_12", "GMS_pds_l20_dw_13", "GMS_pds_l20_dw_14", "GMS_pds_l20_dw_15", "GMS_pds_l20_dw_16", "GMS_pds_l20_dw_17", "GMS_pds_l20_dw_18", "GMS_pds_l20_dw_19", "GMS_pds_l20_dw_20"]].sum(axis=1)
df_GMS["GMS_pds_l20_ws_ct"] = df_GMS[["GMS_pds_l20_dw_1_ct", "GMS_pds_l20_dw_2_ct", "GMS_pds_l20_dw_3_ct", "GMS_pds_l20_dw_4_ct", "GMS_pds_l20_dw_5_ct", "GMS_pds_l20_dw_6_ct", "GMS_pds_l20_dw_7_ct", "GMS_pds_l20_dw_8_ct", "GMS_pds_l20_dw_9_ct", "GMS_pds_l20_dw_10_ct", "GMS_pds_l20_dw_11_ct", "GMS_pds_l20_dw_12_ct", "GMS_pds_l20_dw_13_ct", "GMS_pds_l20_dw_14_ct", "GMS_pds_l20_dw_15_ct", "GMS_pds_l20_dw_16_ct", "GMS_pds_l20_dw_17_ct", "GMS_pds_l20_dw_18_ct", "GMS_pds_l20_dw_19_ct", "GMS_pds_l20_dw_20_ct"]].sum(axis=1)
df_GMS["GMS_pds_l20_dw"] = df_GMS["GMS_pds_l20_ws"]/df_GMS["GMS_pds_l20_ws_ct"]

# Deleting transient columns
df_GMS = df_GMS.drop(["GMS_pds_l20_dw_1", "GMS_pds_l20_dw_2", "GMS_pds_l20_dw_3", "GMS_pds_l20_dw_4", "GMS_pds_l20_dw_5", "GMS_pds_l20_dw_6", "GMS_pds_l20_dw_7", "GMS_pds_l20_dw_8", "GMS_pds_l20_dw_9", "GMS_pds_l20_dw_10", "GMS_pds_l20_dw_11", "GMS_pds_l20_dw_12", "GMS_pds_l20_dw_13", "GMS_pds_l20_dw_14", "GMS_pds_l20_dw_15", "GMS_pds_l20_dw_16", "GMS_pds_l20_dw_17", "GMS_pds_l20_dw_18", "GMS_pds_l20_dw_19", "GMS_pds_l20_dw_20", "GMS_pds_l20_dw_1_ct", "GMS_pds_l20_dw_2_ct", "GMS_pds_l20_dw_3_ct", "GMS_pds_l20_dw_4_ct", "GMS_pds_l20_dw_5_ct", "GMS_pds_l20_dw_6_ct", "GMS_pds_l20_dw_7_ct", "GMS_pds_l20_dw_8_ct", "GMS_pds_l20_dw_9_ct", "GMS_pds_l20_dw_10_ct", "GMS_pds_l20_dw_11_ct", "GMS_pds_l20_dw_12_ct", "GMS_pds_l20_dw_13_ct", "GMS_pds_l20_dw_14_ct", "GMS_pds_l20_dw_15_ct", "GMS_pds_l20_dw_16_ct", "GMS_pds_l20_dw_17_ct", "GMS_pds_l20_dw_18_ct", "GMS_pds_l20_dw_19_ct", "GMS_pds_l20_dw_20_ct", "GMS_pds_l20_ws", "GMS_pds_l20_ws_ct"], axis = 1)

In [38]:
# Calculate Pearson R for 15x15 grids for GMS for this feature
GMS_15x15 = df_GMS.loc[df_GMS["DOW"]!=("Sunday")]

print(GMS_15x15['GMST(s)'].corr(GMS_15x15['GMS_pds_l20_dw']))

# Calculate Pearson R for each puzzle day for this feature
GMS_Sun = df_GMS.loc[df_GMS["DOW"]==("Sunday")]
print(GMS_Sun['GMST(s)'].corr(GMS_Sun['GMS_pds_l20_dw']))

GMS_Mon = df_GMS.loc[df_GMS["DOW"]==("Monday")]
print(GMS_Mon['GMST(s)'].corr(GMS_Mon['GMS_pds_l20_dw']))

GMS_Tue = df_GMS.loc[df_GMS["DOW"]==("Tuesday")]
print(GMS_Tue['GMST(s)'].corr(GMS_Tue['GMS_pds_l20_dw']))

GMS_Wed = df_GMS.loc[df_GMS["DOW"]==("Wednesday")]
print(GMS_Wed['GMST(s)'].corr(GMS_Wed['GMS_pds_l20_dw']))

GMS_Thu = df_GMS.loc[df_GMS["DOW"]==("Thursday")]
print(GMS_Thu['GMST(s)'].corr(GMS_Thu['GMS_pds_l20_dw']))

GMS_Fri = df_GMS.loc[df_GMS["DOW"]==("Friday")]
print(GMS_Fri['GMST(s)'].corr(GMS_Fri['GMS_pds_l20_dw']))

GMS_Sat = df_GMS.loc[df_GMS["DOW"]==("Saturday")]
print(GMS_Sat['GMST(s)'].corr(GMS_Sat['GMS_pds_l20_dw']))

0.8468784671796991
0.650055756249313
0.5780868807325392
0.5474971211883398
0.48803382472820056
0.4348192776007167
0.4335460276450358
0.41744663126335735


In [39]:
#GMS_pds_l20_dw
#Provides decay-weighted(dw), puzzle day-specific (pds) mean solve time performance for the individual solver (GMS) over the previous 20 puzzles
# Note that, unlike the 20-puzzle moving average, the GMS weighted average does NOT include the "puzzle at hand" itself

df_GMS = df_GMS.sort_values(by=['DOW', 'P_Date'], ascending = False)

w = [20,19,18,17,16,15,14,13,12,11,10,10,10,10,10,10,10,10,10,10] #weight assigned to puzzle
#w = [20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] #weight assigned to puzzle
#w = [20,18,14,8,4,4,2,2,1,1] #weight assigned to puzzle
#w = [20,19,18,17,16,15,14,13,12,11] #weight assigned to puzzle
#w = [10,9,8,7,6,5,4,3,2,1] #weight assigned to puzzle
#w = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] #weight assigned to puzzle

df_GMS["GMS_pds_l20_dw_1"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-1)*w[0]
df_GMS["GMS_pds_l20_dw_2"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-2)*w[1]
df_GMS["GMS_pds_l20_dw_3"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-3)*w[2]
df_GMS["GMS_pds_l20_dw_4"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-4)*w[3]
df_GMS["GMS_pds_l20_dw_5"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-5)*w[4]
df_GMS["GMS_pds_l20_dw_6"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-6)*w[5]
df_GMS["GMS_pds_l20_dw_7"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-7)*w[6]
df_GMS["GMS_pds_l20_dw_8"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-8)*w[7]
df_GMS["GMS_pds_l20_dw_9"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-9)*w[8]
df_GMS["GMS_pds_l20_dw_10"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-10)*w[9]
df_GMS["GMS_pds_l20_dw_11"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-11)*w[10]
df_GMS["GMS_pds_l20_dw_12"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-12)*w[11]
df_GMS["GMS_pds_l20_dw_13"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-13)*w[12]
df_GMS["GMS_pds_l20_dw_14"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-14)*w[13]
df_GMS["GMS_pds_l20_dw_15"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-15)*w[14]
df_GMS["GMS_pds_l20_dw_16"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-16)*w[15]
df_GMS["GMS_pds_l20_dw_17"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-17)*w[16]
df_GMS["GMS_pds_l20_dw_18"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-18)*w[17]
df_GMS["GMS_pds_l20_dw_19"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-19)*w[18]
df_GMS["GMS_pds_l20_dw_20"] = df_GMS.groupby(['DOW'])['GMST(s)'].shift(-20)*w[19]

df_GMS["GMS_pds_l20_dw_1_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-1)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-1))*w[0]
df_GMS["GMS_pds_l20_dw_2_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-2)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-2))*w[1]
df_GMS["GMS_pds_l20_dw_3_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-3)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-3))*w[2]
df_GMS["GMS_pds_l20_dw_4_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-4)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-4))*w[3]
df_GMS["GMS_pds_l20_dw_5_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-5)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-5))*w[4]
df_GMS["GMS_pds_l20_dw_6_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-6)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-6))*w[5]
df_GMS["GMS_pds_l20_dw_7_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-7)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-7))*w[6]
df_GMS["GMS_pds_l20_dw_8_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-8)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-8))*w[7]
df_GMS["GMS_pds_l20_dw_9_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-9)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-9))*w[8]
df_GMS["GMS_pds_l20_dw_10_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-10)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-10))*w[9]
df_GMS["GMS_pds_l20_dw_11_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-11)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-11))*w[10]
df_GMS["GMS_pds_l20_dw_12_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-12)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-12))*w[11]
df_GMS["GMS_pds_l20_dw_13_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-13)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-13))*w[12]
df_GMS["GMS_pds_l20_dw_14_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-14)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-14))*w[13]
df_GMS["GMS_pds_l20_dw_15_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-15)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-15))*w[14]
df_GMS["GMS_pds_l20_dw_16_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-16)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-16))*w[15]
df_GMS["GMS_pds_l20_dw_17_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-17)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-17))*w[16]
df_GMS["GMS_pds_l20_dw_18_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-18)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-18))*w[17]
df_GMS["GMS_pds_l20_dw_19_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-19)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-19))*w[18]
df_GMS["GMS_pds_l20_dw_20_ct"] = (df_GMS.groupby(['DOW'])['GMST(s)'].shift(-20)/df_GMS.groupby(['DOW'])['GMST(s)'].shift(-20))*w[19]

df_GMS["GMS_pds_l20_ws"] = df_GMS[["GMS_pds_l20_dw_1", "GMS_pds_l20_dw_2", "GMS_pds_l20_dw_3", "GMS_pds_l20_dw_4", "GMS_pds_l20_dw_5", "GMS_pds_l20_dw_6", "GMS_pds_l20_dw_7", "GMS_pds_l20_dw_8", "GMS_pds_l20_dw_9", "GMS_pds_l20_dw_10", "GMS_pds_l20_dw_11", "GMS_pds_l20_dw_12", "GMS_pds_l20_dw_13", "GMS_pds_l20_dw_14", "GMS_pds_l20_dw_15", "GMS_pds_l20_dw_16", "GMS_pds_l20_dw_17", "GMS_pds_l20_dw_18", "GMS_pds_l20_dw_19", "GMS_pds_l20_dw_20"]].sum(axis=1)
df_GMS["GMS_pds_l20_ws_ct"] = df_GMS[["GMS_pds_l20_dw_1_ct", "GMS_pds_l20_dw_2_ct", "GMS_pds_l20_dw_3_ct", "GMS_pds_l20_dw_4_ct", "GMS_pds_l20_dw_5_ct", "GMS_pds_l20_dw_6_ct", "GMS_pds_l20_dw_7_ct", "GMS_pds_l20_dw_8_ct", "GMS_pds_l20_dw_9_ct", "GMS_pds_l20_dw_10_ct", "GMS_pds_l20_dw_11_ct", "GMS_pds_l20_dw_12_ct", "GMS_pds_l20_dw_13_ct", "GMS_pds_l20_dw_14_ct", "GMS_pds_l20_dw_15_ct", "GMS_pds_l20_dw_16_ct", "GMS_pds_l20_dw_17_ct", "GMS_pds_l20_dw_18_ct", "GMS_pds_l20_dw_19_ct", "GMS_pds_l20_dw_20_ct"]].sum(axis=1)
df_GMS["GMS_pds_l20_dw"] = df_GMS["GMS_pds_l20_ws"]/df_GMS["GMS_pds_l20_ws_ct"]

# Deleting transient columns
df_GMS = df_GMS.drop(["GMS_pds_l20_dw_1", "GMS_pds_l20_dw_2", "GMS_pds_l20_dw_3", "GMS_pds_l20_dw_4", "GMS_pds_l20_dw_5", "GMS_pds_l20_dw_6", "GMS_pds_l20_dw_7", "GMS_pds_l20_dw_8", "GMS_pds_l20_dw_9", "GMS_pds_l20_dw_10", "GMS_pds_l20_dw_11", "GMS_pds_l20_dw_12", "GMS_pds_l20_dw_13", "GMS_pds_l20_dw_14", "GMS_pds_l20_dw_15", "GMS_pds_l20_dw_16", "GMS_pds_l20_dw_17", "GMS_pds_l20_dw_18", "GMS_pds_l20_dw_19", "GMS_pds_l20_dw_20", "GMS_pds_l20_dw_1_ct", "GMS_pds_l20_dw_2_ct", "GMS_pds_l20_dw_3_ct", "GMS_pds_l20_dw_4_ct", "GMS_pds_l20_dw_5_ct", "GMS_pds_l20_dw_6_ct", "GMS_pds_l20_dw_7_ct", "GMS_pds_l20_dw_8_ct", "GMS_pds_l20_dw_9_ct", "GMS_pds_l20_dw_10_ct", "GMS_pds_l20_dw_11_ct", "GMS_pds_l20_dw_12_ct", "GMS_pds_l20_dw_13_ct", "GMS_pds_l20_dw_14_ct", "GMS_pds_l20_dw_15_ct", "GMS_pds_l20_dw_16_ct", "GMS_pds_l20_dw_17_ct", "GMS_pds_l20_dw_18_ct", "GMS_pds_l20_dw_19_ct", "GMS_pds_l20_dw_20_ct", "GMS_pds_l20_ws", "GMS_pds_l20_ws_ct"], axis = 1)

In [40]:
# Calculate Pearson R for 15x15 grids for GMS for this feature
GMS_15x15 = df_GMS.loc[df_GMS["DOW"]!=("Sunday")]

print(GMS_15x15['GMST(s)'].corr(GMS_15x15['GMS_pds_l20_dw']))

# Calculate Pearson R for each puzzle day for this feature
GMS_Sun = df_GMS.loc[df_GMS["DOW"]==("Sunday")]
print(GMS_Sun['GMST(s)'].corr(GMS_Sun['GMS_pds_l20_dw']))

GMS_Mon = df_GMS.loc[df_GMS["DOW"]==("Monday")]
print(GMS_Mon['GMST(s)'].corr(GMS_Mon['GMS_pds_l20_dw']))

GMS_Tue = df_GMS.loc[df_GMS["DOW"]==("Tuesday")]
print(GMS_Tue['GMST(s)'].corr(GMS_Tue['GMS_pds_l20_dw']))

GMS_Wed = df_GMS.loc[df_GMS["DOW"]==("Wednesday")]
print(GMS_Wed['GMST(s)'].corr(GMS_Wed['GMS_pds_l20_dw']))

GMS_Thu = df_GMS.loc[df_GMS["DOW"]==("Thursday")]
print(GMS_Thu['GMST(s)'].corr(GMS_Thu['GMS_pds_l20_dw']))

GMS_Fri = df_GMS.loc[df_GMS["DOW"]==("Friday")]
print(GMS_Fri['GMST(s)'].corr(GMS_Fri['GMS_pds_l20_dw']))

GMS_Sat = df_GMS.loc[df_GMS["DOW"]==("Saturday")]
print(GMS_Sat['GMST(s)'].corr(GMS_Sat['GMS_pds_l20_dw']))

0.8466634549954647
0.6468365630088562
0.5720183687951161
0.544059484419955
0.4857823764171517
0.42482230989153186
0.4319538450398065
0.4212035541367218
