## Imports

In [1]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import datetime
warnings.filterwarnings('ignore')
#print(sns.__version__)

## Load Data

Note: Data obtained from various sources (via scraping, manual entry) and compiled into .xlsx format prior to this workstream

In [2]:
#Read file into a DataFrame and print head.
xword_data = pd.ExcelFile('../../NYT_XWord_data_master.xlsx', engine='openpyxl')
print(xword_data.sheet_names)

['Sheet1']


In [3]:
df1 = xword_data.parse('Sheet1')

In [4]:
df1.head(35)

Unnamed: 0,Puzzle_Date,GMS_Completed,IS1_Completed,IS2_Completed,Completed_Date (IS1),Completed_Date (IS2),Day_of_Week,Time (s) (IS1),Time (s) (IS2),Global_Median_Solver_Time(s),...,90_180_Rot_Sym,LR_UD_Sym,Diag_Sym,Black_Square_Fill,Outside_Grid,Unchecked_Sq,Uniclue,Duplicate_Answers,Quantum,Wordplay
0,2024-02-25,1,1.0,1.0,2024-02-24 18:42:10,2024-02-25 10:03:32,Sunday,865.0,1054.0,1337,...,0,0,0,0,0,0,0,0,0,12.0
1,2024-02-24,1,1.0,1.0,2024-02-24 07:23:10,2024-02-24 14:54:10,Saturday,604.0,871.0,1044,...,0,0,0,0,0,0,0,0,0,6.0
2,2024-02-23,1,1.0,1.0,2024-02-23 06:49:34,2024-02-24 09:51:55,Friday,518.0,1255.0,1022,...,1,0,0,0,0,0,0,0,0,3.0
3,2024-02-22,1,1.0,1.0,2024-02-22 08:13:32,2024-02-22 18:05:02,Thursday,531.0,920.0,1137,...,0,0,0,0,0,0,0,0,0,3.0
4,2024-02-21,1,1.0,1.0,2024-02-21 06:49:48,2024-02-21 10:33:24,Wednesday,317.0,453.0,549,...,0,0,0,0,0,0,0,0,0,2.0
5,2024-02-20,1,1.0,1.0,2024-02-20 07:26:53,2024-02-20 17:41:05,Tuesday,347.0,587.0,562,...,0,0,0,0,0,0,0,0,0,1.0
6,2024-02-19,1,1.0,1.0,2024-02-19 07:27:02,2024-02-19 13:51:53,Monday,332.0,421.0,371,...,0,0,0,0,0,0,0,0,0,2.0
7,2024-02-18,1,1.0,1.0,2024-02-17 18:29:19,2024-02-17 21:51:25,Sunday,1019.0,1238.0,1541,...,0,0,0,0,0,0,0,0,0,6.0
8,2024-02-17,1,1.0,1.0,2024-02-17 07:49:40,2024-02-17 12:27:42,Saturday,635.0,1443.0,1143,...,0,0,0,0,0,0,0,0,0,4.0
9,2024-02-16,1,1.0,1.0,2024-02-16 07:09:58,2024-02-16 13:28:37,Friday,509.0,700.0,822,...,0,0,0,0,0,0,0,0,0,8.0


In [5]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2247 entries, 0 to 2246
Data columns (total 72 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   Puzzle_Date                   2247 non-null   datetime64[ns]
 1   GMS_Completed                 2247 non-null   int64         
 2   IS1_Completed                 1203 non-null   float64       
 3   IS2_Completed                 1202 non-null   float64       
 4   Completed_Date (IS1)          1203 non-null   datetime64[ns]
 5   Completed_Date (IS2)          1202 non-null   datetime64[ns]
 6   Day_of_Week                   2247 non-null   object        
 7   Time (s) (IS1)                1203 non-null   float64       
 8   Time (s) (IS2)                1202 non-null   float64       
 9   Global_Median_Solver_Time(s)  2247 non-null   int64         
 10  Difficulty                    2247 non-null   object        
 11  Median_Solver%_AVG_PM         

## Data Cleaning 

In [6]:
# Rename some columns for brevity and clarity
df1.rename(columns={'Constructors(by seniority)': 'Constructors', 'Day_of_Week': 'DOW', 'Puzzle_Date': 'P_Date', 'Global_Median_Solver_Time(s)': 'GMST(s)', 'Completed_Date (IS2)': 'Comp_Date', 'Time (s) (IS2)': 'IS2_ST(s)'}, inplace=True)

In [7]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2247 entries, 0 to 2246
Data columns (total 72 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   P_Date                        2247 non-null   datetime64[ns]
 1   GMS_Completed                 2247 non-null   int64         
 2   IS1_Completed                 1203 non-null   float64       
 3   IS2_Completed                 1202 non-null   float64       
 4   Completed_Date (IS1)          1203 non-null   datetime64[ns]
 5   Comp_Date                     1202 non-null   datetime64[ns]
 6   DOW                           2247 non-null   object        
 7   Time (s) (IS1)                1203 non-null   float64       
 8   IS2_ST(s)                     1202 non-null   float64       
 9   GMST(s)                       2247 non-null   int64         
 10  Difficulty                    2247 non-null   object        
 11  Median_Solver%_AVG_PM         

In [8]:
# Drop columns that we don't need
# Note: 'Unusual_Sym captures/collapses all of the other symmetry columns'
df2 = df1.drop(['GMS_Completed', 'IS1_Completed', 'Completed_Date (IS1)', 'Time (s) (IS1)', 'Difficulty', 'Median_Solver%_AVG_PM', 'No_Constructors', '1st_Constructor', '1st_Constuctor_Gender', '1st_Constructor_Puzzle_N', '1st_Constructor_Debut', '1st_Constructor_Scrabble_Avg', '1st_Constructor_Fresh%_Avg', '2nd_Constructor', '2nd_Constuctor_Gender', '2nd_Constructor_Puzzle_N', '2nd_Constructor_Debut', '2nd_Constructor_Scrabble_Avg', '2nd_Constructor_Fresh%_Avg', '3rd_Constructor', '3rd_Constuctor_Gender', '3rd_Constructor_Puzzle_N', '3rd_Constructor_Debut', '3rd_Constructor_Scrabble_Avg', '3rd_Constructor_Fresh%_Avg', 'Spans', '2_Stacks', '3_4_Stacks', 'Mirror_Sym', 'UD_Sym', 'Asym', 'Supersym', '90_180_Rot_Sym', 'LR_UD_Sym', 'Diag_Sym'], axis=1)

In [9]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2247 entries, 0 to 2246
Data columns (total 37 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   P_Date              2247 non-null   datetime64[ns]
 1   IS2_Completed       1202 non-null   float64       
 2   Comp_Date           1202 non-null   datetime64[ns]
 3   DOW                 2247 non-null   object        
 4   IS2_ST(s)           1202 non-null   float64       
 5   GMST(s)             2247 non-null   int64         
 6   Grid Size           2247 non-null   int64         
 7   Constructors        2247 non-null   object        
 8   Words               2247 non-null   int64         
 9   Blocks              2247 non-null   int64         
 10  Unused_Letters      2247 non-null   int64         
 11  Stacks              2247 non-null   int64         
 12  Unique_Answers      2247 non-null   int64         
 13  Rebus_Count         2247 non-null   int64       

In [10]:
# Rename some columns for brevity and clarity
df2.rename(columns={'Constructors(by seniority)': 'Constructors', 'Day_of_Week': 'DOW', 'Puzzle_Date': 'P_Date', 'Global_Median_Solver_Time(s)': 'GMST(s)', 'Completed_Date (IS2)': 'Comp_Date', 'Time (s) (IS2)': 'IS1_ST(s)'}, inplace=True)

In [11]:
# A numerical column for puzzle day that starts with Sunday and ends with Saturday
df2.loc[(df2["DOW"] == "Sunday"), "DOW_num"] = 1 
df2.loc[(df2["DOW"] == "Monday"), "DOW_num"] = 2
df2.loc[(df2["DOW"] == "Tuesday"), "DOW_num"] = 3
df2.loc[(df2["DOW"] == "Wednesday"), "DOW_num"] = 4
df2.loc[(df2["DOW"] == "Thursday"), "DOW_num"] = 5
df2.loc[(df2["DOW"] == "Friday"), "DOW_num"] = 6
df2.loc[(df2["DOW"] == "Saturday"), "DOW_num"] = 7

In [12]:
# It will be useful generally to have puzzle issue dates and completion dates as strings in other columns along with the datetimes 
df2['P_Date_str'] = df2['P_Date'].dt.strftime('%Y-%m-%d') #we want datetime as a string to index into here
df2['Comp_Date_str'] = df2['Comp_Date'].dt.strftime('%Y-%m-%d') #we want datetime as a string to index into here

col = df2.pop('P_Date_str')
df2.insert(1, col.name, col)

col = df2.pop('Comp_Date_str')
df2.insert(4, col.name, col)

col = df2.pop('DOW_num')
df2.insert(6, col.name, col)

In [13]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2247 entries, 0 to 2246
Data columns (total 40 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   P_Date              2247 non-null   datetime64[ns]
 1   P_Date_str          2247 non-null   object        
 2   IS2_Completed       1202 non-null   float64       
 3   Comp_Date           1202 non-null   datetime64[ns]
 4   Comp_Date_str       1202 non-null   object        
 5   DOW                 2247 non-null   object        
 6   DOW_num             2247 non-null   float64       
 7   IS2_ST(s)           1202 non-null   float64       
 8   GMST(s)             2247 non-null   int64         
 9   Grid Size           2247 non-null   int64         
 10  Constructors        2247 non-null   object        
 11  Words               2247 non-null   int64         
 12  Blocks              2247 non-null   int64         
 13  Unused_Letters      2247 non-null   int64       

## Feature Creation

Before dropping puzzles not completed by IS2, we need to make a few calculations on the Global Median Solver times that will be used to make 'Strength of Schedule' adjustments on IS2 recent history features.

In [14]:
# Convert global median solve times (GMTs) per puzzle to minutes and drop times in seconds
df2["GMST(m)"] = df2["GMST(s)"]/60  #GMST = Global Median solve times
df2.drop(['GMST(s)'], axis=1, inplace=True)

# Individual Solver 1 (IS1) solve times per puzzle to minutes and drop times in seconds
df2["IS2_ST(m)"] = df2["IS2_ST(s)"]/60  #GMST = Global Median solve times
df2.drop(['IS2_ST(s)'], axis=1, inplace=True)

col = df2.pop('IS2_ST(m)')
df2.insert(8, col.name, col)

col = df2.pop('GMST(m)')
df2.insert(9, col.name, col)

In [15]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2247 entries, 0 to 2246
Data columns (total 40 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   P_Date              2247 non-null   datetime64[ns]
 1   P_Date_str          2247 non-null   object        
 2   IS2_Completed       1202 non-null   float64       
 3   Comp_Date           1202 non-null   datetime64[ns]
 4   Comp_Date_str       1202 non-null   object        
 5   DOW                 2247 non-null   object        
 6   DOW_num             2247 non-null   float64       
 7   Grid Size           2247 non-null   int64         
 8   IS2_ST(m)           1202 non-null   float64       
 9   GMST(m)             2247 non-null   float64       
 10  Constructors        2247 non-null   object        
 11  Words               2247 non-null   int64         
 12  Blocks              2247 non-null   int64         
 13  Unused_Letters      2247 non-null   int64       

In [16]:
# Calculates NON-decay-time weighted GMS performance on the previous 40 day-specific puzzles. This feeds into IS2 'Strength of Schedule' calculation below, but is not itself included in the modeling feature set.
# This time integration window was shown to be optimal for modeling GMS performance in the GMS predictive modeling project preceding the IS2 version
# First iteration of full model performed slightly better with no decay time weighting, so it was removed (gradual decay-weighting WAS used in GMS model, as it did show an ever-so-slight advantage over no weighting in that system)

df2 = df2.sort_values(by=['DOW', 'P_Date'], ascending = False)

# Gradual decay 
# w = np.arange(1,41)
# w = list(w)

# No decay
w = np.ones(40)
w = list(w)

df2["GMS_pds_l40_ndw_1"] = df2.groupby(['DOW'])['GMST(m)'].shift(-1)*w[0]
df2["GMS_pds_l40_ndw_2"] = df2.groupby(['DOW'])['GMST(m)'].shift(-2)*w[1]
df2["GMS_pds_l40_ndw_3"] = df2.groupby(['DOW'])['GMST(m)'].shift(-3)*w[2]
df2["GMS_pds_l40_ndw_4"] = df2.groupby(['DOW'])['GMST(m)'].shift(-4)*w[3]
df2["GMS_pds_l40_ndw_5"] = df2.groupby(['DOW'])['GMST(m)'].shift(-5)*w[4]
df2["GMS_pds_l40_ndw_6"] = df2.groupby(['DOW'])['GMST(m)'].shift(-6)*w[5]
df2["GMS_pds_l40_ndw_7"] = df2.groupby(['DOW'])['GMST(m)'].shift(-7)*w[6]
df2["GMS_pds_l40_ndw_8"] = df2.groupby(['DOW'])['GMST(m)'].shift(-8)*w[7]
df2["GMS_pds_l40_ndw_9"] = df2.groupby(['DOW'])['GMST(m)'].shift(-9)*w[8]
df2["GMS_pds_l40_ndw_10"] = df2.groupby(['DOW'])['GMST(m)'].shift(-10)*w[9]
df2["GMS_pds_l40_ndw_11"] = df2.groupby(['DOW'])['GMST(m)'].shift(-11)*w[10]
df2["GMS_pds_l40_ndw_12"] = df2.groupby(['DOW'])['GMST(m)'].shift(-12)*w[11]
df2["GMS_pds_l40_ndw_13"] = df2.groupby(['DOW'])['GMST(m)'].shift(-13)*w[12]
df2["GMS_pds_l40_ndw_14"] = df2.groupby(['DOW'])['GMST(m)'].shift(-14)*w[13]
df2["GMS_pds_l40_ndw_15"] = df2.groupby(['DOW'])['GMST(m)'].shift(-15)*w[14]
df2["GMS_pds_l40_ndw_16"] = df2.groupby(['DOW'])['GMST(m)'].shift(-16)*w[15]
df2["GMS_pds_l40_ndw_17"] = df2.groupby(['DOW'])['GMST(m)'].shift(-17)*w[16]
df2["GMS_pds_l40_ndw_18"] = df2.groupby(['DOW'])['GMST(m)'].shift(-18)*w[17]
df2["GMS_pds_l40_ndw_19"] = df2.groupby(['DOW'])['GMST(m)'].shift(-19)*w[18]
df2["GMS_pds_l40_ndw_20"] = df2.groupby(['DOW'])['GMST(m)'].shift(-20)*w[19]
df2["GMS_pds_l40_ndw_21"] = df2.groupby(['DOW'])['GMST(m)'].shift(-21)*w[20]
df2["GMS_pds_l40_ndw_22"] = df2.groupby(['DOW'])['GMST(m)'].shift(-22)*w[21]
df2["GMS_pds_l40_ndw_23"] = df2.groupby(['DOW'])['GMST(m)'].shift(-23)*w[22]
df2["GMS_pds_l40_ndw_24"] = df2.groupby(['DOW'])['GMST(m)'].shift(-24)*w[23]
df2["GMS_pds_l40_ndw_25"] = df2.groupby(['DOW'])['GMST(m)'].shift(-25)*w[24]
df2["GMS_pds_l40_ndw_26"] = df2.groupby(['DOW'])['GMST(m)'].shift(-26)*w[25]
df2["GMS_pds_l40_ndw_27"] = df2.groupby(['DOW'])['GMST(m)'].shift(-27)*w[26]
df2["GMS_pds_l40_ndw_28"] = df2.groupby(['DOW'])['GMST(m)'].shift(-28)*w[27]
df2["GMS_pds_l40_ndw_29"] = df2.groupby(['DOW'])['GMST(m)'].shift(-29)*w[28]
df2["GMS_pds_l40_ndw_30"] = df2.groupby(['DOW'])['GMST(m)'].shift(-30)*w[29]
df2["GMS_pds_l40_ndw_31"] = df2.groupby(['DOW'])['GMST(m)'].shift(-31)*w[30]
df2["GMS_pds_l40_ndw_32"] = df2.groupby(['DOW'])['GMST(m)'].shift(-32)*w[31]
df2["GMS_pds_l40_ndw_33"] = df2.groupby(['DOW'])['GMST(m)'].shift(-33)*w[32]
df2["GMS_pds_l40_ndw_34"] = df2.groupby(['DOW'])['GMST(m)'].shift(-34)*w[33]
df2["GMS_pds_l40_ndw_35"] = df2.groupby(['DOW'])['GMST(m)'].shift(-35)*w[34]
df2["GMS_pds_l40_ndw_36"] = df2.groupby(['DOW'])['GMST(m)'].shift(-36)*w[35]
df2["GMS_pds_l40_ndw_37"] = df2.groupby(['DOW'])['GMST(m)'].shift(-37)*w[36]
df2["GMS_pds_l40_ndw_38"] = df2.groupby(['DOW'])['GMST(m)'].shift(-38)*w[37]
df2["GMS_pds_l40_ndw_39"] = df2.groupby(['DOW'])['GMST(m)'].shift(-39)*w[38]
df2["GMS_pds_l40_ndw_40"] = df2.groupby(['DOW'])['GMST(m)'].shift(-40)*w[39]


df2["GMS_pds_l40_ndw_1_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-1)/df2.groupby(['DOW'])['GMST(m)'].shift(-1))*w[0]
df2["GMS_pds_l40_ndw_2_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-2)/df2.groupby(['DOW'])['GMST(m)'].shift(-2))*w[1]
df2["GMS_pds_l40_ndw_3_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-3)/df2.groupby(['DOW'])['GMST(m)'].shift(-3))*w[2]
df2["GMS_pds_l40_ndw_4_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-4)/df2.groupby(['DOW'])['GMST(m)'].shift(-4))*w[3]
df2["GMS_pds_l40_ndw_5_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-5)/df2.groupby(['DOW'])['GMST(m)'].shift(-5))*w[4]
df2["GMS_pds_l40_ndw_6_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-6)/df2.groupby(['DOW'])['GMST(m)'].shift(-6))*w[5]
df2["GMS_pds_l40_ndw_7_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-7)/df2.groupby(['DOW'])['GMST(m)'].shift(-7))*w[6]
df2["GMS_pds_l40_ndw_8_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-8)/df2.groupby(['DOW'])['GMST(m)'].shift(-8))*w[7]
df2["GMS_pds_l40_ndw_9_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-9)/df2.groupby(['DOW'])['GMST(m)'].shift(-9))*w[8]
df2["GMS_pds_l40_ndw_10_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-10)/df2.groupby(['DOW'])['GMST(m)'].shift(-10))*w[9]
df2["GMS_pds_l40_ndw_11_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-11)/df2.groupby(['DOW'])['GMST(m)'].shift(-11))*w[10]
df2["GMS_pds_l40_ndw_12_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-12)/df2.groupby(['DOW'])['GMST(m)'].shift(-12))*w[11]
df2["GMS_pds_l40_ndw_13_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-13)/df2.groupby(['DOW'])['GMST(m)'].shift(-13))*w[12]
df2["GMS_pds_l40_ndw_14_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-14)/df2.groupby(['DOW'])['GMST(m)'].shift(-14))*w[13]
df2["GMS_pds_l40_ndw_15_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-15)/df2.groupby(['DOW'])['GMST(m)'].shift(-15))*w[14]
df2["GMS_pds_l40_ndw_16_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-16)/df2.groupby(['DOW'])['GMST(m)'].shift(-16))*w[15]
df2["GMS_pds_l40_ndw_17_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-17)/df2.groupby(['DOW'])['GMST(m)'].shift(-17))*w[16]
df2["GMS_pds_l40_ndw_18_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-18)/df2.groupby(['DOW'])['GMST(m)'].shift(-18))*w[17]
df2["GMS_pds_l40_ndw_19_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-19)/df2.groupby(['DOW'])['GMST(m)'].shift(-19))*w[18]
df2["GMS_pds_l40_ndw_20_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-20)/df2.groupby(['DOW'])['GMST(m)'].shift(-20))*w[19]
df2["GMS_pds_l40_ndw_21_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-21)/df2.groupby(['DOW'])['GMST(m)'].shift(-21))*w[20]
df2["GMS_pds_l40_ndw_22_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-22)/df2.groupby(['DOW'])['GMST(m)'].shift(-22))*w[21]
df2["GMS_pds_l40_ndw_23_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-23)/df2.groupby(['DOW'])['GMST(m)'].shift(-23))*w[22]
df2["GMS_pds_l40_ndw_24_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-24)/df2.groupby(['DOW'])['GMST(m)'].shift(-24))*w[23]
df2["GMS_pds_l40_ndw_25_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-25)/df2.groupby(['DOW'])['GMST(m)'].shift(-25))*w[24]
df2["GMS_pds_l40_ndw_26_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-26)/df2.groupby(['DOW'])['GMST(m)'].shift(-26))*w[25]
df2["GMS_pds_l40_ndw_27_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-27)/df2.groupby(['DOW'])['GMST(m)'].shift(-27))*w[26]
df2["GMS_pds_l40_ndw_28_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-28)/df2.groupby(['DOW'])['GMST(m)'].shift(-28))*w[27]
df2["GMS_pds_l40_ndw_29_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-29)/df2.groupby(['DOW'])['GMST(m)'].shift(-29))*w[28]
df2["GMS_pds_l40_ndw_30_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-30)/df2.groupby(['DOW'])['GMST(m)'].shift(-30))*w[29]
df2["GMS_pds_l40_ndw_31_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-31)/df2.groupby(['DOW'])['GMST(m)'].shift(-31))*w[30]
df2["GMS_pds_l40_ndw_32_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-32)/df2.groupby(['DOW'])['GMST(m)'].shift(-32))*w[31]
df2["GMS_pds_l40_ndw_33_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-33)/df2.groupby(['DOW'])['GMST(m)'].shift(-33))*w[32]
df2["GMS_pds_l40_ndw_34_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-34)/df2.groupby(['DOW'])['GMST(m)'].shift(-34))*w[33]
df2["GMS_pds_l40_ndw_35_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-35)/df2.groupby(['DOW'])['GMST(m)'].shift(-35))*w[34]
df2["GMS_pds_l40_ndw_36_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-36)/df2.groupby(['DOW'])['GMST(m)'].shift(-36))*w[35]
df2["GMS_pds_l40_ndw_37_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-37)/df2.groupby(['DOW'])['GMST(m)'].shift(-37))*w[36]
df2["GMS_pds_l40_ndw_38_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-38)/df2.groupby(['DOW'])['GMST(m)'].shift(-38))*w[37]
df2["GMS_pds_l40_ndw_39_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-39)/df2.groupby(['DOW'])['GMST(m)'].shift(-39))*w[38]
df2["GMS_pds_l40_ndw_40_ct"] = (df2.groupby(['DOW'])['GMST(m)'].shift(-40)/df2.groupby(['DOW'])['GMST(m)'].shift(-40))*w[39]


df2["GMS_pds_l40_ws"] = df2[["GMS_pds_l40_ndw_1", "GMS_pds_l40_ndw_2", "GMS_pds_l40_ndw_3", "GMS_pds_l40_ndw_4", "GMS_pds_l40_ndw_5", "GMS_pds_l40_ndw_6", "GMS_pds_l40_ndw_7", "GMS_pds_l40_ndw_8", "GMS_pds_l40_ndw_9", "GMS_pds_l40_ndw_10", "GMS_pds_l40_ndw_11", "GMS_pds_l40_ndw_12", "GMS_pds_l40_ndw_13", "GMS_pds_l40_ndw_14", "GMS_pds_l40_ndw_15", "GMS_pds_l40_ndw_16", "GMS_pds_l40_ndw_17", "GMS_pds_l40_ndw_18", "GMS_pds_l40_ndw_19", "GMS_pds_l40_ndw_20", "GMS_pds_l40_ndw_21", "GMS_pds_l40_ndw_22", "GMS_pds_l40_ndw_23", "GMS_pds_l40_ndw_24", "GMS_pds_l40_ndw_25",
                            "GMS_pds_l40_ndw_26", "GMS_pds_l40_ndw_27", "GMS_pds_l40_ndw_28", "GMS_pds_l40_ndw_29", "GMS_pds_l40_ndw_30", "GMS_pds_l40_ndw_31", "GMS_pds_l40_ndw_32", "GMS_pds_l40_ndw_33", "GMS_pds_l40_ndw_34", "GMS_pds_l40_ndw_35", "GMS_pds_l40_ndw_36", "GMS_pds_l40_ndw_37", "GMS_pds_l40_ndw_38", "GMS_pds_l40_ndw_39", "GMS_pds_l40_ndw_40"]].sum(axis=1)
df2["GMS_pds_l40_ws_ct"] = df2[["GMS_pds_l40_ndw_1_ct", "GMS_pds_l40_ndw_2_ct", "GMS_pds_l40_ndw_3_ct", "GMS_pds_l40_ndw_4_ct", "GMS_pds_l40_ndw_5_ct", "GMS_pds_l40_ndw_6_ct", "GMS_pds_l40_ndw_7_ct", "GMS_pds_l40_ndw_8_ct", "GMS_pds_l40_ndw_9_ct", "GMS_pds_l40_ndw_10_ct", "GMS_pds_l40_ndw_11_ct", "GMS_pds_l40_ndw_12_ct", "GMS_pds_l40_ndw_13_ct", "GMS_pds_l40_ndw_14_ct", "GMS_pds_l40_ndw_15_ct", "GMS_pds_l40_ndw_16_ct", "GMS_pds_l40_ndw_17_ct", "GMS_pds_l40_ndw_18_ct", "GMS_pds_l40_ndw_19_ct", "GMS_pds_l40_ndw_20_ct", "GMS_pds_l40_ndw_21_ct", "GMS_pds_l40_ndw_22_ct", "GMS_pds_l40_ndw_23_ct", "GMS_pds_l40_ndw_24_ct", "GMS_pds_l40_ndw_25_ct",
                               "GMS_pds_l40_ndw_26_ct", "GMS_pds_l40_ndw_27_ct", "GMS_pds_l40_ndw_28_ct", "GMS_pds_l40_ndw_29_ct", "GMS_pds_l40_ndw_30_ct", "GMS_pds_l40_ndw_31_ct", "GMS_pds_l40_ndw_32_ct", "GMS_pds_l40_ndw_33_ct", "GMS_pds_l40_ndw_34_ct", "GMS_pds_l40_ndw_35_ct", "GMS_pds_l40_ndw_36_ct", "GMS_pds_l40_ndw_37_ct", "GMS_pds_l40_ndw_38_ct", "GMS_pds_l40_ndw_39_ct", "GMS_pds_l40_ndw_40_ct"]].sum(axis=1)
df2["GMS_pds_l40_ndw"] = df2["GMS_pds_l40_ws"]/df2["GMS_pds_l40_ws_ct"]

# Deleting transient columns
df2 = df2.drop(["GMS_pds_l40_ndw_1", "GMS_pds_l40_ndw_2", "GMS_pds_l40_ndw_3", "GMS_pds_l40_ndw_4", "GMS_pds_l40_ndw_5", "GMS_pds_l40_ndw_6", "GMS_pds_l40_ndw_7", "GMS_pds_l40_ndw_8", "GMS_pds_l40_ndw_9", "GMS_pds_l40_ndw_10", "GMS_pds_l40_ndw_11", "GMS_pds_l40_ndw_12", "GMS_pds_l40_ndw_13", "GMS_pds_l40_ndw_14", "GMS_pds_l40_ndw_15", "GMS_pds_l40_ndw_16", "GMS_pds_l40_ndw_17", "GMS_pds_l40_ndw_18", "GMS_pds_l40_ndw_19", "GMS_pds_l40_ndw_20", "GMS_pds_l40_ndw_21", "GMS_pds_l40_ndw_22", "GMS_pds_l40_ndw_23", "GMS_pds_l40_ndw_24", "GMS_pds_l40_ndw_25", "GMS_pds_l40_ndw_26", "GMS_pds_l40_ndw_27", "GMS_pds_l40_ndw_28", "GMS_pds_l40_ndw_29", "GMS_pds_l40_ndw_30", "GMS_pds_l40_ndw_31", "GMS_pds_l40_ndw_32", "GMS_pds_l40_ndw_33", "GMS_pds_l40_ndw_34", "GMS_pds_l40_ndw_35", "GMS_pds_l40_ndw_36", "GMS_pds_l40_ndw_37", "GMS_pds_l40_ndw_38", "GMS_pds_l40_ndw_39", "GMS_pds_l40_ndw_40",  
                "GMS_pds_l40_ndw_1_ct", "GMS_pds_l40_ndw_2_ct", "GMS_pds_l40_ndw_3_ct", "GMS_pds_l40_ndw_4_ct", "GMS_pds_l40_ndw_5_ct", "GMS_pds_l40_ndw_6_ct", "GMS_pds_l40_ndw_7_ct", "GMS_pds_l40_ndw_8_ct", "GMS_pds_l40_ndw_9_ct", "GMS_pds_l40_ndw_10_ct", "GMS_pds_l40_ndw_11_ct", "GMS_pds_l40_ndw_12_ct", "GMS_pds_l40_ndw_13_ct", "GMS_pds_l40_ndw_14_ct", "GMS_pds_l40_ndw_15_ct", "GMS_pds_l40_ndw_16_ct", "GMS_pds_l40_ndw_17_ct", "GMS_pds_l40_ndw_18_ct", "GMS_pds_l40_ndw_19_ct", "GMS_pds_l40_ndw_20_ct", "GMS_pds_l40_ndw_21_ct", "GMS_pds_l40_ndw_22_ct", "GMS_pds_l40_ndw_23_ct", "GMS_pds_l40_ndw_24_ct", "GMS_pds_l40_ndw_25_ct", "GMS_pds_l40_ndw_26_ct", "GMS_pds_l40_ndw_27_ct", "GMS_pds_l40_ndw_28_ct", "GMS_pds_l40_ndw_29_ct", "GMS_pds_l40_ndw_30_ct", "GMS_pds_l40_ndw_31_ct", "GMS_pds_l40_ndw_32_ct", "GMS_pds_l40_ndw_33_ct", "GMS_pds_l40_ndw_34_ct", "GMS_pds_l40_ndw_35_ct", "GMS_pds_l40_ndw_36_ct", "GMS_pds_l40_ndw_37_ct", "GMS_pds_l40_ndw_38_ct", "GMS_pds_l40_ndw_39_ct", "GMS_pds_l40_ndw_40_ct",
                "GMS_pds_l40_ws", "GMS_pds_l40_ws_ct"], axis = 1)

In [17]:
# GMST_to_GMS_pds_l40_ndw_ratio
# Now we want the ratio of a given GMS solve time to their recent form on that puzzle day. 
# This ratio will be used as the 'strength of schedule' adjustor for IS2's recent puzzle day-specific performance
# A ratio higher than 1 indicates a puzzle that was relatively tough for the GMS compared to their recent form, and vice versa.

df2['GMST_to_GMS_pds_l40_ndw_ratio'] = df2['GMST(m)']/df2['GMS_pds_l40_ndw']

In [18]:
col = df2.pop('GMS_pds_l40_ndw')
df2.insert(10, col.name, col)

col = df2.pop('GMST_to_GMS_pds_l40_ndw_ratio')
df2.insert(11, col.name, col)

In [19]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2247 entries, 4 to 2242
Data columns (total 42 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   P_Date                         2247 non-null   datetime64[ns]
 1   P_Date_str                     2247 non-null   object        
 2   IS2_Completed                  1202 non-null   float64       
 3   Comp_Date                      1202 non-null   datetime64[ns]
 4   Comp_Date_str                  1202 non-null   object        
 5   DOW                            2247 non-null   object        
 6   DOW_num                        2247 non-null   float64       
 7   Grid Size                      2247 non-null   int64         
 8   IS2_ST(m)                      1202 non-null   float64       
 9   GMST(m)                        2247 non-null   float64       
 10  GMS_pds_l40_ndw                2240 non-null   float64       
 11  GMST_to_GMS_pds_l

In [20]:
# Checkpoint
#df2.to_csv('../data/df2.csv', index=False)

In [21]:
# Now we can pare down to only puzzles that IS1 has solved 
df2 = df2.dropna(subset=['IS2_ST(m)'])

In [22]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1202 entries, 4 to 2137
Data columns (total 42 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   P_Date                         1202 non-null   datetime64[ns]
 1   P_Date_str                     1202 non-null   object        
 2   IS2_Completed                  1202 non-null   float64       
 3   Comp_Date                      1202 non-null   datetime64[ns]
 4   Comp_Date_str                  1202 non-null   object        
 5   DOW                            1202 non-null   object        
 6   DOW_num                        1202 non-null   float64       
 7   Grid Size                      1202 non-null   int64         
 8   IS2_ST(m)                      1202 non-null   float64       
 9   GMST(m)                        1202 non-null   float64       
 10  GMS_pds_l40_ndw                1202 non-null   float64       
 11  GMST_to_GMS_pds_l

In [23]:
# Checkpoint
#df2.to_csv('../data/df2.csv', index=False)

## Compute Decay-Time Weighted Past Moving Averages for IS2

In [24]:
df3 = df2.copy()

In [25]:
#IS_pds_l10_ndw
#Determined in DTW testing preprocessing step to be the optimal temporal integration window for this solver

#Provides NON decay-weighted(ndw), puzzle day-specific (pds) mean solve time performance for IS2 over the previous 10 puzzles relative to a given puzzle
# Note that the sort is by completion date for IS2, as completion date was available 
# Note also that, unlike the 10-puzzle moving average, this weighted average does NOT include the "puzzle at hand" itself

df3 = df3.sort_values(by=['DOW', 'Comp_Date'], ascending = False)

# Gradual decay
# w = np.arange(1,11)
# w = list(w)

# No decay
w = np.ones(10)
w = list(w)

df3["IS_pds_l10_ndw_1"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-1)*w[0]
df3["IS_pds_l10_ndw_2"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-2)*w[1]
df3["IS_pds_l10_ndw_3"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-3)*w[2]
df3["IS_pds_l10_ndw_4"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-4)*w[3]
df3["IS_pds_l10_ndw_5"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-5)*w[4]
df3["IS_pds_l10_ndw_6"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-6)*w[5]
df3["IS_pds_l10_ndw_7"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-7)*w[6]
df3["IS_pds_l10_ndw_8"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-8)*w[7]
df3["IS_pds_l10_ndw_9"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-9)*w[8]
df3["IS_pds_l10_ndw_10"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-10)*w[9]


df3["IS_pds_l10_ndw_1_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-1)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-1))*w[0]
df3["IS_pds_l10_ndw_2_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-2)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-2))*w[1]
df3["IS_pds_l10_ndw_3_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-3)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-3))*w[2]
df3["IS_pds_l10_ndw_4_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-4)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-4))*w[3]
df3["IS_pds_l10_ndw_5_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-5)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-5))*w[4]
df3["IS_pds_l10_ndw_6_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-6)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-6))*w[5]
df3["IS_pds_l10_ndw_7_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-7)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-7))*w[6]
df3["IS_pds_l10_ndw_8_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-8)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-8))*w[7]
df3["IS_pds_l10_ndw_9_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-9)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-9))*w[8]
df3["IS_pds_l10_ndw_10_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-10)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-10))*w[9]

df3["IS_pds_l10_ws"] = df3[["IS_pds_l10_ndw_1", "IS_pds_l10_ndw_2", "IS_pds_l10_ndw_3", "IS_pds_l10_ndw_4", "IS_pds_l10_ndw_5", "IS_pds_l10_ndw_6", "IS_pds_l10_ndw_7", "IS_pds_l10_ndw_8", "IS_pds_l10_ndw_9", "IS_pds_l10_ndw_10"]].sum(axis=1)
df3["IS_pds_l10_ws_ct"] = df3[["IS_pds_l10_ndw_1_ct", "IS_pds_l10_ndw_2_ct", "IS_pds_l10_ndw_3_ct", "IS_pds_l10_ndw_4_ct", "IS_pds_l10_ndw_5_ct", "IS_pds_l10_ndw_6_ct", "IS_pds_l10_ndw_7_ct", "IS_pds_l10_ndw_8_ct", "IS_pds_l10_ndw_9_ct", "IS_pds_l10_ndw_10_ct"]].sum(axis=1)
df3["IS_pds_l10_ndw"] = df3["IS_pds_l10_ws"]/df3["IS_pds_l10_ws_ct"]

# Deleting transient columns
df3 = df3.drop(["IS_pds_l10_ndw_1", "IS_pds_l10_ndw_2", "IS_pds_l10_ndw_3", "IS_pds_l10_ndw_4", "IS_pds_l10_ndw_5", "IS_pds_l10_ndw_6", "IS_pds_l10_ndw_7", "IS_pds_l10_ndw_8", "IS_pds_l10_ndw_9", "IS_pds_l10_ndw_10", "IS_pds_l10_ws", "IS_pds_l10_ndw_1_ct", "IS_pds_l10_ndw_2_ct", "IS_pds_l10_ndw_3_ct", "IS_pds_l10_ndw_4_ct", "IS_pds_l10_ndw_5_ct", "IS_pds_l10_ndw_6_ct", "IS_pds_l10_ndw_7_ct", "IS_pds_l10_ndw_8_ct", "IS_pds_l10_ndw_9_ct", "IS_pds_l10_ndw_10_ct", "IS_pds_l10_ws_ct"], axis = 1)

In [26]:
col = df3.pop('IS_pds_l10_ndw')
df3.insert(9, col.name, col)

In [27]:
#IS_pds_l25_dw
#Provides decay-weighted(dw), puzzle day-specific (pds) mean solve time performance for IS2 over the previous 25 puzzles relative to a given puzzle
# Note that the sort is by completion date for IS2, as completion date was avaiable 
# Note also that, unlike the 10-puzzle moving average, this weighted average does NOT include the "puzzle at hand" itself

# df3 = df3.sort_values(by=['DOW', 'Comp_Date'], ascending = False)

# # Gradual decay
# w = np.arange(1,26)
# w = list(w)

# # No decay
# # w = np.ones(25)
# # w = list(w)

# df3["IS_pds_l25_dw_1"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-1)*w[0]
# df3["IS_pds_l25_dw_2"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-2)*w[1]
# df3["IS_pds_l25_dw_3"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-3)*w[2]
# df3["IS_pds_l25_dw_4"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-4)*w[3]
# df3["IS_pds_l25_dw_5"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-5)*w[4]
# df3["IS_pds_l25_dw_6"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-6)*w[5]
# df3["IS_pds_l25_dw_7"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-7)*w[6]
# df3["IS_pds_l25_dw_8"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-8)*w[7]
# df3["IS_pds_l25_dw_9"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-9)*w[8]
# df3["IS_pds_l25_dw_10"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-10)*w[9]
# df3["IS_pds_l25_dw_11"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-11)*w[10]
# df3["IS_pds_l25_dw_12"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-12)*w[11]
# df3["IS_pds_l25_dw_13"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-13)*w[12]
# df3["IS_pds_l25_dw_14"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-14)*w[13]
# df3["IS_pds_l25_dw_15"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-15)*w[14]
# df3["IS_pds_l25_dw_16"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-16)*w[15]
# df3["IS_pds_l25_dw_17"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-17)*w[16]
# df3["IS_pds_l25_dw_18"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-18)*w[17]
# df3["IS_pds_l25_dw_19"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-19)*w[18]
# df3["IS_pds_l25_dw_20"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-20)*w[19]
# df3["IS_pds_l25_dw_21"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-21)*w[20]
# df3["IS_pds_l25_dw_22"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-22)*w[21]
# df3["IS_pds_l25_dw_23"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-23)*w[22]
# df3["IS_pds_l25_dw_24"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-24)*w[23]
# df3["IS_pds_l25_dw_25"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-25)*w[24]

# df3["IS_pds_l25_dw_1_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-1)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-1))*w[0]
# df3["IS_pds_l25_dw_2_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-2)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-2))*w[1]
# df3["IS_pds_l25_dw_3_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-3)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-3))*w[2]
# df3["IS_pds_l25_dw_4_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-4)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-4))*w[3]
# df3["IS_pds_l25_dw_5_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-5)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-5))*w[4]
# df3["IS_pds_l25_dw_6_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-6)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-6))*w[5]
# df3["IS_pds_l25_dw_7_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-7)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-7))*w[6]
# df3["IS_pds_l25_dw_8_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-8)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-8))*w[7]
# df3["IS_pds_l25_dw_9_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-9)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-9))*w[8]
# df3["IS_pds_l25_dw_10_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-10)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-10))*w[9]
# df3["IS_pds_l25_dw_11_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-11)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-11))*w[10]
# df3["IS_pds_l25_dw_12_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-12)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-12))*w[11]
# df3["IS_pds_l25_dw_13_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-13)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-13))*w[12]
# df3["IS_pds_l25_dw_14_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-14)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-14))*w[13]
# df3["IS_pds_l25_dw_15_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-15)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-15))*w[14]
# df3["IS_pds_l25_dw_16_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-16)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-16))*w[15]
# df3["IS_pds_l25_dw_17_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-17)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-17))*w[16]
# df3["IS_pds_l25_dw_18_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-18)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-18))*w[17]
# df3["IS_pds_l25_dw_19_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-19)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-19))*w[18]
# df3["IS_pds_l25_dw_20_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-20)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-20))*w[19]
# df3["IS_pds_l25_dw_21_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-21)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-21))*w[20]
# df3["IS_pds_l25_dw_22_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-22)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-22))*w[21]
# df3["IS_pds_l25_dw_23_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-23)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-23))*w[22]
# df3["IS_pds_l25_dw_24_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-24)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-24))*w[23]
# df3["IS_pds_l25_dw_25_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-25)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-25))*w[24]

# df3["IS_pds_l25_ws"] = df3[["IS_pds_l25_dw_1", "IS_pds_l25_dw_2", "IS_pds_l25_dw_3", "IS_pds_l25_dw_4", "IS_pds_l25_dw_5", "IS_pds_l25_dw_6", "IS_pds_l25_dw_7", "IS_pds_l25_dw_8", "IS_pds_l25_dw_9", "IS_pds_l25_dw_10", "IS_pds_l25_dw_11", "IS_pds_l25_dw_12", "IS_pds_l25_dw_13", "IS_pds_l25_dw_14", "IS_pds_l25_dw_15", "IS_pds_l25_dw_16", "IS_pds_l25_dw_17", "IS_pds_l25_dw_18", "IS_pds_l25_dw_19", "IS_pds_l25_dw_20", "IS_pds_l25_dw_21", "IS_pds_l25_dw_22", "IS_pds_l25_dw_23", "IS_pds_l25_dw_24", "IS_pds_l25_dw_25"]].sum(axis=1)
# df3["IS_pds_l25_ws_ct"] = df3[["IS_pds_l25_dw_1_ct", "IS_pds_l25_dw_2_ct", "IS_pds_l25_dw_3_ct", "IS_pds_l25_dw_4_ct", "IS_pds_l25_dw_5_ct", "IS_pds_l25_dw_6_ct", "IS_pds_l25_dw_7_ct", "IS_pds_l25_dw_8_ct", "IS_pds_l25_dw_9_ct", "IS_pds_l25_dw_10_ct", "IS_pds_l25_dw_11_ct", "IS_pds_l25_dw_12_ct", "IS_pds_l25_dw_13_ct", "IS_pds_l25_dw_14_ct", "IS_pds_l25_dw_15_ct", "IS_pds_l25_dw_16_ct", "IS_pds_l25_dw_17_ct", "IS_pds_l25_dw_18_ct", "IS_pds_l25_dw_19_ct", "IS_pds_l25_dw_20_ct", "IS_pds_l25_dw_21_ct", "IS_pds_l25_dw_22_ct", "IS_pds_l25_dw_23_ct", "IS_pds_l25_dw_24_ct", "IS_pds_l25_dw_25_ct"]].sum(axis=1)
# df3["IS_pds_l25_dw"] = df3["IS_pds_l25_ws"]/df3["IS_pds_l25_ws_ct"]

# # Deleting transient columns
# df3 = df3.drop(["IS_pds_l25_dw_1", "IS_pds_l25_dw_2", "IS_pds_l25_dw_3", "IS_pds_l25_dw_4", "IS_pds_l25_dw_5", "IS_pds_l25_dw_6", "IS_pds_l25_dw_7", "IS_pds_l25_dw_8", "IS_pds_l25_dw_9", "IS_pds_l25_dw_10", "IS_pds_l25_dw_11", "IS_pds_l25_dw_12", "IS_pds_l25_dw_13", "IS_pds_l25_dw_14", "IS_pds_l25_dw_15", "IS_pds_l25_dw_16", "IS_pds_l25_dw_17", "IS_pds_l25_dw_18", "IS_pds_l25_dw_19", "IS_pds_l25_dw_20", "IS_pds_l25_dw_21", "IS_pds_l25_dw_22", "IS_pds_l25_dw_23", "IS_pds_l25_dw_24", "IS_pds_l25_dw_25", "IS_pds_l25_dw_1_ct", "IS_pds_l25_dw_2_ct", "IS_pds_l25_dw_3_ct", "IS_pds_l25_dw_4_ct", "IS_pds_l25_dw_5_ct", "IS_pds_l25_dw_6_ct", "IS_pds_l25_dw_7_ct", "IS_pds_l25_dw_8_ct", "IS_pds_l25_dw_9_ct", "IS_pds_l25_dw_10_ct", "IS_pds_l25_dw_11_ct", "IS_pds_l25_dw_12_ct", "IS_pds_l25_dw_13_ct", "IS_pds_l25_dw_14_ct", "IS_pds_l25_dw_15_ct", "IS_pds_l25_dw_16_ct", "IS_pds_l25_dw_17_ct", "IS_pds_l25_dw_18_ct", "IS_pds_l25_dw_19_ct", "IS_pds_l25_dw_20_ct", "IS_pds_l25_dw_21_ct", "IS_pds_l25_dw_22_ct", "IS_pds_l25_dw_23_ct", "IS_pds_l25_dw_24_ct", "IS_pds_l25_dw_25_ct", "IS_pds_l25_ws", "IS_pds_l25_ws_ct"], axis = 1)

In [28]:
#IS_pds_l40_dw
#Provides decay-weighted(dw), puzzle day-specific (pds) mean solve time performance for IS2 over the previous 40 puzzles relative to a given puzzle
# Note also that, unlike the 10-puzzle moving average, this weighted average does NOT include the "puzzle at hand" itself

# df3 = df3.sort_values(by=['DOW', 'Comp_Date'], ascending = False)

# # Gradual decay
# w = np.arange(1,41)
# w = list(w)

# # No decay
# # w = np.ones(40)
# # w = list(w)

# df3["IS_pds_l40_dw_1"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-1)*w[0]
# df3["IS_pds_l40_dw_2"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-2)*w[1]
# df3["IS_pds_l40_dw_3"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-3)*w[2]
# df3["IS_pds_l40_dw_4"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-4)*w[3]
# df3["IS_pds_l40_dw_5"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-5)*w[4]
# df3["IS_pds_l40_dw_6"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-6)*w[5]
# df3["IS_pds_l40_dw_7"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-7)*w[6]
# df3["IS_pds_l40_dw_8"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-8)*w[7]
# df3["IS_pds_l40_dw_9"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-9)*w[8]
# df3["IS_pds_l40_dw_10"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-10)*w[9]
# df3["IS_pds_l40_dw_11"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-11)*w[10]
# df3["IS_pds_l40_dw_12"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-12)*w[11]
# df3["IS_pds_l40_dw_13"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-13)*w[12]
# df3["IS_pds_l40_dw_14"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-14)*w[13]
# df3["IS_pds_l40_dw_15"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-15)*w[14]
# df3["IS_pds_l40_dw_16"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-16)*w[15]
# df3["IS_pds_l40_dw_17"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-17)*w[16]
# df3["IS_pds_l40_dw_18"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-18)*w[17]
# df3["IS_pds_l40_dw_19"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-19)*w[18]
# df3["IS_pds_l40_dw_20"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-20)*w[19]
# df3["IS_pds_l40_dw_21"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-21)*w[20]
# df3["IS_pds_l40_dw_22"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-22)*w[21]
# df3["IS_pds_l40_dw_23"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-23)*w[22]
# df3["IS_pds_l40_dw_24"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-24)*w[23]
# df3["IS_pds_l40_dw_25"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-25)*w[24]
# df3["IS_pds_l40_dw_26"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-26)*w[25]
# df3["IS_pds_l40_dw_27"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-27)*w[26]
# df3["IS_pds_l40_dw_28"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-28)*w[27]
# df3["IS_pds_l40_dw_29"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-29)*w[28]
# df3["IS_pds_l40_dw_30"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-30)*w[29]
# df3["IS_pds_l40_dw_31"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-31)*w[30]
# df3["IS_pds_l40_dw_32"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-32)*w[31]
# df3["IS_pds_l40_dw_33"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-33)*w[32]
# df3["IS_pds_l40_dw_34"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-34)*w[33]
# df3["IS_pds_l40_dw_35"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-35)*w[34]
# df3["IS_pds_l40_dw_36"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-36)*w[35]
# df3["IS_pds_l40_dw_37"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-37)*w[36]
# df3["IS_pds_l40_dw_38"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-38)*w[37]
# df3["IS_pds_l40_dw_39"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-39)*w[38]
# df3["IS_pds_l40_dw_40"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-40)*w[39]


# df3["IS_pds_l40_dw_1_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-1)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-1))*w[0]
# df3["IS_pds_l40_dw_2_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-2)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-2))*w[1]
# df3["IS_pds_l40_dw_3_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-3)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-3))*w[2]
# df3["IS_pds_l40_dw_4_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-4)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-4))*w[3]
# df3["IS_pds_l40_dw_5_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-5)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-5))*w[4]
# df3["IS_pds_l40_dw_6_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-6)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-6))*w[5]
# df3["IS_pds_l40_dw_7_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-7)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-7))*w[6]
# df3["IS_pds_l40_dw_8_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-8)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-8))*w[7]
# df3["IS_pds_l40_dw_9_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-9)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-9))*w[8]
# df3["IS_pds_l40_dw_10_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-10)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-10))*w[9]
# df3["IS_pds_l40_dw_11_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-11)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-11))*w[10]
# df3["IS_pds_l40_dw_12_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-12)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-12))*w[11]
# df3["IS_pds_l40_dw_13_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-13)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-13))*w[12]
# df3["IS_pds_l40_dw_14_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-14)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-14))*w[13]
# df3["IS_pds_l40_dw_15_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-15)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-15))*w[14]
# df3["IS_pds_l40_dw_16_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-16)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-16))*w[15]
# df3["IS_pds_l40_dw_17_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-17)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-17))*w[16]
# df3["IS_pds_l40_dw_18_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-18)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-18))*w[17]
# df3["IS_pds_l40_dw_19_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-19)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-19))*w[18]
# df3["IS_pds_l40_dw_20_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-20)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-20))*w[19]
# df3["IS_pds_l40_dw_21_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-21)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-21))*w[20]
# df3["IS_pds_l40_dw_22_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-22)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-22))*w[21]
# df3["IS_pds_l40_dw_23_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-23)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-23))*w[22]
# df3["IS_pds_l40_dw_24_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-24)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-24))*w[23]
# df3["IS_pds_l40_dw_25_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-25)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-25))*w[24]
# df3["IS_pds_l40_dw_26_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-26)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-26))*w[25]
# df3["IS_pds_l40_dw_27_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-27)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-27))*w[26]
# df3["IS_pds_l40_dw_28_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-28)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-28))*w[27]
# df3["IS_pds_l40_dw_29_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-29)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-29))*w[28]
# df3["IS_pds_l40_dw_30_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-30)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-30))*w[29]
# df3["IS_pds_l40_dw_31_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-31)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-31))*w[30]
# df3["IS_pds_l40_dw_32_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-32)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-32))*w[31]
# df3["IS_pds_l40_dw_33_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-33)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-33))*w[32]
# df3["IS_pds_l40_dw_34_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-34)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-34))*w[33]
# df3["IS_pds_l40_dw_35_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-35)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-35))*w[34]
# df3["IS_pds_l40_dw_36_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-36)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-36))*w[35]
# df3["IS_pds_l40_dw_37_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-37)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-37))*w[36]
# df3["IS_pds_l40_dw_38_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-38)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-38))*w[37]
# df3["IS_pds_l40_dw_39_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-39)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-39))*w[38]
# df3["IS_pds_l40_dw_40_ct"] = (df3.groupby(['DOW'])['IS2_ST(m)'].shift(-40)/df3.groupby(['DOW'])['IS2_ST(m)'].shift(-40))*w[39]


# df3["IS_pds_l40_ws"] = df3[["IS_pds_l40_dw_1", "IS_pds_l40_dw_2", "IS_pds_l40_dw_3", "IS_pds_l40_dw_4", "IS_pds_l40_dw_5", "IS_pds_l40_dw_6", "IS_pds_l40_dw_7", "IS_pds_l40_dw_8", "IS_pds_l40_dw_9", "IS_pds_l40_dw_10", "IS_pds_l40_dw_11", "IS_pds_l40_dw_12", "IS_pds_l40_dw_13", "IS_pds_l40_dw_14", "IS_pds_l40_dw_15", "IS_pds_l40_dw_16", "IS_pds_l40_dw_17", "IS_pds_l40_dw_18", "IS_pds_l40_dw_19", "IS_pds_l40_dw_20", "IS_pds_l40_dw_21", "IS_pds_l40_dw_22", "IS_pds_l40_dw_23", "IS_pds_l40_dw_24", "IS_pds_l40_dw_25",
#                             "IS_pds_l40_dw_26", "IS_pds_l40_dw_27", "IS_pds_l40_dw_28", "IS_pds_l40_dw_29", "IS_pds_l40_dw_30", "IS_pds_l40_dw_31", "IS_pds_l40_dw_32", "IS_pds_l40_dw_33", "IS_pds_l40_dw_34", "IS_pds_l40_dw_35", "IS_pds_l40_dw_36", "IS_pds_l40_dw_37", "IS_pds_l40_dw_38", "IS_pds_l40_dw_39", "IS_pds_l40_dw_40"]].sum(axis=1)
# df3["IS_pds_l40_ws_ct"] = df3[["IS_pds_l40_dw_1_ct", "IS_pds_l40_dw_2_ct", "IS_pds_l40_dw_3_ct", "IS_pds_l40_dw_4_ct", "IS_pds_l40_dw_5_ct", "IS_pds_l40_dw_6_ct", "IS_pds_l40_dw_7_ct", "IS_pds_l40_dw_8_ct", "IS_pds_l40_dw_9_ct", "IS_pds_l40_dw_10_ct", "IS_pds_l40_dw_11_ct", "IS_pds_l40_dw_12_ct", "IS_pds_l40_dw_13_ct", "IS_pds_l40_dw_14_ct", "IS_pds_l40_dw_15_ct", "IS_pds_l40_dw_16_ct", "IS_pds_l40_dw_17_ct", "IS_pds_l40_dw_18_ct", "IS_pds_l40_dw_19_ct", "IS_pds_l40_dw_20_ct", "IS_pds_l40_dw_21_ct", "IS_pds_l40_dw_22_ct", "IS_pds_l40_dw_23_ct", "IS_pds_l40_dw_24_ct", "IS_pds_l40_dw_25_ct",
#                                "IS_pds_l40_dw_26_ct", "IS_pds_l40_dw_27_ct", "IS_pds_l40_dw_28_ct", "IS_pds_l40_dw_29_ct", "IS_pds_l40_dw_30_ct", "IS_pds_l40_dw_31_ct", "IS_pds_l40_dw_32_ct", "IS_pds_l40_dw_33_ct", "IS_pds_l40_dw_34_ct", "IS_pds_l40_dw_35_ct", "IS_pds_l40_dw_36_ct", "IS_pds_l40_dw_37_ct", "IS_pds_l40_dw_38_ct", "IS_pds_l40_dw_39_ct", "IS_pds_l40_dw_40_ct"]].sum(axis=1)
# df3["IS_pds_l40_dw"] = df3["IS_pds_l40_ws"]/df3["IS_pds_l40_ws_ct"]

# # Deleting transient columns
# df3 = df3.drop(["IS_pds_l40_dw_1", "IS_pds_l40_dw_2", "IS_pds_l40_dw_3", "IS_pds_l40_dw_4", "IS_pds_l40_dw_5", "IS_pds_l40_dw_6", "IS_pds_l40_dw_7", "IS_pds_l40_dw_8", "IS_pds_l40_dw_9", "IS_pds_l40_dw_10", "IS_pds_l40_dw_11", "IS_pds_l40_dw_12", "IS_pds_l40_dw_13", "IS_pds_l40_dw_14", "IS_pds_l40_dw_15", "IS_pds_l40_dw_16", "IS_pds_l40_dw_17", "IS_pds_l40_dw_18", "IS_pds_l40_dw_19", "IS_pds_l40_dw_20", "IS_pds_l40_dw_21", "IS_pds_l40_dw_22", "IS_pds_l40_dw_23", "IS_pds_l40_dw_24", "IS_pds_l40_dw_25", "IS_pds_l40_dw_26", "IS_pds_l40_dw_27", "IS_pds_l40_dw_28", "IS_pds_l40_dw_29", "IS_pds_l40_dw_30", "IS_pds_l40_dw_31", "IS_pds_l40_dw_32", "IS_pds_l40_dw_33", "IS_pds_l40_dw_34", "IS_pds_l40_dw_35", "IS_pds_l40_dw_36", "IS_pds_l40_dw_37", "IS_pds_l40_dw_38", "IS_pds_l40_dw_39", "IS_pds_l40_dw_40",  
#                 "IS_pds_l40_dw_1_ct", "IS_pds_l40_dw_2_ct", "IS_pds_l40_dw_3_ct", "IS_pds_l40_dw_4_ct", "IS_pds_l40_dw_5_ct", "IS_pds_l40_dw_6_ct", "IS_pds_l40_dw_7_ct", "IS_pds_l40_dw_8_ct", "IS_pds_l40_dw_9_ct", "IS_pds_l40_dw_10_ct", "IS_pds_l40_dw_11_ct", "IS_pds_l40_dw_12_ct", "IS_pds_l40_dw_13_ct", "IS_pds_l40_dw_14_ct", "IS_pds_l40_dw_15_ct", "IS_pds_l40_dw_16_ct", "IS_pds_l40_dw_17_ct", "IS_pds_l40_dw_18_ct", "IS_pds_l40_dw_19_ct", "IS_pds_l40_dw_20_ct", "IS_pds_l40_dw_21_ct", "IS_pds_l40_dw_22_ct", "IS_pds_l40_dw_23_ct", "IS_pds_l40_dw_24_ct", "IS_pds_l40_dw_25_ct", "IS_pds_l40_dw_26_ct", "IS_pds_l40_dw_27_ct", "IS_pds_l40_dw_28_ct", "IS_pds_l40_dw_29_ct", "IS_pds_l40_dw_30_ct", "IS_pds_l40_dw_31_ct", "IS_pds_l40_dw_32_ct", "IS_pds_l40_dw_33_ct", "IS_pds_l40_dw_34_ct", "IS_pds_l40_dw_35_ct", "IS_pds_l40_dw_36_ct", "IS_pds_l40_dw_37_ct", "IS_pds_l40_dw_38_ct", "IS_pds_l40_dw_39_ct", "IS_pds_l40_dw_40_ct",
#                 "IS_pds_l40_ws", "IS_pds_l40_ws_ct"], axis = 1)

In [29]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1202 entries, 634 to 2046
Data columns (total 43 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   P_Date                         1202 non-null   datetime64[ns]
 1   P_Date_str                     1202 non-null   object        
 2   IS2_Completed                  1202 non-null   float64       
 3   Comp_Date                      1202 non-null   datetime64[ns]
 4   Comp_Date_str                  1202 non-null   object        
 5   DOW                            1202 non-null   object        
 6   DOW_num                        1202 non-null   float64       
 7   Grid Size                      1202 non-null   int64         
 8   IS2_ST(m)                      1202 non-null   float64       
 9   IS_pds_l10_ndw                 1195 non-null   float64       
 10  GMST(m)                        1202 non-null   float64       
 11  GMS_pds_l40_ndw

In [30]:
#Compute standard deviation over the previous 10 day-specific puzzles 

#IS_pds_l10_stdev
#Provides NON-decay-weighted(dw), puzzle day-specific (pds) standard deviation in solve time performance for IS2 over the previous 10 puzzles relative to a given puzzle
# Note that the sort is by completion date for IS2, as completion date was available 
# Note also that, unlike the 10-puzzle moving average, this weighted average does NOT include the "puzzle at hand" itself

df3 = df3.sort_values(by=['DOW', 'Comp_Date'], ascending = False)

# Gradual decay
# w = np.arange(1,11)
# w = list(w)

# No decay
w = np.ones(10)
w = list(w)

df3["IS_pds_l10_ndw_1"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-1)*w[0]
df3["IS_pds_l10_ndw_2"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-2)*w[1]
df3["IS_pds_l10_ndw_3"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-3)*w[2]
df3["IS_pds_l10_ndw_4"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-4)*w[3]
df3["IS_pds_l10_ndw_5"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-5)*w[4]
df3["IS_pds_l10_ndw_6"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-6)*w[5]
df3["IS_pds_l10_ndw_7"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-7)*w[6]
df3["IS_pds_l10_ndw_8"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-8)*w[7]
df3["IS_pds_l10_ndw_9"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-9)*w[8]
df3["IS_pds_l10_ndw_10"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-10)*w[9]


df3["IS_pds_l10_stdev"] = df3[["IS_pds_l10_ndw_1", "IS_pds_l10_ndw_2", "IS_pds_l10_ndw_3", "IS_pds_l10_ndw_4", "IS_pds_l10_ndw_5", "IS_pds_l10_ndw_6", "IS_pds_l10_ndw_7", "IS_pds_l10_ndw_8", "IS_pds_l10_ndw_9", "IS_pds_l10_ndw_10"]].std(axis=1)

# Deleting transient columns
df3 = df3.drop(["IS_pds_l10_ndw_1", "IS_pds_l10_ndw_2", "IS_pds_l10_ndw_3", "IS_pds_l10_ndw_4", "IS_pds_l10_ndw_5", "IS_pds_l10_ndw_6", "IS_pds_l10_ndw_7", "IS_pds_l10_ndw_8", "IS_pds_l10_ndw_9", "IS_pds_l10_ndw_10"], axis = 1)

In [31]:
#IS_pds_l25_stdev
#Provides decay-weighted(dw), puzzle day-specific (pds) standard deviation of solve time performance for IS2 over the previous 25 puzzles relative to a given puzzle
# Note that the sort is by completion date for IS2, as completion date was avaiable 
# Note also that, unlike the 10-puzzle moving average, this weighted average does NOT include the "puzzle at hand" itself

# df3 = df3.sort_values(by=['DOW', 'Comp_Date'], ascending = False)

# # Gradual decay
# w = np.arange(1,26)
# w = list(w)

# # No decay
# # w = np.ones(25)
# # w = list(w)

# df3["IS_pds_l25_dw_1"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-1)*w[0]
# df3["IS_pds_l25_dw_2"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-2)*w[1]
# df3["IS_pds_l25_dw_3"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-3)*w[2]
# df3["IS_pds_l25_dw_4"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-4)*w[3]
# df3["IS_pds_l25_dw_5"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-5)*w[4]
# df3["IS_pds_l25_dw_6"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-6)*w[5]
# df3["IS_pds_l25_dw_7"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-7)*w[6]
# df3["IS_pds_l25_dw_8"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-8)*w[7]
# df3["IS_pds_l25_dw_9"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-9)*w[8]
# df3["IS_pds_l25_dw_10"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-10)*w[9]
# df3["IS_pds_l25_dw_11"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-11)*w[10]
# df3["IS_pds_l25_dw_12"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-12)*w[11]
# df3["IS_pds_l25_dw_13"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-13)*w[12]
# df3["IS_pds_l25_dw_14"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-14)*w[13]
# df3["IS_pds_l25_dw_15"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-15)*w[14]
# df3["IS_pds_l25_dw_16"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-16)*w[15]
# df3["IS_pds_l25_dw_17"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-17)*w[16]
# df3["IS_pds_l25_dw_18"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-18)*w[17]
# df3["IS_pds_l25_dw_19"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-19)*w[18]
# df3["IS_pds_l25_dw_20"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-20)*w[19]
# df3["IS_pds_l25_dw_21"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-21)*w[20]
# df3["IS_pds_l25_dw_22"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-22)*w[21]
# df3["IS_pds_l25_dw_23"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-23)*w[22]
# df3["IS_pds_l25_dw_24"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-24)*w[23]
# df3["IS_pds_l25_dw_25"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-25)*w[24]

# df3["IS_pds_l25_stdev"] = df3[["IS_pds_l25_dw_1", "IS_pds_l25_dw_2", "IS_pds_l25_dw_3", "IS_pds_l25_dw_4", "IS_pds_l25_dw_5", "IS_pds_l25_dw_6", "IS_pds_l25_dw_7", "IS_pds_l25_dw_8", "IS_pds_l25_dw_9", "IS_pds_l25_dw_10", "IS_pds_l25_dw_11", "IS_pds_l25_dw_12", "IS_pds_l25_dw_13", "IS_pds_l25_dw_14", "IS_pds_l25_dw_15", "IS_pds_l25_dw_16", "IS_pds_l25_dw_17", "IS_pds_l25_dw_18", "IS_pds_l25_dw_19", "IS_pds_l25_dw_20", "IS_pds_l25_dw_21", "IS_pds_l25_dw_22", "IS_pds_l25_dw_23", "IS_pds_l25_dw_24", "IS_pds_l25_dw_25"]].std(axis=1)

# # Deleting transient columns
# df3 = df3.drop(["IS_pds_l25_dw_1", "IS_pds_l25_dw_2", "IS_pds_l25_dw_3", "IS_pds_l25_dw_4", "IS_pds_l25_dw_5", "IS_pds_l25_dw_6", "IS_pds_l25_dw_7", "IS_pds_l25_dw_8", "IS_pds_l25_dw_9", "IS_pds_l25_dw_10", "IS_pds_l25_dw_11", "IS_pds_l25_dw_12", "IS_pds_l25_dw_13", "IS_pds_l25_dw_14", "IS_pds_l25_dw_15", "IS_pds_l25_dw_16", "IS_pds_l25_dw_17", "IS_pds_l25_dw_18", "IS_pds_l25_dw_19", "IS_pds_l25_dw_20", "IS_pds_l25_dw_21", "IS_pds_l25_dw_22", "IS_pds_l25_dw_23", "IS_pds_l25_dw_24", "IS_pds_l25_dw_25"], axis = 1)

In [32]:
#IS_pds_l40_stdev
#Provides decay-weighted, puzzle day-specific (pds) standard deviation of solve time performance for GMS over the previous 40 puzzles relative to a given puzzle
# Note also that, unlike the 10-puzzle moving average, this weighted average does NOT include the "puzzle at hand" itself

# df3 = df3.sort_values(by=['DOW', 'Comp_Date'], ascending = False)

# # Gradual decay
# w = np.arange(1,41)
# w = list(w)

# # No decay
# # w = np.ones(40)
# # w = list(w)

# df3["IS_pds_l40_dw_1"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-1)*w[0]
# df3["IS_pds_l40_dw_2"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-2)*w[1]
# df3["IS_pds_l40_dw_3"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-3)*w[2]
# df3["IS_pds_l40_dw_4"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-4)*w[3]
# df3["IS_pds_l40_dw_5"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-5)*w[4]
# df3["IS_pds_l40_dw_6"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-6)*w[5]
# df3["IS_pds_l40_dw_7"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-7)*w[6]
# df3["IS_pds_l40_dw_8"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-8)*w[7]
# df3["IS_pds_l40_dw_9"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-9)*w[8]
# df3["IS_pds_l40_dw_10"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-10)*w[9]
# df3["IS_pds_l40_dw_11"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-11)*w[10]
# df3["IS_pds_l40_dw_12"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-12)*w[11]
# df3["IS_pds_l40_dw_13"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-13)*w[12]
# df3["IS_pds_l40_dw_14"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-14)*w[13]
# df3["IS_pds_l40_dw_15"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-15)*w[14]
# df3["IS_pds_l40_dw_16"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-16)*w[15]
# df3["IS_pds_l40_dw_17"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-17)*w[16]
# df3["IS_pds_l40_dw_18"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-18)*w[17]
# df3["IS_pds_l40_dw_19"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-19)*w[18]
# df3["IS_pds_l40_dw_20"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-20)*w[19]
# df3["IS_pds_l40_dw_21"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-21)*w[20]
# df3["IS_pds_l40_dw_22"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-22)*w[21]
# df3["IS_pds_l40_dw_23"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-23)*w[22]
# df3["IS_pds_l40_dw_24"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-24)*w[23]
# df3["IS_pds_l40_dw_25"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-25)*w[24]
# df3["IS_pds_l40_dw_26"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-26)*w[25]
# df3["IS_pds_l40_dw_27"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-27)*w[26]
# df3["IS_pds_l40_dw_28"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-28)*w[27]
# df3["IS_pds_l40_dw_29"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-29)*w[28]
# df3["IS_pds_l40_dw_30"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-30)*w[29]
# df3["IS_pds_l40_dw_31"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-31)*w[30]
# df3["IS_pds_l40_dw_32"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-32)*w[31]
# df3["IS_pds_l40_dw_33"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-33)*w[32]
# df3["IS_pds_l40_dw_34"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-34)*w[33]
# df3["IS_pds_l40_dw_35"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-35)*w[34]
# df3["IS_pds_l40_dw_36"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-36)*w[35]
# df3["IS_pds_l40_dw_37"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-37)*w[36]
# df3["IS_pds_l40_dw_38"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-38)*w[37]
# df3["IS_pds_l40_dw_39"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-39)*w[38]
# df3["IS_pds_l40_dw_40"] = df3.groupby(['DOW'])['IS2_ST(m)'].shift(-40)*w[39]


# df3["IS_pds_l40_stdev"] = df3[["IS_pds_l40_dw_1", "IS_pds_l40_dw_2", "IS_pds_l40_dw_3", "IS_pds_l40_dw_4", "IS_pds_l40_dw_5", "IS_pds_l40_dw_6", "IS_pds_l40_dw_7", "IS_pds_l40_dw_8", "IS_pds_l40_dw_9", "IS_pds_l40_dw_10", "IS_pds_l40_dw_11", "IS_pds_l40_dw_12", "IS_pds_l40_dw_13", "IS_pds_l40_dw_14", "IS_pds_l40_dw_15", "IS_pds_l40_dw_16", "IS_pds_l40_dw_17", "IS_pds_l40_dw_18", "IS_pds_l40_dw_19", "IS_pds_l40_dw_20", "IS_pds_l40_dw_21", "IS_pds_l40_dw_22", "IS_pds_l40_dw_23", "IS_pds_l40_dw_24", "IS_pds_l40_dw_25",
#                             "IS_pds_l40_dw_26", "IS_pds_l40_dw_27", "IS_pds_l40_dw_28", "IS_pds_l40_dw_29", "IS_pds_l40_dw_30", "IS_pds_l40_dw_31", "IS_pds_l40_dw_32", "IS_pds_l40_dw_33", "IS_pds_l40_dw_34", "IS_pds_l40_dw_35", "IS_pds_l40_dw_36", "IS_pds_l40_dw_37", "IS_pds_l40_dw_38", "IS_pds_l40_dw_39", "IS_pds_l40_dw_40"]].std(axis=1)

# # Deleting transient columns
# df3 = df3.drop(["IS_pds_l40_dw_1", "IS_pds_l40_dw_2", "IS_pds_l40_dw_3", "IS_pds_l40_dw_4", "IS_pds_l40_dw_5", "IS_pds_l40_dw_6", "IS_pds_l40_dw_7", "IS_pds_l40_dw_8", "IS_pds_l40_dw_9", "IS_pds_l40_dw_10", "IS_pds_l40_dw_11", "IS_pds_l40_dw_12", "IS_pds_l40_dw_13", "IS_pds_l40_dw_14", "IS_pds_l40_dw_15", "IS_pds_l40_dw_16", "IS_pds_l40_dw_17", "IS_pds_l40_dw_18", "IS_pds_l40_dw_19", "IS_pds_l40_dw_20", "IS_pds_l40_dw_21", "IS_pds_l40_dw_22", "IS_pds_l40_dw_23", "IS_pds_l40_dw_24", "IS_pds_l40_dw_25", "IS_pds_l40_dw_26", "IS_pds_l40_dw_27", "IS_pds_l40_dw_28", "IS_pds_l40_dw_29", "IS_pds_l40_dw_30", "IS_pds_l40_dw_31", "IS_pds_l40_dw_32", "IS_pds_l40_dw_33", "IS_pds_l40_dw_34", "IS_pds_l40_dw_35", "IS_pds_l40_dw_36", "IS_pds_l40_dw_37", "IS_pds_l40_dw_38", "IS_pds_l40_dw_39", "IS_pds_l40_dw_40"], axis = 1)

In [33]:
col = df3.pop('IS_pds_l10_stdev')
df3.insert(10, col.name, col)

In [34]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1202 entries, 634 to 2046
Data columns (total 44 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   P_Date                         1202 non-null   datetime64[ns]
 1   P_Date_str                     1202 non-null   object        
 2   IS2_Completed                  1202 non-null   float64       
 3   Comp_Date                      1202 non-null   datetime64[ns]
 4   Comp_Date_str                  1202 non-null   object        
 5   DOW                            1202 non-null   object        
 6   DOW_num                        1202 non-null   float64       
 7   Grid Size                      1202 non-null   int64         
 8   IS2_ST(m)                      1202 non-null   float64       
 9   IS_pds_l10_ndw                 1195 non-null   float64       
 10  IS_pds_l10_stdev               1188 non-null   float64       
 11  GMST(m)        

In [35]:
# Checkpoint
df3.to_csv('../data/df3.csv', index=False)

#### Strength of Schedule adjustment for IS2 Recent Performance Baseline
* 'IS_pds_l10_ndw_SOS_adj' (intermediate is 'IS_pds_l10_reldiff_ndw')
* Takes IS2 recent form ('IS_pds_l10_ndw') and adjusts up or down based on the mean of GMS performance relative to THEIR OWN form ('GMST_to_GMS_pds_l40_ndw_ratio') over that same stretch.
* The mean of 'GMST_to_GMS_pds_l40_dw_ratio' ('IS_pds_l10_reldiff') over the last 8 puzzle-day specific puzzles solved by IS2 it itself NON-decay-weighted (change post-initiation of modeling based on early model iterations). 

In [36]:
#'IS_pds_l10_reldiff_ndw'
# Intermediate for generating SOS adjustment-see comments above 

df3 = df3.sort_values(by=['DOW', 'Comp_Date'], ascending = False)

# Gradual decay
# w = np.arange(1,11)
# w = list(w)

# No decay
w = np.ones(10)
w = list(w)

df3["IS_pds_l10_reldiff_ndw_1"] = df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-1)*w[0]
df3["IS_pds_l10_reldiff_ndw_2"] = df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-2)*w[1]
df3["IS_pds_l10_reldiff_ndw_3"] = df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-3)*w[2]
df3["IS_pds_l10_reldiff_ndw_4"] = df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-4)*w[3]
df3["IS_pds_l10_reldiff_ndw_5"] = df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-5)*w[4]
df3["IS_pds_l10_reldiff_ndw_6"] = df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-6)*w[5]
df3["IS_pds_l10_reldiff_ndw_7"] = df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-7)*w[6]
df3["IS_pds_l10_reldiff_ndw_8"] = df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-8)*w[7]
df3["IS_pds_l10_reldiff_ndw_9"] = df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-9)*w[8]
df3["IS_pds_l10_reldiff_ndw_10"] = df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-10)*w[9]

df3["IS_pds_l10_reldiff_ndw_1_ct"] = (df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-1)/df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-1))*w[0]
df3["IS_pds_l10_reldiff_ndw_2_ct"] = (df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-2)/df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-2))*w[1]
df3["IS_pds_l10_reldiff_ndw_3_ct"] = (df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-3)/df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-3))*w[2]
df3["IS_pds_l10_reldiff_ndw_4_ct"] = (df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-4)/df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-4))*w[3]
df3["IS_pds_l10_reldiff_ndw_5_ct"] = (df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-5)/df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-5))*w[4]
df3["IS_pds_l10_reldiff_ndw_6_ct"] = (df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-6)/df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-6))*w[5]
df3["IS_pds_l10_reldiff_ndw_7_ct"] = (df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-7)/df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-7))*w[6]
df3["IS_pds_l10_reldiff_ndw_8_ct"] = (df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-8)/df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-8))*w[7]
df3["IS_pds_l10_reldiff_ndw_9_ct"] = (df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-9)/df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-9))*w[8]
df3["IS_pds_l10_reldiff_ndw_10_ct"] = (df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-10)/df3.groupby(['DOW'])['GMST_to_GMS_pds_l40_ndw_ratio'].shift(-10))*w[9]

df3["IS_pds_l10_reldiff_ndw_ws"] = df3[["IS_pds_l10_reldiff_ndw_1", "IS_pds_l10_reldiff_ndw_2", "IS_pds_l10_reldiff_ndw_3", "IS_pds_l10_reldiff_ndw_4", "IS_pds_l10_reldiff_ndw_5", "IS_pds_l10_reldiff_ndw_6", "IS_pds_l10_reldiff_ndw_7", "IS_pds_l10_reldiff_ndw_8", "IS_pds_l10_reldiff_ndw_9", "IS_pds_l10_reldiff_ndw_10"]].sum(axis=1)
df3["IS_pds_l10_reldiff_ndw_ws_ct"] = df3[["IS_pds_l10_reldiff_ndw_1_ct", "IS_pds_l10_reldiff_ndw_2_ct", "IS_pds_l10_reldiff_ndw_3_ct", "IS_pds_l10_reldiff_ndw_4_ct", "IS_pds_l10_reldiff_ndw_5_ct", "IS_pds_l10_reldiff_ndw_6_ct", "IS_pds_l10_reldiff_ndw_7_ct", "IS_pds_l10_reldiff_ndw_8_ct", "IS_pds_l10_reldiff_ndw_9_ct", "IS_pds_l10_reldiff_ndw_10_ct"]].sum(axis=1)
df3["IS_pds_l10_reldiff_ndw"] = df3["IS_pds_l10_reldiff_ndw_ws"]/df3["IS_pds_l10_reldiff_ndw_ws_ct"]

# Deleting transient columns
df3 = df3.drop(["IS_pds_l10_reldiff_ndw_1", "IS_pds_l10_reldiff_ndw_2", "IS_pds_l10_reldiff_ndw_3", "IS_pds_l10_reldiff_ndw_4", "IS_pds_l10_reldiff_ndw_5", "IS_pds_l10_reldiff_ndw_6", "IS_pds_l10_reldiff_ndw_7", "IS_pds_l10_reldiff_ndw_8", "IS_pds_l10_reldiff_ndw_9", "IS_pds_l10_reldiff_ndw_10", "IS_pds_l10_reldiff_ndw_ws", "IS_pds_l10_reldiff_ndw_1_ct", "IS_pds_l10_reldiff_ndw_2_ct", "IS_pds_l10_reldiff_ndw_3_ct", "IS_pds_l10_reldiff_ndw_4_ct", "IS_pds_l10_reldiff_ndw_5_ct", "IS_pds_l10_reldiff_ndw_6_ct", "IS_pds_l10_reldiff_ndw_7_ct", "IS_pds_l10_reldiff_ndw_8_ct", "IS_pds_l10_reldiff_ndw_9_ct", "IS_pds_l10_reldiff_ndw_10_ct", "IS_pds_l10_reldiff_ndw_ws_ct"], axis = 1)

In [37]:
#'IS_pds_l10_ndw_SOS_adj'
# Now, take the mean difficulty over the last 10 puzzle day-specific puzzles (as determined by GMS performance on those 10 puzzle normalized th GMS 'recent' performance)
# and use this ratio to factor IS2's recent performance (ratio >1 will reduce 'IS_pds_l10_ndw' and ratio <1 will increase 'IS_pds_l10_ndw' in proportion to the ratio.

df3['IS_pds_l10_ndw_SOS_adj'] = df3['IS_pds_l10_ndw']*(1/df3['IS_pds_l10_reldiff_ndw'])

In [38]:
col = df3.pop('IS_pds_l10_ndw_SOS_adj')
df3.insert(11, col.name, col)

In [39]:
#Can drop intermediates used to generate 'IS_pds_l10_ndw_SOS_adj' now
#In fact, it is critical to drop 'GMST_to_GMS_pds_l40_ndw_ratio' because this feature would constitute data leakage as it includes info about GMST performance on the puzzle to be predicted
df3 = df3.drop(['GMST_to_GMS_pds_l40_ndw_ratio', 'IS_pds_l10_reldiff_ndw', 'GMS_pds_l40_ndw'], axis=1) 

In [40]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1202 entries, 634 to 2046
Data columns (total 43 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   P_Date                  1202 non-null   datetime64[ns]
 1   P_Date_str              1202 non-null   object        
 2   IS2_Completed           1202 non-null   float64       
 3   Comp_Date               1202 non-null   datetime64[ns]
 4   Comp_Date_str           1202 non-null   object        
 5   DOW                     1202 non-null   object        
 6   DOW_num                 1202 non-null   float64       
 7   Grid Size               1202 non-null   int64         
 8   IS2_ST(m)               1202 non-null   float64       
 9   IS_pds_l10_ndw          1195 non-null   float64       
 10  IS_pds_l10_stdev        1188 non-null   float64       
 11  IS_pds_l10_ndw_SOS_adj  1195 non-null   float64       
 12  GMST(m)                 1202 non-null   float6

In [41]:
# Checkpoint
df3.to_csv('../data/df3.csv', index=False)

In [42]:
# 'IST_Diff%_from_IS_pds_l10_ndw_SOS_adj'
# Calculate % deviation from puzzle day-specific recent solve form ('IS_pds_l10_ndw_SOS_adj') per individual raw solve time for individual solver (IS2)
# This will be used to create past (relative to a given solve) performance against a specific constructor or constructor team
df3['IST_Diff%_from_IS_pds_l10_ndw_SOS_adj'] = (((df3['IS2_ST(m)'] - df3['IS_pds_l10_ndw_SOS_adj']))/((df3['IS2_ST(m)'] + df3['IS_pds_l10_ndw_SOS_adj'])/2)*100).round(2)

df3 = df3.sort_values(by=['DOW','Comp_Date'], ascending = False)

In [43]:
# Calculate past performance (using COMP date to sort) for IS2 against a given constructor (% difference from RPB) for each puzzle in sample
# Individual Solver mean past performance versus a given constructor(s), per puzzle. 
# This is calculated off of deviation from 10-RPB (SOS_adj), so that it can be collapsed across puzzle days
# Getting the count as well will allow filtering by past number of puzzles for a second version of the figure

df3 = df3.sort_values(by=['Constructors','Comp_Date'], ascending = False)

df3 = df3.iloc[::-1]
df3['IS_per_constr_avg_past_diff_from_RPB'] = df3.groupby(['Constructors'])['IST_Diff%_from_IS_pds_l10_ndw_SOS_adj'].transform(lambda x: x.rolling(window=100, min_periods = 1).mean().round(2).shift(1))
df3['IS_per_constr_past_diff_from_RPB_ct'] = df3.groupby(['Constructors'])['IST_Diff%_from_IS_pds_l10_ndw_SOS_adj'].transform(lambda x: x.rolling(window=100, min_periods = 1).count().shift(1))
df3 = df3.iloc[::-1]

df3['IS_per_constr_past_diff_from_RPB_ct'] =  df3['IS_per_constr_past_diff_from_RPB_ct'].fillna(0) #these are all the first puzzle solved vs a specific constructor(s)

col = df3.pop('IST_Diff%_from_IS_pds_l10_ndw_SOS_adj')
df3.insert(12, col.name, col)

col = df3.pop('IS_per_constr_avg_past_diff_from_RPB')
df3.insert(13, col.name, col)

col = df3.pop('IS_per_constr_past_diff_from_RPB_ct')
df3.insert(14, col.name, col)

In [44]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1202 entries, 142 to 738
Data columns (total 46 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   P_Date                                 1202 non-null   datetime64[ns]
 1   P_Date_str                             1202 non-null   object        
 2   IS2_Completed                          1202 non-null   float64       
 3   Comp_Date                              1202 non-null   datetime64[ns]
 4   Comp_Date_str                          1202 non-null   object        
 5   DOW                                    1202 non-null   object        
 6   DOW_num                                1202 non-null   float64       
 7   Grid Size                              1202 non-null   int64         
 8   IS2_ST(m)                              1202 non-null   float64       
 9   IS_pds_l10_ndw                         1195 non-null   float64

In [45]:
#Let's also create "prior solve experience variables", both by solve day (pds) and overall (npds) (for 15x15 puzzles)

df3 = df3.sort_values(by=['DOW','Comp_Date'], ascending = False)

df3 = df3.iloc[::-1]
df3['IS2_pds_prior_solves_ct'] = df3.groupby(['DOW'])['DOW'].transform(lambda x: x.rolling(window=1000, min_periods = 1).count().shift(1))
df3 = df3.iloc[::-1]

df3 = df3.sort_values(by=['Comp_Date'], ascending = False)

df3 = df3.iloc[::-1]
df3['IS2_npds_prior_solves_ct'] = df3.groupby(['IS2_Completed'])['IS2_Completed'].transform(lambda x: x.rolling(window=10000, min_periods = 1).count().shift(1))
df3 = df3.iloc[::-1]

df3['IS2_pds_prior_solves_ct'] =  df3['IS2_pds_prior_solves_ct'].fillna(0) #these are all the first puzzle solved on a puzzle day of week
df3['IS2_npds_prior_solves_ct'] =  df3['IS2_npds_prior_solves_ct'].fillna(0) #this is the first puzzle solved overall 


col = df3.pop('IS2_pds_prior_solves_ct')
df3.insert(15, col.name, col)

col = df3.pop('IS2_npds_prior_solves_ct')
df3.insert(16, col.name, col)

In [46]:
# Extract solve hour and part of daily cycle into separate columns

# Creates a column where Individual Solver completion timestamps are binned to the hour in 24-hour cycle
df3['Comp_Hr'] = df3['Comp_Date'].dt.hour
df3['Comp_Hr'].value_counts()
df3['Comp_Hr']= df3['Comp_Hr'].astype('int64')
df3 = df3.sort_values('Comp_Hr')

# encode part of daily cycle puzzle was solved in (1 = 12-6 AM; 2 = 6 AM -12 PM; 3 = 12 PM-6 PM; 4= 6 PM-12 AM)
df3.loc[(df3["Comp_Hr"] >= 0) & (df3["Comp_Hr"] < 6) , "Solve_day_phase"] = 1 # late night
df3.loc[(df3["Comp_Hr"] >= 6) & (df3["Comp_Hr"] < 12) , "Solve_day_phase"] = 2 # morning
df3.loc[(df3["Comp_Hr"] >= 12) & (df3["Comp_Hr"] < 18) , "Solve_day_phase"] = 3 # afternoon
df3.loc[(df3["Comp_Hr"] >= 18) & (df3["Comp_Hr"] < 24) , "Solve_day_phase"] = 4 # evening

col = df3.pop('Comp_Hr')
df3.insert(5, col.name, col)

col = df3.pop('Solve_day_phase')
df3.insert(6, col.name, col)

In [47]:
# Get average difference from day-specific, last 10-puzzle time-decay weighted, SOS-adjusted mean for solve phase for past puzzles solved in the same solve phase
# Use metric normalized for puzzle day

df3 = df3.sort_values(by=['Solve_day_phase','Comp_Date'], ascending = False)

df3 = df3.iloc[::-1]
df3['IS_per_sdp_avg_past_diff_from_RPB'] = df3.groupby(['Solve_day_phase'])['IST_Diff%_from_IS_pds_l10_ndw_SOS_adj'].transform(lambda x: x.rolling(window=1000, min_periods = 1).mean().round(2).shift(1))
df3 = df3.iloc[::-1]

col = df3.pop('IS_per_sdp_avg_past_diff_from_RPB')
df3.insert(7, col.name, col)

In [48]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1202 entries, 822 to 2001
Data columns (total 51 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   P_Date                                 1202 non-null   datetime64[ns]
 1   P_Date_str                             1202 non-null   object        
 2   IS2_Completed                          1202 non-null   float64       
 3   Comp_Date                              1202 non-null   datetime64[ns]
 4   Comp_Date_str                          1202 non-null   object        
 5   Comp_Hr                                1202 non-null   int64         
 6   Solve_day_phase                        1202 non-null   float64       
 7   IS_per_sdp_avg_past_diff_from_RPB      1193 non-null   float64       
 8   DOW                                    1202 non-null   object        
 9   DOW_num                                1202 non-null   float6

In [49]:
# Get the number of solves in the calendar week of a given solve, prior to (ie, not including) that solve
#This feature becomes a proxy for recent rate of solving, which I suspected coming out of the EDA may change the learning/improvement rate as it deviates substantially from one puzzle per day

df3 = df3.sort_values(by=['Comp_Date'], ascending = False)

df3 = df3.iloc[::-1]
IS2_solves_l7 = (df3.assign(IS2_solves_l7=1)
   .set_index('Comp_Date')
   .rolling('7d')['IS2_solves_l7'].sum()
   .shift(1)
   .sort_index(level='Comp_Date').reset_index()) 
df3 = df3.iloc[::-1]

IS2_solves_l7['IS2_solves_l7'] = IS2_solves_l7['IS2_solves_l7'].fillna(0) #should only need for the very first solve in the sample

df4 = pd.merge(df3, IS2_solves_l7, how="outer", on=["Comp_Date"])

In [50]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1206 entries, 0 to 1205
Data columns (total 52 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   P_Date                                 1206 non-null   datetime64[ns]
 1   P_Date_str                             1206 non-null   object        
 2   IS2_Completed                          1206 non-null   float64       
 3   Comp_Date                              1206 non-null   datetime64[ns]
 4   Comp_Date_str                          1206 non-null   object        
 5   Comp_Hr                                1206 non-null   int64         
 6   Solve_day_phase                        1206 non-null   float64       
 7   IS_per_sdp_avg_past_diff_from_RPB      1197 non-null   float64       
 8   DOW                                    1206 non-null   object        
 9   DOW_num                                1206 non-null   float64 

In [51]:
col = df4.pop('IS2_solves_l7')
df4.insert(20, col.name, col)

In [52]:
# For one of the benchmark models generated in the next stage, calculate IS2's mean solve time across the entire sample for each puzzle day, and pin a copy (with the correct puzzle day) to each row

#Remove the first solve period (2018-2019) to calculate sample averages by day
df4 = df4[df4['Comp_Date_str'].str.contains("2020|2021|2022|2023|2024")]

mean_all_15x15 = (df4.loc[(df4['DOW_num'] != 1), 'IS2_ST(m)'].mean()) 
print(mean_all_15x15)

mean_Sun = (df4.loc[(df4['DOW_num'] == 1), 'IS2_ST(m)'].mean()) 
print(mean_Sun)

mean_Mon = (df4.loc[(df4['DOW_num'] == 2), 'IS2_ST(m)'].mean()) 
print(mean_Mon)

mean_Tue = (df4.loc[(df4['DOW_num'] == 3), 'IS2_ST(m)'].mean()) 
print(mean_Tue)

mean_Wed = (df4.loc[(df4['DOW_num'] == 4), 'IS2_ST(m)'].mean()) 
print(mean_Wed)

mean_Thu = (df4.loc[(df4['DOW_num'] == 5), 'IS2_ST(m)'].mean()) 
print(mean_Thu)

mean_Fri = (df4.loc[(df4['DOW_num'] == 6), 'IS2_ST(m)'].mean()) 
print(mean_Fri)

mean_Sat = (df4.loc[(df4['DOW_num'] == 7), 'IS2_ST(m)'].mean()) 
print(mean_Sat)

df4.loc[(df4['DOW_num'] == 1), 'overall_day_mean_IST(m)'] = mean_Sun
df4.loc[(df4['DOW_num'] == 2), 'overall_day_mean_IST(m)'] = mean_Mon
df4.loc[(df4['DOW_num'] == 3), 'overall_day_mean_IST(m)'] = mean_Tue
df4.loc[(df4['DOW_num'] == 4), 'overall_day_mean_IST(m)'] = mean_Wed
df4.loc[(df4['DOW_num'] == 5), 'overall_day_mean_IST(m)'] = mean_Thu
df4.loc[(df4['DOW_num'] == 6), 'overall_day_mean_IST(m)'] = mean_Fri
df4.loc[(df4['DOW_num'] == 7), 'overall_day_mean_IST(m)'] = mean_Sat

col = df4.pop('overall_day_mean_IST(m)')
df4.insert(20, col.name, col)

18.530125523012526
36.98081140350878
6.451111111111111
9.045474613686531
13.430921052631579
22.63918128654971
23.65814176245211
33.87310126582279


In [53]:
# Drop column that we don't need
df4 = df4.drop(['Grid Size'], axis=1)

In [54]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1108 entries, 0 to 1107
Data columns (total 52 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   P_Date                                 1108 non-null   datetime64[ns]
 1   P_Date_str                             1108 non-null   object        
 2   IS2_Completed                          1108 non-null   float64       
 3   Comp_Date                              1108 non-null   datetime64[ns]
 4   Comp_Date_str                          1108 non-null   object        
 5   Comp_Hr                                1108 non-null   int64         
 6   Solve_day_phase                        1108 non-null   float64       
 7   IS_per_sdp_avg_past_diff_from_RPB      1108 non-null   float64       
 8   DOW                                    1108 non-null   object        
 9   DOW_num                                1108 non-null   float64 

In [55]:
col = df4.pop('IS_pds_l10_stdev')
df4.insert(15, col.name, col)

col = df4.pop('GMST(m)')
df4.insert(10, col.name, col)

col = df4.pop('IS_pds_l10_stdev')
df4.insert(13, col.name, col)

In [56]:
#Drop a few last intermediates that we don't need for modeling
df4 = df4.drop(['IST_Diff%_from_IS_pds_l10_ndw_SOS_adj','IS_per_constr_past_diff_from_RPB_ct'], axis=1)

In [57]:
# Data for preprocessing and training
# df4.to_csv('../data/df_for_modeling.csv', index=False) #old. do not use.
df4.to_csv('../data/df_for_modeling_no_decay_weighting.csv', index=False) #use for full model
# df4.to_csv('../data/df_for_modeling_no_SOS_adj.csv', index=False) #use for -SOS subtraction model only