# 2. Feature Development (No Decay Time-Weighting)

2nd Workbook for the Association of Tennis Professionals (ATP) men's singles predictive modeling project: 
* This version is specifically for the subtraction analysis (see also Notebook 6 and Github summary)
* To generate this version and still calculate SOS and CSP, technically time-decay weighting is still present, but weighting for each match (in both l60 and l10 feature versions) is set to zero

## 1. Imports and Functions

In [1]:
import pandas as pd
import numpy as np
import datetime
import os
import warnings
warnings.filterwarnings('ignore')

## 2. Load Preprocessed Data

In [2]:
#Load preprocessed data from previous workbook
df_match1 = pd.read_csv('../data/cleaned_data_for_FeatureDev.csv')

In [3]:
df_match1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28534 entries, 0 to 28533
Data columns (total 60 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   t_id                28534 non-null  int64  
 1   t_ident             28534 non-null  int64  
 2   t_nm                28534 non-null  object 
 3   t_co                28534 non-null  object 
 4   t_GMT_diff          28534 non-null  int64  
 5   t_surf              28534 non-null  int64  
 6   t_ind               28534 non-null  int64  
 7   t_alt               28534 non-null  int64  
 8   t_draw_sz           28534 non-null  int64  
 9   t_lvl               28534 non-null  int64  
 10  m_bestof            28534 non-null  int64  
 11  m_num               28534 non-null  int64  
 12  m_date              28534 non-null  object 
 13  m_yr                28534 non-null  int64  
 14  t_rd_num            28534 non-null  int64  
 15  m_t(m)              28250 non-null  float64
 16  w_id

## 3. Functions

In [4]:
# Useful function for computing denominator for weighted average calculations to be conducted in this workbook
def triangular_number(n):
    return n * (n + 1) // 2

## 4. Feature Creation

#### dataframe in by-match organization (first iteration)

### 1. Target Feature Creation: % of Total Points Played in a Match Won by Each Player

In [5]:
# Creation of target feature for each player in a given match: proportionalizing points played in the match appropriately between the two players.
df_match1["w_tot_pts_won%"] = round(((df_match1["w_1st_sv_pts_won"] + df_match1["w_2nd_sv_pts_won"] + (df_match1["l_sv_pts"] - (df_match1["l_1st_sv_pts_won"] + df_match1["l_2nd_sv_pts_won"])))/ (df_match1["w_sv_pts"] + df_match1["l_sv_pts"]))*100, 2)

# Loser % pts won is simply 100 - w % pts won
df_match1["l_tot_pts_won%"] = 100 - df_match1["w_tot_pts_won%"]

In [6]:
# Target broken down into serving and returning 

# Winner % Serve pts won
df_match1["w_sv_pts_won%"] = round((df_match1["w_1st_sv_pts_won"] + df_match1["w_2nd_sv_pts_won"]) / df_match1["w_sv_pts"]*100,2)

# Winner % Return pts won
df_match1["w_ret_pts_won%"] =round(((df_match1["l_sv_pts"] - (df_match1["l_1st_sv_pts_won"] + df_match1["l_2nd_sv_pts_won"]))/df_match1["l_sv_pts"])*100,2)

# Loser % Serve pts won
df_match1["l_sv_pts_won%"] = round((df_match1["l_1st_sv_pts_won"] + df_match1["l_2nd_sv_pts_won"]) / df_match1["l_sv_pts"]*100,2)

# Loser % Return pts won
df_match1["l_ret_pts_won%"] =round(((df_match1["w_sv_pts"] - (df_match1["w_1st_sv_pts_won"] + df_match1["w_2nd_sv_pts_won"]))/df_match1["w_sv_pts"])*100,2)

In [7]:
# Not (currently) target features, but raw counts needed for generating retrospective, match stats accrual-based predictive features (see next section): 

#Total Points Played In Match
df_match1["m_tot_pts"] = df_match1["l_sv_pts"] + df_match1["w_sv_pts"]

#Winner Serve pts won
df_match1["w_sv_pts_won"] = df_match1["w_1st_sv_pts_won"] + df_match1["w_2nd_sv_pts_won"]

#Loser Serve pts won
df_match1["l_sv_pts_won"] = df_match1["l_1st_sv_pts_won"] + df_match1["l_2nd_sv_pts_won"]

#Winner 2nd Serve pts 
df_match1["w_2nd_sv_pts"] = df_match1["w_sv_pts"] - df_match1["w_1st_sv_in"]

#Loser 2nd Serve pts
df_match1["l_2nd_sv_pts"] = df_match1["l_sv_pts"] - df_match1["l_1st_sv_in"]

#Winner Return pts
df_match1["w_ret_pts"] = df_match1["l_sv_pts"]

#Loser Return pts 
df_match1["l_ret_pts"] = df_match1["w_sv_pts"]

#Winner Return pts won
df_match1["w_ret_pts_won"] = (df_match1["l_sv_pts"] - (df_match1["l_1st_sv_pts_won"] + df_match1["l_2nd_sv_pts_won"]))

#Loser Return pts won
df_match1["l_ret_pts_won"] = (df_match1["w_sv_pts"] - (df_match1["w_1st_sv_pts_won"] + df_match1["w_2nd_sv_pts_won"]))

#Winner 1st Return pts 
df_match1["w_1st_ret_pts"] = df_match1["l_1st_sv_in"]

#Loser 1st Return pts 
df_match1["l_1st_ret_pts"] = df_match1["w_1st_sv_in"]

#Winner 1st Return pts won
df_match1["w_1st_ret_pts_won"] = df_match1["l_1st_sv_in"] - df_match1["l_1st_sv_pts_won"]

#Loser 1st Return pts won
df_match1["l_1st_ret_pts_won"] = df_match1["w_1st_sv_in"] - df_match1["w_1st_sv_pts_won"]

#Winner 2nd Return pts 
df_match1["w_2nd_ret_pts"] = df_match1["l_sv_pts"] - df_match1["l_1st_sv_in"]

#Loser 2nd Return pts
df_match1["l_2nd_ret_pts"] = df_match1["w_sv_pts"] - df_match1["w_1st_sv_in"]

#Winner 2nd Return pts won
df_match1["w_2nd_ret_pts_won"] = df_match1["l_2nd_sv_pts"] - df_match1["l_2nd_sv_pts_won"]

#Loser 2nd Return pts won
df_match1["l_2nd_ret_pts_won"] = df_match1["w_2nd_sv_pts"] - df_match1["w_2nd_sv_pts_won"]

#Winner Total Pts won
df_match1["w_tot_pts_won"] = df_match1["w_sv_pts_won"] + df_match1["w_ret_pts_won"]

#Loser Total Pts won
df_match1["l_tot_pts_won"] = df_match1["l_sv_pts_won"] + df_match1["l_ret_pts_won"]

In [8]:
# Not (currently) target features, but additional percentages needed for generating means later on for SOS-adjustment of retrospective, match stats accrual-based predictive features (see next section): 
df_match1["w_1st_sv_in%"] = round((df_match1["w_1st_sv_in"]/df_match1["w_sv_pts"])*100,2)
df_match1["l_1st_sv_in%"] = round((df_match1["l_1st_sv_in"]/df_match1["l_sv_pts"])*100,2)

df_match1["w_1st_sv%_yielded"] = round((df_match1["l_1st_sv_in"]/df_match1["w_ret_pts"])*100,2) #as a returner (1st sv percentage of opponents)
df_match1["l_1st_sv%_yielded"] = round((df_match1["w_1st_sv_in"]/df_match1["l_ret_pts"])*100,2) #as a returner (1st sv percentage of opponents)

df_match1["w_1st_sv_pts_won%"] = round((df_match1["w_1st_sv_pts_won"]/df_match1["w_1st_sv_in"])*100,2)
df_match1["l_1st_sv_pts_won%"] = round((df_match1["l_1st_sv_pts_won"]/df_match1["l_1st_sv_in"])*100,2)

df_match1["w_1st_ret_pts_won%"] = round((100 - (df_match1["l_1st_sv_pts_won"]/df_match1["l_1st_sv_in"])*100),2)
df_match1["l_1st_ret_pts_won%"] = round((100 - (df_match1["w_1st_sv_pts_won"]/df_match1["w_1st_sv_in"])*100),2)

df_match1["w_2nd_sv_pts_won%"] = round((df_match1["w_2nd_sv_pts_won"]/df_match1["w_2nd_sv_pts"])*100,2)
df_match1["l_2nd_sv_pts_won%"] = round((df_match1["l_2nd_sv_pts_won"]/df_match1["l_2nd_sv_pts"])*100,2)

df_match1["w_2nd_ret_pts_won%"] = round((100 - (df_match1["l_2nd_sv_pts_won"]/df_match1["l_2nd_sv_pts"])*100),2)
df_match1["l_2nd_ret_pts_won%"] = round((100 - (df_match1["w_2nd_sv_pts_won"]/df_match1["w_2nd_sv_pts"])*100),2)

df_match1["w_ace%"] = round((df_match1["w_ace"]/df_match1["w_sv_pts"])*100,2)
df_match1["l_ace%"] = round((df_match1["l_ace"]/df_match1["l_sv_pts"])*100,2)

df_match1["w_aced%"] = round((df_match1["l_ace"]/df_match1["l_sv_pts"])*100,2)
df_match1["l_aced%"] = round((df_match1["w_ace"]/df_match1["w_sv_pts"])*100,2)

df_match1["w_df%"] = round((df_match1["w_df"]/df_match1["w_sv_pts"])*100,2)
df_match1["l_df%"] = round((df_match1["l_df"]/df_match1["l_sv_pts"])*100,2)

df_match1["w_df_induced%"] = round((df_match1["l_df"]/df_match1["l_sv_pts"])*100,2)
df_match1["l_df_induced%"] = round((df_match1["w_df"]/df_match1["w_sv_pts"])*100,2)

df_match1["w_bp_save%"] = round((df_match1["w_bp_saved"]/df_match1["w_bp_faced"])*100,2)
df_match1.loc[(df_match1["w_bp_saved"] == 0) & (df_match1["w_bp_faced"] == 0), "w_bp_save%"] = 100 #if there are no bps to save, we don't want to "penalize" the player with a 0%. So it's treated as if they saved 1 of 1. 
df_match1["l_bp_save%"] = round((df_match1["l_bp_saved"]/df_match1["l_bp_faced"])*100,2)
df_match1.loc[(df_match1["l_bp_saved"] == 0) & (df_match1["l_bp_faced"] == 0), "l_bp_save%"] = 100 #if there are no bps to save, we don't want to "penalize" the player with a 0%. So it's treated as if they saved 1 of 1. 

df_match1["w_bp_conv%"] = 100-(round((df_match1["l_bp_saved"]/df_match1["l_bp_faced"])*100,2)) 
df_match1.loc[(df_match1["l_bp_faced"] == 0), "w_bp_conv%"] = 0 #if there are no bps to possibly convert, we DO want to "penalize" the player who failed to generate any opportunities. So it's treated as if they converted 0 of 1.
df_match1["l_bp_conv%"] = 100-(round((df_match1["w_bp_saved"]/df_match1["w_bp_faced"])*100,2)) 
df_match1.loc[(df_match1["w_bp_faced"] == 0), "l_bp_conv%"] = 0 #if there are no bps to possibly convert, we DO want to "penalize" the player who failed to generate any opportunities. So it's treated as if they converted 0 of 1.

In [9]:
# Arrange columns
df_match1 = df_match1[["t_id", "t_ident", "t_nm", "t_co", "t_GMT_diff", "t_surf", "t_ind", "t_alt", "t_draw_sz", "t_lvl", "m_bestof", "m_num", "m_date", "m_yr", "t_rd_num", "m_t(m)", "m_tot_pts", "w_id", "w_ent", "w_nm", "w_rk", "w_rk_pts", "w_hd", "w_ht", "w_ioc", "w_age", "w_sv_pts", "w_sv_pts_won", "w_1st_sv_in", "w_1st_sv_pts_won", "w_2nd_sv_pts", "w_2nd_sv_pts_won", "w_ace", "w_df", "w_bp_saved", "w_bp_faced", "w_ret_pts", "w_ret_pts_won", "w_1st_ret_pts", "w_1st_ret_pts_won", "w_2nd_ret_pts", "w_2nd_ret_pts_won", "w_tot_pts_won", "w_tot_pts_won%", "w_sv_pts_won%", "w_ret_pts_won%", "w_1st_sv_in%", "w_1st_sv%_yielded", "w_1st_sv_pts_won%", "w_1st_ret_pts_won%", "w_2nd_sv_pts_won%", "w_2nd_ret_pts_won%", "w_ace%", "w_aced%", "w_df%", "w_df_induced%", "w_bp_save%", "w_bp_conv%", "l_id", "l_ent", "l_nm", "l_rk", "l_rk_pts", "l_hd", "l_ht", "l_ioc", "l_age", "l_sv_pts", "l_sv_pts_won", "l_1st_sv_in", "l_1st_sv_pts_won", "l_2nd_sv_pts", "l_2nd_sv_pts_won",  "l_ace", "l_df", "l_bp_saved", "l_bp_faced", "l_ret_pts", "l_ret_pts_won", "l_1st_ret_pts", "l_1st_ret_pts_won", "l_2nd_ret_pts", "l_2nd_ret_pts_won", "l_tot_pts_won", "l_tot_pts_won%", "l_sv_pts_won%", "l_ret_pts_won%", "l_1st_sv_in%", "l_1st_sv%_yielded", "l_1st_sv_pts_won%", "l_1st_ret_pts_won%", "l_2nd_sv_pts_won%", "l_2nd_ret_pts_won%", "l_ace%", "l_aced%", "l_df%", "l_df_induced%", "l_bp_save%", "l_bp_conv%", "AvgW_C_IP_NV", "AvgL_C_IP_NV", "PSW_C_IP_NV", "PSL_C_IP_NV", "PSW_O_IP_NV", "PSL_O_IP_NV", "Comment"]]

In [10]:
df_match1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28534 entries, 0 to 28533
Columns: 106 entries, t_id to Comment
dtypes: float64(57), int64(41), object(8)
memory usage: 23.1+ MB


In [11]:
#df_match1.to_csv('../data/df_match1.csv', index=False)

### 2. Predictive Features: Retrospective, Match Stats Accrual-Derived 

Player match statistics-derived features are accrued over matches previous to the one being predicted (ie, retrospectively). Most features are calculated on a surface-specific (clay or hard) basis, with exceptions being features related to player stamina and player fatigue. Grass court matches are included only for payer stamina and player fatigue features as well.   

#### To generate these player-specific features, we will reshape the dataframe from a match-specific organization by-row to a by-player specific organization. 

In [12]:
# Transiently create two separate dataframes (match winners and match losers), and remove columns pertinent only to the other
df_winners1 = df_match1.drop(["l_rk", "l_rk_pts", "l_ioc", "l_ent", "l_hd", "l_ht", "l_age", "l_sv_pts", "l_1st_sv_pts_won", "l_2nd_sv_pts", "l_2nd_sv_pts_won", "l_sv_pts_won", "l_ret_pts", "l_ret_pts_won", "l_1st_ret_pts", "l_1st_ret_pts_won", "l_2nd_ret_pts", "l_2nd_ret_pts_won", "l_tot_pts_won", "l_tot_pts_won%", "l_sv_pts_won%", "l_ret_pts_won%", "l_1st_sv_in%", "l_1st_sv%_yielded", "l_1st_sv_pts_won%", "l_1st_ret_pts_won%", "l_2nd_sv_pts_won%", "l_2nd_ret_pts_won%", "l_ace%", "l_aced%", "l_df%", "l_df_induced%", "l_bp_save%", "l_bp_conv%", "AvgL_C_IP_NV", "PSL_O_IP_NV", "PSL_C_IP_NV"], axis = 1)
df_winners1["m_outcome"] = 1
df_losers1 =  df_match1.drop(["w_rk", "w_rk_pts", "w_ioc", "w_ent", "w_hd", "w_ht", "w_age", "w_sv_pts", "w_1st_sv_pts_won", "w_2nd_sv_pts", "w_2nd_sv_pts_won", "w_sv_pts_won", "w_ret_pts", "w_ret_pts_won", "w_1st_ret_pts", "w_1st_ret_pts_won", "w_2nd_ret_pts", "w_2nd_ret_pts_won", "w_tot_pts_won", "w_tot_pts_won%", "w_sv_pts_won%", "w_ret_pts_won%", "w_1st_sv_in%", "w_1st_sv%_yielded", "w_1st_sv_pts_won%", "w_1st_ret_pts_won%", "w_2nd_sv_pts_won%", "w_2nd_ret_pts_won%", "w_ace%", "w_aced%", "w_df%", "w_df_induced%", "w_bp_save%", "w_bp_conv%", "AvgW_C_IP_NV", "PSW_O_IP_NV", "PSW_C_IP_NV"], axis = 1)
df_losers1["m_outcome"] = 0

In [13]:
df_winners1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28534 entries, 0 to 28533
Data columns (total 70 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   t_id                28534 non-null  int64  
 1   t_ident             28534 non-null  int64  
 2   t_nm                28534 non-null  object 
 3   t_co                28534 non-null  object 
 4   t_GMT_diff          28534 non-null  int64  
 5   t_surf              28534 non-null  int64  
 6   t_ind               28534 non-null  int64  
 7   t_alt               28534 non-null  int64  
 8   t_draw_sz           28534 non-null  int64  
 9   t_lvl               28534 non-null  int64  
 10  m_bestof            28534 non-null  int64  
 11  m_num               28534 non-null  int64  
 12  m_date              28534 non-null  object 
 13  m_yr                28534 non-null  int64  
 14  t_rd_num            28534 non-null  int64  
 15  m_t(m)              28250 non-null  float64
 16  m_to

In [14]:
df_losers1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28534 entries, 0 to 28533
Data columns (total 70 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   t_id                28534 non-null  int64  
 1   t_ident             28534 non-null  int64  
 2   t_nm                28534 non-null  object 
 3   t_co                28534 non-null  object 
 4   t_GMT_diff          28534 non-null  int64  
 5   t_surf              28534 non-null  int64  
 6   t_ind               28534 non-null  int64  
 7   t_alt               28534 non-null  int64  
 8   t_draw_sz           28534 non-null  int64  
 9   t_lvl               28534 non-null  int64  
 10  m_bestof            28534 non-null  int64  
 11  m_num               28534 non-null  int64  
 12  m_date              28534 non-null  object 
 13  m_yr                28534 non-null  int64  
 14  t_rd_num            28534 non-null  int64  
 15  m_t(m)              28250 non-null  float64
 16  m_to

In [15]:
# Concatenate Winners and Losers into by-player organization
df_winners1 = df_winners1.set_axis(["t_id", "t_ident", "t_nm", "t_co", "t_GMT_diff", "t_surf", "t_ind", "t_alt", "t_draw_sz", "t_lvl", "m_bestof", "m_num", "m_date", "m_yr", "m_rd_num", "m_t(m)", "m_tot_pts", "p_id", "p_ent", "p_nm", "p_rk", "p_rk_pts", "p_hd", "p_ht", "p_co", "p_age", "p_sv_pts", "p_sv_pts_won", "p_1st_sv_in", "p_1st_sv_pts_won", "p_2nd_sv_pts", "p_2nd_sv_pts_won", "p_ace", "p_df", "p_bp_saved", "p_bp_faced", "p_ret_pts", "p_ret_pts_won", "p_1st_ret_pts", "p_1st_ret_pts_won", "p_2nd_ret_pts", "p_2nd_ret_pts_won", "p_tot_pts_won", "p_tot_pts_won%", "p_sv_pts_won%", "p_ret_pts_won%", "p_1st_sv_in%", "p_1st_sv%_yielded", "p_1st_sv_pts_won%", "p_1st_ret_pts_won%", "p_2nd_sv_pts_won%", "p_2nd_ret_pts_won%", "p_ace%", "p_aced%", "p_df%", "p_df_induced%", "p_bp_save%", "p_bp_conv%", "opp_id", "opp_nm", "opp_1st_sv_in", "opp_ace", "opp_df", "opp_bp_saved", "opp_bp_faced", "p_AVG_C_IP", "p_PS_C_IP", "p_PS_O_IP", "Comment", "m_outcome"], axis=1)
df_losers1 = df_losers1.set_axis(["t_id", "t_ident", "t_nm", "t_co", "t_GMT_diff", "t_surf", "t_ind", "t_alt", "t_draw_sz", "t_lvl", "m_bestof", "m_num", "m_date", "m_yr", "m_rd_num", "m_t(m)", "m_tot_pts", "opp_id", "opp_nm", "opp_1st_sv_in", "opp_ace", "opp_df", "opp_bp_saved", "opp_bp_faced", "p_id", "p_ent", "p_nm", "p_rk", "p_rk_pts", "p_hd", "p_ht", "p_co", "p_age", "p_sv_pts", "p_sv_pts_won", "p_1st_sv_in", "p_1st_sv_pts_won", "p_2nd_sv_pts", "p_2nd_sv_pts_won", "p_ace", "p_df", "p_bp_saved", "p_bp_faced", "p_ret_pts", "p_ret_pts_won", "p_1st_ret_pts", "p_1st_ret_pts_won", "p_2nd_ret_pts", "p_2nd_ret_pts_won", "p_tot_pts_won", "p_tot_pts_won%", "p_sv_pts_won%", "p_ret_pts_won%", "p_1st_sv_in%", "p_1st_sv%_yielded", "p_1st_sv_pts_won%", "p_1st_ret_pts_won%", "p_2nd_sv_pts_won%", "p_2nd_ret_pts_won%", "p_ace%", "p_aced%", "p_df%", "p_df_induced%", "p_bp_save%", "p_bp_conv%", "p_AVG_C_IP", "p_PS_C_IP", "p_PS_O_IP", "Comment", "m_outcome"], axis=1)
df_player1 = pd.concat([df_winners1, df_losers1], ignore_index=True)

#Not sure why, but import converted this back to string. We will need in datetime format for many of the stats accruals.
df_player1["m_date"] = df_player1["m_date"].astype('datetime64[ns]') 
#Note: Removed NV (no vig) from IP (implied probability) here in variable names since there's no longer a risk of confusion with pre-vig removed versions, but these ARE vig-removed probabilities.
del df_match1, df_winners1, df_losers1

In [16]:
#Converts "t_id" to a numerical format as that will be more useful down the line.
#df_player1["t_id"] = df_player1["t_id"].str.replace('-','')
#df_player1["t_id"] = df_player1["t_id"].str.replace('M','10') #some tourny IDs contained a letter (M)
#df_player1["t_id"].head(20)
#df_player1["t_id"] = df_player1["t_id"].astype('int64') 

In [17]:
df_player1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57068 entries, 0 to 57067
Data columns (total 70 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   t_id                57068 non-null  int64         
 1   t_ident             57068 non-null  int64         
 2   t_nm                57068 non-null  object        
 3   t_co                57068 non-null  object        
 4   t_GMT_diff          57068 non-null  int64         
 5   t_surf              57068 non-null  int64         
 6   t_ind               57068 non-null  int64         
 7   t_alt               57068 non-null  int64         
 8   t_draw_sz           57068 non-null  int64         
 9   t_lvl               57068 non-null  int64         
 10  m_bestof            57068 non-null  int64         
 11  m_num               57068 non-null  int64         
 12  m_date              57068 non-null  datetime64[ns]
 13  m_yr                57068 non-null  int64     

In [18]:
# 'p_tot_pts_won%_l60_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean TOTAL POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

df_player1["p_tot_pts_60"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-1)
df_player1["p_tot_pts_won_60"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-1)

df_player1["p_tot_pts_59"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-2)
df_player1["p_tot_pts_won_59"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-2)

df_player1["p_tot_pts_58"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-3)
df_player1["p_tot_pts_won_58"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-3)

df_player1["p_tot_pts_57"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-4)
df_player1["p_tot_pts_won_57"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-4)

df_player1["p_tot_pts_56"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-5)
df_player1["p_tot_pts_won_56"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-5)

df_player1["p_tot_pts_55"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-6)
df_player1["p_tot_pts_won_55"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-6)

df_player1["p_tot_pts_54"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-7)
df_player1["p_tot_pts_won_54"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-7)

df_player1["p_tot_pts_53"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-8)
df_player1["p_tot_pts_won_53"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-8)

df_player1["p_tot_pts_52"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-9)
df_player1["p_tot_pts_won_52"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-9)

df_player1["p_tot_pts_51"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-10)
df_player1["p_tot_pts_won_51"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-10)

df_player1["p_tot_pts_50"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-11)
df_player1["p_tot_pts_won_50"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-11)

df_player1["p_tot_pts_49"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-12)
df_player1["p_tot_pts_won_49"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-12)

df_player1["p_tot_pts_48"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-13)
df_player1["p_tot_pts_won_48"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-13)

df_player1["p_tot_pts_47"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-14)
df_player1["p_tot_pts_won_47"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-14)

df_player1["p_tot_pts_46"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-15)
df_player1["p_tot_pts_won_46"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-15)

df_player1["p_tot_pts_45"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-16)
df_player1["p_tot_pts_won_45"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-16)

df_player1["p_tot_pts_44"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-17)
df_player1["p_tot_pts_won_44"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-17)

df_player1["p_tot_pts_43"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-18)
df_player1["p_tot_pts_won_43"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-18)

df_player1["p_tot_pts_42"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-19)
df_player1["p_tot_pts_won_42"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-19)

df_player1["p_tot_pts_41"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-20)
df_player1["p_tot_pts_won_41"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-20)

df_player1["p_tot_pts_40"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-21)
df_player1["p_tot_pts_won_40"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-21)

df_player1["p_tot_pts_39"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-22)
df_player1["p_tot_pts_won_39"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-22)

df_player1["p_tot_pts_38"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-23)
df_player1["p_tot_pts_won_38"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-23)

df_player1["p_tot_pts_37"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-24)
df_player1["p_tot_pts_won_37"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-24)

df_player1["p_tot_pts_36"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-25)
df_player1["p_tot_pts_won_36"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-25)

df_player1["p_tot_pts_35"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-26)
df_player1["p_tot_pts_won_35"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-26)

df_player1["p_tot_pts_34"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-27)
df_player1["p_tot_pts_won_34"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-27)

df_player1["p_tot_pts_33"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-28)
df_player1["p_tot_pts_won_33"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-28)

df_player1["p_tot_pts_32"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-29)
df_player1["p_tot_pts_won_32"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-29)

df_player1["p_tot_pts_31"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-30)
df_player1["p_tot_pts_won_31"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-30)

df_player1["p_tot_pts_30"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-31)
df_player1["p_tot_pts_won_30"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-31)

df_player1["p_tot_pts_29"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-32)
df_player1["p_tot_pts_won_29"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-32)

df_player1["p_tot_pts_28"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-33)
df_player1["p_tot_pts_won_28"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-33)

df_player1["p_tot_pts_27"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-34)
df_player1["p_tot_pts_won_27"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-34)

df_player1["p_tot_pts_26"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-35)
df_player1["p_tot_pts_won_26"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-35)

df_player1["p_tot_pts_25"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-36)
df_player1["p_tot_pts_won_25"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-36)

df_player1["p_tot_pts_24"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-37)
df_player1["p_tot_pts_won_24"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-37)

df_player1["p_tot_pts_23"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-38)
df_player1["p_tot_pts_won_23"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-38)

df_player1["p_tot_pts_22"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-39)
df_player1["p_tot_pts_won_22"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-39)

df_player1["p_tot_pts_21"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-40)
df_player1["p_tot_pts_won_21"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-40)

df_player1["p_tot_pts_20"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-41)
df_player1["p_tot_pts_won_20"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-41)

df_player1["p_tot_pts_19"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-42)
df_player1["p_tot_pts_won_19"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-42)

df_player1["p_tot_pts_18"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-43)
df_player1["p_tot_pts_won_18"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-43)

df_player1["p_tot_pts_17"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-44)
df_player1["p_tot_pts_won_17"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-44)

df_player1["p_tot_pts_16"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-45)
df_player1["p_tot_pts_won_16"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-45)

df_player1["p_tot_pts_15"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-46)
df_player1["p_tot_pts_won_15"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-46)

df_player1["p_tot_pts_14"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-47)
df_player1["p_tot_pts_won_14"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-47)

df_player1["p_tot_pts_13"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-48)
df_player1["p_tot_pts_won_13"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-48)

df_player1["p_tot_pts_12"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-49)
df_player1["p_tot_pts_won_12"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-49)

df_player1["p_tot_pts_11"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-50)
df_player1["p_tot_pts_won_11"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-50)

df_player1["p_tot_pts_10"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-51)
df_player1["p_tot_pts_won_10"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-51)

df_player1["p_tot_pts_9"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-52)
df_player1["p_tot_pts_won_9"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-52)

df_player1["p_tot_pts_8"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-53)
df_player1["p_tot_pts_won_8"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-53)

df_player1["p_tot_pts_7"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-54)
df_player1["p_tot_pts_won_7"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-54)

df_player1["p_tot_pts_6"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-55)
df_player1["p_tot_pts_won_6"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-55)

df_player1["p_tot_pts_5"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-56)
df_player1["p_tot_pts_won_5"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-56)

df_player1["p_tot_pts_4"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-57)
df_player1["p_tot_pts_won_4"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-57)

df_player1["p_tot_pts_3"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-58)
df_player1["p_tot_pts_won_3"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-58)

df_player1["p_tot_pts_2"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-59)
df_player1["p_tot_pts_won_2"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-59)

df_player1["p_tot_pts_1"] = df_player1.groupby(['p_id','t_surf'])['m_tot_pts'].shift(-60)
df_player1["p_tot_pts_won_1"] = df_player1.groupby(['p_id','t_surf'])['p_tot_pts_won'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_tot_pts_l60_ws"] = df_player1[["p_tot_pts_60", "p_tot_pts_59", "p_tot_pts_58", "p_tot_pts_57", "p_tot_pts_56", "p_tot_pts_55", "p_tot_pts_54", "p_tot_pts_53", "p_tot_pts_52", "p_tot_pts_51", "p_tot_pts_50", "p_tot_pts_49", "p_tot_pts_48", "p_tot_pts_47", "p_tot_pts_46", "p_tot_pts_45", "p_tot_pts_44", "p_tot_pts_43", "p_tot_pts_42", "p_tot_pts_41", "p_tot_pts_40", "p_tot_pts_39", "p_tot_pts_38", "p_tot_pts_37", "p_tot_pts_36", "p_tot_pts_35", "p_tot_pts_34", "p_tot_pts_33", "p_tot_pts_32", "p_tot_pts_31", "p_tot_pts_30", "p_tot_pts_29", "p_tot_pts_28", "p_tot_pts_27", "p_tot_pts_26", "p_tot_pts_25", "p_tot_pts_24", "p_tot_pts_23", "p_tot_pts_22", "p_tot_pts_21", "p_tot_pts_20", "p_tot_pts_19", "p_tot_pts_18", "p_tot_pts_17", "p_tot_pts_16", "p_tot_pts_15", "p_tot_pts_14", "p_tot_pts_13", "p_tot_pts_12", "p_tot_pts_11", "p_tot_pts_10", "p_tot_pts_9", "p_tot_pts_8", "p_tot_pts_7", "p_tot_pts_6", "p_tot_pts_5", "p_tot_pts_4", "p_tot_pts_3", "p_tot_pts_2", "p_tot_pts_1"]].sum(axis=1)
df_player1["p_tot_pts_won_l60_ws"] = df_player1[["p_tot_pts_won_60", "p_tot_pts_won_59", "p_tot_pts_won_58", "p_tot_pts_won_57", "p_tot_pts_won_56", "p_tot_pts_won_55", "p_tot_pts_won_54", "p_tot_pts_won_53", "p_tot_pts_won_52", "p_tot_pts_won_51", "p_tot_pts_won_50", "p_tot_pts_won_49", "p_tot_pts_won_48", "p_tot_pts_won_47", "p_tot_pts_won_46", "p_tot_pts_won_45", "p_tot_pts_won_44", "p_tot_pts_won_43", "p_tot_pts_won_42", "p_tot_pts_won_41", "p_tot_pts_won_40", "p_tot_pts_won_39", "p_tot_pts_won_38", "p_tot_pts_won_37", "p_tot_pts_won_36", "p_tot_pts_won_35", "p_tot_pts_won_34", "p_tot_pts_won_33", "p_tot_pts_won_32", "p_tot_pts_won_31", "p_tot_pts_won_30", "p_tot_pts_won_29", "p_tot_pts_won_28", "p_tot_pts_won_27", "p_tot_pts_won_26", "p_tot_pts_won_25", "p_tot_pts_won_24", "p_tot_pts_won_23", "p_tot_pts_won_22", "p_tot_pts_won_21", "p_tot_pts_won_20", "p_tot_pts_won_19", "p_tot_pts_won_18", "p_tot_pts_won_17", "p_tot_pts_won_16", "p_tot_pts_won_15", "p_tot_pts_won_14", "p_tot_pts_won_13", "p_tot_pts_won_12", "p_tot_pts_won_11", "p_tot_pts_won_10", "p_tot_pts_won_9", "p_tot_pts_won_8", "p_tot_pts_won_7", "p_tot_pts_won_6", "p_tot_pts_won_5", "p_tot_pts_won_4", "p_tot_pts_won_3", "p_tot_pts_won_2", "p_tot_pts_won_1"]].sum(axis=1)
df_player1["p_tot_pts_won%_l60_tw_ss"] = ((df_player1["p_tot_pts_won_l60_ws"]/df_player1["p_tot_pts_l60_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

In [19]:
# 'p_tot_pts_won%_l10_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean TOTAL POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_tot_pts_l10_ws"] = df_player1[["p_tot_pts_60", "p_tot_pts_59", "p_tot_pts_58", "p_tot_pts_57", "p_tot_pts_56", "p_tot_pts_55", "p_tot_pts_54", "p_tot_pts_53", "p_tot_pts_52", "p_tot_pts_51"]].sum(axis=1)
df_player1["p_tot_pts_won_l10_ws"] = df_player1[["p_tot_pts_won_60", "p_tot_pts_won_59", "p_tot_pts_won_58", "p_tot_pts_won_57", "p_tot_pts_won_56", "p_tot_pts_won_55", "p_tot_pts_won_54", "p_tot_pts_won_53", "p_tot_pts_won_52", "p_tot_pts_won_51"]].sum(axis=1)
df_player1["p_tot_pts_won%_l10_tw_ss"] = ((df_player1["p_tot_pts_won_l10_ws"]/df_player1["p_tot_pts_l10_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_tot_pts_l60_ws", "p_tot_pts_won_l60_ws", "p_tot_pts_l10_ws", "p_tot_pts_won_l10_ws", "p_tot_pts_60", "p_tot_pts_59", "p_tot_pts_58", "p_tot_pts_57", "p_tot_pts_56", "p_tot_pts_55", "p_tot_pts_54", "p_tot_pts_53", "p_tot_pts_52", "p_tot_pts_51", "p_tot_pts_50", "p_tot_pts_49", "p_tot_pts_48", "p_tot_pts_47", "p_tot_pts_46", "p_tot_pts_45", "p_tot_pts_44", "p_tot_pts_43", "p_tot_pts_42", "p_tot_pts_41", "p_tot_pts_40", "p_tot_pts_39", "p_tot_pts_38", "p_tot_pts_37", "p_tot_pts_36", "p_tot_pts_35", "p_tot_pts_34", "p_tot_pts_33", "p_tot_pts_32", "p_tot_pts_31", "p_tot_pts_30", "p_tot_pts_29", "p_tot_pts_28", "p_tot_pts_27", "p_tot_pts_26", "p_tot_pts_25", "p_tot_pts_24", "p_tot_pts_23", "p_tot_pts_22", "p_tot_pts_21", "p_tot_pts_20", "p_tot_pts_19", "p_tot_pts_18", "p_tot_pts_17", "p_tot_pts_16", "p_tot_pts_15", "p_tot_pts_14", "p_tot_pts_13", "p_tot_pts_12", "p_tot_pts_11", "p_tot_pts_10", "p_tot_pts_9", "p_tot_pts_8", "p_tot_pts_7", "p_tot_pts_6", "p_tot_pts_5", "p_tot_pts_4", "p_tot_pts_3", "p_tot_pts_2", "p_tot_pts_1", "p_tot_pts_won_60", "p_tot_pts_won_59", "p_tot_pts_won_58", "p_tot_pts_won_57", "p_tot_pts_won_56", "p_tot_pts_won_55", "p_tot_pts_won_54", "p_tot_pts_won_53", "p_tot_pts_won_52", "p_tot_pts_won_51", "p_tot_pts_won_50", "p_tot_pts_won_49", "p_tot_pts_won_48", "p_tot_pts_won_47", "p_tot_pts_won_46", "p_tot_pts_won_45", "p_tot_pts_won_44", "p_tot_pts_won_43", "p_tot_pts_won_42", "p_tot_pts_won_41", "p_tot_pts_won_40", "p_tot_pts_won_39", "p_tot_pts_won_38", "p_tot_pts_won_37", "p_tot_pts_won_36", "p_tot_pts_won_35", "p_tot_pts_won_34", "p_tot_pts_won_33", "p_tot_pts_won_32", "p_tot_pts_won_31", "p_tot_pts_won_30", "p_tot_pts_won_29", "p_tot_pts_won_28", "p_tot_pts_won_27", "p_tot_pts_won_26", "p_tot_pts_won_25", "p_tot_pts_won_24", "p_tot_pts_won_23", "p_tot_pts_won_22", "p_tot_pts_won_21", "p_tot_pts_won_20", "p_tot_pts_won_19", "p_tot_pts_won_18", "p_tot_pts_won_17", "p_tot_pts_won_16", "p_tot_pts_won_15", "p_tot_pts_won_14", "p_tot_pts_won_13", "p_tot_pts_won_12", "p_tot_pts_won_11", "p_tot_pts_won_10", "p_tot_pts_won_9", "p_tot_pts_won_8", "p_tot_pts_won_7", "p_tot_pts_won_6", "p_tot_pts_won_5", "p_tot_pts_won_4", "p_tot_pts_won_3", "p_tot_pts_won_2", "p_tot_pts_won_1"], axis = 1)

In [20]:
df_player1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57068 entries, 56533 to 40644
Data columns (total 72 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   t_id                      57068 non-null  int64         
 1   t_ident                   57068 non-null  int64         
 2   t_nm                      57068 non-null  object        
 3   t_co                      57068 non-null  object        
 4   t_GMT_diff                57068 non-null  int64         
 5   t_surf                    57068 non-null  int64         
 6   t_ind                     57068 non-null  int64         
 7   t_alt                     57068 non-null  int64         
 8   t_draw_sz                 57068 non-null  int64         
 9   t_lvl                     57068 non-null  int64         
 10  m_bestof                  57068 non-null  int64         
 11  m_num                     57068 non-null  int64         
 12  m_date        

In [21]:
# 'p_tot_pts_won%_l60_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor-specific (IO), mean TOTAL POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf', 't_ind', 'm_date','m_rd_num'], ascending = False)

df_player1["p_tot_pts_IO_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-1)
df_player1["p_tot_pts_won_IO_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-1)

df_player1["p_tot_pts_IO_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-2)
df_player1["p_tot_pts_won_IO_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-2)

df_player1["p_tot_pts_IO_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-3)
df_player1["p_tot_pts_won_IO_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-3)

df_player1["p_tot_pts_IO_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-4)
df_player1["p_tot_pts_won_IO_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-4)

df_player1["p_tot_pts_IO_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-5)
df_player1["p_tot_pts_won_IO_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-5)

df_player1["p_tot_pts_IO_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-6)
df_player1["p_tot_pts_won_IO_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-6)

df_player1["p_tot_pts_IO_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-7)
df_player1["p_tot_pts_won_IO_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-7)

df_player1["p_tot_pts_IO_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-8)
df_player1["p_tot_pts_won_IO_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-8)

df_player1["p_tot_pts_IO_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-9)
df_player1["p_tot_pts_won_IO_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-9)

df_player1["p_tot_pts_IO_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-10)
df_player1["p_tot_pts_won_IO_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-10)

df_player1["p_tot_pts_IO_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-11)
df_player1["p_tot_pts_won_IO_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-11)

df_player1["p_tot_pts_IO_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-12)
df_player1["p_tot_pts_won_IO_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-12)

df_player1["p_tot_pts_IO_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-13)
df_player1["p_tot_pts_won_IO_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-13)

df_player1["p_tot_pts_IO_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-14)
df_player1["p_tot_pts_won_IO_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-14)

df_player1["p_tot_pts_IO_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-15)
df_player1["p_tot_pts_won_IO_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-15)

df_player1["p_tot_pts_IO_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-16)
df_player1["p_tot_pts_won_IO_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-16)

df_player1["p_tot_pts_IO_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-17)
df_player1["p_tot_pts_won_IO_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-17)

df_player1["p_tot_pts_IO_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-18)
df_player1["p_tot_pts_won_IO_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-18)

df_player1["p_tot_pts_IO_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-19)
df_player1["p_tot_pts_won_IO_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-19)

df_player1["p_tot_pts_IO_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-20)
df_player1["p_tot_pts_won_IO_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-20)

df_player1["p_tot_pts_IO_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-21)
df_player1["p_tot_pts_won_IO_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-21)

df_player1["p_tot_pts_IO_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-22)
df_player1["p_tot_pts_won_IO_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-22)

df_player1["p_tot_pts_IO_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-23)
df_player1["p_tot_pts_won_IO_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-23)

df_player1["p_tot_pts_IO_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-24)
df_player1["p_tot_pts_won_IO_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-24)

df_player1["p_tot_pts_IO_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-25)
df_player1["p_tot_pts_won_IO_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-25)

df_player1["p_tot_pts_IO_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-26)
df_player1["p_tot_pts_won_IO_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-26)

df_player1["p_tot_pts_IO_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-27)
df_player1["p_tot_pts_won_IO_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-27)

df_player1["p_tot_pts_IO_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-28)
df_player1["p_tot_pts_won_IO_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-28)

df_player1["p_tot_pts_IO_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-29)
df_player1["p_tot_pts_won_IO_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-29)

df_player1["p_tot_pts_IO_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-30)
df_player1["p_tot_pts_won_IO_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-30)

df_player1["p_tot_pts_IO_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-31)
df_player1["p_tot_pts_won_IO_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-31)

df_player1["p_tot_pts_IO_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-32)
df_player1["p_tot_pts_won_IO_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-32)

df_player1["p_tot_pts_IO_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-33)
df_player1["p_tot_pts_won_IO_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-33)

df_player1["p_tot_pts_IO_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-34)
df_player1["p_tot_pts_won_IO_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-34)

df_player1["p_tot_pts_IO_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-35)
df_player1["p_tot_pts_won_IO_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-35)

df_player1["p_tot_pts_IO_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-36)
df_player1["p_tot_pts_won_IO_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-36)

df_player1["p_tot_pts_IO_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-37)
df_player1["p_tot_pts_won_IO_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-37)

df_player1["p_tot_pts_IO_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-38)
df_player1["p_tot_pts_won_IO_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-38)

df_player1["p_tot_pts_IO_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-39)
df_player1["p_tot_pts_won_IO_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-39)

df_player1["p_tot_pts_IO_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-40)
df_player1["p_tot_pts_won_IO_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-40)

df_player1["p_tot_pts_IO_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-41)
df_player1["p_tot_pts_won_IO_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-41)

df_player1["p_tot_pts_IO_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-42)
df_player1["p_tot_pts_won_IO_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-42)

df_player1["p_tot_pts_IO_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-43)
df_player1["p_tot_pts_won_IO_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-43)

df_player1["p_tot_pts_IO_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-44)
df_player1["p_tot_pts_won_IO_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-44)

df_player1["p_tot_pts_IO_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-45)
df_player1["p_tot_pts_won_IO_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-45)

df_player1["p_tot_pts_IO_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-46)
df_player1["p_tot_pts_won_IO_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-46)

df_player1["p_tot_pts_IO_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-47)
df_player1["p_tot_pts_won_IO_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-47)

df_player1["p_tot_pts_IO_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-48)
df_player1["p_tot_pts_won_IO_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-48)

df_player1["p_tot_pts_IO_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-49)
df_player1["p_tot_pts_won_IO_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-49)

df_player1["p_tot_pts_IO_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-50)
df_player1["p_tot_pts_won_IO_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-50)

df_player1["p_tot_pts_IO_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-51)
df_player1["p_tot_pts_won_IO_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-51)

df_player1["p_tot_pts_IO_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-52)
df_player1["p_tot_pts_won_IO_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-52)

df_player1["p_tot_pts_IO_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-53)
df_player1["p_tot_pts_won_IO_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-53)

df_player1["p_tot_pts_IO_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-54)
df_player1["p_tot_pts_won_IO_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-54)

df_player1["p_tot_pts_IO_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-55)
df_player1["p_tot_pts_won_IO_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-55)

df_player1["p_tot_pts_IO_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-56)
df_player1["p_tot_pts_won_IO_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-56)

df_player1["p_tot_pts_IO_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-57)
df_player1["p_tot_pts_won_IO_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-57)

df_player1["p_tot_pts_IO_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-58)
df_player1["p_tot_pts_won_IO_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-58)

df_player1["p_tot_pts_IO_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-59)
df_player1["p_tot_pts_won_IO_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-59)

df_player1["p_tot_pts_IO_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['m_tot_pts'].shift(-60)
df_player1["p_tot_pts_won_IO_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_tot_pts_won'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_tot_pts_IO_l60_ws"] = df_player1[["p_tot_pts_IO_60", "p_tot_pts_IO_59", "p_tot_pts_IO_58", "p_tot_pts_IO_57", "p_tot_pts_IO_56", "p_tot_pts_IO_55", "p_tot_pts_IO_54", "p_tot_pts_IO_53", "p_tot_pts_IO_52", "p_tot_pts_IO_51", "p_tot_pts_IO_50", "p_tot_pts_IO_49", "p_tot_pts_IO_48", "p_tot_pts_IO_47", "p_tot_pts_IO_46", "p_tot_pts_IO_45", "p_tot_pts_IO_44", "p_tot_pts_IO_43", "p_tot_pts_IO_42", "p_tot_pts_IO_41", "p_tot_pts_IO_40", "p_tot_pts_IO_39", "p_tot_pts_IO_38", "p_tot_pts_IO_37", "p_tot_pts_IO_36", "p_tot_pts_IO_35", "p_tot_pts_IO_34", "p_tot_pts_IO_33", "p_tot_pts_IO_32", "p_tot_pts_IO_31", "p_tot_pts_IO_30", "p_tot_pts_IO_29", "p_tot_pts_IO_28", "p_tot_pts_IO_27", "p_tot_pts_IO_26", "p_tot_pts_IO_25", "p_tot_pts_IO_24", "p_tot_pts_IO_23", "p_tot_pts_IO_22", "p_tot_pts_IO_21", "p_tot_pts_IO_20", "p_tot_pts_IO_19", "p_tot_pts_IO_18", "p_tot_pts_IO_17", "p_tot_pts_IO_16", "p_tot_pts_IO_15", "p_tot_pts_IO_14", "p_tot_pts_IO_13", "p_tot_pts_IO_12", "p_tot_pts_IO_11", "p_tot_pts_IO_10", "p_tot_pts_IO_9", "p_tot_pts_IO_8", "p_tot_pts_IO_7", "p_tot_pts_IO_6", "p_tot_pts_IO_5", "p_tot_pts_IO_4", "p_tot_pts_IO_3", "p_tot_pts_IO_2", "p_tot_pts_IO_1"]].sum(axis=1)
df_player1["p_tot_pts_won_IO_l60_ws"] = df_player1[["p_tot_pts_won_IO_60", "p_tot_pts_won_IO_59", "p_tot_pts_won_IO_58", "p_tot_pts_won_IO_57", "p_tot_pts_won_IO_56", "p_tot_pts_won_IO_55", "p_tot_pts_won_IO_54", "p_tot_pts_won_IO_53", "p_tot_pts_won_IO_52", "p_tot_pts_won_IO_51", "p_tot_pts_won_IO_50", "p_tot_pts_won_IO_49", "p_tot_pts_won_IO_48", "p_tot_pts_won_IO_47", "p_tot_pts_won_IO_46", "p_tot_pts_won_IO_45", "p_tot_pts_won_IO_44", "p_tot_pts_won_IO_43", "p_tot_pts_won_IO_42", "p_tot_pts_won_IO_41", "p_tot_pts_won_IO_40", "p_tot_pts_won_IO_39", "p_tot_pts_won_IO_38", "p_tot_pts_won_IO_37", "p_tot_pts_won_IO_36", "p_tot_pts_won_IO_35", "p_tot_pts_won_IO_34", "p_tot_pts_won_IO_33", "p_tot_pts_won_IO_32", "p_tot_pts_won_IO_31", "p_tot_pts_won_IO_30", "p_tot_pts_won_IO_29", "p_tot_pts_won_IO_28", "p_tot_pts_won_IO_27", "p_tot_pts_won_IO_26", "p_tot_pts_won_IO_25", "p_tot_pts_won_IO_24", "p_tot_pts_won_IO_23", "p_tot_pts_won_IO_22", "p_tot_pts_won_IO_21", "p_tot_pts_won_IO_20", "p_tot_pts_won_IO_19", "p_tot_pts_won_IO_18", "p_tot_pts_won_IO_17", "p_tot_pts_won_IO_16", "p_tot_pts_won_IO_15", "p_tot_pts_won_IO_14", "p_tot_pts_won_IO_13", "p_tot_pts_won_IO_12", "p_tot_pts_won_IO_11", "p_tot_pts_won_IO_10", "p_tot_pts_won_IO_9", "p_tot_pts_won_IO_8", "p_tot_pts_won_IO_7", "p_tot_pts_won_IO_6", "p_tot_pts_won_IO_5", "p_tot_pts_won_IO_4", "p_tot_pts_won_IO_3", "p_tot_pts_won_IO_2", "p_tot_pts_won_IO_1"]].sum(axis=1)
df_player1["p_tot_pts_won%_l60_tw_ss_IO"] = ((df_player1["p_tot_pts_won_IO_l60_ws"]/df_player1["p_tot_pts_IO_l60_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

In [22]:
# 'p_tot_pts_won%_l10_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor-specific (IO), mean TOTAL POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_tot_pts_IO_l10_ws"] = df_player1[["p_tot_pts_IO_60", "p_tot_pts_IO_59", "p_tot_pts_IO_58", "p_tot_pts_IO_57", "p_tot_pts_IO_56", "p_tot_pts_IO_55", "p_tot_pts_IO_54", "p_tot_pts_IO_53", "p_tot_pts_IO_52", "p_tot_pts_IO_51"]].sum(axis=1)
df_player1["p_tot_pts_won_IO_l10_ws"] = df_player1[["p_tot_pts_won_IO_60", "p_tot_pts_won_IO_59", "p_tot_pts_won_IO_58", "p_tot_pts_won_IO_57", "p_tot_pts_won_IO_56", "p_tot_pts_won_IO_55", "p_tot_pts_won_IO_54", "p_tot_pts_won_IO_53", "p_tot_pts_won_IO_52", "p_tot_pts_won_IO_51"]].sum(axis=1)
df_player1["p_tot_pts_won%_l10_tw_ss_IO"] = ((df_player1["p_tot_pts_won_IO_l10_ws"]/df_player1["p_tot_pts_IO_l10_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_tot_pts_IO_l60_ws", "p_tot_pts_won_IO_l60_ws", "p_tot_pts_IO_l10_ws", "p_tot_pts_won_IO_l10_ws", "p_tot_pts_IO_60", "p_tot_pts_IO_59", "p_tot_pts_IO_58", "p_tot_pts_IO_57", "p_tot_pts_IO_56", "p_tot_pts_IO_55", "p_tot_pts_IO_54", "p_tot_pts_IO_53", "p_tot_pts_IO_52", "p_tot_pts_IO_51", "p_tot_pts_IO_50", "p_tot_pts_IO_49", "p_tot_pts_IO_48", "p_tot_pts_IO_47", "p_tot_pts_IO_46", "p_tot_pts_IO_45", "p_tot_pts_IO_44", "p_tot_pts_IO_43", "p_tot_pts_IO_42", "p_tot_pts_IO_41", "p_tot_pts_IO_40", "p_tot_pts_IO_39", "p_tot_pts_IO_38", "p_tot_pts_IO_37", "p_tot_pts_IO_36", "p_tot_pts_IO_35", "p_tot_pts_IO_34", "p_tot_pts_IO_33", "p_tot_pts_IO_32", "p_tot_pts_IO_31", "p_tot_pts_IO_30", "p_tot_pts_IO_29", "p_tot_pts_IO_28", "p_tot_pts_IO_27", "p_tot_pts_IO_26", "p_tot_pts_IO_25", "p_tot_pts_IO_24", "p_tot_pts_IO_23", "p_tot_pts_IO_22", "p_tot_pts_IO_21", "p_tot_pts_IO_20", "p_tot_pts_IO_19", "p_tot_pts_IO_18", "p_tot_pts_IO_17", "p_tot_pts_IO_16", "p_tot_pts_IO_15", "p_tot_pts_IO_14", "p_tot_pts_IO_13", "p_tot_pts_IO_12", "p_tot_pts_IO_11", "p_tot_pts_IO_10", "p_tot_pts_IO_9", "p_tot_pts_IO_8", "p_tot_pts_IO_7", "p_tot_pts_IO_6", "p_tot_pts_IO_5", "p_tot_pts_IO_4", "p_tot_pts_IO_3", "p_tot_pts_IO_2", "p_tot_pts_IO_1", "p_tot_pts_won_IO_60", "p_tot_pts_won_IO_59", "p_tot_pts_won_IO_58", "p_tot_pts_won_IO_57", "p_tot_pts_won_IO_56", "p_tot_pts_won_IO_55", "p_tot_pts_won_IO_54", "p_tot_pts_won_IO_53", "p_tot_pts_won_IO_52", "p_tot_pts_won_IO_51", "p_tot_pts_won_IO_50", "p_tot_pts_won_IO_49", "p_tot_pts_won_IO_48", "p_tot_pts_won_IO_47", "p_tot_pts_won_IO_46", "p_tot_pts_won_IO_45", "p_tot_pts_won_IO_44", "p_tot_pts_won_IO_43", "p_tot_pts_won_IO_42", "p_tot_pts_won_IO_41", "p_tot_pts_won_IO_40", "p_tot_pts_won_IO_39", "p_tot_pts_won_IO_38", "p_tot_pts_won_IO_37", "p_tot_pts_won_IO_36", "p_tot_pts_won_IO_35", "p_tot_pts_won_IO_34", "p_tot_pts_won_IO_33", "p_tot_pts_won_IO_32", "p_tot_pts_won_IO_31", "p_tot_pts_won_IO_30", "p_tot_pts_won_IO_29", "p_tot_pts_won_IO_28", "p_tot_pts_won_IO_27", "p_tot_pts_won_IO_26", "p_tot_pts_won_IO_25", "p_tot_pts_won_IO_24", "p_tot_pts_won_IO_23", "p_tot_pts_won_IO_22", "p_tot_pts_won_IO_21", "p_tot_pts_won_IO_20", "p_tot_pts_won_IO_19", "p_tot_pts_won_IO_18", "p_tot_pts_won_IO_17", "p_tot_pts_won_IO_16", "p_tot_pts_won_IO_15", "p_tot_pts_won_IO_14", "p_tot_pts_won_IO_13", "p_tot_pts_won_IO_12", "p_tot_pts_won_IO_11", "p_tot_pts_won_IO_10", "p_tot_pts_won_IO_9", "p_tot_pts_won_IO_8", "p_tot_pts_won_IO_7", "p_tot_pts_won_IO_6", "p_tot_pts_won_IO_5", "p_tot_pts_won_IO_4", "p_tot_pts_won_IO_3", "p_tot_pts_won_IO_2", "p_tot_pts_won_IO_1"], axis = 1)

In [23]:
# 'p_tot_pts_won%_l60_tw_nss'
# Provides time-weighted (TW), NON-surface-specific (SS), mean TOTAL POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted

df_player1 = df_player1.sort_values(by=['p_id','m_date','m_rd_num'], ascending = False)

df_player1["p_tot_pts_60"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-1)
df_player1["p_tot_pts_won_60"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-1)

df_player1["p_tot_pts_59"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-2)
df_player1["p_tot_pts_won_59"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-2)

df_player1["p_tot_pts_58"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-3)
df_player1["p_tot_pts_won_58"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-3)

df_player1["p_tot_pts_57"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-4)
df_player1["p_tot_pts_won_57"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-4)

df_player1["p_tot_pts_56"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-5)
df_player1["p_tot_pts_won_56"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-5)

df_player1["p_tot_pts_55"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-6)
df_player1["p_tot_pts_won_55"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-6)

df_player1["p_tot_pts_54"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-7)
df_player1["p_tot_pts_won_54"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-7)

df_player1["p_tot_pts_53"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-8)
df_player1["p_tot_pts_won_53"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-8)

df_player1["p_tot_pts_52"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-9)
df_player1["p_tot_pts_won_52"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-9)

df_player1["p_tot_pts_51"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-10)
df_player1["p_tot_pts_won_51"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-10)

df_player1["p_tot_pts_50"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-11)
df_player1["p_tot_pts_won_50"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-11)

df_player1["p_tot_pts_49"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-12)
df_player1["p_tot_pts_won_49"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-12)

df_player1["p_tot_pts_48"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-13)
df_player1["p_tot_pts_won_48"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-13)

df_player1["p_tot_pts_47"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-14)
df_player1["p_tot_pts_won_47"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-14)

df_player1["p_tot_pts_46"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-15)
df_player1["p_tot_pts_won_46"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-15)

df_player1["p_tot_pts_45"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-16)
df_player1["p_tot_pts_won_45"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-16)

df_player1["p_tot_pts_44"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-17)
df_player1["p_tot_pts_won_44"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-17)

df_player1["p_tot_pts_43"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-18)*43
df_player1["p_tot_pts_won_43"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-18)*43

df_player1["p_tot_pts_42"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-19)
df_player1["p_tot_pts_won_42"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-19)

df_player1["p_tot_pts_41"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-20)
df_player1["p_tot_pts_won_41"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-20)

df_player1["p_tot_pts_40"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-21)
df_player1["p_tot_pts_won_40"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-21)

df_player1["p_tot_pts_39"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-22)
df_player1["p_tot_pts_won_39"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-22)

df_player1["p_tot_pts_38"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-23)
df_player1["p_tot_pts_won_38"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-23)

df_player1["p_tot_pts_37"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-24)
df_player1["p_tot_pts_won_37"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-24)

df_player1["p_tot_pts_36"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-25)
df_player1["p_tot_pts_won_36"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-25)

df_player1["p_tot_pts_35"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-26)
df_player1["p_tot_pts_won_35"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-26)

df_player1["p_tot_pts_34"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-27)
df_player1["p_tot_pts_won_34"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-27)

df_player1["p_tot_pts_33"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-28)
df_player1["p_tot_pts_won_33"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-28)

df_player1["p_tot_pts_32"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-29)
df_player1["p_tot_pts_won_32"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-29)

df_player1["p_tot_pts_31"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-30)
df_player1["p_tot_pts_won_31"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-30)

df_player1["p_tot_pts_30"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-31)
df_player1["p_tot_pts_won_30"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-31)

df_player1["p_tot_pts_29"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-32)
df_player1["p_tot_pts_won_29"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-32)

df_player1["p_tot_pts_28"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-33)
df_player1["p_tot_pts_won_28"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-33)

df_player1["p_tot_pts_27"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-34)
df_player1["p_tot_pts_won_27"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-34)

df_player1["p_tot_pts_26"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-35)
df_player1["p_tot_pts_won_26"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-35)

df_player1["p_tot_pts_25"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-36)
df_player1["p_tot_pts_won_25"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-36)

df_player1["p_tot_pts_24"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-37)
df_player1["p_tot_pts_won_24"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-37)

df_player1["p_tot_pts_23"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-38)
df_player1["p_tot_pts_won_23"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-38)

df_player1["p_tot_pts_22"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-39)
df_player1["p_tot_pts_won_22"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-39)

df_player1["p_tot_pts_21"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-40)
df_player1["p_tot_pts_won_21"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-40)

df_player1["p_tot_pts_20"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-41)
df_player1["p_tot_pts_won_20"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-41)

df_player1["p_tot_pts_19"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-42)
df_player1["p_tot_pts_won_19"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-42)

df_player1["p_tot_pts_18"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-43)
df_player1["p_tot_pts_won_18"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-43)

df_player1["p_tot_pts_17"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-44)
df_player1["p_tot_pts_won_17"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-44)

df_player1["p_tot_pts_16"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-45)
df_player1["p_tot_pts_won_16"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-45)

df_player1["p_tot_pts_15"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-46)
df_player1["p_tot_pts_won_15"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-46)

df_player1["p_tot_pts_14"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-47)
df_player1["p_tot_pts_won_14"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-47)

df_player1["p_tot_pts_13"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-48)
df_player1["p_tot_pts_won_13"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-48)

df_player1["p_tot_pts_12"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-49)
df_player1["p_tot_pts_won_12"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-49)

df_player1["p_tot_pts_11"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-50)
df_player1["p_tot_pts_won_11"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-50)

df_player1["p_tot_pts_10"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-51)
df_player1["p_tot_pts_won_10"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-51)

df_player1["p_tot_pts_9"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-52)
df_player1["p_tot_pts_won_9"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-52)

df_player1["p_tot_pts_8"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-53)
df_player1["p_tot_pts_won_8"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-53)

df_player1["p_tot_pts_7"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-54)
df_player1["p_tot_pts_won_7"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-54)

df_player1["p_tot_pts_6"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-55)
df_player1["p_tot_pts_won_6"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-55)

df_player1["p_tot_pts_5"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-56)
df_player1["p_tot_pts_won_5"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-56)

df_player1["p_tot_pts_4"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-57)
df_player1["p_tot_pts_won_4"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-57)

df_player1["p_tot_pts_3"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-58)
df_player1["p_tot_pts_won_3"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-58)

df_player1["p_tot_pts_2"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-59)
df_player1["p_tot_pts_won_2"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-59)

df_player1["p_tot_pts_1"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-60)
df_player1["p_tot_pts_won_1"] = df_player1.groupby(['p_id'])['p_tot_pts_won'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_tot_pts_l60_ws"] = df_player1[["p_tot_pts_60", "p_tot_pts_59", "p_tot_pts_58", "p_tot_pts_57", "p_tot_pts_56", "p_tot_pts_55", "p_tot_pts_54", "p_tot_pts_53", "p_tot_pts_52", "p_tot_pts_51", "p_tot_pts_50", "p_tot_pts_49", "p_tot_pts_48", "p_tot_pts_47", "p_tot_pts_46", "p_tot_pts_45", "p_tot_pts_44", "p_tot_pts_43", "p_tot_pts_42", "p_tot_pts_41", "p_tot_pts_40", "p_tot_pts_39", "p_tot_pts_38", "p_tot_pts_37", "p_tot_pts_36", "p_tot_pts_35", "p_tot_pts_34", "p_tot_pts_33", "p_tot_pts_32", "p_tot_pts_31", "p_tot_pts_30", "p_tot_pts_29", "p_tot_pts_28", "p_tot_pts_27", "p_tot_pts_26", "p_tot_pts_25", "p_tot_pts_24", "p_tot_pts_23", "p_tot_pts_22", "p_tot_pts_21", "p_tot_pts_20", "p_tot_pts_19", "p_tot_pts_18", "p_tot_pts_17", "p_tot_pts_16", "p_tot_pts_15", "p_tot_pts_14", "p_tot_pts_13", "p_tot_pts_12", "p_tot_pts_11", "p_tot_pts_10", "p_tot_pts_9", "p_tot_pts_8", "p_tot_pts_7", "p_tot_pts_6", "p_tot_pts_5", "p_tot_pts_4", "p_tot_pts_3", "p_tot_pts_2", "p_tot_pts_1"]].sum(axis=1)
df_player1["p_tot_pts_won_l60_ws"] = df_player1[["p_tot_pts_won_60", "p_tot_pts_won_59", "p_tot_pts_won_58", "p_tot_pts_won_57", "p_tot_pts_won_56", "p_tot_pts_won_55", "p_tot_pts_won_54", "p_tot_pts_won_53", "p_tot_pts_won_52", "p_tot_pts_won_51", "p_tot_pts_won_50", "p_tot_pts_won_49", "p_tot_pts_won_48", "p_tot_pts_won_47", "p_tot_pts_won_46", "p_tot_pts_won_45", "p_tot_pts_won_44", "p_tot_pts_won_43", "p_tot_pts_won_42", "p_tot_pts_won_41", "p_tot_pts_won_40", "p_tot_pts_won_39", "p_tot_pts_won_38", "p_tot_pts_won_37", "p_tot_pts_won_36", "p_tot_pts_won_35", "p_tot_pts_won_34", "p_tot_pts_won_33", "p_tot_pts_won_32", "p_tot_pts_won_31", "p_tot_pts_won_30", "p_tot_pts_won_29", "p_tot_pts_won_28", "p_tot_pts_won_27", "p_tot_pts_won_26", "p_tot_pts_won_25", "p_tot_pts_won_24", "p_tot_pts_won_23", "p_tot_pts_won_22", "p_tot_pts_won_21", "p_tot_pts_won_20", "p_tot_pts_won_19", "p_tot_pts_won_18", "p_tot_pts_won_17", "p_tot_pts_won_16", "p_tot_pts_won_15", "p_tot_pts_won_14", "p_tot_pts_won_13", "p_tot_pts_won_12", "p_tot_pts_won_11", "p_tot_pts_won_10", "p_tot_pts_won_9", "p_tot_pts_won_8", "p_tot_pts_won_7", "p_tot_pts_won_6", "p_tot_pts_won_5", "p_tot_pts_won_4", "p_tot_pts_won_3", "p_tot_pts_won_2", "p_tot_pts_won_1"]].sum(axis=1)
df_player1["p_tot_pts_won%_l60_tw_nss"] = ((df_player1["p_tot_pts_won_l60_ws"]/df_player1["p_tot_pts_l60_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

In [24]:
# 'p_tot_pts_won%_l10_tw_nss'
# Provides time-weighted (TW), NON-surface-specific (SS), mean TOTAL POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_tot_pts_l10_ws"] = df_player1[["p_tot_pts_60", "p_tot_pts_59", "p_tot_pts_58", "p_tot_pts_57", "p_tot_pts_56", "p_tot_pts_55", "p_tot_pts_54", "p_tot_pts_53", "p_tot_pts_52", "p_tot_pts_51"]].sum(axis=1)
df_player1["p_tot_pts_won_l10_ws"] = df_player1[["p_tot_pts_won_60", "p_tot_pts_won_59", "p_tot_pts_won_58", "p_tot_pts_won_57", "p_tot_pts_won_56", "p_tot_pts_won_55", "p_tot_pts_won_54", "p_tot_pts_won_53", "p_tot_pts_won_52", "p_tot_pts_won_51"]].sum(axis=1)
df_player1["p_tot_pts_won%_l10_tw_nss"] = ((df_player1["p_tot_pts_won_l10_ws"]/df_player1["p_tot_pts_l10_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_tot_pts_l60_ws", "p_tot_pts_won_l60_ws", "p_tot_pts_l10_ws", "p_tot_pts_won_l10_ws", "p_tot_pts_60", "p_tot_pts_59", "p_tot_pts_58", "p_tot_pts_57", "p_tot_pts_56", "p_tot_pts_55", "p_tot_pts_54", "p_tot_pts_53", "p_tot_pts_52", "p_tot_pts_51", "p_tot_pts_50", "p_tot_pts_49", "p_tot_pts_48", "p_tot_pts_47", "p_tot_pts_46", "p_tot_pts_45", "p_tot_pts_44", "p_tot_pts_43", "p_tot_pts_42", "p_tot_pts_41", "p_tot_pts_40", "p_tot_pts_39", "p_tot_pts_38", "p_tot_pts_37", "p_tot_pts_36", "p_tot_pts_35", "p_tot_pts_34", "p_tot_pts_33", "p_tot_pts_32", "p_tot_pts_31", "p_tot_pts_30", "p_tot_pts_29", "p_tot_pts_28", "p_tot_pts_27", "p_tot_pts_26", "p_tot_pts_25", "p_tot_pts_24", "p_tot_pts_23", "p_tot_pts_22", "p_tot_pts_21", "p_tot_pts_20", "p_tot_pts_19", "p_tot_pts_18", "p_tot_pts_17", "p_tot_pts_16", "p_tot_pts_15", "p_tot_pts_14", "p_tot_pts_13", "p_tot_pts_12", "p_tot_pts_11", "p_tot_pts_10", "p_tot_pts_9", "p_tot_pts_8", "p_tot_pts_7", "p_tot_pts_6", "p_tot_pts_5", "p_tot_pts_4", "p_tot_pts_3", "p_tot_pts_2", "p_tot_pts_1", "p_tot_pts_won_60", "p_tot_pts_won_59", "p_tot_pts_won_58", "p_tot_pts_won_57", "p_tot_pts_won_56", "p_tot_pts_won_55", "p_tot_pts_won_54", "p_tot_pts_won_53", "p_tot_pts_won_52", "p_tot_pts_won_51", "p_tot_pts_won_50", "p_tot_pts_won_49", "p_tot_pts_won_48", "p_tot_pts_won_47", "p_tot_pts_won_46", "p_tot_pts_won_45", "p_tot_pts_won_44", "p_tot_pts_won_43", "p_tot_pts_won_42", "p_tot_pts_won_41", "p_tot_pts_won_40", "p_tot_pts_won_39", "p_tot_pts_won_38", "p_tot_pts_won_37", "p_tot_pts_won_36", "p_tot_pts_won_35", "p_tot_pts_won_34", "p_tot_pts_won_33", "p_tot_pts_won_32", "p_tot_pts_won_31", "p_tot_pts_won_30", "p_tot_pts_won_29", "p_tot_pts_won_28", "p_tot_pts_won_27", "p_tot_pts_won_26", "p_tot_pts_won_25", "p_tot_pts_won_24", "p_tot_pts_won_23", "p_tot_pts_won_22", "p_tot_pts_won_21", "p_tot_pts_won_20", "p_tot_pts_won_19", "p_tot_pts_won_18", "p_tot_pts_won_17", "p_tot_pts_won_16", "p_tot_pts_won_15", "p_tot_pts_won_14", "p_tot_pts_won_13", "p_tot_pts_won_12", "p_tot_pts_won_11", "p_tot_pts_won_10", "p_tot_pts_won_9", "p_tot_pts_won_8", "p_tot_pts_won_7", "p_tot_pts_won_6", "p_tot_pts_won_5", "p_tot_pts_won_4", "p_tot_pts_won_3", "p_tot_pts_won_2", "p_tot_pts_won_1"], axis = 1)

In [25]:
df_player1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57068 entries, 56533 to 40644
Data columns (total 76 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   t_id                         57068 non-null  int64         
 1   t_ident                      57068 non-null  int64         
 2   t_nm                         57068 non-null  object        
 3   t_co                         57068 non-null  object        
 4   t_GMT_diff                   57068 non-null  int64         
 5   t_surf                       57068 non-null  int64         
 6   t_ind                        57068 non-null  int64         
 7   t_alt                        57068 non-null  int64         
 8   t_draw_sz                    57068 non-null  int64         
 9   t_lvl                        57068 non-null  int64         
 10  m_bestof                     57068 non-null  int64         
 11  m_num                        57068 no

In [26]:
# 'p_1st_sv%_l60_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean FIRST SERVE PERCENTAGE of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

df_player1["p_sv_pts_60"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-1)
df_player1["p_1st_sv_in_60"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-1)

df_player1["p_sv_pts_59"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-2)
df_player1["p_1st_sv_in_59"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-2)

df_player1["p_sv_pts_58"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-3)
df_player1["p_1st_sv_in_58"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-3)

df_player1["p_sv_pts_57"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-4)
df_player1["p_1st_sv_in_57"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-4)

df_player1["p_sv_pts_56"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-5)
df_player1["p_1st_sv_in_56"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-5)

df_player1["p_sv_pts_55"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-6)
df_player1["p_1st_sv_in_55"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-6)

df_player1["p_sv_pts_54"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-7)
df_player1["p_1st_sv_in_54"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-7)

df_player1["p_sv_pts_53"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-8)
df_player1["p_1st_sv_in_53"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-8)

df_player1["p_sv_pts_52"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-9)
df_player1["p_1st_sv_in_52"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-9)

df_player1["p_sv_pts_51"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-10)
df_player1["p_1st_sv_in_51"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-10)

df_player1["p_sv_pts_50"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-11)
df_player1["p_1st_sv_in_50"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-11)

df_player1["p_sv_pts_49"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-12)
df_player1["p_1st_sv_in_49"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-12)

df_player1["p_sv_pts_48"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-13)
df_player1["p_1st_sv_in_48"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-13)

df_player1["p_sv_pts_47"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-14)
df_player1["p_1st_sv_in_47"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-14)

df_player1["p_sv_pts_46"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-15)
df_player1["p_1st_sv_in_46"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-15)

df_player1["p_sv_pts_45"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-16)
df_player1["p_1st_sv_in_45"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-16)

df_player1["p_sv_pts_44"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-17)
df_player1["p_1st_sv_in_44"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-17)

df_player1["p_sv_pts_43"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-18)
df_player1["p_1st_sv_in_43"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-18)

df_player1["p_sv_pts_42"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-19)
df_player1["p_1st_sv_in_42"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-19)

df_player1["p_sv_pts_41"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-20)
df_player1["p_1st_sv_in_41"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-20)

df_player1["p_sv_pts_40"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-21)
df_player1["p_1st_sv_in_40"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-21)

df_player1["p_sv_pts_39"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-22)
df_player1["p_1st_sv_in_39"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-22)

df_player1["p_sv_pts_38"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-23)
df_player1["p_1st_sv_in_38"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-23)

df_player1["p_sv_pts_37"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-24)
df_player1["p_1st_sv_in_37"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-24)

df_player1["p_sv_pts_36"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-25)
df_player1["p_1st_sv_in_36"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-25)

df_player1["p_sv_pts_35"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-26)
df_player1["p_1st_sv_in_35"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-26)

df_player1["p_sv_pts_34"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-27)
df_player1["p_1st_sv_in_34"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-27)

df_player1["p_sv_pts_33"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-28)
df_player1["p_1st_sv_in_33"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-28)

df_player1["p_sv_pts_32"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-29)
df_player1["p_1st_sv_in_32"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-29)

df_player1["p_sv_pts_31"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-30)
df_player1["p_1st_sv_in_31"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-30)

df_player1["p_sv_pts_30"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-31)
df_player1["p_1st_sv_in_30"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-31)

df_player1["p_sv_pts_29"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-32)
df_player1["p_1st_sv_in_29"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-32)

df_player1["p_sv_pts_28"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-33)
df_player1["p_1st_sv_in_28"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-33)

df_player1["p_sv_pts_27"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-34)
df_player1["p_1st_sv_in_27"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-34)

df_player1["p_sv_pts_26"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-35)
df_player1["p_1st_sv_in_26"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-35)

df_player1["p_sv_pts_25"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-36)
df_player1["p_1st_sv_in_25"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-36)

df_player1["p_sv_pts_24"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-37)
df_player1["p_1st_sv_in_24"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-37)

df_player1["p_sv_pts_23"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-38)
df_player1["p_1st_sv_in_23"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-38)

df_player1["p_sv_pts_22"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-39)
df_player1["p_1st_sv_in_22"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-39)

df_player1["p_sv_pts_21"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-40)
df_player1["p_1st_sv_in_21"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-40)

df_player1["p_sv_pts_20"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-41)
df_player1["p_1st_sv_in_20"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-41)

df_player1["p_sv_pts_19"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-42)
df_player1["p_1st_sv_in_19"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-42)

df_player1["p_sv_pts_18"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-43)
df_player1["p_1st_sv_in_18"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-43)

df_player1["p_sv_pts_17"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-44)
df_player1["p_1st_sv_in_17"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-44)

df_player1["p_sv_pts_16"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-45)
df_player1["p_1st_sv_in_16"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-45)

df_player1["p_sv_pts_15"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-46)
df_player1["p_1st_sv_in_15"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-46)

df_player1["p_sv_pts_14"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-47)
df_player1["p_1st_sv_in_14"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-47)

df_player1["p_sv_pts_13"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-48)
df_player1["p_1st_sv_in_13"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-48)

df_player1["p_sv_pts_12"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-49)
df_player1["p_1st_sv_in_12"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-49)

df_player1["p_sv_pts_11"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-50)
df_player1["p_1st_sv_in_11"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-50)

df_player1["p_sv_pts_10"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-51)
df_player1["p_1st_sv_in_10"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-51)

df_player1["p_sv_pts_9"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-52)
df_player1["p_1st_sv_in_9"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-52)

df_player1["p_sv_pts_8"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-53)
df_player1["p_1st_sv_in_8"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-53)

df_player1["p_sv_pts_7"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-54)
df_player1["p_1st_sv_in_7"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-54)

df_player1["p_sv_pts_6"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-55)
df_player1["p_1st_sv_in_6"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-55)

df_player1["p_sv_pts_5"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-56)
df_player1["p_1st_sv_in_5"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-56)

df_player1["p_sv_pts_4"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-57)
df_player1["p_1st_sv_in_4"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-57)

df_player1["p_sv_pts_3"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-58)
df_player1["p_1st_sv_in_3"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-58)

df_player1["p_sv_pts_2"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-59)
df_player1["p_1st_sv_in_2"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-59)

df_player1["p_sv_pts_1"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-60)
df_player1["p_1st_sv_in_1"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_sv_pts_l60_ws"] = df_player1[["p_sv_pts_60", "p_sv_pts_59", "p_sv_pts_58", "p_sv_pts_57", "p_sv_pts_56", "p_sv_pts_55", "p_sv_pts_54", "p_sv_pts_53", "p_sv_pts_52", "p_sv_pts_51", "p_sv_pts_50", "p_sv_pts_49", "p_sv_pts_48", "p_sv_pts_47", "p_sv_pts_46", "p_sv_pts_45", "p_sv_pts_44", "p_sv_pts_43", "p_sv_pts_42", "p_sv_pts_41", "p_sv_pts_40", "p_sv_pts_39", "p_sv_pts_38", "p_sv_pts_37", "p_sv_pts_36", "p_sv_pts_35", "p_sv_pts_34", "p_sv_pts_33", "p_sv_pts_32", "p_sv_pts_31", "p_sv_pts_30", "p_sv_pts_29", "p_sv_pts_28", "p_sv_pts_27", "p_sv_pts_26", "p_sv_pts_25", "p_sv_pts_24", "p_sv_pts_23", "p_sv_pts_22", "p_sv_pts_21", "p_sv_pts_20", "p_sv_pts_19", "p_sv_pts_18", "p_sv_pts_17", "p_sv_pts_16", "p_sv_pts_15", "p_sv_pts_14", "p_sv_pts_13", "p_sv_pts_12", "p_sv_pts_11", "p_sv_pts_10", "p_sv_pts_9", "p_sv_pts_8", "p_sv_pts_7", "p_sv_pts_6", "p_sv_pts_5", "p_sv_pts_4", "p_sv_pts_3", "p_sv_pts_2", "p_sv_pts_1"]].sum(axis=1)
df_player1["p_1st_sv_in_l60_ws"] = df_player1[["p_1st_sv_in_60", "p_1st_sv_in_59", "p_1st_sv_in_58", "p_1st_sv_in_57", "p_1st_sv_in_56", "p_1st_sv_in_55", "p_1st_sv_in_54", "p_1st_sv_in_53", "p_1st_sv_in_52", "p_1st_sv_in_51", "p_1st_sv_in_50", "p_1st_sv_in_49", "p_1st_sv_in_48", "p_1st_sv_in_47", "p_1st_sv_in_46", "p_1st_sv_in_45", "p_1st_sv_in_44", "p_1st_sv_in_43", "p_1st_sv_in_42", "p_1st_sv_in_41", "p_1st_sv_in_40", "p_1st_sv_in_39", "p_1st_sv_in_38", "p_1st_sv_in_37", "p_1st_sv_in_36", "p_1st_sv_in_35", "p_1st_sv_in_34", "p_1st_sv_in_33", "p_1st_sv_in_32", "p_1st_sv_in_31", "p_1st_sv_in_30", "p_1st_sv_in_29", "p_1st_sv_in_28", "p_1st_sv_in_27", "p_1st_sv_in_26", "p_1st_sv_in_25", "p_1st_sv_in_24", "p_1st_sv_in_23", "p_1st_sv_in_22", "p_1st_sv_in_21", "p_1st_sv_in_20", "p_1st_sv_in_19", "p_1st_sv_in_18", "p_1st_sv_in_17", "p_1st_sv_in_16", "p_1st_sv_in_15", "p_1st_sv_in_14", "p_1st_sv_in_13", "p_1st_sv_in_12", "p_1st_sv_in_11", "p_1st_sv_in_10", "p_1st_sv_in_9", "p_1st_sv_in_8", "p_1st_sv_in_7", "p_1st_sv_in_6", "p_1st_sv_in_5", "p_1st_sv_in_4", "p_1st_sv_in_3", "p_1st_sv_in_2", "p_1st_sv_in_1"]].sum(axis=1)
df_player1["p_1st_sv%_l60_tw_ss"] = ((df_player1["p_1st_sv_in_l60_ws"]/df_player1["p_sv_pts_l60_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

In [27]:
# 'p_1st_sv%_l10_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean FIRST SERVE PERCENTAGE of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_sv_pts_l10_ws"] = df_player1[["p_sv_pts_60", "p_sv_pts_59", "p_sv_pts_58", "p_sv_pts_57", "p_sv_pts_56", "p_sv_pts_55", "p_sv_pts_54", "p_sv_pts_53", "p_sv_pts_52", "p_sv_pts_51"]].sum(axis=1)
df_player1["p_1st_sv_in_l10_ws"] = df_player1[["p_1st_sv_in_60", "p_1st_sv_in_59", "p_1st_sv_in_58", "p_1st_sv_in_57", "p_1st_sv_in_56", "p_1st_sv_in_55", "p_1st_sv_in_54", "p_1st_sv_in_53", "p_1st_sv_in_52", "p_1st_sv_in_51"]].sum(axis=1)
df_player1["p_1st_sv%_l10_tw_ss"] = ((df_player1["p_1st_sv_in_l10_ws"]/df_player1["p_sv_pts_l10_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_sv_pts_l60_ws", "p_1st_sv_in_l60_ws", "p_sv_pts_l10_ws", "p_1st_sv_in_l10_ws", "p_sv_pts_60", "p_sv_pts_59", "p_sv_pts_58", "p_sv_pts_57", "p_sv_pts_56", "p_sv_pts_55", "p_sv_pts_54", "p_sv_pts_53", "p_sv_pts_52", "p_sv_pts_51", "p_sv_pts_50", "p_sv_pts_49", "p_sv_pts_48", "p_sv_pts_47", "p_sv_pts_46", "p_sv_pts_45", "p_sv_pts_44", "p_sv_pts_43", "p_sv_pts_42", "p_sv_pts_41", "p_sv_pts_40", "p_sv_pts_39", "p_sv_pts_38", "p_sv_pts_37", "p_sv_pts_36", "p_sv_pts_35", "p_sv_pts_34", "p_sv_pts_33", "p_sv_pts_32", "p_sv_pts_31", "p_sv_pts_30", "p_sv_pts_29", "p_sv_pts_28", "p_sv_pts_27", "p_sv_pts_26", "p_sv_pts_25", "p_sv_pts_24", "p_sv_pts_23", "p_sv_pts_22", "p_sv_pts_21", "p_sv_pts_20", "p_sv_pts_19", "p_sv_pts_18", "p_sv_pts_17", "p_sv_pts_16", "p_sv_pts_15", "p_sv_pts_14", "p_sv_pts_13", "p_sv_pts_12", "p_sv_pts_11", "p_sv_pts_10", "p_sv_pts_9", "p_sv_pts_8", "p_sv_pts_7", "p_sv_pts_6", "p_sv_pts_5", "p_sv_pts_4", "p_sv_pts_3", "p_sv_pts_2", "p_sv_pts_1", "p_1st_sv_in_60", "p_1st_sv_in_59", "p_1st_sv_in_58", "p_1st_sv_in_57", "p_1st_sv_in_56", "p_1st_sv_in_55", "p_1st_sv_in_54", "p_1st_sv_in_53", "p_1st_sv_in_52", "p_1st_sv_in_51", "p_1st_sv_in_50", "p_1st_sv_in_49", "p_1st_sv_in_48", "p_1st_sv_in_47", "p_1st_sv_in_46", "p_1st_sv_in_45", "p_1st_sv_in_44", "p_1st_sv_in_43", "p_1st_sv_in_42", "p_1st_sv_in_41", "p_1st_sv_in_40", "p_1st_sv_in_39", "p_1st_sv_in_38", "p_1st_sv_in_37", "p_1st_sv_in_36", "p_1st_sv_in_35", "p_1st_sv_in_34", "p_1st_sv_in_33", "p_1st_sv_in_32", "p_1st_sv_in_31", "p_1st_sv_in_30", "p_1st_sv_in_29", "p_1st_sv_in_28", "p_1st_sv_in_27", "p_1st_sv_in_26", "p_1st_sv_in_25", "p_1st_sv_in_24", "p_1st_sv_in_23", "p_1st_sv_in_22", "p_1st_sv_in_21", "p_1st_sv_in_20", "p_1st_sv_in_19", "p_1st_sv_in_18", "p_1st_sv_in_17", "p_1st_sv_in_16", "p_1st_sv_in_15", "p_1st_sv_in_14", "p_1st_sv_in_13", "p_1st_sv_in_12", "p_1st_sv_in_11", "p_1st_sv_in_10", "p_1st_sv_in_9", "p_1st_sv_in_8", "p_1st_sv_in_7", "p_1st_sv_in_6", "p_1st_sv_in_5", "p_1st_sv_in_4", "p_1st_sv_in_3", "p_1st_sv_in_2", "p_1st_sv_in_1"], axis = 1)

In [28]:
# 'p_1st_sv%_l60_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor specific (IO) mean FIRST SERVE PERCENTAGE of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf', 't_ind', 'm_date','m_rd_num'], ascending = False)

df_player1["p_sv_pts_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-1)
df_player1["p_1st_sv_in_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-1)

df_player1["p_sv_pts_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-2)
df_player1["p_1st_sv_in_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-2)

df_player1["p_sv_pts_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-3)
df_player1["p_1st_sv_in_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-3)

df_player1["p_sv_pts_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-4)
df_player1["p_1st_sv_in_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-4)

df_player1["p_sv_pts_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-5)
df_player1["p_1st_sv_in_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-5)

df_player1["p_sv_pts_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-6)
df_player1["p_1st_sv_in_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-6)

df_player1["p_sv_pts_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-7)
df_player1["p_1st_sv_in_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-7)

df_player1["p_sv_pts_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-8)
df_player1["p_1st_sv_in_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-8)

df_player1["p_sv_pts_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-9)
df_player1["p_1st_sv_in_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-9)

df_player1["p_sv_pts_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-10)
df_player1["p_1st_sv_in_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-10)

df_player1["p_sv_pts_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-11)
df_player1["p_1st_sv_in_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-11)

df_player1["p_sv_pts_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-12)
df_player1["p_1st_sv_in_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-12)

df_player1["p_sv_pts_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-13)
df_player1["p_1st_sv_in_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-13)

df_player1["p_sv_pts_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-14)
df_player1["p_1st_sv_in_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-14)

df_player1["p_sv_pts_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-15)
df_player1["p_1st_sv_in_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-15)

df_player1["p_sv_pts_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-16)
df_player1["p_1st_sv_in_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-16)

df_player1["p_sv_pts_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-17)
df_player1["p_1st_sv_in_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-17)

df_player1["p_sv_pts_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-18)
df_player1["p_1st_sv_in_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-18)

df_player1["p_sv_pts_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-19)
df_player1["p_1st_sv_in_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-19)

df_player1["p_sv_pts_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-20)
df_player1["p_1st_sv_in_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-20)

df_player1["p_sv_pts_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-21)
df_player1["p_1st_sv_in_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-21)

df_player1["p_sv_pts_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-22)
df_player1["p_1st_sv_in_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-22)

df_player1["p_sv_pts_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-23)
df_player1["p_1st_sv_in_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-23)

df_player1["p_sv_pts_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-24)
df_player1["p_1st_sv_in_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-24)

df_player1["p_sv_pts_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-25)
df_player1["p_1st_sv_in_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-25)

df_player1["p_sv_pts_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-26)
df_player1["p_1st_sv_in_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-26)

df_player1["p_sv_pts_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-27)
df_player1["p_1st_sv_in_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-27)

df_player1["p_sv_pts_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-28)
df_player1["p_1st_sv_in_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-28)

df_player1["p_sv_pts_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-29)
df_player1["p_1st_sv_in_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-29)

df_player1["p_sv_pts_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-30)
df_player1["p_1st_sv_in_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-30)

df_player1["p_sv_pts_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-31)
df_player1["p_1st_sv_in_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-31)

df_player1["p_sv_pts_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-32)
df_player1["p_1st_sv_in_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-32)

df_player1["p_sv_pts_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-33)
df_player1["p_1st_sv_in_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-33)

df_player1["p_sv_pts_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-34)
df_player1["p_1st_sv_in_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-34)

df_player1["p_sv_pts_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-35)
df_player1["p_1st_sv_in_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-35)

df_player1["p_sv_pts_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-36)
df_player1["p_1st_sv_in_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-36)

df_player1["p_sv_pts_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-37)
df_player1["p_1st_sv_in_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-37)

df_player1["p_sv_pts_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-38)
df_player1["p_1st_sv_in_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-38)

df_player1["p_sv_pts_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-39)
df_player1["p_1st_sv_in_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-39)

df_player1["p_sv_pts_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-40)
df_player1["p_1st_sv_in_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-40)

df_player1["p_sv_pts_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-41)
df_player1["p_1st_sv_in_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-41)

df_player1["p_sv_pts_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-42)
df_player1["p_1st_sv_in_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-42)

df_player1["p_sv_pts_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-43)
df_player1["p_1st_sv_in_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-43)

df_player1["p_sv_pts_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-44)
df_player1["p_1st_sv_in_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-44)

df_player1["p_sv_pts_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-45)
df_player1["p_1st_sv_in_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-45)

df_player1["p_sv_pts_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-46)
df_player1["p_1st_sv_in_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-46)

df_player1["p_sv_pts_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-47)
df_player1["p_1st_sv_in_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-47)

df_player1["p_sv_pts_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-48)
df_player1["p_1st_sv_in_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-48)

df_player1["p_sv_pts_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-49)
df_player1["p_1st_sv_in_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-49)

df_player1["p_sv_pts_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-50)
df_player1["p_1st_sv_in_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-50)

df_player1["p_sv_pts_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-51)
df_player1["p_1st_sv_in_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-51)

df_player1["p_sv_pts_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-52)
df_player1["p_1st_sv_in_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-52)

df_player1["p_sv_pts_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-53)
df_player1["p_1st_sv_in_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-53)

df_player1["p_sv_pts_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-54)
df_player1["p_1st_sv_in_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-54)

df_player1["p_sv_pts_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-55)
df_player1["p_1st_sv_in_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-55)

df_player1["p_sv_pts_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-56)
df_player1["p_1st_sv_in_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-56)

df_player1["p_sv_pts_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-57)
df_player1["p_1st_sv_in_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-57)

df_player1["p_sv_pts_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-58)
df_player1["p_1st_sv_in_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-58)

df_player1["p_sv_pts_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-59)
df_player1["p_1st_sv_in_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-59)

df_player1["p_sv_pts_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-60)
df_player1["p_1st_sv_in_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_sv_pts_l60_ws"] = df_player1[["p_sv_pts_60", "p_sv_pts_59", "p_sv_pts_58", "p_sv_pts_57", "p_sv_pts_56", "p_sv_pts_55", "p_sv_pts_54", "p_sv_pts_53", "p_sv_pts_52", "p_sv_pts_51", "p_sv_pts_50", "p_sv_pts_49", "p_sv_pts_48", "p_sv_pts_47", "p_sv_pts_46", "p_sv_pts_45", "p_sv_pts_44", "p_sv_pts_43", "p_sv_pts_42", "p_sv_pts_41", "p_sv_pts_40", "p_sv_pts_39", "p_sv_pts_38", "p_sv_pts_37", "p_sv_pts_36", "p_sv_pts_35", "p_sv_pts_34", "p_sv_pts_33", "p_sv_pts_32", "p_sv_pts_31", "p_sv_pts_30", "p_sv_pts_29", "p_sv_pts_28", "p_sv_pts_27", "p_sv_pts_26", "p_sv_pts_25", "p_sv_pts_24", "p_sv_pts_23", "p_sv_pts_22", "p_sv_pts_21", "p_sv_pts_20", "p_sv_pts_19", "p_sv_pts_18", "p_sv_pts_17", "p_sv_pts_16", "p_sv_pts_15", "p_sv_pts_14", "p_sv_pts_13", "p_sv_pts_12", "p_sv_pts_11", "p_sv_pts_10", "p_sv_pts_9", "p_sv_pts_8", "p_sv_pts_7", "p_sv_pts_6", "p_sv_pts_5", "p_sv_pts_4", "p_sv_pts_3", "p_sv_pts_2", "p_sv_pts_1"]].sum(axis=1)
df_player1["p_1st_sv_in_l60_ws"] = df_player1[["p_1st_sv_in_60", "p_1st_sv_in_59", "p_1st_sv_in_58", "p_1st_sv_in_57", "p_1st_sv_in_56", "p_1st_sv_in_55", "p_1st_sv_in_54", "p_1st_sv_in_53", "p_1st_sv_in_52", "p_1st_sv_in_51", "p_1st_sv_in_50", "p_1st_sv_in_49", "p_1st_sv_in_48", "p_1st_sv_in_47", "p_1st_sv_in_46", "p_1st_sv_in_45", "p_1st_sv_in_44", "p_1st_sv_in_43", "p_1st_sv_in_42", "p_1st_sv_in_41", "p_1st_sv_in_40", "p_1st_sv_in_39", "p_1st_sv_in_38", "p_1st_sv_in_37", "p_1st_sv_in_36", "p_1st_sv_in_35", "p_1st_sv_in_34", "p_1st_sv_in_33", "p_1st_sv_in_32", "p_1st_sv_in_31", "p_1st_sv_in_30", "p_1st_sv_in_29", "p_1st_sv_in_28", "p_1st_sv_in_27", "p_1st_sv_in_26", "p_1st_sv_in_25", "p_1st_sv_in_24", "p_1st_sv_in_23", "p_1st_sv_in_22", "p_1st_sv_in_21", "p_1st_sv_in_20", "p_1st_sv_in_19", "p_1st_sv_in_18", "p_1st_sv_in_17", "p_1st_sv_in_16", "p_1st_sv_in_15", "p_1st_sv_in_14", "p_1st_sv_in_13", "p_1st_sv_in_12", "p_1st_sv_in_11", "p_1st_sv_in_10", "p_1st_sv_in_9", "p_1st_sv_in_8", "p_1st_sv_in_7", "p_1st_sv_in_6", "p_1st_sv_in_5", "p_1st_sv_in_4", "p_1st_sv_in_3", "p_1st_sv_in_2", "p_1st_sv_in_1"]].sum(axis=1)
df_player1["p_1st_sv%_l60_tw_ss_IO"] = ((df_player1["p_1st_sv_in_l60_ws"]/df_player1["p_sv_pts_l60_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

In [29]:
# 'p_1st_sv%_l10_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor specific (IO) mean FIRST SERVE PERCENTAGE of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_sv_pts_l10_ws"] = df_player1[["p_sv_pts_60", "p_sv_pts_59", "p_sv_pts_58", "p_sv_pts_57", "p_sv_pts_56", "p_sv_pts_55", "p_sv_pts_54", "p_sv_pts_53", "p_sv_pts_52", "p_sv_pts_51"]].sum(axis=1)
df_player1["p_1st_sv_in_l10_ws"] = df_player1[["p_1st_sv_in_60", "p_1st_sv_in_59", "p_1st_sv_in_58", "p_1st_sv_in_57", "p_1st_sv_in_56", "p_1st_sv_in_55", "p_1st_sv_in_54", "p_1st_sv_in_53", "p_1st_sv_in_52", "p_1st_sv_in_51"]].sum(axis=1)
df_player1["p_1st_sv%_l10_tw_ss_IO"] = ((df_player1["p_1st_sv_in_l10_ws"]/df_player1["p_sv_pts_l10_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_sv_pts_l60_ws", "p_1st_sv_in_l60_ws", "p_sv_pts_l10_ws", "p_1st_sv_in_l10_ws", "p_sv_pts_60", "p_sv_pts_59", "p_sv_pts_58", "p_sv_pts_57", "p_sv_pts_56", "p_sv_pts_55", "p_sv_pts_54", "p_sv_pts_53", "p_sv_pts_52", "p_sv_pts_51", "p_sv_pts_50", "p_sv_pts_49", "p_sv_pts_48", "p_sv_pts_47", "p_sv_pts_46", "p_sv_pts_45", "p_sv_pts_44", "p_sv_pts_43", "p_sv_pts_42", "p_sv_pts_41", "p_sv_pts_40", "p_sv_pts_39", "p_sv_pts_38", "p_sv_pts_37", "p_sv_pts_36", "p_sv_pts_35", "p_sv_pts_34", "p_sv_pts_33", "p_sv_pts_32", "p_sv_pts_31", "p_sv_pts_30", "p_sv_pts_29", "p_sv_pts_28", "p_sv_pts_27", "p_sv_pts_26", "p_sv_pts_25", "p_sv_pts_24", "p_sv_pts_23", "p_sv_pts_22", "p_sv_pts_21", "p_sv_pts_20", "p_sv_pts_19", "p_sv_pts_18", "p_sv_pts_17", "p_sv_pts_16", "p_sv_pts_15", "p_sv_pts_14", "p_sv_pts_13", "p_sv_pts_12", "p_sv_pts_11", "p_sv_pts_10", "p_sv_pts_9", "p_sv_pts_8", "p_sv_pts_7", "p_sv_pts_6", "p_sv_pts_5", "p_sv_pts_4", "p_sv_pts_3", "p_sv_pts_2", "p_sv_pts_1", "p_1st_sv_in_60", "p_1st_sv_in_59", "p_1st_sv_in_58", "p_1st_sv_in_57", "p_1st_sv_in_56", "p_1st_sv_in_55", "p_1st_sv_in_54", "p_1st_sv_in_53", "p_1st_sv_in_52", "p_1st_sv_in_51", "p_1st_sv_in_50", "p_1st_sv_in_49", "p_1st_sv_in_48", "p_1st_sv_in_47", "p_1st_sv_in_46", "p_1st_sv_in_45", "p_1st_sv_in_44", "p_1st_sv_in_43", "p_1st_sv_in_42", "p_1st_sv_in_41", "p_1st_sv_in_40", "p_1st_sv_in_39", "p_1st_sv_in_38", "p_1st_sv_in_37", "p_1st_sv_in_36", "p_1st_sv_in_35", "p_1st_sv_in_34", "p_1st_sv_in_33", "p_1st_sv_in_32", "p_1st_sv_in_31", "p_1st_sv_in_30", "p_1st_sv_in_29", "p_1st_sv_in_28", "p_1st_sv_in_27", "p_1st_sv_in_26", "p_1st_sv_in_25", "p_1st_sv_in_24", "p_1st_sv_in_23", "p_1st_sv_in_22", "p_1st_sv_in_21", "p_1st_sv_in_20", "p_1st_sv_in_19", "p_1st_sv_in_18", "p_1st_sv_in_17", "p_1st_sv_in_16", "p_1st_sv_in_15", "p_1st_sv_in_14", "p_1st_sv_in_13", "p_1st_sv_in_12", "p_1st_sv_in_11", "p_1st_sv_in_10", "p_1st_sv_in_9", "p_1st_sv_in_8", "p_1st_sv_in_7", "p_1st_sv_in_6", "p_1st_sv_in_5", "p_1st_sv_in_4", "p_1st_sv_in_3", "p_1st_sv_in_2", "p_1st_sv_in_1"], axis = 1)

In [30]:
# 'p_1st_sv%_yielded_l60_tw_ss'
# Provides time-weighted (TW), surface-specific (SS) mean FIRST SERVE PERCENTAGE YIELDED (as RETURNER) of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

df_player1["p_ret_pts_60"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-1)
df_player1["opp_1st_sv_in_60"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-1)

df_player1["p_ret_pts_59"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-2)
df_player1["opp_1st_sv_in_59"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-2)

df_player1["p_ret_pts_58"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-3)
df_player1["opp_1st_sv_in_58"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-3)

df_player1["p_ret_pts_57"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-4)
df_player1["opp_1st_sv_in_57"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-4)

df_player1["p_ret_pts_56"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-5)
df_player1["opp_1st_sv_in_56"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-5)

df_player1["p_ret_pts_55"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-6)
df_player1["opp_1st_sv_in_55"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-6)

df_player1["p_ret_pts_54"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-7)
df_player1["opp_1st_sv_in_54"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-7)

df_player1["p_ret_pts_53"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-8)
df_player1["opp_1st_sv_in_53"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-8)

df_player1["p_ret_pts_52"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-9)
df_player1["opp_1st_sv_in_52"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-9)

df_player1["p_ret_pts_51"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-10)
df_player1["opp_1st_sv_in_51"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-10)

df_player1["p_ret_pts_50"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-11)
df_player1["opp_1st_sv_in_50"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-11)

df_player1["p_ret_pts_49"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-12)
df_player1["opp_1st_sv_in_49"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-12)

df_player1["p_ret_pts_48"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-13)
df_player1["opp_1st_sv_in_48"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-13)

df_player1["p_ret_pts_47"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-14)
df_player1["opp_1st_sv_in_47"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-14)

df_player1["p_ret_pts_46"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-15)
df_player1["opp_1st_sv_in_46"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-15)

df_player1["p_ret_pts_45"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-16)
df_player1["opp_1st_sv_in_45"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-16)

df_player1["p_ret_pts_44"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-17)
df_player1["opp_1st_sv_in_44"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-17)

df_player1["p_ret_pts_43"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-18)
df_player1["opp_1st_sv_in_43"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-18)

df_player1["p_ret_pts_42"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-19)
df_player1["opp_1st_sv_in_42"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-19)

df_player1["p_ret_pts_41"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-20)
df_player1["opp_1st_sv_in_41"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-20)

df_player1["p_ret_pts_40"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-21)
df_player1["opp_1st_sv_in_40"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-21)

df_player1["p_ret_pts_39"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-22)
df_player1["opp_1st_sv_in_39"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-22)

df_player1["p_ret_pts_38"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-23)
df_player1["opp_1st_sv_in_38"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-23)

df_player1["p_ret_pts_37"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-24)
df_player1["opp_1st_sv_in_37"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-24)

df_player1["p_ret_pts_36"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-25)
df_player1["opp_1st_sv_in_36"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-25)

df_player1["p_ret_pts_35"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-26)
df_player1["opp_1st_sv_in_35"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-26)

df_player1["p_ret_pts_34"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-27)
df_player1["opp_1st_sv_in_34"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-27)

df_player1["p_ret_pts_33"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-28)
df_player1["opp_1st_sv_in_33"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-28)

df_player1["p_ret_pts_32"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-29)
df_player1["opp_1st_sv_in_32"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-29)

df_player1["p_ret_pts_31"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-30)
df_player1["opp_1st_sv_in_31"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-30)

df_player1["p_ret_pts_30"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-31)
df_player1["opp_1st_sv_in_30"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-31)

df_player1["p_ret_pts_29"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-32)
df_player1["opp_1st_sv_in_29"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-32)

df_player1["p_ret_pts_28"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-33)
df_player1["opp_1st_sv_in_28"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-33)

df_player1["p_ret_pts_27"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-34)
df_player1["opp_1st_sv_in_27"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-34)

df_player1["p_ret_pts_26"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-35)
df_player1["opp_1st_sv_in_26"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-35)

df_player1["p_ret_pts_25"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-36)
df_player1["opp_1st_sv_in_25"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-36)

df_player1["p_ret_pts_24"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-37)
df_player1["opp_1st_sv_in_24"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-37)

df_player1["p_ret_pts_23"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-38)
df_player1["opp_1st_sv_in_23"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-38)

df_player1["p_ret_pts_22"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-39)
df_player1["opp_1st_sv_in_22"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-39)

df_player1["p_ret_pts_21"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-40)
df_player1["opp_1st_sv_in_21"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-40)

df_player1["p_ret_pts_20"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-41)
df_player1["opp_1st_sv_in_20"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-41)

df_player1["p_ret_pts_19"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-42)
df_player1["opp_1st_sv_in_19"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-42)

df_player1["p_ret_pts_18"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-43)
df_player1["opp_1st_sv_in_18"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-43)

df_player1["p_ret_pts_17"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-44)
df_player1["opp_1st_sv_in_17"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-44)

df_player1["p_ret_pts_16"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-45)
df_player1["opp_1st_sv_in_16"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-45)

df_player1["p_ret_pts_15"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-46)
df_player1["opp_1st_sv_in_15"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-46)

df_player1["p_ret_pts_14"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-47)
df_player1["opp_1st_sv_in_14"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-47)

df_player1["p_ret_pts_13"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-48)
df_player1["opp_1st_sv_in_13"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-48)

df_player1["p_ret_pts_12"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-49)
df_player1["opp_1st_sv_in_12"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-49)

df_player1["p_ret_pts_11"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-50)
df_player1["opp_1st_sv_in_11"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-50)

df_player1["p_ret_pts_10"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-51)
df_player1["opp_1st_sv_in_10"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-51)

df_player1["p_ret_pts_9"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-52)
df_player1["opp_1st_sv_in_9"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-52)

df_player1["p_ret_pts_8"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-53)
df_player1["opp_1st_sv_in_8"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-53)

df_player1["p_ret_pts_7"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-54)
df_player1["opp_1st_sv_in_7"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-54)

df_player1["p_ret_pts_6"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-55)
df_player1["opp_1st_sv_in_6"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-55)

df_player1["p_ret_pts_5"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-56)
df_player1["opp_1st_sv_in_5"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-56)

df_player1["p_ret_pts_4"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-57)
df_player1["opp_1st_sv_in_4"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-57)

df_player1["p_ret_pts_3"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-58)
df_player1["opp_1st_sv_in_3"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-58)

df_player1["p_ret_pts_2"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-59)
df_player1["opp_1st_sv_in_2"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-59)

df_player1["p_ret_pts_1"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-60)
df_player1["opp_1st_sv_in_1"] = df_player1.groupby(['p_id','t_surf'])['opp_1st_sv_in'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_ret_pts_l60_ws"] = df_player1[["p_ret_pts_60", "p_ret_pts_59", "p_ret_pts_58", "p_ret_pts_57", "p_ret_pts_56", "p_ret_pts_55", "p_ret_pts_54", "p_ret_pts_53", "p_ret_pts_52", "p_ret_pts_51", "p_ret_pts_50", "p_ret_pts_49", "p_ret_pts_48", "p_ret_pts_47", "p_ret_pts_46", "p_ret_pts_45", "p_ret_pts_44", "p_ret_pts_43", "p_ret_pts_42", "p_ret_pts_41", "p_ret_pts_40", "p_ret_pts_39", "p_ret_pts_38", "p_ret_pts_37", "p_ret_pts_36", "p_ret_pts_35", "p_ret_pts_34", "p_ret_pts_33", "p_ret_pts_32", "p_ret_pts_31", "p_ret_pts_30", "p_ret_pts_29", "p_ret_pts_28", "p_ret_pts_27", "p_ret_pts_26", "p_ret_pts_25", "p_ret_pts_24", "p_ret_pts_23", "p_ret_pts_22", "p_ret_pts_21", "p_ret_pts_20", "p_ret_pts_19", "p_ret_pts_18", "p_ret_pts_17", "p_ret_pts_16", "p_ret_pts_15", "p_ret_pts_14", "p_ret_pts_13", "p_ret_pts_12", "p_ret_pts_11", "p_ret_pts_10", "p_ret_pts_9", "p_ret_pts_8", "p_ret_pts_7", "p_ret_pts_6", "p_ret_pts_5", "p_ret_pts_4", "p_ret_pts_3", "p_ret_pts_2", "p_ret_pts_1"]].sum(axis=1)
df_player1["opp_1st_sv_in_l60_ws"] = df_player1[["opp_1st_sv_in_60", "opp_1st_sv_in_59", "opp_1st_sv_in_58", "opp_1st_sv_in_57", "opp_1st_sv_in_56", "opp_1st_sv_in_55", "opp_1st_sv_in_54", "opp_1st_sv_in_53", "opp_1st_sv_in_52", "opp_1st_sv_in_51", "opp_1st_sv_in_50", "opp_1st_sv_in_49", "opp_1st_sv_in_48", "opp_1st_sv_in_47", "opp_1st_sv_in_46", "opp_1st_sv_in_45", "opp_1st_sv_in_44", "opp_1st_sv_in_43", "opp_1st_sv_in_42", "opp_1st_sv_in_41", "opp_1st_sv_in_40", "opp_1st_sv_in_39", "opp_1st_sv_in_38", "opp_1st_sv_in_37", "opp_1st_sv_in_36", "opp_1st_sv_in_35", "opp_1st_sv_in_34", "opp_1st_sv_in_33", "opp_1st_sv_in_32", "opp_1st_sv_in_31", "opp_1st_sv_in_30", "opp_1st_sv_in_29", "opp_1st_sv_in_28", "opp_1st_sv_in_27", "opp_1st_sv_in_26", "opp_1st_sv_in_25", "opp_1st_sv_in_24", "opp_1st_sv_in_23", "opp_1st_sv_in_22", "opp_1st_sv_in_21", "opp_1st_sv_in_20", "opp_1st_sv_in_19", "opp_1st_sv_in_18", "opp_1st_sv_in_17", "opp_1st_sv_in_16", "opp_1st_sv_in_15", "opp_1st_sv_in_14", "opp_1st_sv_in_13", "opp_1st_sv_in_12", "opp_1st_sv_in_11", "opp_1st_sv_in_10", "opp_1st_sv_in_9", "opp_1st_sv_in_8", "opp_1st_sv_in_7", "opp_1st_sv_in_6", "opp_1st_sv_in_5", "opp_1st_sv_in_4", "opp_1st_sv_in_3", "opp_1st_sv_in_2", "opp_1st_sv_in_1"]].sum(axis=1)
df_player1["p_1st_sv%_yielded_l60_tw_ss"] = ((df_player1["opp_1st_sv_in_l60_ws"]/df_player1["p_ret_pts_l60_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

In [31]:
# 'p_1st_sv%_yielded_l10_tw_ss'
# Provides time-weighted (TW), surface-specific (SS) mean FIRST SERVE PERCENTAGE YIELDED (AS RETURNER) of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_ret_pts_l10_ws"] = df_player1[["p_ret_pts_60", "p_ret_pts_59", "p_ret_pts_58", "p_ret_pts_57", "p_ret_pts_56", "p_ret_pts_55", "p_ret_pts_54", "p_ret_pts_53", "p_ret_pts_52", "p_ret_pts_51"]].sum(axis=1)
df_player1["opp_1st_sv_in_l10_ws"] = df_player1[["opp_1st_sv_in_60", "opp_1st_sv_in_59", "opp_1st_sv_in_58", "opp_1st_sv_in_57", "opp_1st_sv_in_56", "opp_1st_sv_in_55", "opp_1st_sv_in_54", "opp_1st_sv_in_53", "opp_1st_sv_in_52", "opp_1st_sv_in_51"]].sum(axis=1)
df_player1["p_1st_sv%_yielded_l10_tw_ss"] = ((df_player1["opp_1st_sv_in_l10_ws"]/df_player1["p_ret_pts_l10_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_ret_pts_l60_ws", "opp_1st_sv_in_l60_ws", "p_ret_pts_l10_ws", "opp_1st_sv_in_l10_ws", "p_ret_pts_60", "p_ret_pts_59", "p_ret_pts_58", "p_ret_pts_57", "p_ret_pts_56", "p_ret_pts_55", "p_ret_pts_54", "p_ret_pts_53", "p_ret_pts_52", "p_ret_pts_51", "p_ret_pts_50", "p_ret_pts_49", "p_ret_pts_48", "p_ret_pts_47", "p_ret_pts_46", "p_ret_pts_45", "p_ret_pts_44", "p_ret_pts_43", "p_ret_pts_42", "p_ret_pts_41", "p_ret_pts_40", "p_ret_pts_39", "p_ret_pts_38", "p_ret_pts_37", "p_ret_pts_36", "p_ret_pts_35", "p_ret_pts_34", "p_ret_pts_33", "p_ret_pts_32", "p_ret_pts_31", "p_ret_pts_30", "p_ret_pts_29", "p_ret_pts_28", "p_ret_pts_27", "p_ret_pts_26", "p_ret_pts_25", "p_ret_pts_24", "p_ret_pts_23", "p_ret_pts_22", "p_ret_pts_21", "p_ret_pts_20", "p_ret_pts_19", "p_ret_pts_18", "p_ret_pts_17", "p_ret_pts_16", "p_ret_pts_15", "p_ret_pts_14", "p_ret_pts_13", "p_ret_pts_12", "p_ret_pts_11", "p_ret_pts_10", "p_ret_pts_9", "p_ret_pts_8", "p_ret_pts_7", "p_ret_pts_6", "p_ret_pts_5", "p_ret_pts_4", "p_ret_pts_3", "p_ret_pts_2", "p_ret_pts_1", "opp_1st_sv_in_60", "opp_1st_sv_in_59", "opp_1st_sv_in_58", "opp_1st_sv_in_57", "opp_1st_sv_in_56", "opp_1st_sv_in_55", "opp_1st_sv_in_54", "opp_1st_sv_in_53", "opp_1st_sv_in_52", "opp_1st_sv_in_51", "opp_1st_sv_in_50", "opp_1st_sv_in_49", "opp_1st_sv_in_48", "opp_1st_sv_in_47", "opp_1st_sv_in_46", "opp_1st_sv_in_45", "opp_1st_sv_in_44", "opp_1st_sv_in_43", "opp_1st_sv_in_42", "opp_1st_sv_in_41", "opp_1st_sv_in_40", "opp_1st_sv_in_39", "opp_1st_sv_in_38", "opp_1st_sv_in_37", "opp_1st_sv_in_36", "opp_1st_sv_in_35", "opp_1st_sv_in_34", "opp_1st_sv_in_33", "opp_1st_sv_in_32", "opp_1st_sv_in_31", "opp_1st_sv_in_30", "opp_1st_sv_in_29", "opp_1st_sv_in_28", "opp_1st_sv_in_27", "opp_1st_sv_in_26", "opp_1st_sv_in_25", "opp_1st_sv_in_24", "opp_1st_sv_in_23", "opp_1st_sv_in_22", "opp_1st_sv_in_21", "opp_1st_sv_in_20", "opp_1st_sv_in_19", "opp_1st_sv_in_18", "opp_1st_sv_in_17", "opp_1st_sv_in_16", "opp_1st_sv_in_15", "opp_1st_sv_in_14", "opp_1st_sv_in_13", "opp_1st_sv_in_12", "opp_1st_sv_in_11", "opp_1st_sv_in_10", "opp_1st_sv_in_9", "opp_1st_sv_in_8", "opp_1st_sv_in_7", "opp_1st_sv_in_6", "opp_1st_sv_in_5", "opp_1st_sv_in_4", "opp_1st_sv_in_3", "opp_1st_sv_in_2", "opp_1st_sv_in_1"], axis = 1)

In [32]:
# 'p_1st_sv%_yielded_l60_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor (IO) specific mean FIRST SERVE PERCENTAGE YIELDED (as RETURNER) of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf', 't_ind', 'm_date','m_rd_num'], ascending = False)

df_player1["p_ret_pts_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-1)
df_player1["opp_1st_sv_in_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-1)

df_player1["p_ret_pts_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-2)
df_player1["opp_1st_sv_in_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-2)

df_player1["p_ret_pts_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-3)
df_player1["opp_1st_sv_in_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-3)

df_player1["p_ret_pts_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-4)
df_player1["opp_1st_sv_in_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-4)

df_player1["p_ret_pts_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-5)
df_player1["opp_1st_sv_in_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-5)

df_player1["p_ret_pts_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-6)
df_player1["opp_1st_sv_in_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-6)

df_player1["p_ret_pts_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-7)
df_player1["opp_1st_sv_in_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-7)

df_player1["p_ret_pts_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-8)
df_player1["opp_1st_sv_in_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-8)

df_player1["p_ret_pts_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-9)
df_player1["opp_1st_sv_in_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-9)

df_player1["p_ret_pts_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-10)
df_player1["opp_1st_sv_in_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-10)

df_player1["p_ret_pts_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-11)
df_player1["opp_1st_sv_in_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-11)

df_player1["p_ret_pts_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-12)
df_player1["opp_1st_sv_in_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-12)

df_player1["p_ret_pts_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-13)
df_player1["opp_1st_sv_in_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-13)

df_player1["p_ret_pts_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-14)
df_player1["opp_1st_sv_in_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-14)

df_player1["p_ret_pts_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-15)
df_player1["opp_1st_sv_in_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-15)

df_player1["p_ret_pts_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-16)
df_player1["opp_1st_sv_in_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-16)

df_player1["p_ret_pts_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-17)
df_player1["opp_1st_sv_in_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-17)

df_player1["p_ret_pts_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-18)
df_player1["opp_1st_sv_in_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-18)

df_player1["p_ret_pts_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-19)
df_player1["opp_1st_sv_in_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-19)

df_player1["p_ret_pts_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-20)
df_player1["opp_1st_sv_in_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-20)

df_player1["p_ret_pts_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-21)
df_player1["opp_1st_sv_in_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-21)

df_player1["p_ret_pts_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-22)
df_player1["opp_1st_sv_in_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-22)

df_player1["p_ret_pts_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-23)
df_player1["opp_1st_sv_in_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-23)

df_player1["p_ret_pts_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-24)
df_player1["opp_1st_sv_in_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-24)

df_player1["p_ret_pts_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-25)
df_player1["opp_1st_sv_in_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-25)

df_player1["p_ret_pts_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-26)
df_player1["opp_1st_sv_in_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-26)

df_player1["p_ret_pts_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-27)
df_player1["opp_1st_sv_in_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-27)

df_player1["p_ret_pts_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-28)
df_player1["opp_1st_sv_in_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-28)

df_player1["p_ret_pts_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-29)
df_player1["opp_1st_sv_in_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-29)

df_player1["p_ret_pts_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-30)
df_player1["opp_1st_sv_in_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-30)

df_player1["p_ret_pts_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-31)
df_player1["opp_1st_sv_in_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-31)

df_player1["p_ret_pts_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-32)
df_player1["opp_1st_sv_in_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-32)

df_player1["p_ret_pts_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-33)
df_player1["opp_1st_sv_in_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-33)

df_player1["p_ret_pts_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-34)
df_player1["opp_1st_sv_in_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-34)

df_player1["p_ret_pts_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-35)
df_player1["opp_1st_sv_in_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-35)

df_player1["p_ret_pts_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-36)
df_player1["opp_1st_sv_in_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-36)

df_player1["p_ret_pts_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-37)
df_player1["opp_1st_sv_in_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-37)

df_player1["p_ret_pts_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-38)
df_player1["opp_1st_sv_in_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-38)

df_player1["p_ret_pts_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-39)
df_player1["opp_1st_sv_in_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-39)

df_player1["p_ret_pts_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-40)
df_player1["opp_1st_sv_in_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-40)

df_player1["p_ret_pts_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-41)
df_player1["opp_1st_sv_in_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-41)

df_player1["p_ret_pts_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-42)
df_player1["opp_1st_sv_in_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-42)

df_player1["p_ret_pts_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-43)
df_player1["opp_1st_sv_in_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-43)

df_player1["p_ret_pts_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-44)
df_player1["opp_1st_sv_in_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-44)

df_player1["p_ret_pts_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-45)
df_player1["opp_1st_sv_in_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-45)

df_player1["p_ret_pts_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-46)
df_player1["opp_1st_sv_in_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-46)

df_player1["p_ret_pts_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-47)
df_player1["opp_1st_sv_in_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-47)

df_player1["p_ret_pts_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-48)
df_player1["opp_1st_sv_in_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-48)

df_player1["p_ret_pts_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-49)
df_player1["opp_1st_sv_in_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-49)

df_player1["p_ret_pts_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-50)
df_player1["opp_1st_sv_in_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-50)

df_player1["p_ret_pts_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-51)
df_player1["opp_1st_sv_in_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-51)

df_player1["p_ret_pts_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-52)
df_player1["opp_1st_sv_in_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-52)

df_player1["p_ret_pts_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-53)
df_player1["opp_1st_sv_in_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-53)

df_player1["p_ret_pts_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-54)
df_player1["opp_1st_sv_in_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-54)

df_player1["p_ret_pts_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-55)
df_player1["opp_1st_sv_in_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-55)

df_player1["p_ret_pts_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-56)
df_player1["opp_1st_sv_in_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-56)

df_player1["p_ret_pts_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-57)
df_player1["opp_1st_sv_in_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-57)

df_player1["p_ret_pts_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-58)
df_player1["opp_1st_sv_in_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-58)

df_player1["p_ret_pts_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-59)
df_player1["opp_1st_sv_in_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-59)

df_player1["p_ret_pts_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-60)
df_player1["opp_1st_sv_in_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_1st_sv_in'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_ret_pts_l60_ws"] = df_player1[["p_ret_pts_60", "p_ret_pts_59", "p_ret_pts_58", "p_ret_pts_57", "p_ret_pts_56", "p_ret_pts_55", "p_ret_pts_54", "p_ret_pts_53", "p_ret_pts_52", "p_ret_pts_51", "p_ret_pts_50", "p_ret_pts_49", "p_ret_pts_48", "p_ret_pts_47", "p_ret_pts_46", "p_ret_pts_45", "p_ret_pts_44", "p_ret_pts_43", "p_ret_pts_42", "p_ret_pts_41", "p_ret_pts_40", "p_ret_pts_39", "p_ret_pts_38", "p_ret_pts_37", "p_ret_pts_36", "p_ret_pts_35", "p_ret_pts_34", "p_ret_pts_33", "p_ret_pts_32", "p_ret_pts_31", "p_ret_pts_30", "p_ret_pts_29", "p_ret_pts_28", "p_ret_pts_27", "p_ret_pts_26", "p_ret_pts_25", "p_ret_pts_24", "p_ret_pts_23", "p_ret_pts_22", "p_ret_pts_21", "p_ret_pts_20", "p_ret_pts_19", "p_ret_pts_18", "p_ret_pts_17", "p_ret_pts_16", "p_ret_pts_15", "p_ret_pts_14", "p_ret_pts_13", "p_ret_pts_12", "p_ret_pts_11", "p_ret_pts_10", "p_ret_pts_9", "p_ret_pts_8", "p_ret_pts_7", "p_ret_pts_6", "p_ret_pts_5", "p_ret_pts_4", "p_ret_pts_3", "p_ret_pts_2", "p_ret_pts_1"]].sum(axis=1)
df_player1["opp_1st_sv_in_l60_ws"] = df_player1[["opp_1st_sv_in_60", "opp_1st_sv_in_59", "opp_1st_sv_in_58", "opp_1st_sv_in_57", "opp_1st_sv_in_56", "opp_1st_sv_in_55", "opp_1st_sv_in_54", "opp_1st_sv_in_53", "opp_1st_sv_in_52", "opp_1st_sv_in_51", "opp_1st_sv_in_50", "opp_1st_sv_in_49", "opp_1st_sv_in_48", "opp_1st_sv_in_47", "opp_1st_sv_in_46", "opp_1st_sv_in_45", "opp_1st_sv_in_44", "opp_1st_sv_in_43", "opp_1st_sv_in_42", "opp_1st_sv_in_41", "opp_1st_sv_in_40", "opp_1st_sv_in_39", "opp_1st_sv_in_38", "opp_1st_sv_in_37", "opp_1st_sv_in_36", "opp_1st_sv_in_35", "opp_1st_sv_in_34", "opp_1st_sv_in_33", "opp_1st_sv_in_32", "opp_1st_sv_in_31", "opp_1st_sv_in_30", "opp_1st_sv_in_29", "opp_1st_sv_in_28", "opp_1st_sv_in_27", "opp_1st_sv_in_26", "opp_1st_sv_in_25", "opp_1st_sv_in_24", "opp_1st_sv_in_23", "opp_1st_sv_in_22", "opp_1st_sv_in_21", "opp_1st_sv_in_20", "opp_1st_sv_in_19", "opp_1st_sv_in_18", "opp_1st_sv_in_17", "opp_1st_sv_in_16", "opp_1st_sv_in_15", "opp_1st_sv_in_14", "opp_1st_sv_in_13", "opp_1st_sv_in_12", "opp_1st_sv_in_11", "opp_1st_sv_in_10", "opp_1st_sv_in_9", "opp_1st_sv_in_8", "opp_1st_sv_in_7", "opp_1st_sv_in_6", "opp_1st_sv_in_5", "opp_1st_sv_in_4", "opp_1st_sv_in_3", "opp_1st_sv_in_2", "opp_1st_sv_in_1"]].sum(axis=1)
df_player1["p_1st_sv%_yielded_l60_tw_ss_IO"] = ((df_player1["opp_1st_sv_in_l60_ws"]/df_player1["p_ret_pts_l60_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

In [33]:
# 'p_1st_sv%_yielded_l10_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor (IO) specific mean FIRST SERVE PERCENTAGE YIELDED (AS RETURNER) of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_ret_pts_l10_ws"] = df_player1[["p_ret_pts_60", "p_ret_pts_59", "p_ret_pts_58", "p_ret_pts_57", "p_ret_pts_56", "p_ret_pts_55", "p_ret_pts_54", "p_ret_pts_53", "p_ret_pts_52", "p_ret_pts_51"]].sum(axis=1)
df_player1["opp_1st_sv_in_l10_ws"] = df_player1[["opp_1st_sv_in_60", "opp_1st_sv_in_59", "opp_1st_sv_in_58", "opp_1st_sv_in_57", "opp_1st_sv_in_56", "opp_1st_sv_in_55", "opp_1st_sv_in_54", "opp_1st_sv_in_53", "opp_1st_sv_in_52", "opp_1st_sv_in_51"]].sum(axis=1)
df_player1["p_1st_sv%_yielded_l10_tw_ss_IO"] = ((df_player1["opp_1st_sv_in_l10_ws"]/df_player1["p_ret_pts_l10_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_ret_pts_l60_ws", "opp_1st_sv_in_l60_ws", "p_ret_pts_l10_ws", "opp_1st_sv_in_l10_ws", "p_ret_pts_60", "p_ret_pts_59", "p_ret_pts_58", "p_ret_pts_57", "p_ret_pts_56", "p_ret_pts_55", "p_ret_pts_54", "p_ret_pts_53", "p_ret_pts_52", "p_ret_pts_51", "p_ret_pts_50", "p_ret_pts_49", "p_ret_pts_48", "p_ret_pts_47", "p_ret_pts_46", "p_ret_pts_45", "p_ret_pts_44", "p_ret_pts_43", "p_ret_pts_42", "p_ret_pts_41", "p_ret_pts_40", "p_ret_pts_39", "p_ret_pts_38", "p_ret_pts_37", "p_ret_pts_36", "p_ret_pts_35", "p_ret_pts_34", "p_ret_pts_33", "p_ret_pts_32", "p_ret_pts_31", "p_ret_pts_30", "p_ret_pts_29", "p_ret_pts_28", "p_ret_pts_27", "p_ret_pts_26", "p_ret_pts_25", "p_ret_pts_24", "p_ret_pts_23", "p_ret_pts_22", "p_ret_pts_21", "p_ret_pts_20", "p_ret_pts_19", "p_ret_pts_18", "p_ret_pts_17", "p_ret_pts_16", "p_ret_pts_15", "p_ret_pts_14", "p_ret_pts_13", "p_ret_pts_12", "p_ret_pts_11", "p_ret_pts_10", "p_ret_pts_9", "p_ret_pts_8", "p_ret_pts_7", "p_ret_pts_6", "p_ret_pts_5", "p_ret_pts_4", "p_ret_pts_3", "p_ret_pts_2", "p_ret_pts_1", "opp_1st_sv_in_60", "opp_1st_sv_in_59", "opp_1st_sv_in_58", "opp_1st_sv_in_57", "opp_1st_sv_in_56", "opp_1st_sv_in_55", "opp_1st_sv_in_54", "opp_1st_sv_in_53", "opp_1st_sv_in_52", "opp_1st_sv_in_51", "opp_1st_sv_in_50", "opp_1st_sv_in_49", "opp_1st_sv_in_48", "opp_1st_sv_in_47", "opp_1st_sv_in_46", "opp_1st_sv_in_45", "opp_1st_sv_in_44", "opp_1st_sv_in_43", "opp_1st_sv_in_42", "opp_1st_sv_in_41", "opp_1st_sv_in_40", "opp_1st_sv_in_39", "opp_1st_sv_in_38", "opp_1st_sv_in_37", "opp_1st_sv_in_36", "opp_1st_sv_in_35", "opp_1st_sv_in_34", "opp_1st_sv_in_33", "opp_1st_sv_in_32", "opp_1st_sv_in_31", "opp_1st_sv_in_30", "opp_1st_sv_in_29", "opp_1st_sv_in_28", "opp_1st_sv_in_27", "opp_1st_sv_in_26", "opp_1st_sv_in_25", "opp_1st_sv_in_24", "opp_1st_sv_in_23", "opp_1st_sv_in_22", "opp_1st_sv_in_21", "opp_1st_sv_in_20", "opp_1st_sv_in_19", "opp_1st_sv_in_18", "opp_1st_sv_in_17", "opp_1st_sv_in_16", "opp_1st_sv_in_15", "opp_1st_sv_in_14", "opp_1st_sv_in_13", "opp_1st_sv_in_12", "opp_1st_sv_in_11", "opp_1st_sv_in_10", "opp_1st_sv_in_9", "opp_1st_sv_in_8", "opp_1st_sv_in_7", "opp_1st_sv_in_6", "opp_1st_sv_in_5", "opp_1st_sv_in_4", "opp_1st_sv_in_3", "opp_1st_sv_in_2", "opp_1st_sv_in_1"], axis = 1)

In [34]:
# 'p_sv_pts_won%_l60_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean SERVE POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

df_player1["p_sv_pts_60"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-1)
df_player1["p_sv_pts_won_60"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-1)

df_player1["p_sv_pts_59"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-2)
df_player1["p_sv_pts_won_59"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-2)

df_player1["p_sv_pts_58"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-3)
df_player1["p_sv_pts_won_58"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-3)

df_player1["p_sv_pts_57"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-4)
df_player1["p_sv_pts_won_57"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-4)

df_player1["p_sv_pts_56"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-5)
df_player1["p_sv_pts_won_56"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-5)

df_player1["p_sv_pts_55"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-6)
df_player1["p_sv_pts_won_55"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-6)

df_player1["p_sv_pts_54"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-7)
df_player1["p_sv_pts_won_54"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-7)

df_player1["p_sv_pts_53"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-8)
df_player1["p_sv_pts_won_53"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-8)

df_player1["p_sv_pts_52"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-9)
df_player1["p_sv_pts_won_52"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-9)

df_player1["p_sv_pts_51"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-10)
df_player1["p_sv_pts_won_51"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-10)

df_player1["p_sv_pts_50"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-11)
df_player1["p_sv_pts_won_50"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-11)

df_player1["p_sv_pts_49"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-12)
df_player1["p_sv_pts_won_49"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-12)

df_player1["p_sv_pts_48"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-13)
df_player1["p_sv_pts_won_48"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-13)

df_player1["p_sv_pts_47"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-14)
df_player1["p_sv_pts_won_47"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-14)

df_player1["p_sv_pts_46"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-15)
df_player1["p_sv_pts_won_46"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-15)

df_player1["p_sv_pts_45"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-16)
df_player1["p_sv_pts_won_45"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-16)

df_player1["p_sv_pts_44"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-17)
df_player1["p_sv_pts_won_44"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-17)

df_player1["p_sv_pts_43"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-18)
df_player1["p_sv_pts_won_43"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-18)

df_player1["p_sv_pts_42"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-19)
df_player1["p_sv_pts_won_42"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-19)

df_player1["p_sv_pts_41"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-20)
df_player1["p_sv_pts_won_41"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-20)

df_player1["p_sv_pts_40"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-21)
df_player1["p_sv_pts_won_40"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-21)

df_player1["p_sv_pts_39"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-22)
df_player1["p_sv_pts_won_39"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-22)

df_player1["p_sv_pts_38"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-23)
df_player1["p_sv_pts_won_38"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-23)

df_player1["p_sv_pts_37"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-24)
df_player1["p_sv_pts_won_37"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-24)

df_player1["p_sv_pts_36"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-25)
df_player1["p_sv_pts_won_36"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-25)

df_player1["p_sv_pts_35"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-26)
df_player1["p_sv_pts_won_35"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-26)

df_player1["p_sv_pts_34"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-27)
df_player1["p_sv_pts_won_34"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-27)

df_player1["p_sv_pts_33"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-28)
df_player1["p_sv_pts_won_33"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-28)

df_player1["p_sv_pts_32"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-29)
df_player1["p_sv_pts_won_32"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-29)

df_player1["p_sv_pts_31"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-30)
df_player1["p_sv_pts_won_31"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-30)

df_player1["p_sv_pts_30"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-31)
df_player1["p_sv_pts_won_30"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-31)

df_player1["p_sv_pts_29"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-32)
df_player1["p_sv_pts_won_29"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-32)

df_player1["p_sv_pts_28"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-33)
df_player1["p_sv_pts_won_28"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-33)

df_player1["p_sv_pts_27"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-34)
df_player1["p_sv_pts_won_27"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-34)

df_player1["p_sv_pts_26"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-35)
df_player1["p_sv_pts_won_26"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-35)

df_player1["p_sv_pts_25"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-36)
df_player1["p_sv_pts_won_25"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-36)

df_player1["p_sv_pts_24"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-37)
df_player1["p_sv_pts_won_24"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-37)

df_player1["p_sv_pts_23"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-38)
df_player1["p_sv_pts_won_23"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-38)

df_player1["p_sv_pts_22"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-39)
df_player1["p_sv_pts_won_22"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-39)

df_player1["p_sv_pts_21"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-40)
df_player1["p_sv_pts_won_21"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-40)

df_player1["p_sv_pts_20"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-41)
df_player1["p_sv_pts_won_20"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-41)

df_player1["p_sv_pts_19"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-42)
df_player1["p_sv_pts_won_19"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-42)

df_player1["p_sv_pts_18"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-43)
df_player1["p_sv_pts_won_18"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-43)

df_player1["p_sv_pts_17"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-44)
df_player1["p_sv_pts_won_17"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-44)

df_player1["p_sv_pts_16"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-45)
df_player1["p_sv_pts_won_16"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-45)

df_player1["p_sv_pts_15"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-46)
df_player1["p_sv_pts_won_15"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-46)

df_player1["p_sv_pts_14"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-47)
df_player1["p_sv_pts_won_14"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-47)

df_player1["p_sv_pts_13"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-48)
df_player1["p_sv_pts_won_13"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-48)

df_player1["p_sv_pts_12"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-49)
df_player1["p_sv_pts_won_12"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-49)

df_player1["p_sv_pts_11"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-50)
df_player1["p_sv_pts_won_11"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-50)

df_player1["p_sv_pts_10"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-51)
df_player1["p_sv_pts_won_10"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-51)

df_player1["p_sv_pts_9"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-52)
df_player1["p_sv_pts_won_9"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-52)

df_player1["p_sv_pts_8"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-53)
df_player1["p_sv_pts_won_8"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-53)

df_player1["p_sv_pts_7"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-54)
df_player1["p_sv_pts_won_7"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-54)

df_player1["p_sv_pts_6"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-55)
df_player1["p_sv_pts_won_6"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-55)

df_player1["p_sv_pts_5"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-56)
df_player1["p_sv_pts_won_5"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-56)

df_player1["p_sv_pts_4"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-57)
df_player1["p_sv_pts_won_4"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-57)

df_player1["p_sv_pts_3"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-58)
df_player1["p_sv_pts_won_3"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-58)

df_player1["p_sv_pts_2"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-59)
df_player1["p_sv_pts_won_2"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-59)

df_player1["p_sv_pts_1"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-60)
df_player1["p_sv_pts_won_1"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_sv_pts_l60_ws"] = df_player1[["p_sv_pts_60", "p_sv_pts_59", "p_sv_pts_58", "p_sv_pts_57", "p_sv_pts_56", "p_sv_pts_55", "p_sv_pts_54", "p_sv_pts_53", "p_sv_pts_52", "p_sv_pts_51", "p_sv_pts_50", "p_sv_pts_49", "p_sv_pts_48", "p_sv_pts_47", "p_sv_pts_46", "p_sv_pts_45", "p_sv_pts_44", "p_sv_pts_43", "p_sv_pts_42", "p_sv_pts_41", "p_sv_pts_40", "p_sv_pts_39", "p_sv_pts_38", "p_sv_pts_37", "p_sv_pts_36", "p_sv_pts_35", "p_sv_pts_34", "p_sv_pts_33", "p_sv_pts_32", "p_sv_pts_31", "p_sv_pts_30", "p_sv_pts_29", "p_sv_pts_28", "p_sv_pts_27", "p_sv_pts_26", "p_sv_pts_25", "p_sv_pts_24", "p_sv_pts_23", "p_sv_pts_22", "p_sv_pts_21", "p_sv_pts_20", "p_sv_pts_19", "p_sv_pts_18", "p_sv_pts_17", "p_sv_pts_16", "p_sv_pts_15", "p_sv_pts_14", "p_sv_pts_13", "p_sv_pts_12", "p_sv_pts_11", "p_sv_pts_10", "p_sv_pts_9", "p_sv_pts_8", "p_sv_pts_7", "p_sv_pts_6", "p_sv_pts_5", "p_sv_pts_4", "p_sv_pts_3", "p_sv_pts_2", "p_sv_pts_1"]].sum(axis=1)
df_player1["p_sv_pts_won_l60_ws"] = df_player1[["p_sv_pts_won_60", "p_sv_pts_won_59", "p_sv_pts_won_58", "p_sv_pts_won_57", "p_sv_pts_won_56", "p_sv_pts_won_55", "p_sv_pts_won_54", "p_sv_pts_won_53", "p_sv_pts_won_52", "p_sv_pts_won_51", "p_sv_pts_won_50", "p_sv_pts_won_49", "p_sv_pts_won_48", "p_sv_pts_won_47", "p_sv_pts_won_46", "p_sv_pts_won_45", "p_sv_pts_won_44", "p_sv_pts_won_43", "p_sv_pts_won_42", "p_sv_pts_won_41", "p_sv_pts_won_40", "p_sv_pts_won_39", "p_sv_pts_won_38", "p_sv_pts_won_37", "p_sv_pts_won_36", "p_sv_pts_won_35", "p_sv_pts_won_34", "p_sv_pts_won_33", "p_sv_pts_won_32", "p_sv_pts_won_31", "p_sv_pts_won_30", "p_sv_pts_won_29", "p_sv_pts_won_28", "p_sv_pts_won_27", "p_sv_pts_won_26", "p_sv_pts_won_25", "p_sv_pts_won_24", "p_sv_pts_won_23", "p_sv_pts_won_22", "p_sv_pts_won_21", "p_sv_pts_won_20", "p_sv_pts_won_19", "p_sv_pts_won_18", "p_sv_pts_won_17", "p_sv_pts_won_16", "p_sv_pts_won_15", "p_sv_pts_won_14", "p_sv_pts_won_13", "p_sv_pts_won_12", "p_sv_pts_won_11", "p_sv_pts_won_10", "p_sv_pts_won_9", "p_sv_pts_won_8", "p_sv_pts_won_7", "p_sv_pts_won_6", "p_sv_pts_won_5", "p_sv_pts_won_4", "p_sv_pts_won_3", "p_sv_pts_won_2", "p_sv_pts_won_1"]].sum(axis=1)
df_player1["p_sv_pts_won%_l60_tw_ss"] = ((df_player1["p_sv_pts_won_l60_ws"]/df_player1["p_sv_pts_l60_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

In [35]:
# 'p_sv_pts_won%_l10_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean SERVE POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_sv_pts_l10_ws"] = df_player1[["p_sv_pts_60", "p_sv_pts_59", "p_sv_pts_58", "p_sv_pts_57", "p_sv_pts_56", "p_sv_pts_55", "p_sv_pts_54", "p_sv_pts_53", "p_sv_pts_52", "p_sv_pts_51"]].sum(axis=1)
df_player1["p_sv_pts_won_l10_ws"] = df_player1[["p_sv_pts_won_60", "p_sv_pts_won_59", "p_sv_pts_won_58", "p_sv_pts_won_57", "p_sv_pts_won_56", "p_sv_pts_won_55", "p_sv_pts_won_54", "p_sv_pts_won_53", "p_sv_pts_won_52", "p_sv_pts_won_51"]].sum(axis=1)
df_player1["p_sv_pts_won%_l10_tw_ss"] = ((df_player1["p_sv_pts_won_l10_ws"]/df_player1["p_sv_pts_l10_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_sv_pts_l60_ws", "p_sv_pts_won_l60_ws", "p_sv_pts_l10_ws", "p_sv_pts_won_l10_ws", "p_sv_pts_60", "p_sv_pts_59", "p_sv_pts_58", "p_sv_pts_57", "p_sv_pts_56", "p_sv_pts_55", "p_sv_pts_54", "p_sv_pts_53", "p_sv_pts_52", "p_sv_pts_51", "p_sv_pts_50", "p_sv_pts_49", "p_sv_pts_48", "p_sv_pts_47", "p_sv_pts_46", "p_sv_pts_45", "p_sv_pts_44", "p_sv_pts_43", "p_sv_pts_42", "p_sv_pts_41", "p_sv_pts_40", "p_sv_pts_39", "p_sv_pts_38", "p_sv_pts_37", "p_sv_pts_36", "p_sv_pts_35", "p_sv_pts_34", "p_sv_pts_33", "p_sv_pts_32", "p_sv_pts_31", "p_sv_pts_30", "p_sv_pts_29", "p_sv_pts_28", "p_sv_pts_27", "p_sv_pts_26", "p_sv_pts_25", "p_sv_pts_24", "p_sv_pts_23", "p_sv_pts_22", "p_sv_pts_21", "p_sv_pts_20", "p_sv_pts_19", "p_sv_pts_18", "p_sv_pts_17", "p_sv_pts_16", "p_sv_pts_15", "p_sv_pts_14", "p_sv_pts_13", "p_sv_pts_12", "p_sv_pts_11", "p_sv_pts_10", "p_sv_pts_9", "p_sv_pts_8", "p_sv_pts_7", "p_sv_pts_6", "p_sv_pts_5", "p_sv_pts_4", "p_sv_pts_3", "p_sv_pts_2", "p_sv_pts_1", "p_sv_pts_won_60", "p_sv_pts_won_59", "p_sv_pts_won_58", "p_sv_pts_won_57", "p_sv_pts_won_56", "p_sv_pts_won_55", "p_sv_pts_won_54", "p_sv_pts_won_53", "p_sv_pts_won_52", "p_sv_pts_won_51", "p_sv_pts_won_50", "p_sv_pts_won_49", "p_sv_pts_won_48", "p_sv_pts_won_47", "p_sv_pts_won_46", "p_sv_pts_won_45", "p_sv_pts_won_44", "p_sv_pts_won_43", "p_sv_pts_won_42", "p_sv_pts_won_41", "p_sv_pts_won_40", "p_sv_pts_won_39", "p_sv_pts_won_38", "p_sv_pts_won_37", "p_sv_pts_won_36", "p_sv_pts_won_35", "p_sv_pts_won_34", "p_sv_pts_won_33", "p_sv_pts_won_32", "p_sv_pts_won_31", "p_sv_pts_won_30", "p_sv_pts_won_29", "p_sv_pts_won_28", "p_sv_pts_won_27", "p_sv_pts_won_26", "p_sv_pts_won_25", "p_sv_pts_won_24", "p_sv_pts_won_23", "p_sv_pts_won_22", "p_sv_pts_won_21", "p_sv_pts_won_20", "p_sv_pts_won_19", "p_sv_pts_won_18", "p_sv_pts_won_17", "p_sv_pts_won_16", "p_sv_pts_won_15", "p_sv_pts_won_14", "p_sv_pts_won_13", "p_sv_pts_won_12", "p_sv_pts_won_11", "p_sv_pts_won_10", "p_sv_pts_won_9", "p_sv_pts_won_8", "p_sv_pts_won_7", "p_sv_pts_won_6", "p_sv_pts_won_5", "p_sv_pts_won_4", "p_sv_pts_won_3", "p_sv_pts_won_2", "p_sv_pts_won_1"], axis = 1)

In [36]:
# 'p_sv_pts_won%_l60_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdor (IO) specific mean SERVE POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

df_player1["p_sv_pts_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-1)
df_player1["p_sv_pts_won_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-1)

df_player1["p_sv_pts_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-2)
df_player1["p_sv_pts_won_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-2)

df_player1["p_sv_pts_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-3)
df_player1["p_sv_pts_won_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-3)

df_player1["p_sv_pts_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-4)
df_player1["p_sv_pts_won_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-4)

df_player1["p_sv_pts_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-5)
df_player1["p_sv_pts_won_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-5)

df_player1["p_sv_pts_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-6)
df_player1["p_sv_pts_won_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-6)

df_player1["p_sv_pts_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-7)
df_player1["p_sv_pts_won_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-7)

df_player1["p_sv_pts_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-8)
df_player1["p_sv_pts_won_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-8)

df_player1["p_sv_pts_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-9)
df_player1["p_sv_pts_won_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-9)

df_player1["p_sv_pts_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-10)
df_player1["p_sv_pts_won_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-10)

df_player1["p_sv_pts_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-11)
df_player1["p_sv_pts_won_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-11)

df_player1["p_sv_pts_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-12)
df_player1["p_sv_pts_won_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-12)

df_player1["p_sv_pts_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-13)
df_player1["p_sv_pts_won_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-13)

df_player1["p_sv_pts_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-14)
df_player1["p_sv_pts_won_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-14)

df_player1["p_sv_pts_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-15)
df_player1["p_sv_pts_won_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-15)

df_player1["p_sv_pts_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-16)
df_player1["p_sv_pts_won_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-16)

df_player1["p_sv_pts_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-17)
df_player1["p_sv_pts_won_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-17)

df_player1["p_sv_pts_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-18)
df_player1["p_sv_pts_won_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-18)

df_player1["p_sv_pts_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-19)
df_player1["p_sv_pts_won_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-19)

df_player1["p_sv_pts_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-20)
df_player1["p_sv_pts_won_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-20)

df_player1["p_sv_pts_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-21)
df_player1["p_sv_pts_won_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-21)

df_player1["p_sv_pts_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-22)
df_player1["p_sv_pts_won_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-22)

df_player1["p_sv_pts_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-23)
df_player1["p_sv_pts_won_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-23)

df_player1["p_sv_pts_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-24)
df_player1["p_sv_pts_won_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-24)

df_player1["p_sv_pts_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-25)
df_player1["p_sv_pts_won_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-25)

df_player1["p_sv_pts_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-26)
df_player1["p_sv_pts_won_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-26)

df_player1["p_sv_pts_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-27)
df_player1["p_sv_pts_won_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-27)

df_player1["p_sv_pts_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-28)
df_player1["p_sv_pts_won_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-28)

df_player1["p_sv_pts_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-29)
df_player1["p_sv_pts_won_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-29)

df_player1["p_sv_pts_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-30)
df_player1["p_sv_pts_won_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-30)

df_player1["p_sv_pts_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-31)
df_player1["p_sv_pts_won_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-31)

df_player1["p_sv_pts_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-32)
df_player1["p_sv_pts_won_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-32)

df_player1["p_sv_pts_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-33)
df_player1["p_sv_pts_won_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-33)

df_player1["p_sv_pts_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-34)
df_player1["p_sv_pts_won_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-34)

df_player1["p_sv_pts_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-35)
df_player1["p_sv_pts_won_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-35)

df_player1["p_sv_pts_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-36)
df_player1["p_sv_pts_won_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-36)

df_player1["p_sv_pts_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-37)
df_player1["p_sv_pts_won_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-37)

df_player1["p_sv_pts_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-38)
df_player1["p_sv_pts_won_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-38)

df_player1["p_sv_pts_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-39)
df_player1["p_sv_pts_won_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-39)

df_player1["p_sv_pts_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-40)
df_player1["p_sv_pts_won_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-40)

df_player1["p_sv_pts_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-41)
df_player1["p_sv_pts_won_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-41)

df_player1["p_sv_pts_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-42)
df_player1["p_sv_pts_won_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-42)

df_player1["p_sv_pts_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-43)
df_player1["p_sv_pts_won_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-43)

df_player1["p_sv_pts_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-44)
df_player1["p_sv_pts_won_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-44)

df_player1["p_sv_pts_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-45)
df_player1["p_sv_pts_won_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-45)

df_player1["p_sv_pts_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-46)
df_player1["p_sv_pts_won_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-46)

df_player1["p_sv_pts_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-47)
df_player1["p_sv_pts_won_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-47)

df_player1["p_sv_pts_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-48)
df_player1["p_sv_pts_won_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-48)

df_player1["p_sv_pts_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-49)
df_player1["p_sv_pts_won_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-49)

df_player1["p_sv_pts_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-50)
df_player1["p_sv_pts_won_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-50)

df_player1["p_sv_pts_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-51)
df_player1["p_sv_pts_won_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-51)

df_player1["p_sv_pts_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-52)
df_player1["p_sv_pts_won_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-52)

df_player1["p_sv_pts_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-53)
df_player1["p_sv_pts_won_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-53)

df_player1["p_sv_pts_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-54)
df_player1["p_sv_pts_won_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-54)

df_player1["p_sv_pts_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-55)
df_player1["p_sv_pts_won_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-55)

df_player1["p_sv_pts_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-56)
df_player1["p_sv_pts_won_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-56)

df_player1["p_sv_pts_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-57)
df_player1["p_sv_pts_won_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-57)

df_player1["p_sv_pts_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-58)
df_player1["p_sv_pts_won_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-58)

df_player1["p_sv_pts_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-59)
df_player1["p_sv_pts_won_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-59)

df_player1["p_sv_pts_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-60)
df_player1["p_sv_pts_won_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts_won'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_sv_pts_l60_ws"] = df_player1[["p_sv_pts_60", "p_sv_pts_59", "p_sv_pts_58", "p_sv_pts_57", "p_sv_pts_56", "p_sv_pts_55", "p_sv_pts_54", "p_sv_pts_53", "p_sv_pts_52", "p_sv_pts_51", "p_sv_pts_50", "p_sv_pts_49", "p_sv_pts_48", "p_sv_pts_47", "p_sv_pts_46", "p_sv_pts_45", "p_sv_pts_44", "p_sv_pts_43", "p_sv_pts_42", "p_sv_pts_41", "p_sv_pts_40", "p_sv_pts_39", "p_sv_pts_38", "p_sv_pts_37", "p_sv_pts_36", "p_sv_pts_35", "p_sv_pts_34", "p_sv_pts_33", "p_sv_pts_32", "p_sv_pts_31", "p_sv_pts_30", "p_sv_pts_29", "p_sv_pts_28", "p_sv_pts_27", "p_sv_pts_26", "p_sv_pts_25", "p_sv_pts_24", "p_sv_pts_23", "p_sv_pts_22", "p_sv_pts_21", "p_sv_pts_20", "p_sv_pts_19", "p_sv_pts_18", "p_sv_pts_17", "p_sv_pts_16", "p_sv_pts_15", "p_sv_pts_14", "p_sv_pts_13", "p_sv_pts_12", "p_sv_pts_11", "p_sv_pts_10", "p_sv_pts_9", "p_sv_pts_8", "p_sv_pts_7", "p_sv_pts_6", "p_sv_pts_5", "p_sv_pts_4", "p_sv_pts_3", "p_sv_pts_2", "p_sv_pts_1"]].sum(axis=1)
df_player1["p_sv_pts_won_l60_ws"] = df_player1[["p_sv_pts_won_60", "p_sv_pts_won_59", "p_sv_pts_won_58", "p_sv_pts_won_57", "p_sv_pts_won_56", "p_sv_pts_won_55", "p_sv_pts_won_54", "p_sv_pts_won_53", "p_sv_pts_won_52", "p_sv_pts_won_51", "p_sv_pts_won_50", "p_sv_pts_won_49", "p_sv_pts_won_48", "p_sv_pts_won_47", "p_sv_pts_won_46", "p_sv_pts_won_45", "p_sv_pts_won_44", "p_sv_pts_won_43", "p_sv_pts_won_42", "p_sv_pts_won_41", "p_sv_pts_won_40", "p_sv_pts_won_39", "p_sv_pts_won_38", "p_sv_pts_won_37", "p_sv_pts_won_36", "p_sv_pts_won_35", "p_sv_pts_won_34", "p_sv_pts_won_33", "p_sv_pts_won_32", "p_sv_pts_won_31", "p_sv_pts_won_30", "p_sv_pts_won_29", "p_sv_pts_won_28", "p_sv_pts_won_27", "p_sv_pts_won_26", "p_sv_pts_won_25", "p_sv_pts_won_24", "p_sv_pts_won_23", "p_sv_pts_won_22", "p_sv_pts_won_21", "p_sv_pts_won_20", "p_sv_pts_won_19", "p_sv_pts_won_18", "p_sv_pts_won_17", "p_sv_pts_won_16", "p_sv_pts_won_15", "p_sv_pts_won_14", "p_sv_pts_won_13", "p_sv_pts_won_12", "p_sv_pts_won_11", "p_sv_pts_won_10", "p_sv_pts_won_9", "p_sv_pts_won_8", "p_sv_pts_won_7", "p_sv_pts_won_6", "p_sv_pts_won_5", "p_sv_pts_won_4", "p_sv_pts_won_3", "p_sv_pts_won_2", "p_sv_pts_won_1"]].sum(axis=1)
df_player1["p_sv_pts_won%_l60_tw_ss_IO"] = ((df_player1["p_sv_pts_won_l60_ws"]/df_player1["p_sv_pts_l60_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

In [37]:
# 'p_sv_pts_won%_l10_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor (IO) specific mean SERVE POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_sv_pts_l10_ws"] = df_player1[["p_sv_pts_60", "p_sv_pts_59", "p_sv_pts_58", "p_sv_pts_57", "p_sv_pts_56", "p_sv_pts_55", "p_sv_pts_54", "p_sv_pts_53", "p_sv_pts_52", "p_sv_pts_51"]].sum(axis=1)
df_player1["p_sv_pts_won_l10_ws"] = df_player1[["p_sv_pts_won_60", "p_sv_pts_won_59", "p_sv_pts_won_58", "p_sv_pts_won_57", "p_sv_pts_won_56", "p_sv_pts_won_55", "p_sv_pts_won_54", "p_sv_pts_won_53", "p_sv_pts_won_52", "p_sv_pts_won_51"]].sum(axis=1)
df_player1["p_sv_pts_won%_l10_tw_ss_IO"] = ((df_player1["p_sv_pts_won_l10_ws"]/df_player1["p_sv_pts_l10_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_sv_pts_l60_ws", "p_sv_pts_won_l60_ws", "p_sv_pts_l10_ws", "p_sv_pts_won_l10_ws", "p_sv_pts_60", "p_sv_pts_59", "p_sv_pts_58", "p_sv_pts_57", "p_sv_pts_56", "p_sv_pts_55", "p_sv_pts_54", "p_sv_pts_53", "p_sv_pts_52", "p_sv_pts_51", "p_sv_pts_50", "p_sv_pts_49", "p_sv_pts_48", "p_sv_pts_47", "p_sv_pts_46", "p_sv_pts_45", "p_sv_pts_44", "p_sv_pts_43", "p_sv_pts_42", "p_sv_pts_41", "p_sv_pts_40", "p_sv_pts_39", "p_sv_pts_38", "p_sv_pts_37", "p_sv_pts_36", "p_sv_pts_35", "p_sv_pts_34", "p_sv_pts_33", "p_sv_pts_32", "p_sv_pts_31", "p_sv_pts_30", "p_sv_pts_29", "p_sv_pts_28", "p_sv_pts_27", "p_sv_pts_26", "p_sv_pts_25", "p_sv_pts_24", "p_sv_pts_23", "p_sv_pts_22", "p_sv_pts_21", "p_sv_pts_20", "p_sv_pts_19", "p_sv_pts_18", "p_sv_pts_17", "p_sv_pts_16", "p_sv_pts_15", "p_sv_pts_14", "p_sv_pts_13", "p_sv_pts_12", "p_sv_pts_11", "p_sv_pts_10", "p_sv_pts_9", "p_sv_pts_8", "p_sv_pts_7", "p_sv_pts_6", "p_sv_pts_5", "p_sv_pts_4", "p_sv_pts_3", "p_sv_pts_2", "p_sv_pts_1", "p_sv_pts_won_60", "p_sv_pts_won_59", "p_sv_pts_won_58", "p_sv_pts_won_57", "p_sv_pts_won_56", "p_sv_pts_won_55", "p_sv_pts_won_54", "p_sv_pts_won_53", "p_sv_pts_won_52", "p_sv_pts_won_51", "p_sv_pts_won_50", "p_sv_pts_won_49", "p_sv_pts_won_48", "p_sv_pts_won_47", "p_sv_pts_won_46", "p_sv_pts_won_45", "p_sv_pts_won_44", "p_sv_pts_won_43", "p_sv_pts_won_42", "p_sv_pts_won_41", "p_sv_pts_won_40", "p_sv_pts_won_39", "p_sv_pts_won_38", "p_sv_pts_won_37", "p_sv_pts_won_36", "p_sv_pts_won_35", "p_sv_pts_won_34", "p_sv_pts_won_33", "p_sv_pts_won_32", "p_sv_pts_won_31", "p_sv_pts_won_30", "p_sv_pts_won_29", "p_sv_pts_won_28", "p_sv_pts_won_27", "p_sv_pts_won_26", "p_sv_pts_won_25", "p_sv_pts_won_24", "p_sv_pts_won_23", "p_sv_pts_won_22", "p_sv_pts_won_21", "p_sv_pts_won_20", "p_sv_pts_won_19", "p_sv_pts_won_18", "p_sv_pts_won_17", "p_sv_pts_won_16", "p_sv_pts_won_15", "p_sv_pts_won_14", "p_sv_pts_won_13", "p_sv_pts_won_12", "p_sv_pts_won_11", "p_sv_pts_won_10", "p_sv_pts_won_9", "p_sv_pts_won_8", "p_sv_pts_won_7", "p_sv_pts_won_6", "p_sv_pts_won_5", "p_sv_pts_won_4", "p_sv_pts_won_3", "p_sv_pts_won_2", "p_sv_pts_won_1"], axis = 1)

In [38]:
# 'p_1st_sv_pts_won%_l60_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean FIRST SERVE POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

df_player1["p_1st_sv_pts_60"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-1)
df_player1["p_1st_sv_pts_won_60"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-1)

df_player1["p_1st_sv_pts_59"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-2)
df_player1["p_1st_sv_pts_won_59"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-2)

df_player1["p_1st_sv_pts_58"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-3)
df_player1["p_1st_sv_pts_won_58"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-3)

df_player1["p_1st_sv_pts_57"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-4)
df_player1["p_1st_sv_pts_won_57"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-4)

df_player1["p_1st_sv_pts_56"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-5)
df_player1["p_1st_sv_pts_won_56"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-5)

df_player1["p_1st_sv_pts_55"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-6)
df_player1["p_1st_sv_pts_won_55"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-6)

df_player1["p_1st_sv_pts_54"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-7)
df_player1["p_1st_sv_pts_won_54"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-7)

df_player1["p_1st_sv_pts_53"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-8)
df_player1["p_1st_sv_pts_won_53"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-8)

df_player1["p_1st_sv_pts_52"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-9)
df_player1["p_1st_sv_pts_won_52"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-9)

df_player1["p_1st_sv_pts_51"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-10)
df_player1["p_1st_sv_pts_won_51"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-10)

df_player1["p_1st_sv_pts_50"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-11)
df_player1["p_1st_sv_pts_won_50"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-11)

df_player1["p_1st_sv_pts_49"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-12)
df_player1["p_1st_sv_pts_won_49"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-12)

df_player1["p_1st_sv_pts_48"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-13)
df_player1["p_1st_sv_pts_won_48"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-13)

df_player1["p_1st_sv_pts_47"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-14)
df_player1["p_1st_sv_pts_won_47"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-14)

df_player1["p_1st_sv_pts_46"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-15)
df_player1["p_1st_sv_pts_won_46"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-15)

df_player1["p_1st_sv_pts_45"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-16)
df_player1["p_1st_sv_pts_won_45"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-16)

df_player1["p_1st_sv_pts_44"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-17)
df_player1["p_1st_sv_pts_won_44"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-17)

df_player1["p_1st_sv_pts_43"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-18)
df_player1["p_1st_sv_pts_won_43"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-18)

df_player1["p_1st_sv_pts_42"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-19)
df_player1["p_1st_sv_pts_won_42"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-19)

df_player1["p_1st_sv_pts_41"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-20)
df_player1["p_1st_sv_pts_won_41"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-20)

df_player1["p_1st_sv_pts_40"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-21)
df_player1["p_1st_sv_pts_won_40"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-21)

df_player1["p_1st_sv_pts_39"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-22)
df_player1["p_1st_sv_pts_won_39"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-22)

df_player1["p_1st_sv_pts_38"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-23)
df_player1["p_1st_sv_pts_won_38"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-23)

df_player1["p_1st_sv_pts_37"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-24)
df_player1["p_1st_sv_pts_won_37"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-24)

df_player1["p_1st_sv_pts_36"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-25)
df_player1["p_1st_sv_pts_won_36"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-25)

df_player1["p_1st_sv_pts_35"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-26)
df_player1["p_1st_sv_pts_won_35"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-26)

df_player1["p_1st_sv_pts_34"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-27)
df_player1["p_1st_sv_pts_won_34"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-27)

df_player1["p_1st_sv_pts_33"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-28)
df_player1["p_1st_sv_pts_won_33"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-28)

df_player1["p_1st_sv_pts_32"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-29)
df_player1["p_1st_sv_pts_won_32"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-29)

df_player1["p_1st_sv_pts_31"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-30)
df_player1["p_1st_sv_pts_won_31"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-30)

df_player1["p_1st_sv_pts_30"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-31)
df_player1["p_1st_sv_pts_won_30"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-31)

df_player1["p_1st_sv_pts_29"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-32)
df_player1["p_1st_sv_pts_won_29"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-32)

df_player1["p_1st_sv_pts_28"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-33)
df_player1["p_1st_sv_pts_won_28"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-33)

df_player1["p_1st_sv_pts_27"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-34)
df_player1["p_1st_sv_pts_won_27"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-34)

df_player1["p_1st_sv_pts_26"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-35)
df_player1["p_1st_sv_pts_won_26"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-35)

df_player1["p_1st_sv_pts_25"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-36)
df_player1["p_1st_sv_pts_won_25"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-36)

df_player1["p_1st_sv_pts_24"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-37)
df_player1["p_1st_sv_pts_won_24"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-37)

df_player1["p_1st_sv_pts_23"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-38)
df_player1["p_1st_sv_pts_won_23"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-38)

df_player1["p_1st_sv_pts_22"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-39)
df_player1["p_1st_sv_pts_won_22"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-39)

df_player1["p_1st_sv_pts_21"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-40)
df_player1["p_1st_sv_pts_won_21"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-40)

df_player1["p_1st_sv_pts_20"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-41)
df_player1["p_1st_sv_pts_won_20"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-41)

df_player1["p_1st_sv_pts_19"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-42)
df_player1["p_1st_sv_pts_won_19"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-42)

df_player1["p_1st_sv_pts_18"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-43)
df_player1["p_1st_sv_pts_won_18"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-43)

df_player1["p_1st_sv_pts_17"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-44)
df_player1["p_1st_sv_pts_won_17"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-44)

df_player1["p_1st_sv_pts_16"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-45)
df_player1["p_1st_sv_pts_won_16"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-45)

df_player1["p_1st_sv_pts_15"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-46)
df_player1["p_1st_sv_pts_won_15"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-46)

df_player1["p_1st_sv_pts_14"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-47)
df_player1["p_1st_sv_pts_won_14"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-47)

df_player1["p_1st_sv_pts_13"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-48)
df_player1["p_1st_sv_pts_won_13"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-48)

df_player1["p_1st_sv_pts_12"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-49)
df_player1["p_1st_sv_pts_won_12"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-49)

df_player1["p_1st_sv_pts_11"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-50)
df_player1["p_1st_sv_pts_won_11"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-50)

df_player1["p_1st_sv_pts_10"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-51)
df_player1["p_1st_sv_pts_won_10"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-51)

df_player1["p_1st_sv_pts_9"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-52)
df_player1["p_1st_sv_pts_won_9"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-52)

df_player1["p_1st_sv_pts_8"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-53)
df_player1["p_1st_sv_pts_won_8"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-53)

df_player1["p_1st_sv_pts_7"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-54)
df_player1["p_1st_sv_pts_won_7"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-54)

df_player1["p_1st_sv_pts_6"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-55)
df_player1["p_1st_sv_pts_won_6"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-55)

df_player1["p_1st_sv_pts_5"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-56)
df_player1["p_1st_sv_pts_won_5"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-56)

df_player1["p_1st_sv_pts_4"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-57)
df_player1["p_1st_sv_pts_won_4"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-57)

df_player1["p_1st_sv_pts_3"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-58)
df_player1["p_1st_sv_pts_won_3"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-58)

df_player1["p_1st_sv_pts_2"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-59)
df_player1["p_1st_sv_pts_won_2"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-59)

df_player1["p_1st_sv_pts_1"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_in'].shift(-60)
df_player1["p_1st_sv_pts_won_1"] = df_player1.groupby(['p_id','t_surf'])['p_1st_sv_pts_won'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_1st_sv_pts_l60_ws"] = df_player1[["p_1st_sv_pts_60", "p_1st_sv_pts_59", "p_1st_sv_pts_58", "p_1st_sv_pts_57", "p_1st_sv_pts_56", "p_1st_sv_pts_55", "p_1st_sv_pts_54", "p_1st_sv_pts_53", "p_1st_sv_pts_52", "p_1st_sv_pts_51", "p_1st_sv_pts_50", "p_1st_sv_pts_49", "p_1st_sv_pts_48", "p_1st_sv_pts_47", "p_1st_sv_pts_46", "p_1st_sv_pts_45", "p_1st_sv_pts_44", "p_1st_sv_pts_43", "p_1st_sv_pts_42", "p_1st_sv_pts_41", "p_1st_sv_pts_40", "p_1st_sv_pts_39", "p_1st_sv_pts_38", "p_1st_sv_pts_37", "p_1st_sv_pts_36", "p_1st_sv_pts_35", "p_1st_sv_pts_34", "p_1st_sv_pts_33", "p_1st_sv_pts_32", "p_1st_sv_pts_31", "p_1st_sv_pts_30", "p_1st_sv_pts_29", "p_1st_sv_pts_28", "p_1st_sv_pts_27", "p_1st_sv_pts_26", "p_1st_sv_pts_25", "p_1st_sv_pts_24", "p_1st_sv_pts_23", "p_1st_sv_pts_22", "p_1st_sv_pts_21", "p_1st_sv_pts_20", "p_1st_sv_pts_19", "p_1st_sv_pts_18", "p_1st_sv_pts_17", "p_1st_sv_pts_16", "p_1st_sv_pts_15", "p_1st_sv_pts_14", "p_1st_sv_pts_13", "p_1st_sv_pts_12", "p_1st_sv_pts_11", "p_1st_sv_pts_10", "p_1st_sv_pts_9", "p_1st_sv_pts_8", "p_1st_sv_pts_7", "p_1st_sv_pts_6", "p_1st_sv_pts_5", "p_1st_sv_pts_4", "p_1st_sv_pts_3", "p_1st_sv_pts_2", "p_1st_sv_pts_1"]].sum(axis=1)
df_player1["p_1st_sv_pts_won_l60_ws"] = df_player1[["p_1st_sv_pts_won_60", "p_1st_sv_pts_won_59", "p_1st_sv_pts_won_58", "p_1st_sv_pts_won_57", "p_1st_sv_pts_won_56", "p_1st_sv_pts_won_55", "p_1st_sv_pts_won_54", "p_1st_sv_pts_won_53", "p_1st_sv_pts_won_52", "p_1st_sv_pts_won_51", "p_1st_sv_pts_won_50", "p_1st_sv_pts_won_49", "p_1st_sv_pts_won_48", "p_1st_sv_pts_won_47", "p_1st_sv_pts_won_46", "p_1st_sv_pts_won_45", "p_1st_sv_pts_won_44", "p_1st_sv_pts_won_43", "p_1st_sv_pts_won_42", "p_1st_sv_pts_won_41", "p_1st_sv_pts_won_40", "p_1st_sv_pts_won_39", "p_1st_sv_pts_won_38", "p_1st_sv_pts_won_37", "p_1st_sv_pts_won_36", "p_1st_sv_pts_won_35", "p_1st_sv_pts_won_34", "p_1st_sv_pts_won_33", "p_1st_sv_pts_won_32", "p_1st_sv_pts_won_31", "p_1st_sv_pts_won_30", "p_1st_sv_pts_won_29", "p_1st_sv_pts_won_28", "p_1st_sv_pts_won_27", "p_1st_sv_pts_won_26", "p_1st_sv_pts_won_25", "p_1st_sv_pts_won_24", "p_1st_sv_pts_won_23", "p_1st_sv_pts_won_22", "p_1st_sv_pts_won_21", "p_1st_sv_pts_won_20", "p_1st_sv_pts_won_19", "p_1st_sv_pts_won_18", "p_1st_sv_pts_won_17", "p_1st_sv_pts_won_16", "p_1st_sv_pts_won_15", "p_1st_sv_pts_won_14", "p_1st_sv_pts_won_13", "p_1st_sv_pts_won_12", "p_1st_sv_pts_won_11", "p_1st_sv_pts_won_10", "p_1st_sv_pts_won_9", "p_1st_sv_pts_won_8", "p_1st_sv_pts_won_7", "p_1st_sv_pts_won_6", "p_1st_sv_pts_won_5", "p_1st_sv_pts_won_4", "p_1st_sv_pts_won_3", "p_1st_sv_pts_won_2", "p_1st_sv_pts_won_1"]].sum(axis=1)
df_player1["p_1st_sv_pts_won%_l60_tw_ss"] = ((df_player1["p_1st_sv_pts_won_l60_ws"]/df_player1["p_1st_sv_pts_l60_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

In [39]:
# 'p_1st_sv_pts_won%_l10_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean FIRST SERVE POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_1st_sv_pts_l10_ws"] = df_player1[["p_1st_sv_pts_60", "p_1st_sv_pts_59", "p_1st_sv_pts_58", "p_1st_sv_pts_57", "p_1st_sv_pts_56", "p_1st_sv_pts_55", "p_1st_sv_pts_54", "p_1st_sv_pts_53", "p_1st_sv_pts_52", "p_1st_sv_pts_51"]].sum(axis=1)
df_player1["p_1st_sv_pts_won_l10_ws"] = df_player1[["p_1st_sv_pts_won_60", "p_1st_sv_pts_won_59", "p_1st_sv_pts_won_58", "p_1st_sv_pts_won_57", "p_1st_sv_pts_won_56", "p_1st_sv_pts_won_55", "p_1st_sv_pts_won_54", "p_1st_sv_pts_won_53", "p_1st_sv_pts_won_52", "p_1st_sv_pts_won_51"]].sum(axis=1)
df_player1["p_1st_sv_pts_won%_l10_tw_ss"] = ((df_player1["p_1st_sv_pts_won_l10_ws"]/df_player1["p_1st_sv_pts_l10_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_1st_sv_pts_l60_ws", "p_1st_sv_pts_won_l60_ws", "p_1st_sv_pts_l10_ws", "p_1st_sv_pts_won_l10_ws", "p_1st_sv_pts_60", "p_1st_sv_pts_59", "p_1st_sv_pts_58", "p_1st_sv_pts_57", "p_1st_sv_pts_56", "p_1st_sv_pts_55", "p_1st_sv_pts_54", "p_1st_sv_pts_53", "p_1st_sv_pts_52", "p_1st_sv_pts_51", "p_1st_sv_pts_50", "p_1st_sv_pts_49", "p_1st_sv_pts_48", "p_1st_sv_pts_47", "p_1st_sv_pts_46", "p_1st_sv_pts_45", "p_1st_sv_pts_44", "p_1st_sv_pts_43", "p_1st_sv_pts_42", "p_1st_sv_pts_41", "p_1st_sv_pts_40", "p_1st_sv_pts_39", "p_1st_sv_pts_38", "p_1st_sv_pts_37", "p_1st_sv_pts_36", "p_1st_sv_pts_35", "p_1st_sv_pts_34", "p_1st_sv_pts_33", "p_1st_sv_pts_32", "p_1st_sv_pts_31", "p_1st_sv_pts_30", "p_1st_sv_pts_29", "p_1st_sv_pts_28", "p_1st_sv_pts_27", "p_1st_sv_pts_26", "p_1st_sv_pts_25", "p_1st_sv_pts_24", "p_1st_sv_pts_23", "p_1st_sv_pts_22", "p_1st_sv_pts_21", "p_1st_sv_pts_20", "p_1st_sv_pts_19", "p_1st_sv_pts_18", "p_1st_sv_pts_17", "p_1st_sv_pts_16", "p_1st_sv_pts_15", "p_1st_sv_pts_14", "p_1st_sv_pts_13", "p_1st_sv_pts_12", "p_1st_sv_pts_11", "p_1st_sv_pts_10", "p_1st_sv_pts_9", "p_1st_sv_pts_8", "p_1st_sv_pts_7", "p_1st_sv_pts_6", "p_1st_sv_pts_5", "p_1st_sv_pts_4", "p_1st_sv_pts_3", "p_1st_sv_pts_2", "p_1st_sv_pts_1", "p_1st_sv_pts_won_60", "p_1st_sv_pts_won_59", "p_1st_sv_pts_won_58", "p_1st_sv_pts_won_57", "p_1st_sv_pts_won_56", "p_1st_sv_pts_won_55", "p_1st_sv_pts_won_54", "p_1st_sv_pts_won_53", "p_1st_sv_pts_won_52", "p_1st_sv_pts_won_51", "p_1st_sv_pts_won_50", "p_1st_sv_pts_won_49", "p_1st_sv_pts_won_48", "p_1st_sv_pts_won_47", "p_1st_sv_pts_won_46", "p_1st_sv_pts_won_45", "p_1st_sv_pts_won_44", "p_1st_sv_pts_won_43", "p_1st_sv_pts_won_42", "p_1st_sv_pts_won_41", "p_1st_sv_pts_won_40", "p_1st_sv_pts_won_39", "p_1st_sv_pts_won_38", "p_1st_sv_pts_won_37", "p_1st_sv_pts_won_36", "p_1st_sv_pts_won_35", "p_1st_sv_pts_won_34", "p_1st_sv_pts_won_33", "p_1st_sv_pts_won_32", "p_1st_sv_pts_won_31", "p_1st_sv_pts_won_30", "p_1st_sv_pts_won_29", "p_1st_sv_pts_won_28", "p_1st_sv_pts_won_27", "p_1st_sv_pts_won_26", "p_1st_sv_pts_won_25", "p_1st_sv_pts_won_24", "p_1st_sv_pts_won_23", "p_1st_sv_pts_won_22", "p_1st_sv_pts_won_21", "p_1st_sv_pts_won_20", "p_1st_sv_pts_won_19", "p_1st_sv_pts_won_18", "p_1st_sv_pts_won_17", "p_1st_sv_pts_won_16", "p_1st_sv_pts_won_15", "p_1st_sv_pts_won_14", "p_1st_sv_pts_won_13", "p_1st_sv_pts_won_12", "p_1st_sv_pts_won_11", "p_1st_sv_pts_won_10", "p_1st_sv_pts_won_9", "p_1st_sv_pts_won_8", "p_1st_sv_pts_won_7", "p_1st_sv_pts_won_6", "p_1st_sv_pts_won_5", "p_1st_sv_pts_won_4", "p_1st_sv_pts_won_3", "p_1st_sv_pts_won_2", "p_1st_sv_pts_won_1"], axis = 1)

In [40]:
# 'p_1st_sv_pts_won%_l60_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor (IO) specific mean FIRST SERVE POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

df_player1["p_1st_sv_pts_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-1)
df_player1["p_1st_sv_pts_won_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-1)

df_player1["p_1st_sv_pts_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-2)
df_player1["p_1st_sv_pts_won_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-2)

df_player1["p_1st_sv_pts_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-3)
df_player1["p_1st_sv_pts_won_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-3)

df_player1["p_1st_sv_pts_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-4)
df_player1["p_1st_sv_pts_won_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-4)

df_player1["p_1st_sv_pts_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-5)
df_player1["p_1st_sv_pts_won_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-5)

df_player1["p_1st_sv_pts_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-6)
df_player1["p_1st_sv_pts_won_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-6)

df_player1["p_1st_sv_pts_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-7)
df_player1["p_1st_sv_pts_won_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-7)

df_player1["p_1st_sv_pts_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-8)
df_player1["p_1st_sv_pts_won_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-8)

df_player1["p_1st_sv_pts_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-9)
df_player1["p_1st_sv_pts_won_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-9)

df_player1["p_1st_sv_pts_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-10)
df_player1["p_1st_sv_pts_won_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-10)

df_player1["p_1st_sv_pts_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-11)
df_player1["p_1st_sv_pts_won_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-11)

df_player1["p_1st_sv_pts_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-12)
df_player1["p_1st_sv_pts_won_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-12)

df_player1["p_1st_sv_pts_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-13)
df_player1["p_1st_sv_pts_won_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-13)

df_player1["p_1st_sv_pts_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-14)
df_player1["p_1st_sv_pts_won_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-14)

df_player1["p_1st_sv_pts_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-15)
df_player1["p_1st_sv_pts_won_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-15)

df_player1["p_1st_sv_pts_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-16)
df_player1["p_1st_sv_pts_won_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-16)

df_player1["p_1st_sv_pts_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-17)
df_player1["p_1st_sv_pts_won_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-17)

df_player1["p_1st_sv_pts_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-18)
df_player1["p_1st_sv_pts_won_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-18)

df_player1["p_1st_sv_pts_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-19)
df_player1["p_1st_sv_pts_won_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-19)

df_player1["p_1st_sv_pts_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-20)
df_player1["p_1st_sv_pts_won_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-20)

df_player1["p_1st_sv_pts_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-21)
df_player1["p_1st_sv_pts_won_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-21)

df_player1["p_1st_sv_pts_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-22)
df_player1["p_1st_sv_pts_won_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-22)

df_player1["p_1st_sv_pts_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-23)
df_player1["p_1st_sv_pts_won_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-23)

df_player1["p_1st_sv_pts_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-24)
df_player1["p_1st_sv_pts_won_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-24)

df_player1["p_1st_sv_pts_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-25)
df_player1["p_1st_sv_pts_won_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-25)

df_player1["p_1st_sv_pts_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-26)
df_player1["p_1st_sv_pts_won_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-26)

df_player1["p_1st_sv_pts_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-27)
df_player1["p_1st_sv_pts_won_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-27)

df_player1["p_1st_sv_pts_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-28)
df_player1["p_1st_sv_pts_won_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-28)

df_player1["p_1st_sv_pts_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-29)
df_player1["p_1st_sv_pts_won_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-29)

df_player1["p_1st_sv_pts_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-30)
df_player1["p_1st_sv_pts_won_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-30)

df_player1["p_1st_sv_pts_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-31)
df_player1["p_1st_sv_pts_won_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-31)

df_player1["p_1st_sv_pts_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-32)
df_player1["p_1st_sv_pts_won_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-32)

df_player1["p_1st_sv_pts_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-33)
df_player1["p_1st_sv_pts_won_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-33)

df_player1["p_1st_sv_pts_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-34)
df_player1["p_1st_sv_pts_won_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-34)

df_player1["p_1st_sv_pts_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-35)
df_player1["p_1st_sv_pts_won_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-35)

df_player1["p_1st_sv_pts_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-36)
df_player1["p_1st_sv_pts_won_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-36)

df_player1["p_1st_sv_pts_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-37)
df_player1["p_1st_sv_pts_won_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-37)

df_player1["p_1st_sv_pts_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-38)
df_player1["p_1st_sv_pts_won_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-38)

df_player1["p_1st_sv_pts_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-39)
df_player1["p_1st_sv_pts_won_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-39)

df_player1["p_1st_sv_pts_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-40)
df_player1["p_1st_sv_pts_won_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-40)

df_player1["p_1st_sv_pts_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-41)
df_player1["p_1st_sv_pts_won_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-41)

df_player1["p_1st_sv_pts_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-42)
df_player1["p_1st_sv_pts_won_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-42)

df_player1["p_1st_sv_pts_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-43)
df_player1["p_1st_sv_pts_won_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-43)

df_player1["p_1st_sv_pts_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-44)
df_player1["p_1st_sv_pts_won_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-44)

df_player1["p_1st_sv_pts_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-45)
df_player1["p_1st_sv_pts_won_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-45)

df_player1["p_1st_sv_pts_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-46)
df_player1["p_1st_sv_pts_won_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-46)

df_player1["p_1st_sv_pts_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-47)
df_player1["p_1st_sv_pts_won_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-47)

df_player1["p_1st_sv_pts_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-48)
df_player1["p_1st_sv_pts_won_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-48)

df_player1["p_1st_sv_pts_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-49)
df_player1["p_1st_sv_pts_won_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-49)

df_player1["p_1st_sv_pts_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-50)
df_player1["p_1st_sv_pts_won_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-50)

df_player1["p_1st_sv_pts_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-51)
df_player1["p_1st_sv_pts_won_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-51)

df_player1["p_1st_sv_pts_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-52)
df_player1["p_1st_sv_pts_won_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-52)

df_player1["p_1st_sv_pts_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-53)
df_player1["p_1st_sv_pts_won_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-53)

df_player1["p_1st_sv_pts_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-54)
df_player1["p_1st_sv_pts_won_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-54)

df_player1["p_1st_sv_pts_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-55)
df_player1["p_1st_sv_pts_won_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-55)

df_player1["p_1st_sv_pts_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-56)
df_player1["p_1st_sv_pts_won_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-56)

df_player1["p_1st_sv_pts_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-57)
df_player1["p_1st_sv_pts_won_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-57)

df_player1["p_1st_sv_pts_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-58)
df_player1["p_1st_sv_pts_won_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-58)

df_player1["p_1st_sv_pts_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-59)
df_player1["p_1st_sv_pts_won_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-59)

df_player1["p_1st_sv_pts_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_in'].shift(-60)
df_player1["p_1st_sv_pts_won_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_sv_pts_won'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_1st_sv_pts_l60_ws"] = df_player1[["p_1st_sv_pts_60", "p_1st_sv_pts_59", "p_1st_sv_pts_58", "p_1st_sv_pts_57", "p_1st_sv_pts_56", "p_1st_sv_pts_55", "p_1st_sv_pts_54", "p_1st_sv_pts_53", "p_1st_sv_pts_52", "p_1st_sv_pts_51", "p_1st_sv_pts_50", "p_1st_sv_pts_49", "p_1st_sv_pts_48", "p_1st_sv_pts_47", "p_1st_sv_pts_46", "p_1st_sv_pts_45", "p_1st_sv_pts_44", "p_1st_sv_pts_43", "p_1st_sv_pts_42", "p_1st_sv_pts_41", "p_1st_sv_pts_40", "p_1st_sv_pts_39", "p_1st_sv_pts_38", "p_1st_sv_pts_37", "p_1st_sv_pts_36", "p_1st_sv_pts_35", "p_1st_sv_pts_34", "p_1st_sv_pts_33", "p_1st_sv_pts_32", "p_1st_sv_pts_31", "p_1st_sv_pts_30", "p_1st_sv_pts_29", "p_1st_sv_pts_28", "p_1st_sv_pts_27", "p_1st_sv_pts_26", "p_1st_sv_pts_25", "p_1st_sv_pts_24", "p_1st_sv_pts_23", "p_1st_sv_pts_22", "p_1st_sv_pts_21", "p_1st_sv_pts_20", "p_1st_sv_pts_19", "p_1st_sv_pts_18", "p_1st_sv_pts_17", "p_1st_sv_pts_16", "p_1st_sv_pts_15", "p_1st_sv_pts_14", "p_1st_sv_pts_13", "p_1st_sv_pts_12", "p_1st_sv_pts_11", "p_1st_sv_pts_10", "p_1st_sv_pts_9", "p_1st_sv_pts_8", "p_1st_sv_pts_7", "p_1st_sv_pts_6", "p_1st_sv_pts_5", "p_1st_sv_pts_4", "p_1st_sv_pts_3", "p_1st_sv_pts_2", "p_1st_sv_pts_1"]].sum(axis=1)
df_player1["p_1st_sv_pts_won_l60_ws"] = df_player1[["p_1st_sv_pts_won_60", "p_1st_sv_pts_won_59", "p_1st_sv_pts_won_58", "p_1st_sv_pts_won_57", "p_1st_sv_pts_won_56", "p_1st_sv_pts_won_55", "p_1st_sv_pts_won_54", "p_1st_sv_pts_won_53", "p_1st_sv_pts_won_52", "p_1st_sv_pts_won_51", "p_1st_sv_pts_won_50", "p_1st_sv_pts_won_49", "p_1st_sv_pts_won_48", "p_1st_sv_pts_won_47", "p_1st_sv_pts_won_46", "p_1st_sv_pts_won_45", "p_1st_sv_pts_won_44", "p_1st_sv_pts_won_43", "p_1st_sv_pts_won_42", "p_1st_sv_pts_won_41", "p_1st_sv_pts_won_40", "p_1st_sv_pts_won_39", "p_1st_sv_pts_won_38", "p_1st_sv_pts_won_37", "p_1st_sv_pts_won_36", "p_1st_sv_pts_won_35", "p_1st_sv_pts_won_34", "p_1st_sv_pts_won_33", "p_1st_sv_pts_won_32", "p_1st_sv_pts_won_31", "p_1st_sv_pts_won_30", "p_1st_sv_pts_won_29", "p_1st_sv_pts_won_28", "p_1st_sv_pts_won_27", "p_1st_sv_pts_won_26", "p_1st_sv_pts_won_25", "p_1st_sv_pts_won_24", "p_1st_sv_pts_won_23", "p_1st_sv_pts_won_22", "p_1st_sv_pts_won_21", "p_1st_sv_pts_won_20", "p_1st_sv_pts_won_19", "p_1st_sv_pts_won_18", "p_1st_sv_pts_won_17", "p_1st_sv_pts_won_16", "p_1st_sv_pts_won_15", "p_1st_sv_pts_won_14", "p_1st_sv_pts_won_13", "p_1st_sv_pts_won_12", "p_1st_sv_pts_won_11", "p_1st_sv_pts_won_10", "p_1st_sv_pts_won_9", "p_1st_sv_pts_won_8", "p_1st_sv_pts_won_7", "p_1st_sv_pts_won_6", "p_1st_sv_pts_won_5", "p_1st_sv_pts_won_4", "p_1st_sv_pts_won_3", "p_1st_sv_pts_won_2", "p_1st_sv_pts_won_1"]].sum(axis=1)
df_player1["p_1st_sv_pts_won%_l60_tw_ss_IO"] = ((df_player1["p_1st_sv_pts_won_l60_ws"]/df_player1["p_1st_sv_pts_l60_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

In [41]:
# 'p_1st_sv_pts_won%_l10_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor (IO) specific mean FIRST SERVE POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_1st_sv_pts_l10_ws"] = df_player1[["p_1st_sv_pts_60", "p_1st_sv_pts_59", "p_1st_sv_pts_58", "p_1st_sv_pts_57", "p_1st_sv_pts_56", "p_1st_sv_pts_55", "p_1st_sv_pts_54", "p_1st_sv_pts_53", "p_1st_sv_pts_52", "p_1st_sv_pts_51"]].sum(axis=1)
df_player1["p_1st_sv_pts_won_l10_ws"] = df_player1[["p_1st_sv_pts_won_60", "p_1st_sv_pts_won_59", "p_1st_sv_pts_won_58", "p_1st_sv_pts_won_57", "p_1st_sv_pts_won_56", "p_1st_sv_pts_won_55", "p_1st_sv_pts_won_54", "p_1st_sv_pts_won_53", "p_1st_sv_pts_won_52", "p_1st_sv_pts_won_51"]].sum(axis=1)
df_player1["p_1st_sv_pts_won%_l10_tw_ss_IO"] = ((df_player1["p_1st_sv_pts_won_l10_ws"]/df_player1["p_1st_sv_pts_l10_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_1st_sv_pts_l60_ws", "p_1st_sv_pts_won_l60_ws", "p_1st_sv_pts_l10_ws", "p_1st_sv_pts_won_l10_ws", "p_1st_sv_pts_60", "p_1st_sv_pts_59", "p_1st_sv_pts_58", "p_1st_sv_pts_57", "p_1st_sv_pts_56", "p_1st_sv_pts_55", "p_1st_sv_pts_54", "p_1st_sv_pts_53", "p_1st_sv_pts_52", "p_1st_sv_pts_51", "p_1st_sv_pts_50", "p_1st_sv_pts_49", "p_1st_sv_pts_48", "p_1st_sv_pts_47", "p_1st_sv_pts_46", "p_1st_sv_pts_45", "p_1st_sv_pts_44", "p_1st_sv_pts_43", "p_1st_sv_pts_42", "p_1st_sv_pts_41", "p_1st_sv_pts_40", "p_1st_sv_pts_39", "p_1st_sv_pts_38", "p_1st_sv_pts_37", "p_1st_sv_pts_36", "p_1st_sv_pts_35", "p_1st_sv_pts_34", "p_1st_sv_pts_33", "p_1st_sv_pts_32", "p_1st_sv_pts_31", "p_1st_sv_pts_30", "p_1st_sv_pts_29", "p_1st_sv_pts_28", "p_1st_sv_pts_27", "p_1st_sv_pts_26", "p_1st_sv_pts_25", "p_1st_sv_pts_24", "p_1st_sv_pts_23", "p_1st_sv_pts_22", "p_1st_sv_pts_21", "p_1st_sv_pts_20", "p_1st_sv_pts_19", "p_1st_sv_pts_18", "p_1st_sv_pts_17", "p_1st_sv_pts_16", "p_1st_sv_pts_15", "p_1st_sv_pts_14", "p_1st_sv_pts_13", "p_1st_sv_pts_12", "p_1st_sv_pts_11", "p_1st_sv_pts_10", "p_1st_sv_pts_9", "p_1st_sv_pts_8", "p_1st_sv_pts_7", "p_1st_sv_pts_6", "p_1st_sv_pts_5", "p_1st_sv_pts_4", "p_1st_sv_pts_3", "p_1st_sv_pts_2", "p_1st_sv_pts_1", "p_1st_sv_pts_won_60", "p_1st_sv_pts_won_59", "p_1st_sv_pts_won_58", "p_1st_sv_pts_won_57", "p_1st_sv_pts_won_56", "p_1st_sv_pts_won_55", "p_1st_sv_pts_won_54", "p_1st_sv_pts_won_53", "p_1st_sv_pts_won_52", "p_1st_sv_pts_won_51", "p_1st_sv_pts_won_50", "p_1st_sv_pts_won_49", "p_1st_sv_pts_won_48", "p_1st_sv_pts_won_47", "p_1st_sv_pts_won_46", "p_1st_sv_pts_won_45", "p_1st_sv_pts_won_44", "p_1st_sv_pts_won_43", "p_1st_sv_pts_won_42", "p_1st_sv_pts_won_41", "p_1st_sv_pts_won_40", "p_1st_sv_pts_won_39", "p_1st_sv_pts_won_38", "p_1st_sv_pts_won_37", "p_1st_sv_pts_won_36", "p_1st_sv_pts_won_35", "p_1st_sv_pts_won_34", "p_1st_sv_pts_won_33", "p_1st_sv_pts_won_32", "p_1st_sv_pts_won_31", "p_1st_sv_pts_won_30", "p_1st_sv_pts_won_29", "p_1st_sv_pts_won_28", "p_1st_sv_pts_won_27", "p_1st_sv_pts_won_26", "p_1st_sv_pts_won_25", "p_1st_sv_pts_won_24", "p_1st_sv_pts_won_23", "p_1st_sv_pts_won_22", "p_1st_sv_pts_won_21", "p_1st_sv_pts_won_20", "p_1st_sv_pts_won_19", "p_1st_sv_pts_won_18", "p_1st_sv_pts_won_17", "p_1st_sv_pts_won_16", "p_1st_sv_pts_won_15", "p_1st_sv_pts_won_14", "p_1st_sv_pts_won_13", "p_1st_sv_pts_won_12", "p_1st_sv_pts_won_11", "p_1st_sv_pts_won_10", "p_1st_sv_pts_won_9", "p_1st_sv_pts_won_8", "p_1st_sv_pts_won_7", "p_1st_sv_pts_won_6", "p_1st_sv_pts_won_5", "p_1st_sv_pts_won_4", "p_1st_sv_pts_won_3", "p_1st_sv_pts_won_2", "p_1st_sv_pts_won_1"], axis = 1)

In [42]:
# 'p_2nd_sv_pts_won%_l60_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean SECOND SERVE POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

df_player1["p_2nd_sv_pts_60"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-1)
df_player1["p_2nd_sv_pts_won_60"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-1)

df_player1["p_2nd_sv_pts_59"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-2)
df_player1["p_2nd_sv_pts_won_59"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-2)

df_player1["p_2nd_sv_pts_58"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-3)
df_player1["p_2nd_sv_pts_won_58"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-3)

df_player1["p_2nd_sv_pts_57"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-4)
df_player1["p_2nd_sv_pts_won_57"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-4)

df_player1["p_2nd_sv_pts_56"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-5)
df_player1["p_2nd_sv_pts_won_56"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-5)

df_player1["p_2nd_sv_pts_55"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-6)
df_player1["p_2nd_sv_pts_won_55"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-6)

df_player1["p_2nd_sv_pts_54"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-7)
df_player1["p_2nd_sv_pts_won_54"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-7)

df_player1["p_2nd_sv_pts_53"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-8)
df_player1["p_2nd_sv_pts_won_53"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-8)

df_player1["p_2nd_sv_pts_52"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-9)
df_player1["p_2nd_sv_pts_won_52"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-9)

df_player1["p_2nd_sv_pts_51"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-10)
df_player1["p_2nd_sv_pts_won_51"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-10)

df_player1["p_2nd_sv_pts_50"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-11)
df_player1["p_2nd_sv_pts_won_50"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-11)

df_player1["p_2nd_sv_pts_49"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-12)
df_player1["p_2nd_sv_pts_won_49"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-12)

df_player1["p_2nd_sv_pts_48"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-13)
df_player1["p_2nd_sv_pts_won_48"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-13)

df_player1["p_2nd_sv_pts_47"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-14)
df_player1["p_2nd_sv_pts_won_47"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-14)

df_player1["p_2nd_sv_pts_46"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-15)
df_player1["p_2nd_sv_pts_won_46"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-15)

df_player1["p_2nd_sv_pts_45"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-16)
df_player1["p_2nd_sv_pts_won_45"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-16)

df_player1["p_2nd_sv_pts_44"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-17)
df_player1["p_2nd_sv_pts_won_44"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-17)

df_player1["p_2nd_sv_pts_43"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-18)
df_player1["p_2nd_sv_pts_won_43"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-18)

df_player1["p_2nd_sv_pts_42"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-19)
df_player1["p_2nd_sv_pts_won_42"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-19)

df_player1["p_2nd_sv_pts_41"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-20)
df_player1["p_2nd_sv_pts_won_41"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-20)

df_player1["p_2nd_sv_pts_40"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-21)
df_player1["p_2nd_sv_pts_won_40"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-21)

df_player1["p_2nd_sv_pts_39"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-22)
df_player1["p_2nd_sv_pts_won_39"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-22)

df_player1["p_2nd_sv_pts_38"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-23)
df_player1["p_2nd_sv_pts_won_38"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-23)

df_player1["p_2nd_sv_pts_37"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-24)
df_player1["p_2nd_sv_pts_won_37"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-24)

df_player1["p_2nd_sv_pts_36"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-25)
df_player1["p_2nd_sv_pts_won_36"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-25)

df_player1["p_2nd_sv_pts_35"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-26)
df_player1["p_2nd_sv_pts_won_35"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-26)

df_player1["p_2nd_sv_pts_34"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-27)
df_player1["p_2nd_sv_pts_won_34"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-27)

df_player1["p_2nd_sv_pts_33"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-28)
df_player1["p_2nd_sv_pts_won_33"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-28)

df_player1["p_2nd_sv_pts_32"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-29)
df_player1["p_2nd_sv_pts_won_32"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-29)

df_player1["p_2nd_sv_pts_31"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-30)
df_player1["p_2nd_sv_pts_won_31"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-30)

df_player1["p_2nd_sv_pts_30"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-31)
df_player1["p_2nd_sv_pts_won_30"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-31)

df_player1["p_2nd_sv_pts_29"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-32)
df_player1["p_2nd_sv_pts_won_29"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-32)

df_player1["p_2nd_sv_pts_28"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-33)
df_player1["p_2nd_sv_pts_won_28"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-33)

df_player1["p_2nd_sv_pts_27"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-34)
df_player1["p_2nd_sv_pts_won_27"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-34)

df_player1["p_2nd_sv_pts_26"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-35)
df_player1["p_2nd_sv_pts_won_26"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-35)

df_player1["p_2nd_sv_pts_25"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-36)
df_player1["p_2nd_sv_pts_won_25"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-36)

df_player1["p_2nd_sv_pts_24"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-37)
df_player1["p_2nd_sv_pts_won_24"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-37)

df_player1["p_2nd_sv_pts_23"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-38)
df_player1["p_2nd_sv_pts_won_23"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-38)

df_player1["p_2nd_sv_pts_22"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-39)
df_player1["p_2nd_sv_pts_won_22"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-39)

df_player1["p_2nd_sv_pts_21"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-40)
df_player1["p_2nd_sv_pts_won_21"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-40)

df_player1["p_2nd_sv_pts_20"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-41)
df_player1["p_2nd_sv_pts_won_20"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-41)

df_player1["p_2nd_sv_pts_19"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-42)
df_player1["p_2nd_sv_pts_won_19"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-42)

df_player1["p_2nd_sv_pts_18"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-43)
df_player1["p_2nd_sv_pts_won_18"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-43)

df_player1["p_2nd_sv_pts_17"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-44)
df_player1["p_2nd_sv_pts_won_17"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-44)

df_player1["p_2nd_sv_pts_16"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-45)
df_player1["p_2nd_sv_pts_won_16"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-45)

df_player1["p_2nd_sv_pts_15"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-46)
df_player1["p_2nd_sv_pts_won_15"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-46)

df_player1["p_2nd_sv_pts_14"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-47)
df_player1["p_2nd_sv_pts_won_14"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-47)

df_player1["p_2nd_sv_pts_13"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-48)
df_player1["p_2nd_sv_pts_won_13"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-48)

df_player1["p_2nd_sv_pts_12"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-49)
df_player1["p_2nd_sv_pts_won_12"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-49)

df_player1["p_2nd_sv_pts_11"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-50)
df_player1["p_2nd_sv_pts_won_11"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-50)

df_player1["p_2nd_sv_pts_10"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-51)
df_player1["p_2nd_sv_pts_won_10"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-51)

df_player1["p_2nd_sv_pts_9"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-52)
df_player1["p_2nd_sv_pts_won_9"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-52)

df_player1["p_2nd_sv_pts_8"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-53)
df_player1["p_2nd_sv_pts_won_8"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-53)

df_player1["p_2nd_sv_pts_7"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-54)
df_player1["p_2nd_sv_pts_won_7"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-54)

df_player1["p_2nd_sv_pts_6"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-55)
df_player1["p_2nd_sv_pts_won_6"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-55)

df_player1["p_2nd_sv_pts_5"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-56)
df_player1["p_2nd_sv_pts_won_5"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-56)

df_player1["p_2nd_sv_pts_4"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-57)
df_player1["p_2nd_sv_pts_won_4"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-57)

df_player1["p_2nd_sv_pts_3"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-58)
df_player1["p_2nd_sv_pts_won_3"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-58)

df_player1["p_2nd_sv_pts_2"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-59)
df_player1["p_2nd_sv_pts_won_2"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-59)

df_player1["p_2nd_sv_pts_1"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts'].shift(-60)
df_player1["p_2nd_sv_pts_won_1"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_sv_pts_won'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_2nd_sv_pts_l60_ws"] = df_player1[["p_2nd_sv_pts_60", "p_2nd_sv_pts_59", "p_2nd_sv_pts_58", "p_2nd_sv_pts_57", "p_2nd_sv_pts_56", "p_2nd_sv_pts_55", "p_2nd_sv_pts_54", "p_2nd_sv_pts_53", "p_2nd_sv_pts_52", "p_2nd_sv_pts_51", "p_2nd_sv_pts_50", "p_2nd_sv_pts_49", "p_2nd_sv_pts_48", "p_2nd_sv_pts_47", "p_2nd_sv_pts_46", "p_2nd_sv_pts_45", "p_2nd_sv_pts_44", "p_2nd_sv_pts_43", "p_2nd_sv_pts_42", "p_2nd_sv_pts_41", "p_2nd_sv_pts_40", "p_2nd_sv_pts_39", "p_2nd_sv_pts_38", "p_2nd_sv_pts_37", "p_2nd_sv_pts_36", "p_2nd_sv_pts_35", "p_2nd_sv_pts_34", "p_2nd_sv_pts_33", "p_2nd_sv_pts_32", "p_2nd_sv_pts_31", "p_2nd_sv_pts_30", "p_2nd_sv_pts_29", "p_2nd_sv_pts_28", "p_2nd_sv_pts_27", "p_2nd_sv_pts_26", "p_2nd_sv_pts_25", "p_2nd_sv_pts_24", "p_2nd_sv_pts_23", "p_2nd_sv_pts_22", "p_2nd_sv_pts_21", "p_2nd_sv_pts_20", "p_2nd_sv_pts_19", "p_2nd_sv_pts_18", "p_2nd_sv_pts_17", "p_2nd_sv_pts_16", "p_2nd_sv_pts_15", "p_2nd_sv_pts_14", "p_2nd_sv_pts_13", "p_2nd_sv_pts_12", "p_2nd_sv_pts_11", "p_2nd_sv_pts_10", "p_2nd_sv_pts_9", "p_2nd_sv_pts_8", "p_2nd_sv_pts_7", "p_2nd_sv_pts_6", "p_2nd_sv_pts_5", "p_2nd_sv_pts_4", "p_2nd_sv_pts_3", "p_2nd_sv_pts_2", "p_2nd_sv_pts_1"]].sum(axis=1)
df_player1["p_2nd_sv_pts_won_l60_ws"] = df_player1[["p_2nd_sv_pts_won_60", "p_2nd_sv_pts_won_59", "p_2nd_sv_pts_won_58", "p_2nd_sv_pts_won_57", "p_2nd_sv_pts_won_56", "p_2nd_sv_pts_won_55", "p_2nd_sv_pts_won_54", "p_2nd_sv_pts_won_53", "p_2nd_sv_pts_won_52", "p_2nd_sv_pts_won_51", "p_2nd_sv_pts_won_50", "p_2nd_sv_pts_won_49", "p_2nd_sv_pts_won_48", "p_2nd_sv_pts_won_47", "p_2nd_sv_pts_won_46", "p_2nd_sv_pts_won_45", "p_2nd_sv_pts_won_44", "p_2nd_sv_pts_won_43", "p_2nd_sv_pts_won_42", "p_2nd_sv_pts_won_41", "p_2nd_sv_pts_won_40", "p_2nd_sv_pts_won_39", "p_2nd_sv_pts_won_38", "p_2nd_sv_pts_won_37", "p_2nd_sv_pts_won_36", "p_2nd_sv_pts_won_35", "p_2nd_sv_pts_won_34", "p_2nd_sv_pts_won_33", "p_2nd_sv_pts_won_32", "p_2nd_sv_pts_won_31", "p_2nd_sv_pts_won_30", "p_2nd_sv_pts_won_29", "p_2nd_sv_pts_won_28", "p_2nd_sv_pts_won_27", "p_2nd_sv_pts_won_26", "p_2nd_sv_pts_won_25", "p_2nd_sv_pts_won_24", "p_2nd_sv_pts_won_23", "p_2nd_sv_pts_won_22", "p_2nd_sv_pts_won_21", "p_2nd_sv_pts_won_20", "p_2nd_sv_pts_won_19", "p_2nd_sv_pts_won_18", "p_2nd_sv_pts_won_17", "p_2nd_sv_pts_won_16", "p_2nd_sv_pts_won_15", "p_2nd_sv_pts_won_14", "p_2nd_sv_pts_won_13", "p_2nd_sv_pts_won_12", "p_2nd_sv_pts_won_11", "p_2nd_sv_pts_won_10", "p_2nd_sv_pts_won_9", "p_2nd_sv_pts_won_8", "p_2nd_sv_pts_won_7", "p_2nd_sv_pts_won_6", "p_2nd_sv_pts_won_5", "p_2nd_sv_pts_won_4", "p_2nd_sv_pts_won_3", "p_2nd_sv_pts_won_2", "p_2nd_sv_pts_won_1"]].sum(axis=1)
df_player1["p_2nd_sv_pts_won%_l60_tw_ss"] = ((df_player1["p_2nd_sv_pts_won_l60_ws"]/df_player1["p_2nd_sv_pts_l60_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

In [43]:
# 'p_2nd_sv_pts_won%_l10_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean SECOND SERVE POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_2nd_sv_pts_l10_ws"] = df_player1[["p_2nd_sv_pts_60", "p_2nd_sv_pts_59", "p_2nd_sv_pts_58", "p_2nd_sv_pts_57", "p_2nd_sv_pts_56", "p_2nd_sv_pts_55", "p_2nd_sv_pts_54", "p_2nd_sv_pts_53", "p_2nd_sv_pts_52", "p_2nd_sv_pts_51"]].sum(axis=1)
df_player1["p_2nd_sv_pts_won_l10_ws"] = df_player1[["p_2nd_sv_pts_won_60", "p_2nd_sv_pts_won_59", "p_2nd_sv_pts_won_58", "p_2nd_sv_pts_won_57", "p_2nd_sv_pts_won_56", "p_2nd_sv_pts_won_55", "p_2nd_sv_pts_won_54", "p_2nd_sv_pts_won_53", "p_2nd_sv_pts_won_52", "p_2nd_sv_pts_won_51"]].sum(axis=1)
df_player1["p_2nd_sv_pts_won%_l10_tw_ss"] = ((df_player1["p_2nd_sv_pts_won_l10_ws"]/df_player1["p_2nd_sv_pts_l10_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_2nd_sv_pts_l60_ws", "p_2nd_sv_pts_won_l60_ws", "p_2nd_sv_pts_l10_ws", "p_2nd_sv_pts_won_l10_ws", "p_2nd_sv_pts_60", "p_2nd_sv_pts_59", "p_2nd_sv_pts_58", "p_2nd_sv_pts_57", "p_2nd_sv_pts_56", "p_2nd_sv_pts_55", "p_2nd_sv_pts_54", "p_2nd_sv_pts_53", "p_2nd_sv_pts_52", "p_2nd_sv_pts_51", "p_2nd_sv_pts_50", "p_2nd_sv_pts_49", "p_2nd_sv_pts_48", "p_2nd_sv_pts_47", "p_2nd_sv_pts_46", "p_2nd_sv_pts_45", "p_2nd_sv_pts_44", "p_2nd_sv_pts_43", "p_2nd_sv_pts_42", "p_2nd_sv_pts_41", "p_2nd_sv_pts_40", "p_2nd_sv_pts_39", "p_2nd_sv_pts_38", "p_2nd_sv_pts_37", "p_2nd_sv_pts_36", "p_2nd_sv_pts_35", "p_2nd_sv_pts_34", "p_2nd_sv_pts_33", "p_2nd_sv_pts_32", "p_2nd_sv_pts_31", "p_2nd_sv_pts_30", "p_2nd_sv_pts_29", "p_2nd_sv_pts_28", "p_2nd_sv_pts_27", "p_2nd_sv_pts_26", "p_2nd_sv_pts_25", "p_2nd_sv_pts_24", "p_2nd_sv_pts_23", "p_2nd_sv_pts_22", "p_2nd_sv_pts_21", "p_2nd_sv_pts_20", "p_2nd_sv_pts_19", "p_2nd_sv_pts_18", "p_2nd_sv_pts_17", "p_2nd_sv_pts_16", "p_2nd_sv_pts_15", "p_2nd_sv_pts_14", "p_2nd_sv_pts_13", "p_2nd_sv_pts_12", "p_2nd_sv_pts_11", "p_2nd_sv_pts_10", "p_2nd_sv_pts_9", "p_2nd_sv_pts_8", "p_2nd_sv_pts_7", "p_2nd_sv_pts_6", "p_2nd_sv_pts_5", "p_2nd_sv_pts_4", "p_2nd_sv_pts_3", "p_2nd_sv_pts_2", "p_2nd_sv_pts_1", "p_2nd_sv_pts_won_60", "p_2nd_sv_pts_won_59", "p_2nd_sv_pts_won_58", "p_2nd_sv_pts_won_57", "p_2nd_sv_pts_won_56", "p_2nd_sv_pts_won_55", "p_2nd_sv_pts_won_54", "p_2nd_sv_pts_won_53", "p_2nd_sv_pts_won_52", "p_2nd_sv_pts_won_51", "p_2nd_sv_pts_won_50", "p_2nd_sv_pts_won_49", "p_2nd_sv_pts_won_48", "p_2nd_sv_pts_won_47", "p_2nd_sv_pts_won_46", "p_2nd_sv_pts_won_45", "p_2nd_sv_pts_won_44", "p_2nd_sv_pts_won_43", "p_2nd_sv_pts_won_42", "p_2nd_sv_pts_won_41", "p_2nd_sv_pts_won_40", "p_2nd_sv_pts_won_39", "p_2nd_sv_pts_won_38", "p_2nd_sv_pts_won_37", "p_2nd_sv_pts_won_36", "p_2nd_sv_pts_won_35", "p_2nd_sv_pts_won_34", "p_2nd_sv_pts_won_33", "p_2nd_sv_pts_won_32", "p_2nd_sv_pts_won_31", "p_2nd_sv_pts_won_30", "p_2nd_sv_pts_won_29", "p_2nd_sv_pts_won_28", "p_2nd_sv_pts_won_27", "p_2nd_sv_pts_won_26", "p_2nd_sv_pts_won_25", "p_2nd_sv_pts_won_24", "p_2nd_sv_pts_won_23", "p_2nd_sv_pts_won_22", "p_2nd_sv_pts_won_21", "p_2nd_sv_pts_won_20", "p_2nd_sv_pts_won_19", "p_2nd_sv_pts_won_18", "p_2nd_sv_pts_won_17", "p_2nd_sv_pts_won_16", "p_2nd_sv_pts_won_15", "p_2nd_sv_pts_won_14", "p_2nd_sv_pts_won_13", "p_2nd_sv_pts_won_12", "p_2nd_sv_pts_won_11", "p_2nd_sv_pts_won_10", "p_2nd_sv_pts_won_9", "p_2nd_sv_pts_won_8", "p_2nd_sv_pts_won_7", "p_2nd_sv_pts_won_6", "p_2nd_sv_pts_won_5", "p_2nd_sv_pts_won_4", "p_2nd_sv_pts_won_3", "p_2nd_sv_pts_won_2", "p_2nd_sv_pts_won_1"], axis = 1)

In [44]:
# 'p_2nd_sv_pts_won%_l60_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor (IO) mean SECOND SERVE POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

df_player1["p_2nd_sv_pts_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-1)
df_player1["p_2nd_sv_pts_won_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-1)

df_player1["p_2nd_sv_pts_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-2)
df_player1["p_2nd_sv_pts_won_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-2)

df_player1["p_2nd_sv_pts_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-3)
df_player1["p_2nd_sv_pts_won_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-3)

df_player1["p_2nd_sv_pts_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-4)
df_player1["p_2nd_sv_pts_won_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-4)

df_player1["p_2nd_sv_pts_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-5)
df_player1["p_2nd_sv_pts_won_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-5)

df_player1["p_2nd_sv_pts_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-6)
df_player1["p_2nd_sv_pts_won_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-6)

df_player1["p_2nd_sv_pts_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-7)
df_player1["p_2nd_sv_pts_won_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-7)

df_player1["p_2nd_sv_pts_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-8)
df_player1["p_2nd_sv_pts_won_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-8)

df_player1["p_2nd_sv_pts_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-9)
df_player1["p_2nd_sv_pts_won_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-9)

df_player1["p_2nd_sv_pts_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-10)
df_player1["p_2nd_sv_pts_won_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-10)

df_player1["p_2nd_sv_pts_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-11)
df_player1["p_2nd_sv_pts_won_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-11)

df_player1["p_2nd_sv_pts_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-12)
df_player1["p_2nd_sv_pts_won_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-12)

df_player1["p_2nd_sv_pts_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-13)
df_player1["p_2nd_sv_pts_won_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-13)

df_player1["p_2nd_sv_pts_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-14)
df_player1["p_2nd_sv_pts_won_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-14)

df_player1["p_2nd_sv_pts_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-15)
df_player1["p_2nd_sv_pts_won_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-15)

df_player1["p_2nd_sv_pts_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-16)
df_player1["p_2nd_sv_pts_won_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-16)

df_player1["p_2nd_sv_pts_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-17)
df_player1["p_2nd_sv_pts_won_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-17)

df_player1["p_2nd_sv_pts_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-18)
df_player1["p_2nd_sv_pts_won_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-18)

df_player1["p_2nd_sv_pts_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-19)
df_player1["p_2nd_sv_pts_won_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-19)

df_player1["p_2nd_sv_pts_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-20)
df_player1["p_2nd_sv_pts_won_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-20)

df_player1["p_2nd_sv_pts_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-21)
df_player1["p_2nd_sv_pts_won_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-21)

df_player1["p_2nd_sv_pts_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-22)
df_player1["p_2nd_sv_pts_won_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-22)

df_player1["p_2nd_sv_pts_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-23)
df_player1["p_2nd_sv_pts_won_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-23)

df_player1["p_2nd_sv_pts_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-24)
df_player1["p_2nd_sv_pts_won_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-24)

df_player1["p_2nd_sv_pts_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-25)
df_player1["p_2nd_sv_pts_won_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-25)

df_player1["p_2nd_sv_pts_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-26)
df_player1["p_2nd_sv_pts_won_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-26)

df_player1["p_2nd_sv_pts_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-27)
df_player1["p_2nd_sv_pts_won_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-27)

df_player1["p_2nd_sv_pts_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-28)
df_player1["p_2nd_sv_pts_won_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-28)

df_player1["p_2nd_sv_pts_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-29)
df_player1["p_2nd_sv_pts_won_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-29)

df_player1["p_2nd_sv_pts_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-30)
df_player1["p_2nd_sv_pts_won_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-30)

df_player1["p_2nd_sv_pts_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-31)
df_player1["p_2nd_sv_pts_won_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-31)

df_player1["p_2nd_sv_pts_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-32)
df_player1["p_2nd_sv_pts_won_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-32)

df_player1["p_2nd_sv_pts_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-33)
df_player1["p_2nd_sv_pts_won_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-33)

df_player1["p_2nd_sv_pts_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-34)
df_player1["p_2nd_sv_pts_won_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-34)

df_player1["p_2nd_sv_pts_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-35)
df_player1["p_2nd_sv_pts_won_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-35)

df_player1["p_2nd_sv_pts_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-36)
df_player1["p_2nd_sv_pts_won_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-36)

df_player1["p_2nd_sv_pts_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-37)
df_player1["p_2nd_sv_pts_won_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-37)

df_player1["p_2nd_sv_pts_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-38)
df_player1["p_2nd_sv_pts_won_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-38)

df_player1["p_2nd_sv_pts_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-39)
df_player1["p_2nd_sv_pts_won_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-39)

df_player1["p_2nd_sv_pts_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-40)
df_player1["p_2nd_sv_pts_won_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-40)

df_player1["p_2nd_sv_pts_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-41)
df_player1["p_2nd_sv_pts_won_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-41)

df_player1["p_2nd_sv_pts_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-42)
df_player1["p_2nd_sv_pts_won_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-42)

df_player1["p_2nd_sv_pts_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-43)
df_player1["p_2nd_sv_pts_won_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-43)

df_player1["p_2nd_sv_pts_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-44)
df_player1["p_2nd_sv_pts_won_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-44)

df_player1["p_2nd_sv_pts_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-45)
df_player1["p_2nd_sv_pts_won_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-45)

df_player1["p_2nd_sv_pts_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-46)
df_player1["p_2nd_sv_pts_won_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-46)

df_player1["p_2nd_sv_pts_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-47)
df_player1["p_2nd_sv_pts_won_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-47)

df_player1["p_2nd_sv_pts_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-48)
df_player1["p_2nd_sv_pts_won_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-48)

df_player1["p_2nd_sv_pts_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-49)
df_player1["p_2nd_sv_pts_won_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-49)

df_player1["p_2nd_sv_pts_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-50)
df_player1["p_2nd_sv_pts_won_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-50)

df_player1["p_2nd_sv_pts_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-51)
df_player1["p_2nd_sv_pts_won_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-51)

df_player1["p_2nd_sv_pts_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-52)
df_player1["p_2nd_sv_pts_won_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-52)

df_player1["p_2nd_sv_pts_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-53)
df_player1["p_2nd_sv_pts_won_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-53)

df_player1["p_2nd_sv_pts_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-54)
df_player1["p_2nd_sv_pts_won_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-54)

df_player1["p_2nd_sv_pts_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-55)
df_player1["p_2nd_sv_pts_won_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-55)

df_player1["p_2nd_sv_pts_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-56)
df_player1["p_2nd_sv_pts_won_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-56)

df_player1["p_2nd_sv_pts_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-57)
df_player1["p_2nd_sv_pts_won_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-57)

df_player1["p_2nd_sv_pts_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-58)
df_player1["p_2nd_sv_pts_won_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-58)

df_player1["p_2nd_sv_pts_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-59)
df_player1["p_2nd_sv_pts_won_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-59)

df_player1["p_2nd_sv_pts_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts'].shift(-60)
df_player1["p_2nd_sv_pts_won_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_sv_pts_won'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_2nd_sv_pts_l60_ws"] = df_player1[["p_2nd_sv_pts_60", "p_2nd_sv_pts_59", "p_2nd_sv_pts_58", "p_2nd_sv_pts_57", "p_2nd_sv_pts_56", "p_2nd_sv_pts_55", "p_2nd_sv_pts_54", "p_2nd_sv_pts_53", "p_2nd_sv_pts_52", "p_2nd_sv_pts_51", "p_2nd_sv_pts_50", "p_2nd_sv_pts_49", "p_2nd_sv_pts_48", "p_2nd_sv_pts_47", "p_2nd_sv_pts_46", "p_2nd_sv_pts_45", "p_2nd_sv_pts_44", "p_2nd_sv_pts_43", "p_2nd_sv_pts_42", "p_2nd_sv_pts_41", "p_2nd_sv_pts_40", "p_2nd_sv_pts_39", "p_2nd_sv_pts_38", "p_2nd_sv_pts_37", "p_2nd_sv_pts_36", "p_2nd_sv_pts_35", "p_2nd_sv_pts_34", "p_2nd_sv_pts_33", "p_2nd_sv_pts_32", "p_2nd_sv_pts_31", "p_2nd_sv_pts_30", "p_2nd_sv_pts_29", "p_2nd_sv_pts_28", "p_2nd_sv_pts_27", "p_2nd_sv_pts_26", "p_2nd_sv_pts_25", "p_2nd_sv_pts_24", "p_2nd_sv_pts_23", "p_2nd_sv_pts_22", "p_2nd_sv_pts_21", "p_2nd_sv_pts_20", "p_2nd_sv_pts_19", "p_2nd_sv_pts_18", "p_2nd_sv_pts_17", "p_2nd_sv_pts_16", "p_2nd_sv_pts_15", "p_2nd_sv_pts_14", "p_2nd_sv_pts_13", "p_2nd_sv_pts_12", "p_2nd_sv_pts_11", "p_2nd_sv_pts_10", "p_2nd_sv_pts_9", "p_2nd_sv_pts_8", "p_2nd_sv_pts_7", "p_2nd_sv_pts_6", "p_2nd_sv_pts_5", "p_2nd_sv_pts_4", "p_2nd_sv_pts_3", "p_2nd_sv_pts_2", "p_2nd_sv_pts_1"]].sum(axis=1)
df_player1["p_2nd_sv_pts_won_l60_ws"] = df_player1[["p_2nd_sv_pts_won_60", "p_2nd_sv_pts_won_59", "p_2nd_sv_pts_won_58", "p_2nd_sv_pts_won_57", "p_2nd_sv_pts_won_56", "p_2nd_sv_pts_won_55", "p_2nd_sv_pts_won_54", "p_2nd_sv_pts_won_53", "p_2nd_sv_pts_won_52", "p_2nd_sv_pts_won_51", "p_2nd_sv_pts_won_50", "p_2nd_sv_pts_won_49", "p_2nd_sv_pts_won_48", "p_2nd_sv_pts_won_47", "p_2nd_sv_pts_won_46", "p_2nd_sv_pts_won_45", "p_2nd_sv_pts_won_44", "p_2nd_sv_pts_won_43", "p_2nd_sv_pts_won_42", "p_2nd_sv_pts_won_41", "p_2nd_sv_pts_won_40", "p_2nd_sv_pts_won_39", "p_2nd_sv_pts_won_38", "p_2nd_sv_pts_won_37", "p_2nd_sv_pts_won_36", "p_2nd_sv_pts_won_35", "p_2nd_sv_pts_won_34", "p_2nd_sv_pts_won_33", "p_2nd_sv_pts_won_32", "p_2nd_sv_pts_won_31", "p_2nd_sv_pts_won_30", "p_2nd_sv_pts_won_29", "p_2nd_sv_pts_won_28", "p_2nd_sv_pts_won_27", "p_2nd_sv_pts_won_26", "p_2nd_sv_pts_won_25", "p_2nd_sv_pts_won_24", "p_2nd_sv_pts_won_23", "p_2nd_sv_pts_won_22", "p_2nd_sv_pts_won_21", "p_2nd_sv_pts_won_20", "p_2nd_sv_pts_won_19", "p_2nd_sv_pts_won_18", "p_2nd_sv_pts_won_17", "p_2nd_sv_pts_won_16", "p_2nd_sv_pts_won_15", "p_2nd_sv_pts_won_14", "p_2nd_sv_pts_won_13", "p_2nd_sv_pts_won_12", "p_2nd_sv_pts_won_11", "p_2nd_sv_pts_won_10", "p_2nd_sv_pts_won_9", "p_2nd_sv_pts_won_8", "p_2nd_sv_pts_won_7", "p_2nd_sv_pts_won_6", "p_2nd_sv_pts_won_5", "p_2nd_sv_pts_won_4", "p_2nd_sv_pts_won_3", "p_2nd_sv_pts_won_2", "p_2nd_sv_pts_won_1"]].sum(axis=1)
df_player1["p_2nd_sv_pts_won%_l60_tw_ss_IO"] = ((df_player1["p_2nd_sv_pts_won_l60_ws"]/df_player1["p_2nd_sv_pts_l60_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

In [45]:
# 'p_2nd_sv_pts_won%_l10_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor (IO) specific mean SECOND SERVE POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_2nd_sv_pts_l10_ws"] = df_player1[["p_2nd_sv_pts_60", "p_2nd_sv_pts_59", "p_2nd_sv_pts_58", "p_2nd_sv_pts_57", "p_2nd_sv_pts_56", "p_2nd_sv_pts_55", "p_2nd_sv_pts_54", "p_2nd_sv_pts_53", "p_2nd_sv_pts_52", "p_2nd_sv_pts_51"]].sum(axis=1)
df_player1["p_2nd_sv_pts_won_l10_ws"] = df_player1[["p_2nd_sv_pts_won_60", "p_2nd_sv_pts_won_59", "p_2nd_sv_pts_won_58", "p_2nd_sv_pts_won_57", "p_2nd_sv_pts_won_56", "p_2nd_sv_pts_won_55", "p_2nd_sv_pts_won_54", "p_2nd_sv_pts_won_53", "p_2nd_sv_pts_won_52", "p_2nd_sv_pts_won_51"]].sum(axis=1)
df_player1["p_2nd_sv_pts_won%_l10_tw_ss_IO"] = ((df_player1["p_2nd_sv_pts_won_l10_ws"]/df_player1["p_2nd_sv_pts_l10_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_2nd_sv_pts_l60_ws", "p_2nd_sv_pts_won_l60_ws", "p_2nd_sv_pts_l10_ws", "p_2nd_sv_pts_won_l10_ws", "p_2nd_sv_pts_60", "p_2nd_sv_pts_59", "p_2nd_sv_pts_58", "p_2nd_sv_pts_57", "p_2nd_sv_pts_56", "p_2nd_sv_pts_55", "p_2nd_sv_pts_54", "p_2nd_sv_pts_53", "p_2nd_sv_pts_52", "p_2nd_sv_pts_51", "p_2nd_sv_pts_50", "p_2nd_sv_pts_49", "p_2nd_sv_pts_48", "p_2nd_sv_pts_47", "p_2nd_sv_pts_46", "p_2nd_sv_pts_45", "p_2nd_sv_pts_44", "p_2nd_sv_pts_43", "p_2nd_sv_pts_42", "p_2nd_sv_pts_41", "p_2nd_sv_pts_40", "p_2nd_sv_pts_39", "p_2nd_sv_pts_38", "p_2nd_sv_pts_37", "p_2nd_sv_pts_36", "p_2nd_sv_pts_35", "p_2nd_sv_pts_34", "p_2nd_sv_pts_33", "p_2nd_sv_pts_32", "p_2nd_sv_pts_31", "p_2nd_sv_pts_30", "p_2nd_sv_pts_29", "p_2nd_sv_pts_28", "p_2nd_sv_pts_27", "p_2nd_sv_pts_26", "p_2nd_sv_pts_25", "p_2nd_sv_pts_24", "p_2nd_sv_pts_23", "p_2nd_sv_pts_22", "p_2nd_sv_pts_21", "p_2nd_sv_pts_20", "p_2nd_sv_pts_19", "p_2nd_sv_pts_18", "p_2nd_sv_pts_17", "p_2nd_sv_pts_16", "p_2nd_sv_pts_15", "p_2nd_sv_pts_14", "p_2nd_sv_pts_13", "p_2nd_sv_pts_12", "p_2nd_sv_pts_11", "p_2nd_sv_pts_10", "p_2nd_sv_pts_9", "p_2nd_sv_pts_8", "p_2nd_sv_pts_7", "p_2nd_sv_pts_6", "p_2nd_sv_pts_5", "p_2nd_sv_pts_4", "p_2nd_sv_pts_3", "p_2nd_sv_pts_2", "p_2nd_sv_pts_1", "p_2nd_sv_pts_won_60", "p_2nd_sv_pts_won_59", "p_2nd_sv_pts_won_58", "p_2nd_sv_pts_won_57", "p_2nd_sv_pts_won_56", "p_2nd_sv_pts_won_55", "p_2nd_sv_pts_won_54", "p_2nd_sv_pts_won_53", "p_2nd_sv_pts_won_52", "p_2nd_sv_pts_won_51", "p_2nd_sv_pts_won_50", "p_2nd_sv_pts_won_49", "p_2nd_sv_pts_won_48", "p_2nd_sv_pts_won_47", "p_2nd_sv_pts_won_46", "p_2nd_sv_pts_won_45", "p_2nd_sv_pts_won_44", "p_2nd_sv_pts_won_43", "p_2nd_sv_pts_won_42", "p_2nd_sv_pts_won_41", "p_2nd_sv_pts_won_40", "p_2nd_sv_pts_won_39", "p_2nd_sv_pts_won_38", "p_2nd_sv_pts_won_37", "p_2nd_sv_pts_won_36", "p_2nd_sv_pts_won_35", "p_2nd_sv_pts_won_34", "p_2nd_sv_pts_won_33", "p_2nd_sv_pts_won_32", "p_2nd_sv_pts_won_31", "p_2nd_sv_pts_won_30", "p_2nd_sv_pts_won_29", "p_2nd_sv_pts_won_28", "p_2nd_sv_pts_won_27", "p_2nd_sv_pts_won_26", "p_2nd_sv_pts_won_25", "p_2nd_sv_pts_won_24", "p_2nd_sv_pts_won_23", "p_2nd_sv_pts_won_22", "p_2nd_sv_pts_won_21", "p_2nd_sv_pts_won_20", "p_2nd_sv_pts_won_19", "p_2nd_sv_pts_won_18", "p_2nd_sv_pts_won_17", "p_2nd_sv_pts_won_16", "p_2nd_sv_pts_won_15", "p_2nd_sv_pts_won_14", "p_2nd_sv_pts_won_13", "p_2nd_sv_pts_won_12", "p_2nd_sv_pts_won_11", "p_2nd_sv_pts_won_10", "p_2nd_sv_pts_won_9", "p_2nd_sv_pts_won_8", "p_2nd_sv_pts_won_7", "p_2nd_sv_pts_won_6", "p_2nd_sv_pts_won_5", "p_2nd_sv_pts_won_4", "p_2nd_sv_pts_won_3", "p_2nd_sv_pts_won_2", "p_2nd_sv_pts_won_1"], axis = 1)

In [46]:
# 'p_ret_pts_won%_l60_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean RETURN POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

df_player1["p_ret_pts_60"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-1)
df_player1["p_ret_pts_won_60"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-1)

df_player1["p_ret_pts_59"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-2)
df_player1["p_ret_pts_won_59"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-2)

df_player1["p_ret_pts_58"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-3)
df_player1["p_ret_pts_won_58"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-3)

df_player1["p_ret_pts_57"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-4)
df_player1["p_ret_pts_won_57"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-4)

df_player1["p_ret_pts_56"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-5)
df_player1["p_ret_pts_won_56"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-5)

df_player1["p_ret_pts_55"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-6)
df_player1["p_ret_pts_won_55"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-6)

df_player1["p_ret_pts_54"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-7)
df_player1["p_ret_pts_won_54"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-7)

df_player1["p_ret_pts_53"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-8)
df_player1["p_ret_pts_won_53"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-8)

df_player1["p_ret_pts_52"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-9)
df_player1["p_ret_pts_won_52"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-9)

df_player1["p_ret_pts_51"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-10)
df_player1["p_ret_pts_won_51"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-10)

df_player1["p_ret_pts_50"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-11)
df_player1["p_ret_pts_won_50"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-11)

df_player1["p_ret_pts_49"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-12)
df_player1["p_ret_pts_won_49"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-12)

df_player1["p_ret_pts_48"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-13)
df_player1["p_ret_pts_won_48"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-13)

df_player1["p_ret_pts_47"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-14)
df_player1["p_ret_pts_won_47"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-14)

df_player1["p_ret_pts_46"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-15)
df_player1["p_ret_pts_won_46"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-15)

df_player1["p_ret_pts_45"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-16)
df_player1["p_ret_pts_won_45"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-16)

df_player1["p_ret_pts_44"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-17)
df_player1["p_ret_pts_won_44"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-17)

df_player1["p_ret_pts_43"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-18)
df_player1["p_ret_pts_won_43"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-18)

df_player1["p_ret_pts_42"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-19)
df_player1["p_ret_pts_won_42"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-19)

df_player1["p_ret_pts_41"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-20)
df_player1["p_ret_pts_won_41"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-20)

df_player1["p_ret_pts_40"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-21)
df_player1["p_ret_pts_won_40"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-21)

df_player1["p_ret_pts_39"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-22)
df_player1["p_ret_pts_won_39"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-22)

df_player1["p_ret_pts_38"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-23)
df_player1["p_ret_pts_won_38"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-23)

df_player1["p_ret_pts_37"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-24)
df_player1["p_ret_pts_won_37"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-24)

df_player1["p_ret_pts_36"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-25)
df_player1["p_ret_pts_won_36"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-25)

df_player1["p_ret_pts_35"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-26)
df_player1["p_ret_pts_won_35"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-26)

df_player1["p_ret_pts_34"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-27)
df_player1["p_ret_pts_won_34"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-27)

df_player1["p_ret_pts_33"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-28)
df_player1["p_ret_pts_won_33"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-28)

df_player1["p_ret_pts_32"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-29)
df_player1["p_ret_pts_won_32"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-29)

df_player1["p_ret_pts_31"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-30)
df_player1["p_ret_pts_won_31"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-30)

df_player1["p_ret_pts_30"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-31)
df_player1["p_ret_pts_won_30"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-31)

df_player1["p_ret_pts_29"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-32)
df_player1["p_ret_pts_won_29"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-32)

df_player1["p_ret_pts_28"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-33)
df_player1["p_ret_pts_won_28"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-33)

df_player1["p_ret_pts_27"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-34)
df_player1["p_ret_pts_won_27"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-34)

df_player1["p_ret_pts_26"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-35)
df_player1["p_ret_pts_won_26"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-35)

df_player1["p_ret_pts_25"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-36)
df_player1["p_ret_pts_won_25"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-36)

df_player1["p_ret_pts_24"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-37)
df_player1["p_ret_pts_won_24"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-37)

df_player1["p_ret_pts_23"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-38)
df_player1["p_ret_pts_won_23"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-38)

df_player1["p_ret_pts_22"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-39)
df_player1["p_ret_pts_won_22"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-39)

df_player1["p_ret_pts_21"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-40)
df_player1["p_ret_pts_won_21"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-40)

df_player1["p_ret_pts_20"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-41)
df_player1["p_ret_pts_won_20"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-41)

df_player1["p_ret_pts_19"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-42)
df_player1["p_ret_pts_won_19"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-42)

df_player1["p_ret_pts_18"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-43)
df_player1["p_ret_pts_won_18"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-43)

df_player1["p_ret_pts_17"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-44)
df_player1["p_ret_pts_won_17"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-44)

df_player1["p_ret_pts_16"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-45)
df_player1["p_ret_pts_won_16"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-45)

df_player1["p_ret_pts_15"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-46)
df_player1["p_ret_pts_won_15"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-46)

df_player1["p_ret_pts_14"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-47)
df_player1["p_ret_pts_won_14"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-47)

df_player1["p_ret_pts_13"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-48)
df_player1["p_ret_pts_won_13"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-48)

df_player1["p_ret_pts_12"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-49)
df_player1["p_ret_pts_won_12"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-49)

df_player1["p_ret_pts_11"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-50)
df_player1["p_ret_pts_won_11"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-50)

df_player1["p_ret_pts_10"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-51)
df_player1["p_ret_pts_won_10"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-51)

df_player1["p_ret_pts_9"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-52)
df_player1["p_ret_pts_won_9"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-52)

df_player1["p_ret_pts_8"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-53)
df_player1["p_ret_pts_won_8"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-53)

df_player1["p_ret_pts_7"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-54)
df_player1["p_ret_pts_won_7"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-54)

df_player1["p_ret_pts_6"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-55)
df_player1["p_ret_pts_won_6"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-55)

df_player1["p_ret_pts_5"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-56)
df_player1["p_ret_pts_won_5"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-56)

df_player1["p_ret_pts_4"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-57)
df_player1["p_ret_pts_won_4"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-57)

df_player1["p_ret_pts_3"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-58)
df_player1["p_ret_pts_won_3"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-58)

df_player1["p_ret_pts_2"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-59)
df_player1["p_ret_pts_won_2"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-59)

df_player1["p_ret_pts_1"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-60)
df_player1["p_ret_pts_won_1"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_ret_pts_l60_ws"] = df_player1[["p_ret_pts_60", "p_ret_pts_59", "p_ret_pts_58", "p_ret_pts_57", "p_ret_pts_56", "p_ret_pts_55", "p_ret_pts_54", "p_ret_pts_53", "p_ret_pts_52", "p_ret_pts_51", "p_ret_pts_50", "p_ret_pts_49", "p_ret_pts_48", "p_ret_pts_47", "p_ret_pts_46", "p_ret_pts_45", "p_ret_pts_44", "p_ret_pts_43", "p_ret_pts_42", "p_ret_pts_41", "p_ret_pts_40", "p_ret_pts_39", "p_ret_pts_38", "p_ret_pts_37", "p_ret_pts_36", "p_ret_pts_35", "p_ret_pts_34", "p_ret_pts_33", "p_ret_pts_32", "p_ret_pts_31", "p_ret_pts_30", "p_ret_pts_29", "p_ret_pts_28", "p_ret_pts_27", "p_ret_pts_26", "p_ret_pts_25", "p_ret_pts_24", "p_ret_pts_23", "p_ret_pts_22", "p_ret_pts_21", "p_ret_pts_20", "p_ret_pts_19", "p_ret_pts_18", "p_ret_pts_17", "p_ret_pts_16", "p_ret_pts_15", "p_ret_pts_14", "p_ret_pts_13", "p_ret_pts_12", "p_ret_pts_11", "p_ret_pts_10", "p_ret_pts_9", "p_ret_pts_8", "p_ret_pts_7", "p_ret_pts_6", "p_ret_pts_5", "p_ret_pts_4", "p_ret_pts_3", "p_ret_pts_2", "p_ret_pts_1"]].sum(axis=1)
df_player1["p_ret_pts_won_l60_ws"] = df_player1[["p_ret_pts_won_60", "p_ret_pts_won_59", "p_ret_pts_won_58", "p_ret_pts_won_57", "p_ret_pts_won_56", "p_ret_pts_won_55", "p_ret_pts_won_54", "p_ret_pts_won_53", "p_ret_pts_won_52", "p_ret_pts_won_51", "p_ret_pts_won_50", "p_ret_pts_won_49", "p_ret_pts_won_48", "p_ret_pts_won_47", "p_ret_pts_won_46", "p_ret_pts_won_45", "p_ret_pts_won_44", "p_ret_pts_won_43", "p_ret_pts_won_42", "p_ret_pts_won_41", "p_ret_pts_won_40", "p_ret_pts_won_39", "p_ret_pts_won_38", "p_ret_pts_won_37", "p_ret_pts_won_36", "p_ret_pts_won_35", "p_ret_pts_won_34", "p_ret_pts_won_33", "p_ret_pts_won_32", "p_ret_pts_won_31", "p_ret_pts_won_30", "p_ret_pts_won_29", "p_ret_pts_won_28", "p_ret_pts_won_27", "p_ret_pts_won_26", "p_ret_pts_won_25", "p_ret_pts_won_24", "p_ret_pts_won_23", "p_ret_pts_won_22", "p_ret_pts_won_21", "p_ret_pts_won_20", "p_ret_pts_won_19", "p_ret_pts_won_18", "p_ret_pts_won_17", "p_ret_pts_won_16", "p_ret_pts_won_15", "p_ret_pts_won_14", "p_ret_pts_won_13", "p_ret_pts_won_12", "p_ret_pts_won_11", "p_ret_pts_won_10", "p_ret_pts_won_9", "p_ret_pts_won_8", "p_ret_pts_won_7", "p_ret_pts_won_6", "p_ret_pts_won_5", "p_ret_pts_won_4", "p_ret_pts_won_3", "p_ret_pts_won_2", "p_ret_pts_won_1"]].sum(axis=1)
df_player1["p_ret_pts_won%_l60_tw_ss"] = ((df_player1["p_ret_pts_won_l60_ws"]/df_player1["p_ret_pts_l60_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

In [47]:
# 'p_ret_pts_won%_l10_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean RETURN POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_ret_pts_l10_ws"] = df_player1[["p_ret_pts_60", "p_ret_pts_59", "p_ret_pts_58", "p_ret_pts_57", "p_ret_pts_56", "p_ret_pts_55", "p_ret_pts_54", "p_ret_pts_53", "p_ret_pts_52", "p_ret_pts_51"]].sum(axis=1)
df_player1["p_ret_pts_won_l10_ws"] = df_player1[["p_ret_pts_won_60", "p_ret_pts_won_59", "p_ret_pts_won_58", "p_ret_pts_won_57", "p_ret_pts_won_56", "p_ret_pts_won_55", "p_ret_pts_won_54", "p_ret_pts_won_53", "p_ret_pts_won_52", "p_ret_pts_won_51"]].sum(axis=1)
df_player1["p_ret_pts_won%_l10_tw_ss"] = ((df_player1["p_ret_pts_won_l10_ws"]/df_player1["p_ret_pts_l10_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_ret_pts_l60_ws", "p_ret_pts_won_l60_ws", "p_ret_pts_l10_ws", "p_ret_pts_won_l10_ws", "p_ret_pts_60", "p_ret_pts_59", "p_ret_pts_58", "p_ret_pts_57", "p_ret_pts_56", "p_ret_pts_55", "p_ret_pts_54", "p_ret_pts_53", "p_ret_pts_52", "p_ret_pts_51", "p_ret_pts_50", "p_ret_pts_49", "p_ret_pts_48", "p_ret_pts_47", "p_ret_pts_46", "p_ret_pts_45", "p_ret_pts_44", "p_ret_pts_43", "p_ret_pts_42", "p_ret_pts_41", "p_ret_pts_40", "p_ret_pts_39", "p_ret_pts_38", "p_ret_pts_37", "p_ret_pts_36", "p_ret_pts_35", "p_ret_pts_34", "p_ret_pts_33", "p_ret_pts_32", "p_ret_pts_31", "p_ret_pts_30", "p_ret_pts_29", "p_ret_pts_28", "p_ret_pts_27", "p_ret_pts_26", "p_ret_pts_25", "p_ret_pts_24", "p_ret_pts_23", "p_ret_pts_22", "p_ret_pts_21", "p_ret_pts_20", "p_ret_pts_19", "p_ret_pts_18", "p_ret_pts_17", "p_ret_pts_16", "p_ret_pts_15", "p_ret_pts_14", "p_ret_pts_13", "p_ret_pts_12", "p_ret_pts_11", "p_ret_pts_10", "p_ret_pts_9", "p_ret_pts_8", "p_ret_pts_7", "p_ret_pts_6", "p_ret_pts_5", "p_ret_pts_4", "p_ret_pts_3", "p_ret_pts_2", "p_ret_pts_1", "p_ret_pts_won_60", "p_ret_pts_won_59", "p_ret_pts_won_58", "p_ret_pts_won_57", "p_ret_pts_won_56", "p_ret_pts_won_55", "p_ret_pts_won_54", "p_ret_pts_won_53", "p_ret_pts_won_52", "p_ret_pts_won_51", "p_ret_pts_won_50", "p_ret_pts_won_49", "p_ret_pts_won_48", "p_ret_pts_won_47", "p_ret_pts_won_46", "p_ret_pts_won_45", "p_ret_pts_won_44", "p_ret_pts_won_43", "p_ret_pts_won_42", "p_ret_pts_won_41", "p_ret_pts_won_40", "p_ret_pts_won_39", "p_ret_pts_won_38", "p_ret_pts_won_37", "p_ret_pts_won_36", "p_ret_pts_won_35", "p_ret_pts_won_34", "p_ret_pts_won_33", "p_ret_pts_won_32", "p_ret_pts_won_31", "p_ret_pts_won_30", "p_ret_pts_won_29", "p_ret_pts_won_28", "p_ret_pts_won_27", "p_ret_pts_won_26", "p_ret_pts_won_25", "p_ret_pts_won_24", "p_ret_pts_won_23", "p_ret_pts_won_22", "p_ret_pts_won_21", "p_ret_pts_won_20", "p_ret_pts_won_19", "p_ret_pts_won_18", "p_ret_pts_won_17", "p_ret_pts_won_16", "p_ret_pts_won_15", "p_ret_pts_won_14", "p_ret_pts_won_13", "p_ret_pts_won_12", "p_ret_pts_won_11", "p_ret_pts_won_10", "p_ret_pts_won_9", "p_ret_pts_won_8", "p_ret_pts_won_7", "p_ret_pts_won_6", "p_ret_pts_won_5", "p_ret_pts_won_4", "p_ret_pts_won_3", "p_ret_pts_won_2", "p_ret_pts_won_1"], axis = 1)

In [48]:
# 'p_ret_pts_won%_l60_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor (IO) specific mean RETURN POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf', 't_ind', 'm_date','m_rd_num'], ascending = False)

df_player1["p_ret_pts_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-1)
df_player1["p_ret_pts_won_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-1)

df_player1["p_ret_pts_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-2)
df_player1["p_ret_pts_won_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-2)

df_player1["p_ret_pts_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-3)
df_player1["p_ret_pts_won_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-3)

df_player1["p_ret_pts_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-4)
df_player1["p_ret_pts_won_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-4)

df_player1["p_ret_pts_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-5)
df_player1["p_ret_pts_won_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-5)

df_player1["p_ret_pts_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-6)
df_player1["p_ret_pts_won_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-6)

df_player1["p_ret_pts_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-7)
df_player1["p_ret_pts_won_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-7)

df_player1["p_ret_pts_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-8)
df_player1["p_ret_pts_won_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-8)

df_player1["p_ret_pts_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-9)
df_player1["p_ret_pts_won_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-9)

df_player1["p_ret_pts_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-10)
df_player1["p_ret_pts_won_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-10)

df_player1["p_ret_pts_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-11)
df_player1["p_ret_pts_won_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-11)

df_player1["p_ret_pts_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-12)
df_player1["p_ret_pts_won_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-12)

df_player1["p_ret_pts_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-13)
df_player1["p_ret_pts_won_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-13)

df_player1["p_ret_pts_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-14)
df_player1["p_ret_pts_won_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-14)

df_player1["p_ret_pts_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-15)
df_player1["p_ret_pts_won_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-15)

df_player1["p_ret_pts_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-16)
df_player1["p_ret_pts_won_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-16)

df_player1["p_ret_pts_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-17)
df_player1["p_ret_pts_won_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-17)

df_player1["p_ret_pts_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-18)
df_player1["p_ret_pts_won_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-18)

df_player1["p_ret_pts_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-19)
df_player1["p_ret_pts_won_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-19)

df_player1["p_ret_pts_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-20)
df_player1["p_ret_pts_won_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-20)

df_player1["p_ret_pts_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-21)
df_player1["p_ret_pts_won_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-21)

df_player1["p_ret_pts_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-22)
df_player1["p_ret_pts_won_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-22)

df_player1["p_ret_pts_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-23)
df_player1["p_ret_pts_won_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-23)

df_player1["p_ret_pts_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-24)
df_player1["p_ret_pts_won_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-24)

df_player1["p_ret_pts_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-25)
df_player1["p_ret_pts_won_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-25)

df_player1["p_ret_pts_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-26)
df_player1["p_ret_pts_won_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-26)

df_player1["p_ret_pts_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-27)
df_player1["p_ret_pts_won_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-27)

df_player1["p_ret_pts_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-28)
df_player1["p_ret_pts_won_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-28)

df_player1["p_ret_pts_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-29)
df_player1["p_ret_pts_won_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-29)

df_player1["p_ret_pts_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-30)
df_player1["p_ret_pts_won_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-30)

df_player1["p_ret_pts_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-31)
df_player1["p_ret_pts_won_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-31)

df_player1["p_ret_pts_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-32)
df_player1["p_ret_pts_won_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-32)

df_player1["p_ret_pts_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-33)
df_player1["p_ret_pts_won_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-33)

df_player1["p_ret_pts_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-34)
df_player1["p_ret_pts_won_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-34)

df_player1["p_ret_pts_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-35)
df_player1["p_ret_pts_won_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-35)

df_player1["p_ret_pts_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-36)
df_player1["p_ret_pts_won_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-36)

df_player1["p_ret_pts_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-37)
df_player1["p_ret_pts_won_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-37)

df_player1["p_ret_pts_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-38)
df_player1["p_ret_pts_won_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-38)

df_player1["p_ret_pts_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-39)
df_player1["p_ret_pts_won_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-39)

df_player1["p_ret_pts_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-40)
df_player1["p_ret_pts_won_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-40)

df_player1["p_ret_pts_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-41)
df_player1["p_ret_pts_won_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-41)

df_player1["p_ret_pts_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-42)
df_player1["p_ret_pts_won_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-42)

df_player1["p_ret_pts_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-43)
df_player1["p_ret_pts_won_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-43)

df_player1["p_ret_pts_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-44)
df_player1["p_ret_pts_won_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-44)

df_player1["p_ret_pts_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-45)
df_player1["p_ret_pts_won_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-45)

df_player1["p_ret_pts_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-46)
df_player1["p_ret_pts_won_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-46)

df_player1["p_ret_pts_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-47)
df_player1["p_ret_pts_won_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-47)

df_player1["p_ret_pts_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-48)
df_player1["p_ret_pts_won_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-48)

df_player1["p_ret_pts_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-49)
df_player1["p_ret_pts_won_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-49)

df_player1["p_ret_pts_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-50)
df_player1["p_ret_pts_won_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-50)

df_player1["p_ret_pts_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-51)
df_player1["p_ret_pts_won_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-51)

df_player1["p_ret_pts_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-52)
df_player1["p_ret_pts_won_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-52)

df_player1["p_ret_pts_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-53)
df_player1["p_ret_pts_won_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-53)

df_player1["p_ret_pts_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-54)
df_player1["p_ret_pts_won_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-54)

df_player1["p_ret_pts_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-55)
df_player1["p_ret_pts_won_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-55)

df_player1["p_ret_pts_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-56)
df_player1["p_ret_pts_won_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-56)

df_player1["p_ret_pts_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-57)
df_player1["p_ret_pts_won_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-57)

df_player1["p_ret_pts_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-58)
df_player1["p_ret_pts_won_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-58)

df_player1["p_ret_pts_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-59)
df_player1["p_ret_pts_won_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-59)

df_player1["p_ret_pts_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-60)
df_player1["p_ret_pts_won_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts_won'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_ret_pts_l60_ws"] = df_player1[["p_ret_pts_60", "p_ret_pts_59", "p_ret_pts_58", "p_ret_pts_57", "p_ret_pts_56", "p_ret_pts_55", "p_ret_pts_54", "p_ret_pts_53", "p_ret_pts_52", "p_ret_pts_51", "p_ret_pts_50", "p_ret_pts_49", "p_ret_pts_48", "p_ret_pts_47", "p_ret_pts_46", "p_ret_pts_45", "p_ret_pts_44", "p_ret_pts_43", "p_ret_pts_42", "p_ret_pts_41", "p_ret_pts_40", "p_ret_pts_39", "p_ret_pts_38", "p_ret_pts_37", "p_ret_pts_36", "p_ret_pts_35", "p_ret_pts_34", "p_ret_pts_33", "p_ret_pts_32", "p_ret_pts_31", "p_ret_pts_30", "p_ret_pts_29", "p_ret_pts_28", "p_ret_pts_27", "p_ret_pts_26", "p_ret_pts_25", "p_ret_pts_24", "p_ret_pts_23", "p_ret_pts_22", "p_ret_pts_21", "p_ret_pts_20", "p_ret_pts_19", "p_ret_pts_18", "p_ret_pts_17", "p_ret_pts_16", "p_ret_pts_15", "p_ret_pts_14", "p_ret_pts_13", "p_ret_pts_12", "p_ret_pts_11", "p_ret_pts_10", "p_ret_pts_9", "p_ret_pts_8", "p_ret_pts_7", "p_ret_pts_6", "p_ret_pts_5", "p_ret_pts_4", "p_ret_pts_3", "p_ret_pts_2", "p_ret_pts_1"]].sum(axis=1)
df_player1["p_ret_pts_won_l60_ws"] = df_player1[["p_ret_pts_won_60", "p_ret_pts_won_59", "p_ret_pts_won_58", "p_ret_pts_won_57", "p_ret_pts_won_56", "p_ret_pts_won_55", "p_ret_pts_won_54", "p_ret_pts_won_53", "p_ret_pts_won_52", "p_ret_pts_won_51", "p_ret_pts_won_50", "p_ret_pts_won_49", "p_ret_pts_won_48", "p_ret_pts_won_47", "p_ret_pts_won_46", "p_ret_pts_won_45", "p_ret_pts_won_44", "p_ret_pts_won_43", "p_ret_pts_won_42", "p_ret_pts_won_41", "p_ret_pts_won_40", "p_ret_pts_won_39", "p_ret_pts_won_38", "p_ret_pts_won_37", "p_ret_pts_won_36", "p_ret_pts_won_35", "p_ret_pts_won_34", "p_ret_pts_won_33", "p_ret_pts_won_32", "p_ret_pts_won_31", "p_ret_pts_won_30", "p_ret_pts_won_29", "p_ret_pts_won_28", "p_ret_pts_won_27", "p_ret_pts_won_26", "p_ret_pts_won_25", "p_ret_pts_won_24", "p_ret_pts_won_23", "p_ret_pts_won_22", "p_ret_pts_won_21", "p_ret_pts_won_20", "p_ret_pts_won_19", "p_ret_pts_won_18", "p_ret_pts_won_17", "p_ret_pts_won_16", "p_ret_pts_won_15", "p_ret_pts_won_14", "p_ret_pts_won_13", "p_ret_pts_won_12", "p_ret_pts_won_11", "p_ret_pts_won_10", "p_ret_pts_won_9", "p_ret_pts_won_8", "p_ret_pts_won_7", "p_ret_pts_won_6", "p_ret_pts_won_5", "p_ret_pts_won_4", "p_ret_pts_won_3", "p_ret_pts_won_2", "p_ret_pts_won_1"]].sum(axis=1)
df_player1["p_ret_pts_won%_l60_tw_ss_IO"] = ((df_player1["p_ret_pts_won_l60_ws"]/df_player1["p_ret_pts_l60_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

In [49]:
# 'p_ret_pts_won%_l10_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor (IO) specific mean RETURN POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_ret_pts_l10_ws"] = df_player1[["p_ret_pts_60", "p_ret_pts_59", "p_ret_pts_58", "p_ret_pts_57", "p_ret_pts_56", "p_ret_pts_55", "p_ret_pts_54", "p_ret_pts_53", "p_ret_pts_52", "p_ret_pts_51"]].sum(axis=1)
df_player1["p_ret_pts_won_l10_ws"] = df_player1[["p_ret_pts_won_60", "p_ret_pts_won_59", "p_ret_pts_won_58", "p_ret_pts_won_57", "p_ret_pts_won_56", "p_ret_pts_won_55", "p_ret_pts_won_54", "p_ret_pts_won_53", "p_ret_pts_won_52", "p_ret_pts_won_51"]].sum(axis=1)
df_player1["p_ret_pts_won%_l10_tw_ss_IO"] = ((df_player1["p_ret_pts_won_l10_ws"]/df_player1["p_ret_pts_l10_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_ret_pts_l60_ws", "p_ret_pts_won_l60_ws", "p_ret_pts_l10_ws", "p_ret_pts_won_l10_ws", "p_ret_pts_60", "p_ret_pts_59", "p_ret_pts_58", "p_ret_pts_57", "p_ret_pts_56", "p_ret_pts_55", "p_ret_pts_54", "p_ret_pts_53", "p_ret_pts_52", "p_ret_pts_51", "p_ret_pts_50", "p_ret_pts_49", "p_ret_pts_48", "p_ret_pts_47", "p_ret_pts_46", "p_ret_pts_45", "p_ret_pts_44", "p_ret_pts_43", "p_ret_pts_42", "p_ret_pts_41", "p_ret_pts_40", "p_ret_pts_39", "p_ret_pts_38", "p_ret_pts_37", "p_ret_pts_36", "p_ret_pts_35", "p_ret_pts_34", "p_ret_pts_33", "p_ret_pts_32", "p_ret_pts_31", "p_ret_pts_30", "p_ret_pts_29", "p_ret_pts_28", "p_ret_pts_27", "p_ret_pts_26", "p_ret_pts_25", "p_ret_pts_24", "p_ret_pts_23", "p_ret_pts_22", "p_ret_pts_21", "p_ret_pts_20", "p_ret_pts_19", "p_ret_pts_18", "p_ret_pts_17", "p_ret_pts_16", "p_ret_pts_15", "p_ret_pts_14", "p_ret_pts_13", "p_ret_pts_12", "p_ret_pts_11", "p_ret_pts_10", "p_ret_pts_9", "p_ret_pts_8", "p_ret_pts_7", "p_ret_pts_6", "p_ret_pts_5", "p_ret_pts_4", "p_ret_pts_3", "p_ret_pts_2", "p_ret_pts_1", "p_ret_pts_won_60", "p_ret_pts_won_59", "p_ret_pts_won_58", "p_ret_pts_won_57", "p_ret_pts_won_56", "p_ret_pts_won_55", "p_ret_pts_won_54", "p_ret_pts_won_53", "p_ret_pts_won_52", "p_ret_pts_won_51", "p_ret_pts_won_50", "p_ret_pts_won_49", "p_ret_pts_won_48", "p_ret_pts_won_47", "p_ret_pts_won_46", "p_ret_pts_won_45", "p_ret_pts_won_44", "p_ret_pts_won_43", "p_ret_pts_won_42", "p_ret_pts_won_41", "p_ret_pts_won_40", "p_ret_pts_won_39", "p_ret_pts_won_38", "p_ret_pts_won_37", "p_ret_pts_won_36", "p_ret_pts_won_35", "p_ret_pts_won_34", "p_ret_pts_won_33", "p_ret_pts_won_32", "p_ret_pts_won_31", "p_ret_pts_won_30", "p_ret_pts_won_29", "p_ret_pts_won_28", "p_ret_pts_won_27", "p_ret_pts_won_26", "p_ret_pts_won_25", "p_ret_pts_won_24", "p_ret_pts_won_23", "p_ret_pts_won_22", "p_ret_pts_won_21", "p_ret_pts_won_20", "p_ret_pts_won_19", "p_ret_pts_won_18", "p_ret_pts_won_17", "p_ret_pts_won_16", "p_ret_pts_won_15", "p_ret_pts_won_14", "p_ret_pts_won_13", "p_ret_pts_won_12", "p_ret_pts_won_11", "p_ret_pts_won_10", "p_ret_pts_won_9", "p_ret_pts_won_8", "p_ret_pts_won_7", "p_ret_pts_won_6", "p_ret_pts_won_5", "p_ret_pts_won_4", "p_ret_pts_won_3", "p_ret_pts_won_2", "p_ret_pts_won_1"], axis = 1)

In [50]:
# 'p_1st_ret_pts_won%_l60_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean FIRST (SERVE) RETURN POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

df_player1["p_1st_ret_pts_60"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-1)
df_player1["p_1st_ret_pts_won_60"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-1)

df_player1["p_1st_ret_pts_59"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-2)
df_player1["p_1st_ret_pts_won_59"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-2)

df_player1["p_1st_ret_pts_58"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-3)
df_player1["p_1st_ret_pts_won_58"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-3)

df_player1["p_1st_ret_pts_57"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-4)
df_player1["p_1st_ret_pts_won_57"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-4)

df_player1["p_1st_ret_pts_56"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-5)
df_player1["p_1st_ret_pts_won_56"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-5)

df_player1["p_1st_ret_pts_55"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-6)
df_player1["p_1st_ret_pts_won_55"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-6)

df_player1["p_1st_ret_pts_54"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-7)
df_player1["p_1st_ret_pts_won_54"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-7)

df_player1["p_1st_ret_pts_53"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-8)
df_player1["p_1st_ret_pts_won_53"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-8)

df_player1["p_1st_ret_pts_52"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-9)
df_player1["p_1st_ret_pts_won_52"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-9)

df_player1["p_1st_ret_pts_51"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-10)
df_player1["p_1st_ret_pts_won_51"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-10)

df_player1["p_1st_ret_pts_50"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-11)
df_player1["p_1st_ret_pts_won_50"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-11)

df_player1["p_1st_ret_pts_49"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-12)
df_player1["p_1st_ret_pts_won_49"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-12)

df_player1["p_1st_ret_pts_48"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-13)
df_player1["p_1st_ret_pts_won_48"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-13)

df_player1["p_1st_ret_pts_47"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-14)
df_player1["p_1st_ret_pts_won_47"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-14)

df_player1["p_1st_ret_pts_46"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-15)
df_player1["p_1st_ret_pts_won_46"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-15)

df_player1["p_1st_ret_pts_45"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-16)
df_player1["p_1st_ret_pts_won_45"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-16)

df_player1["p_1st_ret_pts_44"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-17)
df_player1["p_1st_ret_pts_won_44"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-17)

df_player1["p_1st_ret_pts_43"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-18)
df_player1["p_1st_ret_pts_won_43"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-18)

df_player1["p_1st_ret_pts_42"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-19)
df_player1["p_1st_ret_pts_won_42"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-19)

df_player1["p_1st_ret_pts_41"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-20)
df_player1["p_1st_ret_pts_won_41"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-20)

df_player1["p_1st_ret_pts_40"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-21)
df_player1["p_1st_ret_pts_won_40"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-21)

df_player1["p_1st_ret_pts_39"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-22)
df_player1["p_1st_ret_pts_won_39"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-22)

df_player1["p_1st_ret_pts_38"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-23)
df_player1["p_1st_ret_pts_won_38"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-23)

df_player1["p_1st_ret_pts_37"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-24)
df_player1["p_1st_ret_pts_won_37"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-24)

df_player1["p_1st_ret_pts_36"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-25)
df_player1["p_1st_ret_pts_won_36"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-25)

df_player1["p_1st_ret_pts_35"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-26)
df_player1["p_1st_ret_pts_won_35"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-26)

df_player1["p_1st_ret_pts_34"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-27)
df_player1["p_1st_ret_pts_won_34"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-27)

df_player1["p_1st_ret_pts_33"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-28)
df_player1["p_1st_ret_pts_won_33"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-28)

df_player1["p_1st_ret_pts_32"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-29)
df_player1["p_1st_ret_pts_won_32"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-29)

df_player1["p_1st_ret_pts_31"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-30)
df_player1["p_1st_ret_pts_won_31"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-30)

df_player1["p_1st_ret_pts_30"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-31)
df_player1["p_1st_ret_pts_won_30"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-31)

df_player1["p_1st_ret_pts_29"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-32)
df_player1["p_1st_ret_pts_won_29"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-32)

df_player1["p_1st_ret_pts_28"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-33)
df_player1["p_1st_ret_pts_won_28"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-33)

df_player1["p_1st_ret_pts_27"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-34)
df_player1["p_1st_ret_pts_won_27"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-34)

df_player1["p_1st_ret_pts_26"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-35)
df_player1["p_1st_ret_pts_won_26"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-35)

df_player1["p_1st_ret_pts_25"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-36)
df_player1["p_1st_ret_pts_won_25"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-36)

df_player1["p_1st_ret_pts_24"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-37)
df_player1["p_1st_ret_pts_won_24"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-37)

df_player1["p_1st_ret_pts_23"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-38)
df_player1["p_1st_ret_pts_won_23"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-38)

df_player1["p_1st_ret_pts_22"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-39)
df_player1["p_1st_ret_pts_won_22"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-39)

df_player1["p_1st_ret_pts_21"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-40)
df_player1["p_1st_ret_pts_won_21"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-40)

df_player1["p_1st_ret_pts_20"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-41)
df_player1["p_1st_ret_pts_won_20"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-41)

df_player1["p_1st_ret_pts_19"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-42)
df_player1["p_1st_ret_pts_won_19"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-42)

df_player1["p_1st_ret_pts_18"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-43)
df_player1["p_1st_ret_pts_won_18"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-43)

df_player1["p_1st_ret_pts_17"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-44)
df_player1["p_1st_ret_pts_won_17"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-44)

df_player1["p_1st_ret_pts_16"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-45)
df_player1["p_1st_ret_pts_won_16"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-45)

df_player1["p_1st_ret_pts_15"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-46)
df_player1["p_1st_ret_pts_won_15"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-46)

df_player1["p_1st_ret_pts_14"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-47)
df_player1["p_1st_ret_pts_won_14"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-47)

df_player1["p_1st_ret_pts_13"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-48)
df_player1["p_1st_ret_pts_won_13"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-48)

df_player1["p_1st_ret_pts_12"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-49)
df_player1["p_1st_ret_pts_won_12"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-49)

df_player1["p_1st_ret_pts_11"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-50)
df_player1["p_1st_ret_pts_won_11"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-50)

df_player1["p_1st_ret_pts_10"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-51)
df_player1["p_1st_ret_pts_won_10"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-51)

df_player1["p_1st_ret_pts_9"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-52)
df_player1["p_1st_ret_pts_won_9"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-52)

df_player1["p_1st_ret_pts_8"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-53)
df_player1["p_1st_ret_pts_won_8"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-53)

df_player1["p_1st_ret_pts_7"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-54)
df_player1["p_1st_ret_pts_won_7"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-54)

df_player1["p_1st_ret_pts_6"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-55)
df_player1["p_1st_ret_pts_won_6"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-55)

df_player1["p_1st_ret_pts_5"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-56)
df_player1["p_1st_ret_pts_won_5"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-56)

df_player1["p_1st_ret_pts_4"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-57)
df_player1["p_1st_ret_pts_won_4"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-57)

df_player1["p_1st_ret_pts_3"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-58)
df_player1["p_1st_ret_pts_won_3"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-58)

df_player1["p_1st_ret_pts_2"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-59)
df_player1["p_1st_ret_pts_won_2"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-59)

df_player1["p_1st_ret_pts_1"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts'].shift(-60)
df_player1["p_1st_ret_pts_won_1"] = df_player1.groupby(['p_id','t_surf'])['p_1st_ret_pts_won'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_1st_ret_pts_l60_ws"] = df_player1[["p_1st_ret_pts_60", "p_1st_ret_pts_59", "p_1st_ret_pts_58", "p_1st_ret_pts_57", "p_1st_ret_pts_56", "p_1st_ret_pts_55", "p_1st_ret_pts_54", "p_1st_ret_pts_53", "p_1st_ret_pts_52", "p_1st_ret_pts_51", "p_1st_ret_pts_50", "p_1st_ret_pts_49", "p_1st_ret_pts_48", "p_1st_ret_pts_47", "p_1st_ret_pts_46", "p_1st_ret_pts_45", "p_1st_ret_pts_44", "p_1st_ret_pts_43", "p_1st_ret_pts_42", "p_1st_ret_pts_41", "p_1st_ret_pts_40", "p_1st_ret_pts_39", "p_1st_ret_pts_38", "p_1st_ret_pts_37", "p_1st_ret_pts_36", "p_1st_ret_pts_35", "p_1st_ret_pts_34", "p_1st_ret_pts_33", "p_1st_ret_pts_32", "p_1st_ret_pts_31", "p_1st_ret_pts_30", "p_1st_ret_pts_29", "p_1st_ret_pts_28", "p_1st_ret_pts_27", "p_1st_ret_pts_26", "p_1st_ret_pts_25", "p_1st_ret_pts_24", "p_1st_ret_pts_23", "p_1st_ret_pts_22", "p_1st_ret_pts_21", "p_1st_ret_pts_20", "p_1st_ret_pts_19", "p_1st_ret_pts_18", "p_1st_ret_pts_17", "p_1st_ret_pts_16", "p_1st_ret_pts_15", "p_1st_ret_pts_14", "p_1st_ret_pts_13", "p_1st_ret_pts_12", "p_1st_ret_pts_11", "p_1st_ret_pts_10", "p_1st_ret_pts_9", "p_1st_ret_pts_8", "p_1st_ret_pts_7", "p_1st_ret_pts_6", "p_1st_ret_pts_5", "p_1st_ret_pts_4", "p_1st_ret_pts_3", "p_1st_ret_pts_2", "p_1st_ret_pts_1"]].sum(axis=1)
df_player1["p_1st_ret_pts_won_l60_ws"] = df_player1[["p_1st_ret_pts_won_60", "p_1st_ret_pts_won_59", "p_1st_ret_pts_won_58", "p_1st_ret_pts_won_57", "p_1st_ret_pts_won_56", "p_1st_ret_pts_won_55", "p_1st_ret_pts_won_54", "p_1st_ret_pts_won_53", "p_1st_ret_pts_won_52", "p_1st_ret_pts_won_51", "p_1st_ret_pts_won_50", "p_1st_ret_pts_won_49", "p_1st_ret_pts_won_48", "p_1st_ret_pts_won_47", "p_1st_ret_pts_won_46", "p_1st_ret_pts_won_45", "p_1st_ret_pts_won_44", "p_1st_ret_pts_won_43", "p_1st_ret_pts_won_42", "p_1st_ret_pts_won_41", "p_1st_ret_pts_won_40", "p_1st_ret_pts_won_39", "p_1st_ret_pts_won_38", "p_1st_ret_pts_won_37", "p_1st_ret_pts_won_36", "p_1st_ret_pts_won_35", "p_1st_ret_pts_won_34", "p_1st_ret_pts_won_33", "p_1st_ret_pts_won_32", "p_1st_ret_pts_won_31", "p_1st_ret_pts_won_30", "p_1st_ret_pts_won_29", "p_1st_ret_pts_won_28", "p_1st_ret_pts_won_27", "p_1st_ret_pts_won_26", "p_1st_ret_pts_won_25", "p_1st_ret_pts_won_24", "p_1st_ret_pts_won_23", "p_1st_ret_pts_won_22", "p_1st_ret_pts_won_21", "p_1st_ret_pts_won_20", "p_1st_ret_pts_won_19", "p_1st_ret_pts_won_18", "p_1st_ret_pts_won_17", "p_1st_ret_pts_won_16", "p_1st_ret_pts_won_15", "p_1st_ret_pts_won_14", "p_1st_ret_pts_won_13", "p_1st_ret_pts_won_12", "p_1st_ret_pts_won_11", "p_1st_ret_pts_won_10", "p_1st_ret_pts_won_9", "p_1st_ret_pts_won_8", "p_1st_ret_pts_won_7", "p_1st_ret_pts_won_6", "p_1st_ret_pts_won_5", "p_1st_ret_pts_won_4", "p_1st_ret_pts_won_3", "p_1st_ret_pts_won_2", "p_1st_ret_pts_won_1"]].sum(axis=1)
df_player1["p_1st_ret_pts_won%_l60_tw_ss"] = ((df_player1["p_1st_ret_pts_won_l60_ws"]/df_player1["p_1st_ret_pts_l60_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

In [51]:
# 'p_1st_ret_pts_won%_l10_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean FIRST (SERVE) RETURN POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_1st_ret_pts_l10_ws"] = df_player1[["p_1st_ret_pts_60", "p_1st_ret_pts_59", "p_1st_ret_pts_58", "p_1st_ret_pts_57", "p_1st_ret_pts_56", "p_1st_ret_pts_55", "p_1st_ret_pts_54", "p_1st_ret_pts_53", "p_1st_ret_pts_52", "p_1st_ret_pts_51"]].sum(axis=1)
df_player1["p_1st_ret_pts_won_l10_ws"] = df_player1[["p_1st_ret_pts_won_60", "p_1st_ret_pts_won_59", "p_1st_ret_pts_won_58", "p_1st_ret_pts_won_57", "p_1st_ret_pts_won_56", "p_1st_ret_pts_won_55", "p_1st_ret_pts_won_54", "p_1st_ret_pts_won_53", "p_1st_ret_pts_won_52", "p_1st_ret_pts_won_51"]].sum(axis=1)
df_player1["p_1st_ret_pts_won%_l10_tw_ss"] = ((df_player1["p_1st_ret_pts_won_l10_ws"]/df_player1["p_1st_ret_pts_l10_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_1st_ret_pts_l60_ws", "p_1st_ret_pts_won_l60_ws", "p_1st_ret_pts_l10_ws", "p_1st_ret_pts_won_l10_ws", "p_1st_ret_pts_60", "p_1st_ret_pts_59", "p_1st_ret_pts_58", "p_1st_ret_pts_57", "p_1st_ret_pts_56", "p_1st_ret_pts_55", "p_1st_ret_pts_54", "p_1st_ret_pts_53", "p_1st_ret_pts_52", "p_1st_ret_pts_51", "p_1st_ret_pts_50", "p_1st_ret_pts_49", "p_1st_ret_pts_48", "p_1st_ret_pts_47", "p_1st_ret_pts_46", "p_1st_ret_pts_45", "p_1st_ret_pts_44", "p_1st_ret_pts_43", "p_1st_ret_pts_42", "p_1st_ret_pts_41", "p_1st_ret_pts_40", "p_1st_ret_pts_39", "p_1st_ret_pts_38", "p_1st_ret_pts_37", "p_1st_ret_pts_36", "p_1st_ret_pts_35", "p_1st_ret_pts_34", "p_1st_ret_pts_33", "p_1st_ret_pts_32", "p_1st_ret_pts_31", "p_1st_ret_pts_30", "p_1st_ret_pts_29", "p_1st_ret_pts_28", "p_1st_ret_pts_27", "p_1st_ret_pts_26", "p_1st_ret_pts_25", "p_1st_ret_pts_24", "p_1st_ret_pts_23", "p_1st_ret_pts_22", "p_1st_ret_pts_21", "p_1st_ret_pts_20", "p_1st_ret_pts_19", "p_1st_ret_pts_18", "p_1st_ret_pts_17", "p_1st_ret_pts_16", "p_1st_ret_pts_15", "p_1st_ret_pts_14", "p_1st_ret_pts_13", "p_1st_ret_pts_12", "p_1st_ret_pts_11", "p_1st_ret_pts_10", "p_1st_ret_pts_9", "p_1st_ret_pts_8", "p_1st_ret_pts_7", "p_1st_ret_pts_6", "p_1st_ret_pts_5", "p_1st_ret_pts_4", "p_1st_ret_pts_3", "p_1st_ret_pts_2", "p_1st_ret_pts_1", "p_1st_ret_pts_won_60", "p_1st_ret_pts_won_59", "p_1st_ret_pts_won_58", "p_1st_ret_pts_won_57", "p_1st_ret_pts_won_56", "p_1st_ret_pts_won_55", "p_1st_ret_pts_won_54", "p_1st_ret_pts_won_53", "p_1st_ret_pts_won_52", "p_1st_ret_pts_won_51", "p_1st_ret_pts_won_50", "p_1st_ret_pts_won_49", "p_1st_ret_pts_won_48", "p_1st_ret_pts_won_47", "p_1st_ret_pts_won_46", "p_1st_ret_pts_won_45", "p_1st_ret_pts_won_44", "p_1st_ret_pts_won_43", "p_1st_ret_pts_won_42", "p_1st_ret_pts_won_41", "p_1st_ret_pts_won_40", "p_1st_ret_pts_won_39", "p_1st_ret_pts_won_38", "p_1st_ret_pts_won_37", "p_1st_ret_pts_won_36", "p_1st_ret_pts_won_35", "p_1st_ret_pts_won_34", "p_1st_ret_pts_won_33", "p_1st_ret_pts_won_32", "p_1st_ret_pts_won_31", "p_1st_ret_pts_won_30", "p_1st_ret_pts_won_29", "p_1st_ret_pts_won_28", "p_1st_ret_pts_won_27", "p_1st_ret_pts_won_26", "p_1st_ret_pts_won_25", "p_1st_ret_pts_won_24", "p_1st_ret_pts_won_23", "p_1st_ret_pts_won_22", "p_1st_ret_pts_won_21", "p_1st_ret_pts_won_20", "p_1st_ret_pts_won_19", "p_1st_ret_pts_won_18", "p_1st_ret_pts_won_17", "p_1st_ret_pts_won_16", "p_1st_ret_pts_won_15", "p_1st_ret_pts_won_14", "p_1st_ret_pts_won_13", "p_1st_ret_pts_won_12", "p_1st_ret_pts_won_11", "p_1st_ret_pts_won_10", "p_1st_ret_pts_won_9", "p_1st_ret_pts_won_8", "p_1st_ret_pts_won_7", "p_1st_ret_pts_won_6", "p_1st_ret_pts_won_5", "p_1st_ret_pts_won_4", "p_1st_ret_pts_won_3", "p_1st_ret_pts_won_2", "p_1st_ret_pts_won_1"], axis = 1)

In [52]:
# 'p_1st_ret_pts_won%_l60_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor (IO) specific mean FIRST (SERVE) RETURN POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf', 't_ind', 'm_date','m_rd_num'], ascending = False)

df_player1["p_1st_ret_pts_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-1)
df_player1["p_1st_ret_pts_won_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-1)

df_player1["p_1st_ret_pts_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-2)
df_player1["p_1st_ret_pts_won_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-2)

df_player1["p_1st_ret_pts_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-3)
df_player1["p_1st_ret_pts_won_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-3)

df_player1["p_1st_ret_pts_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-4)
df_player1["p_1st_ret_pts_won_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-4)

df_player1["p_1st_ret_pts_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-5)
df_player1["p_1st_ret_pts_won_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-5)

df_player1["p_1st_ret_pts_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-6)
df_player1["p_1st_ret_pts_won_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-6)

df_player1["p_1st_ret_pts_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-7)
df_player1["p_1st_ret_pts_won_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-7)

df_player1["p_1st_ret_pts_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-8)
df_player1["p_1st_ret_pts_won_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-8)

df_player1["p_1st_ret_pts_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-9)
df_player1["p_1st_ret_pts_won_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-9)

df_player1["p_1st_ret_pts_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-10)
df_player1["p_1st_ret_pts_won_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-10)

df_player1["p_1st_ret_pts_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-11)
df_player1["p_1st_ret_pts_won_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-11)

df_player1["p_1st_ret_pts_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-12)
df_player1["p_1st_ret_pts_won_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-12)

df_player1["p_1st_ret_pts_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-13)
df_player1["p_1st_ret_pts_won_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-13)

df_player1["p_1st_ret_pts_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-14)
df_player1["p_1st_ret_pts_won_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-14)

df_player1["p_1st_ret_pts_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-15)
df_player1["p_1st_ret_pts_won_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-15)

df_player1["p_1st_ret_pts_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-16)
df_player1["p_1st_ret_pts_won_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-16)

df_player1["p_1st_ret_pts_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-17)
df_player1["p_1st_ret_pts_won_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-17)

df_player1["p_1st_ret_pts_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-18)
df_player1["p_1st_ret_pts_won_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-18)

df_player1["p_1st_ret_pts_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-19)
df_player1["p_1st_ret_pts_won_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-19)

df_player1["p_1st_ret_pts_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-20)
df_player1["p_1st_ret_pts_won_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-20)

df_player1["p_1st_ret_pts_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-21)
df_player1["p_1st_ret_pts_won_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-21)

df_player1["p_1st_ret_pts_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-22)
df_player1["p_1st_ret_pts_won_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-22)

df_player1["p_1st_ret_pts_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-23)
df_player1["p_1st_ret_pts_won_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-23)

df_player1["p_1st_ret_pts_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-24)
df_player1["p_1st_ret_pts_won_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-24)

df_player1["p_1st_ret_pts_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-25)
df_player1["p_1st_ret_pts_won_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-25)

df_player1["p_1st_ret_pts_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-26)
df_player1["p_1st_ret_pts_won_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-26)

df_player1["p_1st_ret_pts_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-27)
df_player1["p_1st_ret_pts_won_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-27)

df_player1["p_1st_ret_pts_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-28)
df_player1["p_1st_ret_pts_won_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-28)

df_player1["p_1st_ret_pts_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-29)
df_player1["p_1st_ret_pts_won_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-29)

df_player1["p_1st_ret_pts_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-30)
df_player1["p_1st_ret_pts_won_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-30)

df_player1["p_1st_ret_pts_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-31)
df_player1["p_1st_ret_pts_won_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-31)

df_player1["p_1st_ret_pts_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-32)
df_player1["p_1st_ret_pts_won_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-32)

df_player1["p_1st_ret_pts_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-33)
df_player1["p_1st_ret_pts_won_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-33)

df_player1["p_1st_ret_pts_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-34)
df_player1["p_1st_ret_pts_won_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-34)

df_player1["p_1st_ret_pts_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-35)
df_player1["p_1st_ret_pts_won_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-35)

df_player1["p_1st_ret_pts_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-36)
df_player1["p_1st_ret_pts_won_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-36)

df_player1["p_1st_ret_pts_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-37)
df_player1["p_1st_ret_pts_won_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-37)

df_player1["p_1st_ret_pts_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-38)
df_player1["p_1st_ret_pts_won_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-38)

df_player1["p_1st_ret_pts_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-39)
df_player1["p_1st_ret_pts_won_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-39)

df_player1["p_1st_ret_pts_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-40)
df_player1["p_1st_ret_pts_won_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-40)

df_player1["p_1st_ret_pts_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-41)
df_player1["p_1st_ret_pts_won_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-41)

df_player1["p_1st_ret_pts_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-42)
df_player1["p_1st_ret_pts_won_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-42)

df_player1["p_1st_ret_pts_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-43)
df_player1["p_1st_ret_pts_won_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-43)

df_player1["p_1st_ret_pts_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-44)
df_player1["p_1st_ret_pts_won_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-44)

df_player1["p_1st_ret_pts_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-45)
df_player1["p_1st_ret_pts_won_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-45)

df_player1["p_1st_ret_pts_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-46)
df_player1["p_1st_ret_pts_won_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-46)

df_player1["p_1st_ret_pts_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-47)
df_player1["p_1st_ret_pts_won_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-47)

df_player1["p_1st_ret_pts_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-48)
df_player1["p_1st_ret_pts_won_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-48)

df_player1["p_1st_ret_pts_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-49)
df_player1["p_1st_ret_pts_won_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-49)

df_player1["p_1st_ret_pts_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-50)
df_player1["p_1st_ret_pts_won_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-50)

df_player1["p_1st_ret_pts_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-51)
df_player1["p_1st_ret_pts_won_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-51)

df_player1["p_1st_ret_pts_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-52)
df_player1["p_1st_ret_pts_won_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-52)

df_player1["p_1st_ret_pts_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-53)
df_player1["p_1st_ret_pts_won_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-53)

df_player1["p_1st_ret_pts_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-54)
df_player1["p_1st_ret_pts_won_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-54)

df_player1["p_1st_ret_pts_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-55)
df_player1["p_1st_ret_pts_won_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-55)

df_player1["p_1st_ret_pts_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-56)
df_player1["p_1st_ret_pts_won_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-56)

df_player1["p_1st_ret_pts_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-57)
df_player1["p_1st_ret_pts_won_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-57)

df_player1["p_1st_ret_pts_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-58)
df_player1["p_1st_ret_pts_won_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-58)

df_player1["p_1st_ret_pts_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-59)
df_player1["p_1st_ret_pts_won_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-59)

df_player1["p_1st_ret_pts_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts'].shift(-60)
df_player1["p_1st_ret_pts_won_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_1st_ret_pts_won'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_1st_ret_pts_l60_ws"] = df_player1[["p_1st_ret_pts_60", "p_1st_ret_pts_59", "p_1st_ret_pts_58", "p_1st_ret_pts_57", "p_1st_ret_pts_56", "p_1st_ret_pts_55", "p_1st_ret_pts_54", "p_1st_ret_pts_53", "p_1st_ret_pts_52", "p_1st_ret_pts_51", "p_1st_ret_pts_50", "p_1st_ret_pts_49", "p_1st_ret_pts_48", "p_1st_ret_pts_47", "p_1st_ret_pts_46", "p_1st_ret_pts_45", "p_1st_ret_pts_44", "p_1st_ret_pts_43", "p_1st_ret_pts_42", "p_1st_ret_pts_41", "p_1st_ret_pts_40", "p_1st_ret_pts_39", "p_1st_ret_pts_38", "p_1st_ret_pts_37", "p_1st_ret_pts_36", "p_1st_ret_pts_35", "p_1st_ret_pts_34", "p_1st_ret_pts_33", "p_1st_ret_pts_32", "p_1st_ret_pts_31", "p_1st_ret_pts_30", "p_1st_ret_pts_29", "p_1st_ret_pts_28", "p_1st_ret_pts_27", "p_1st_ret_pts_26", "p_1st_ret_pts_25", "p_1st_ret_pts_24", "p_1st_ret_pts_23", "p_1st_ret_pts_22", "p_1st_ret_pts_21", "p_1st_ret_pts_20", "p_1st_ret_pts_19", "p_1st_ret_pts_18", "p_1st_ret_pts_17", "p_1st_ret_pts_16", "p_1st_ret_pts_15", "p_1st_ret_pts_14", "p_1st_ret_pts_13", "p_1st_ret_pts_12", "p_1st_ret_pts_11", "p_1st_ret_pts_10", "p_1st_ret_pts_9", "p_1st_ret_pts_8", "p_1st_ret_pts_7", "p_1st_ret_pts_6", "p_1st_ret_pts_5", "p_1st_ret_pts_4", "p_1st_ret_pts_3", "p_1st_ret_pts_2", "p_1st_ret_pts_1"]].sum(axis=1)
df_player1["p_1st_ret_pts_won_l60_ws"] = df_player1[["p_1st_ret_pts_won_60", "p_1st_ret_pts_won_59", "p_1st_ret_pts_won_58", "p_1st_ret_pts_won_57", "p_1st_ret_pts_won_56", "p_1st_ret_pts_won_55", "p_1st_ret_pts_won_54", "p_1st_ret_pts_won_53", "p_1st_ret_pts_won_52", "p_1st_ret_pts_won_51", "p_1st_ret_pts_won_50", "p_1st_ret_pts_won_49", "p_1st_ret_pts_won_48", "p_1st_ret_pts_won_47", "p_1st_ret_pts_won_46", "p_1st_ret_pts_won_45", "p_1st_ret_pts_won_44", "p_1st_ret_pts_won_43", "p_1st_ret_pts_won_42", "p_1st_ret_pts_won_41", "p_1st_ret_pts_won_40", "p_1st_ret_pts_won_39", "p_1st_ret_pts_won_38", "p_1st_ret_pts_won_37", "p_1st_ret_pts_won_36", "p_1st_ret_pts_won_35", "p_1st_ret_pts_won_34", "p_1st_ret_pts_won_33", "p_1st_ret_pts_won_32", "p_1st_ret_pts_won_31", "p_1st_ret_pts_won_30", "p_1st_ret_pts_won_29", "p_1st_ret_pts_won_28", "p_1st_ret_pts_won_27", "p_1st_ret_pts_won_26", "p_1st_ret_pts_won_25", "p_1st_ret_pts_won_24", "p_1st_ret_pts_won_23", "p_1st_ret_pts_won_22", "p_1st_ret_pts_won_21", "p_1st_ret_pts_won_20", "p_1st_ret_pts_won_19", "p_1st_ret_pts_won_18", "p_1st_ret_pts_won_17", "p_1st_ret_pts_won_16", "p_1st_ret_pts_won_15", "p_1st_ret_pts_won_14", "p_1st_ret_pts_won_13", "p_1st_ret_pts_won_12", "p_1st_ret_pts_won_11", "p_1st_ret_pts_won_10", "p_1st_ret_pts_won_9", "p_1st_ret_pts_won_8", "p_1st_ret_pts_won_7", "p_1st_ret_pts_won_6", "p_1st_ret_pts_won_5", "p_1st_ret_pts_won_4", "p_1st_ret_pts_won_3", "p_1st_ret_pts_won_2", "p_1st_ret_pts_won_1"]].sum(axis=1)
df_player1["p_1st_ret_pts_won%_l60_tw_ss_IO"] = ((df_player1["p_1st_ret_pts_won_l60_ws"]/df_player1["p_1st_ret_pts_l60_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

In [53]:
# 'p_1st_ret_pts_won%_l10_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor (IO) specific mean FIRST (SERVE) RETURN POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_1st_ret_pts_l10_ws"] = df_player1[["p_1st_ret_pts_60", "p_1st_ret_pts_59", "p_1st_ret_pts_58", "p_1st_ret_pts_57", "p_1st_ret_pts_56", "p_1st_ret_pts_55", "p_1st_ret_pts_54", "p_1st_ret_pts_53", "p_1st_ret_pts_52", "p_1st_ret_pts_51"]].sum(axis=1)
df_player1["p_1st_ret_pts_won_l10_ws"] = df_player1[["p_1st_ret_pts_won_60", "p_1st_ret_pts_won_59", "p_1st_ret_pts_won_58", "p_1st_ret_pts_won_57", "p_1st_ret_pts_won_56", "p_1st_ret_pts_won_55", "p_1st_ret_pts_won_54", "p_1st_ret_pts_won_53", "p_1st_ret_pts_won_52", "p_1st_ret_pts_won_51"]].sum(axis=1)
df_player1["p_1st_ret_pts_won%_l10_tw_ss_IO"] = ((df_player1["p_1st_ret_pts_won_l10_ws"]/df_player1["p_1st_ret_pts_l10_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_1st_ret_pts_l60_ws", "p_1st_ret_pts_won_l60_ws", "p_1st_ret_pts_l10_ws", "p_1st_ret_pts_won_l10_ws", "p_1st_ret_pts_60", "p_1st_ret_pts_59", "p_1st_ret_pts_58", "p_1st_ret_pts_57", "p_1st_ret_pts_56", "p_1st_ret_pts_55", "p_1st_ret_pts_54", "p_1st_ret_pts_53", "p_1st_ret_pts_52", "p_1st_ret_pts_51", "p_1st_ret_pts_50", "p_1st_ret_pts_49", "p_1st_ret_pts_48", "p_1st_ret_pts_47", "p_1st_ret_pts_46", "p_1st_ret_pts_45", "p_1st_ret_pts_44", "p_1st_ret_pts_43", "p_1st_ret_pts_42", "p_1st_ret_pts_41", "p_1st_ret_pts_40", "p_1st_ret_pts_39", "p_1st_ret_pts_38", "p_1st_ret_pts_37", "p_1st_ret_pts_36", "p_1st_ret_pts_35", "p_1st_ret_pts_34", "p_1st_ret_pts_33", "p_1st_ret_pts_32", "p_1st_ret_pts_31", "p_1st_ret_pts_30", "p_1st_ret_pts_29", "p_1st_ret_pts_28", "p_1st_ret_pts_27", "p_1st_ret_pts_26", "p_1st_ret_pts_25", "p_1st_ret_pts_24", "p_1st_ret_pts_23", "p_1st_ret_pts_22", "p_1st_ret_pts_21", "p_1st_ret_pts_20", "p_1st_ret_pts_19", "p_1st_ret_pts_18", "p_1st_ret_pts_17", "p_1st_ret_pts_16", "p_1st_ret_pts_15", "p_1st_ret_pts_14", "p_1st_ret_pts_13", "p_1st_ret_pts_12", "p_1st_ret_pts_11", "p_1st_ret_pts_10", "p_1st_ret_pts_9", "p_1st_ret_pts_8", "p_1st_ret_pts_7", "p_1st_ret_pts_6", "p_1st_ret_pts_5", "p_1st_ret_pts_4", "p_1st_ret_pts_3", "p_1st_ret_pts_2", "p_1st_ret_pts_1", "p_1st_ret_pts_won_60", "p_1st_ret_pts_won_59", "p_1st_ret_pts_won_58", "p_1st_ret_pts_won_57", "p_1st_ret_pts_won_56", "p_1st_ret_pts_won_55", "p_1st_ret_pts_won_54", "p_1st_ret_pts_won_53", "p_1st_ret_pts_won_52", "p_1st_ret_pts_won_51", "p_1st_ret_pts_won_50", "p_1st_ret_pts_won_49", "p_1st_ret_pts_won_48", "p_1st_ret_pts_won_47", "p_1st_ret_pts_won_46", "p_1st_ret_pts_won_45", "p_1st_ret_pts_won_44", "p_1st_ret_pts_won_43", "p_1st_ret_pts_won_42", "p_1st_ret_pts_won_41", "p_1st_ret_pts_won_40", "p_1st_ret_pts_won_39", "p_1st_ret_pts_won_38", "p_1st_ret_pts_won_37", "p_1st_ret_pts_won_36", "p_1st_ret_pts_won_35", "p_1st_ret_pts_won_34", "p_1st_ret_pts_won_33", "p_1st_ret_pts_won_32", "p_1st_ret_pts_won_31", "p_1st_ret_pts_won_30", "p_1st_ret_pts_won_29", "p_1st_ret_pts_won_28", "p_1st_ret_pts_won_27", "p_1st_ret_pts_won_26", "p_1st_ret_pts_won_25", "p_1st_ret_pts_won_24", "p_1st_ret_pts_won_23", "p_1st_ret_pts_won_22", "p_1st_ret_pts_won_21", "p_1st_ret_pts_won_20", "p_1st_ret_pts_won_19", "p_1st_ret_pts_won_18", "p_1st_ret_pts_won_17", "p_1st_ret_pts_won_16", "p_1st_ret_pts_won_15", "p_1st_ret_pts_won_14", "p_1st_ret_pts_won_13", "p_1st_ret_pts_won_12", "p_1st_ret_pts_won_11", "p_1st_ret_pts_won_10", "p_1st_ret_pts_won_9", "p_1st_ret_pts_won_8", "p_1st_ret_pts_won_7", "p_1st_ret_pts_won_6", "p_1st_ret_pts_won_5", "p_1st_ret_pts_won_4", "p_1st_ret_pts_won_3", "p_1st_ret_pts_won_2", "p_1st_ret_pts_won_1"], axis = 1)

In [54]:
# 'p_2nd_ret_pts_won%_l60_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean SECOND (SERVE) RETURN POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

df_player1["p_2nd_ret_pts_60"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-1)
df_player1["p_2nd_ret_pts_won_60"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-1)

df_player1["p_2nd_ret_pts_59"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-2)
df_player1["p_2nd_ret_pts_won_59"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-2)

df_player1["p_2nd_ret_pts_58"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-3)
df_player1["p_2nd_ret_pts_won_58"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-3)

df_player1["p_2nd_ret_pts_57"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-4)
df_player1["p_2nd_ret_pts_won_57"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-4)

df_player1["p_2nd_ret_pts_56"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-5)
df_player1["p_2nd_ret_pts_won_56"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-5)

df_player1["p_2nd_ret_pts_55"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-6)
df_player1["p_2nd_ret_pts_won_55"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-6)

df_player1["p_2nd_ret_pts_54"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-7)
df_player1["p_2nd_ret_pts_won_54"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-7)

df_player1["p_2nd_ret_pts_53"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-8)
df_player1["p_2nd_ret_pts_won_53"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-8)

df_player1["p_2nd_ret_pts_52"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-9)
df_player1["p_2nd_ret_pts_won_52"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-9)

df_player1["p_2nd_ret_pts_51"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-10)
df_player1["p_2nd_ret_pts_won_51"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-10)

df_player1["p_2nd_ret_pts_50"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-11)
df_player1["p_2nd_ret_pts_won_50"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-11)

df_player1["p_2nd_ret_pts_49"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-12)
df_player1["p_2nd_ret_pts_won_49"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-12)

df_player1["p_2nd_ret_pts_48"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-13)
df_player1["p_2nd_ret_pts_won_48"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-13)

df_player1["p_2nd_ret_pts_47"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-14)
df_player1["p_2nd_ret_pts_won_47"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-14)

df_player1["p_2nd_ret_pts_46"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-15)
df_player1["p_2nd_ret_pts_won_46"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-15)

df_player1["p_2nd_ret_pts_45"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-16)
df_player1["p_2nd_ret_pts_won_45"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-16)

df_player1["p_2nd_ret_pts_44"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-17)
df_player1["p_2nd_ret_pts_won_44"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-17)

df_player1["p_2nd_ret_pts_43"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-18)
df_player1["p_2nd_ret_pts_won_43"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-18)

df_player1["p_2nd_ret_pts_42"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-19)
df_player1["p_2nd_ret_pts_won_42"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-19)

df_player1["p_2nd_ret_pts_41"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-20)
df_player1["p_2nd_ret_pts_won_41"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-20)

df_player1["p_2nd_ret_pts_40"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-21)
df_player1["p_2nd_ret_pts_won_40"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-21)

df_player1["p_2nd_ret_pts_39"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-22)
df_player1["p_2nd_ret_pts_won_39"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-22)

df_player1["p_2nd_ret_pts_38"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-23)
df_player1["p_2nd_ret_pts_won_38"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-23)

df_player1["p_2nd_ret_pts_37"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-24)
df_player1["p_2nd_ret_pts_won_37"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-24)

df_player1["p_2nd_ret_pts_36"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-25)
df_player1["p_2nd_ret_pts_won_36"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-25)

df_player1["p_2nd_ret_pts_35"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-26)
df_player1["p_2nd_ret_pts_won_35"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-26)

df_player1["p_2nd_ret_pts_34"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-27)
df_player1["p_2nd_ret_pts_won_34"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-27)

df_player1["p_2nd_ret_pts_33"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-28)
df_player1["p_2nd_ret_pts_won_33"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-28)

df_player1["p_2nd_ret_pts_32"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-29)
df_player1["p_2nd_ret_pts_won_32"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-29)

df_player1["p_2nd_ret_pts_31"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-30)
df_player1["p_2nd_ret_pts_won_31"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-30)

df_player1["p_2nd_ret_pts_30"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-31)
df_player1["p_2nd_ret_pts_won_30"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-31)

df_player1["p_2nd_ret_pts_29"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-32)
df_player1["p_2nd_ret_pts_won_29"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-32)

df_player1["p_2nd_ret_pts_28"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-33)
df_player1["p_2nd_ret_pts_won_28"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-33)

df_player1["p_2nd_ret_pts_27"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-34)
df_player1["p_2nd_ret_pts_won_27"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-34)

df_player1["p_2nd_ret_pts_26"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-35)
df_player1["p_2nd_ret_pts_won_26"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-35)

df_player1["p_2nd_ret_pts_25"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-36)
df_player1["p_2nd_ret_pts_won_25"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-36)

df_player1["p_2nd_ret_pts_24"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-37)
df_player1["p_2nd_ret_pts_won_24"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-37)

df_player1["p_2nd_ret_pts_23"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-38)
df_player1["p_2nd_ret_pts_won_23"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-38)

df_player1["p_2nd_ret_pts_22"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-39)
df_player1["p_2nd_ret_pts_won_22"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-39)

df_player1["p_2nd_ret_pts_21"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-40)
df_player1["p_2nd_ret_pts_won_21"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-40)

df_player1["p_2nd_ret_pts_20"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-41)
df_player1["p_2nd_ret_pts_won_20"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-41)

df_player1["p_2nd_ret_pts_19"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-42)
df_player1["p_2nd_ret_pts_won_19"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-42)

df_player1["p_2nd_ret_pts_18"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-43)
df_player1["p_2nd_ret_pts_won_18"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-43)

df_player1["p_2nd_ret_pts_17"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-44)
df_player1["p_2nd_ret_pts_won_17"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-44)

df_player1["p_2nd_ret_pts_16"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-45)
df_player1["p_2nd_ret_pts_won_16"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-45)

df_player1["p_2nd_ret_pts_15"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-46)
df_player1["p_2nd_ret_pts_won_15"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-46)

df_player1["p_2nd_ret_pts_14"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-47)
df_player1["p_2nd_ret_pts_won_14"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-47)

df_player1["p_2nd_ret_pts_13"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-48)
df_player1["p_2nd_ret_pts_won_13"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-48)

df_player1["p_2nd_ret_pts_12"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-49)
df_player1["p_2nd_ret_pts_won_12"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-49)

df_player1["p_2nd_ret_pts_11"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-50)
df_player1["p_2nd_ret_pts_won_11"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-50)

df_player1["p_2nd_ret_pts_10"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-51)
df_player1["p_2nd_ret_pts_won_10"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-51)

df_player1["p_2nd_ret_pts_9"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-52)
df_player1["p_2nd_ret_pts_won_9"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-52)

df_player1["p_2nd_ret_pts_8"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-53)
df_player1["p_2nd_ret_pts_won_8"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-53)

df_player1["p_2nd_ret_pts_7"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-54)
df_player1["p_2nd_ret_pts_won_7"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-54)

df_player1["p_2nd_ret_pts_6"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-55)
df_player1["p_2nd_ret_pts_won_6"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-55)

df_player1["p_2nd_ret_pts_5"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-56)
df_player1["p_2nd_ret_pts_won_5"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-56)

df_player1["p_2nd_ret_pts_4"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-57)
df_player1["p_2nd_ret_pts_won_4"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-57)

df_player1["p_2nd_ret_pts_3"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-58)
df_player1["p_2nd_ret_pts_won_3"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-58)

df_player1["p_2nd_ret_pts_2"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-59)
df_player1["p_2nd_ret_pts_won_2"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-59)

df_player1["p_2nd_ret_pts_1"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts'].shift(-60)
df_player1["p_2nd_ret_pts_won_1"] = df_player1.groupby(['p_id','t_surf'])['p_2nd_ret_pts_won'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_2nd_ret_pts_l60_ws"] = df_player1[["p_2nd_ret_pts_60", "p_2nd_ret_pts_59", "p_2nd_ret_pts_58", "p_2nd_ret_pts_57", "p_2nd_ret_pts_56", "p_2nd_ret_pts_55", "p_2nd_ret_pts_54", "p_2nd_ret_pts_53", "p_2nd_ret_pts_52", "p_2nd_ret_pts_51", "p_2nd_ret_pts_50", "p_2nd_ret_pts_49", "p_2nd_ret_pts_48", "p_2nd_ret_pts_47", "p_2nd_ret_pts_46", "p_2nd_ret_pts_45", "p_2nd_ret_pts_44", "p_2nd_ret_pts_43", "p_2nd_ret_pts_42", "p_2nd_ret_pts_41", "p_2nd_ret_pts_40", "p_2nd_ret_pts_39", "p_2nd_ret_pts_38", "p_2nd_ret_pts_37", "p_2nd_ret_pts_36", "p_2nd_ret_pts_35", "p_2nd_ret_pts_34", "p_2nd_ret_pts_33", "p_2nd_ret_pts_32", "p_2nd_ret_pts_31", "p_2nd_ret_pts_30", "p_2nd_ret_pts_29", "p_2nd_ret_pts_28", "p_2nd_ret_pts_27", "p_2nd_ret_pts_26", "p_2nd_ret_pts_25", "p_2nd_ret_pts_24", "p_2nd_ret_pts_23", "p_2nd_ret_pts_22", "p_2nd_ret_pts_21", "p_2nd_ret_pts_20", "p_2nd_ret_pts_19", "p_2nd_ret_pts_18", "p_2nd_ret_pts_17", "p_2nd_ret_pts_16", "p_2nd_ret_pts_15", "p_2nd_ret_pts_14", "p_2nd_ret_pts_13", "p_2nd_ret_pts_12", "p_2nd_ret_pts_11", "p_2nd_ret_pts_10", "p_2nd_ret_pts_9", "p_2nd_ret_pts_8", "p_2nd_ret_pts_7", "p_2nd_ret_pts_6", "p_2nd_ret_pts_5", "p_2nd_ret_pts_4", "p_2nd_ret_pts_3", "p_2nd_ret_pts_2", "p_2nd_ret_pts_1"]].sum(axis=1)
df_player1["p_2nd_ret_pts_won_l60_ws"] = df_player1[["p_2nd_ret_pts_won_60", "p_2nd_ret_pts_won_59", "p_2nd_ret_pts_won_58", "p_2nd_ret_pts_won_57", "p_2nd_ret_pts_won_56", "p_2nd_ret_pts_won_55", "p_2nd_ret_pts_won_54", "p_2nd_ret_pts_won_53", "p_2nd_ret_pts_won_52", "p_2nd_ret_pts_won_51", "p_2nd_ret_pts_won_50", "p_2nd_ret_pts_won_49", "p_2nd_ret_pts_won_48", "p_2nd_ret_pts_won_47", "p_2nd_ret_pts_won_46", "p_2nd_ret_pts_won_45", "p_2nd_ret_pts_won_44", "p_2nd_ret_pts_won_43", "p_2nd_ret_pts_won_42", "p_2nd_ret_pts_won_41", "p_2nd_ret_pts_won_40", "p_2nd_ret_pts_won_39", "p_2nd_ret_pts_won_38", "p_2nd_ret_pts_won_37", "p_2nd_ret_pts_won_36", "p_2nd_ret_pts_won_35", "p_2nd_ret_pts_won_34", "p_2nd_ret_pts_won_33", "p_2nd_ret_pts_won_32", "p_2nd_ret_pts_won_31", "p_2nd_ret_pts_won_30", "p_2nd_ret_pts_won_29", "p_2nd_ret_pts_won_28", "p_2nd_ret_pts_won_27", "p_2nd_ret_pts_won_26", "p_2nd_ret_pts_won_25", "p_2nd_ret_pts_won_24", "p_2nd_ret_pts_won_23", "p_2nd_ret_pts_won_22", "p_2nd_ret_pts_won_21", "p_2nd_ret_pts_won_20", "p_2nd_ret_pts_won_19", "p_2nd_ret_pts_won_18", "p_2nd_ret_pts_won_17", "p_2nd_ret_pts_won_16", "p_2nd_ret_pts_won_15", "p_2nd_ret_pts_won_14", "p_2nd_ret_pts_won_13", "p_2nd_ret_pts_won_12", "p_2nd_ret_pts_won_11", "p_2nd_ret_pts_won_10", "p_2nd_ret_pts_won_9", "p_2nd_ret_pts_won_8", "p_2nd_ret_pts_won_7", "p_2nd_ret_pts_won_6", "p_2nd_ret_pts_won_5", "p_2nd_ret_pts_won_4", "p_2nd_ret_pts_won_3", "p_2nd_ret_pts_won_2", "p_2nd_ret_pts_won_1"]].sum(axis=1)
df_player1["p_2nd_ret_pts_won%_l60_tw_ss"] = ((df_player1["p_2nd_ret_pts_won_l60_ws"]/df_player1["p_2nd_ret_pts_l60_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

In [55]:
# 'p_2nd_ret_pts_won%_l10_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean SECOND (SERVE) RETURN POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_2nd_ret_pts_l10_ws"] = df_player1[["p_2nd_ret_pts_60", "p_2nd_ret_pts_59", "p_2nd_ret_pts_58", "p_2nd_ret_pts_57", "p_2nd_ret_pts_56", "p_2nd_ret_pts_55", "p_2nd_ret_pts_54", "p_2nd_ret_pts_53", "p_2nd_ret_pts_52", "p_2nd_ret_pts_51"]].sum(axis=1)
df_player1["p_2nd_ret_pts_won_l10_ws"] = df_player1[["p_2nd_ret_pts_won_60", "p_2nd_ret_pts_won_59", "p_2nd_ret_pts_won_58", "p_2nd_ret_pts_won_57", "p_2nd_ret_pts_won_56", "p_2nd_ret_pts_won_55", "p_2nd_ret_pts_won_54", "p_2nd_ret_pts_won_53", "p_2nd_ret_pts_won_52", "p_2nd_ret_pts_won_51"]].sum(axis=1)
df_player1["p_2nd_ret_pts_won%_l10_tw_ss"] = ((df_player1["p_2nd_ret_pts_won_l10_ws"]/df_player1["p_2nd_ret_pts_l10_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_2nd_ret_pts_l60_ws", "p_2nd_ret_pts_won_l60_ws", "p_2nd_ret_pts_l10_ws", "p_2nd_ret_pts_won_l10_ws", "p_2nd_ret_pts_60", "p_2nd_ret_pts_59", "p_2nd_ret_pts_58", "p_2nd_ret_pts_57", "p_2nd_ret_pts_56", "p_2nd_ret_pts_55", "p_2nd_ret_pts_54", "p_2nd_ret_pts_53", "p_2nd_ret_pts_52", "p_2nd_ret_pts_51", "p_2nd_ret_pts_50", "p_2nd_ret_pts_49", "p_2nd_ret_pts_48", "p_2nd_ret_pts_47", "p_2nd_ret_pts_46", "p_2nd_ret_pts_45", "p_2nd_ret_pts_44", "p_2nd_ret_pts_43", "p_2nd_ret_pts_42", "p_2nd_ret_pts_41", "p_2nd_ret_pts_40", "p_2nd_ret_pts_39", "p_2nd_ret_pts_38", "p_2nd_ret_pts_37", "p_2nd_ret_pts_36", "p_2nd_ret_pts_35", "p_2nd_ret_pts_34", "p_2nd_ret_pts_33", "p_2nd_ret_pts_32", "p_2nd_ret_pts_31", "p_2nd_ret_pts_30", "p_2nd_ret_pts_29", "p_2nd_ret_pts_28", "p_2nd_ret_pts_27", "p_2nd_ret_pts_26", "p_2nd_ret_pts_25", "p_2nd_ret_pts_24", "p_2nd_ret_pts_23", "p_2nd_ret_pts_22", "p_2nd_ret_pts_21", "p_2nd_ret_pts_20", "p_2nd_ret_pts_19", "p_2nd_ret_pts_18", "p_2nd_ret_pts_17", "p_2nd_ret_pts_16", "p_2nd_ret_pts_15", "p_2nd_ret_pts_14", "p_2nd_ret_pts_13", "p_2nd_ret_pts_12", "p_2nd_ret_pts_11", "p_2nd_ret_pts_10", "p_2nd_ret_pts_9", "p_2nd_ret_pts_8", "p_2nd_ret_pts_7", "p_2nd_ret_pts_6", "p_2nd_ret_pts_5", "p_2nd_ret_pts_4", "p_2nd_ret_pts_3", "p_2nd_ret_pts_2", "p_2nd_ret_pts_1", "p_2nd_ret_pts_won_60", "p_2nd_ret_pts_won_59", "p_2nd_ret_pts_won_58", "p_2nd_ret_pts_won_57", "p_2nd_ret_pts_won_56", "p_2nd_ret_pts_won_55", "p_2nd_ret_pts_won_54", "p_2nd_ret_pts_won_53", "p_2nd_ret_pts_won_52", "p_2nd_ret_pts_won_51", "p_2nd_ret_pts_won_50", "p_2nd_ret_pts_won_49", "p_2nd_ret_pts_won_48", "p_2nd_ret_pts_won_47", "p_2nd_ret_pts_won_46", "p_2nd_ret_pts_won_45", "p_2nd_ret_pts_won_44", "p_2nd_ret_pts_won_43", "p_2nd_ret_pts_won_42", "p_2nd_ret_pts_won_41", "p_2nd_ret_pts_won_40", "p_2nd_ret_pts_won_39", "p_2nd_ret_pts_won_38", "p_2nd_ret_pts_won_37", "p_2nd_ret_pts_won_36", "p_2nd_ret_pts_won_35", "p_2nd_ret_pts_won_34", "p_2nd_ret_pts_won_33", "p_2nd_ret_pts_won_32", "p_2nd_ret_pts_won_31", "p_2nd_ret_pts_won_30", "p_2nd_ret_pts_won_29", "p_2nd_ret_pts_won_28", "p_2nd_ret_pts_won_27", "p_2nd_ret_pts_won_26", "p_2nd_ret_pts_won_25", "p_2nd_ret_pts_won_24", "p_2nd_ret_pts_won_23", "p_2nd_ret_pts_won_22", "p_2nd_ret_pts_won_21", "p_2nd_ret_pts_won_20", "p_2nd_ret_pts_won_19", "p_2nd_ret_pts_won_18", "p_2nd_ret_pts_won_17", "p_2nd_ret_pts_won_16", "p_2nd_ret_pts_won_15", "p_2nd_ret_pts_won_14", "p_2nd_ret_pts_won_13", "p_2nd_ret_pts_won_12", "p_2nd_ret_pts_won_11", "p_2nd_ret_pts_won_10", "p_2nd_ret_pts_won_9", "p_2nd_ret_pts_won_8", "p_2nd_ret_pts_won_7", "p_2nd_ret_pts_won_6", "p_2nd_ret_pts_won_5", "p_2nd_ret_pts_won_4", "p_2nd_ret_pts_won_3", "p_2nd_ret_pts_won_2", "p_2nd_ret_pts_won_1"], axis = 1)

In [56]:
# 'p_2nd_ret_pts_won%_l60_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor specific (IO) mean SECOND (SERVE) RETURN POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

df_player1["p_2nd_ret_pts_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-1)
df_player1["p_2nd_ret_pts_won_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-1)

df_player1["p_2nd_ret_pts_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-2)
df_player1["p_2nd_ret_pts_won_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-2)

df_player1["p_2nd_ret_pts_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-3)
df_player1["p_2nd_ret_pts_won_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-3)

df_player1["p_2nd_ret_pts_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-4)
df_player1["p_2nd_ret_pts_won_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-4)

df_player1["p_2nd_ret_pts_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-5)
df_player1["p_2nd_ret_pts_won_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-5)

df_player1["p_2nd_ret_pts_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-6)
df_player1["p_2nd_ret_pts_won_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-6)

df_player1["p_2nd_ret_pts_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-7)
df_player1["p_2nd_ret_pts_won_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-7)

df_player1["p_2nd_ret_pts_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-8)
df_player1["p_2nd_ret_pts_won_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-8)

df_player1["p_2nd_ret_pts_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-9)
df_player1["p_2nd_ret_pts_won_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-9)

df_player1["p_2nd_ret_pts_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-10)
df_player1["p_2nd_ret_pts_won_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-10)

df_player1["p_2nd_ret_pts_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-11)
df_player1["p_2nd_ret_pts_won_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-11)

df_player1["p_2nd_ret_pts_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-12)
df_player1["p_2nd_ret_pts_won_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-12)

df_player1["p_2nd_ret_pts_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-13)
df_player1["p_2nd_ret_pts_won_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-13)

df_player1["p_2nd_ret_pts_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-14)
df_player1["p_2nd_ret_pts_won_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-14)

df_player1["p_2nd_ret_pts_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-15)
df_player1["p_2nd_ret_pts_won_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-15)

df_player1["p_2nd_ret_pts_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-16)
df_player1["p_2nd_ret_pts_won_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-16)

df_player1["p_2nd_ret_pts_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-17)
df_player1["p_2nd_ret_pts_won_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-17)

df_player1["p_2nd_ret_pts_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-18)
df_player1["p_2nd_ret_pts_won_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-18)

df_player1["p_2nd_ret_pts_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-19)
df_player1["p_2nd_ret_pts_won_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-19)

df_player1["p_2nd_ret_pts_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-20)
df_player1["p_2nd_ret_pts_won_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-20)

df_player1["p_2nd_ret_pts_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-21)
df_player1["p_2nd_ret_pts_won_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-21)

df_player1["p_2nd_ret_pts_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-22)
df_player1["p_2nd_ret_pts_won_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-22)

df_player1["p_2nd_ret_pts_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-23)
df_player1["p_2nd_ret_pts_won_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-23)

df_player1["p_2nd_ret_pts_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-24)
df_player1["p_2nd_ret_pts_won_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-24)

df_player1["p_2nd_ret_pts_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-25)
df_player1["p_2nd_ret_pts_won_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-25)

df_player1["p_2nd_ret_pts_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-26)
df_player1["p_2nd_ret_pts_won_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-26)

df_player1["p_2nd_ret_pts_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-27)
df_player1["p_2nd_ret_pts_won_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-27)

df_player1["p_2nd_ret_pts_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-28)
df_player1["p_2nd_ret_pts_won_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-28)

df_player1["p_2nd_ret_pts_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-29)
df_player1["p_2nd_ret_pts_won_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-29)

df_player1["p_2nd_ret_pts_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-30)
df_player1["p_2nd_ret_pts_won_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-30)

df_player1["p_2nd_ret_pts_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-31)
df_player1["p_2nd_ret_pts_won_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-31)

df_player1["p_2nd_ret_pts_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-32)
df_player1["p_2nd_ret_pts_won_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-32)

df_player1["p_2nd_ret_pts_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-33)
df_player1["p_2nd_ret_pts_won_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-33)

df_player1["p_2nd_ret_pts_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-34)
df_player1["p_2nd_ret_pts_won_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-34)

df_player1["p_2nd_ret_pts_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-35)
df_player1["p_2nd_ret_pts_won_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-35)

df_player1["p_2nd_ret_pts_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-36)
df_player1["p_2nd_ret_pts_won_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-36)

df_player1["p_2nd_ret_pts_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-37)
df_player1["p_2nd_ret_pts_won_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-37)

df_player1["p_2nd_ret_pts_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-38)
df_player1["p_2nd_ret_pts_won_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-38)

df_player1["p_2nd_ret_pts_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-39)
df_player1["p_2nd_ret_pts_won_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-39)

df_player1["p_2nd_ret_pts_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-40)
df_player1["p_2nd_ret_pts_won_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-40)

df_player1["p_2nd_ret_pts_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-41)
df_player1["p_2nd_ret_pts_won_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-41)

df_player1["p_2nd_ret_pts_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-42)
df_player1["p_2nd_ret_pts_won_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-42)

df_player1["p_2nd_ret_pts_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-43)
df_player1["p_2nd_ret_pts_won_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-43)

df_player1["p_2nd_ret_pts_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-44)
df_player1["p_2nd_ret_pts_won_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-44)

df_player1["p_2nd_ret_pts_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-45)
df_player1["p_2nd_ret_pts_won_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-45)

df_player1["p_2nd_ret_pts_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-46)
df_player1["p_2nd_ret_pts_won_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-46)

df_player1["p_2nd_ret_pts_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-47)
df_player1["p_2nd_ret_pts_won_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-47)

df_player1["p_2nd_ret_pts_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-48)
df_player1["p_2nd_ret_pts_won_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-48)

df_player1["p_2nd_ret_pts_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-49)
df_player1["p_2nd_ret_pts_won_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-49)

df_player1["p_2nd_ret_pts_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-50)
df_player1["p_2nd_ret_pts_won_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-50)

df_player1["p_2nd_ret_pts_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-51)
df_player1["p_2nd_ret_pts_won_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-51)

df_player1["p_2nd_ret_pts_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-52)
df_player1["p_2nd_ret_pts_won_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-52)

df_player1["p_2nd_ret_pts_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-53)
df_player1["p_2nd_ret_pts_won_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-53)

df_player1["p_2nd_ret_pts_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-54)
df_player1["p_2nd_ret_pts_won_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-54)

df_player1["p_2nd_ret_pts_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-55)
df_player1["p_2nd_ret_pts_won_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-55)

df_player1["p_2nd_ret_pts_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-56)
df_player1["p_2nd_ret_pts_won_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-56)

df_player1["p_2nd_ret_pts_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-57)
df_player1["p_2nd_ret_pts_won_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-57)

df_player1["p_2nd_ret_pts_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-58)
df_player1["p_2nd_ret_pts_won_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-58)

df_player1["p_2nd_ret_pts_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-59)
df_player1["p_2nd_ret_pts_won_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-59)

df_player1["p_2nd_ret_pts_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts'].shift(-60)
df_player1["p_2nd_ret_pts_won_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_2nd_ret_pts_won'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_2nd_ret_pts_l60_ws"] = df_player1[["p_2nd_ret_pts_60", "p_2nd_ret_pts_59", "p_2nd_ret_pts_58", "p_2nd_ret_pts_57", "p_2nd_ret_pts_56", "p_2nd_ret_pts_55", "p_2nd_ret_pts_54", "p_2nd_ret_pts_53", "p_2nd_ret_pts_52", "p_2nd_ret_pts_51", "p_2nd_ret_pts_50", "p_2nd_ret_pts_49", "p_2nd_ret_pts_48", "p_2nd_ret_pts_47", "p_2nd_ret_pts_46", "p_2nd_ret_pts_45", "p_2nd_ret_pts_44", "p_2nd_ret_pts_43", "p_2nd_ret_pts_42", "p_2nd_ret_pts_41", "p_2nd_ret_pts_40", "p_2nd_ret_pts_39", "p_2nd_ret_pts_38", "p_2nd_ret_pts_37", "p_2nd_ret_pts_36", "p_2nd_ret_pts_35", "p_2nd_ret_pts_34", "p_2nd_ret_pts_33", "p_2nd_ret_pts_32", "p_2nd_ret_pts_31", "p_2nd_ret_pts_30", "p_2nd_ret_pts_29", "p_2nd_ret_pts_28", "p_2nd_ret_pts_27", "p_2nd_ret_pts_26", "p_2nd_ret_pts_25", "p_2nd_ret_pts_24", "p_2nd_ret_pts_23", "p_2nd_ret_pts_22", "p_2nd_ret_pts_21", "p_2nd_ret_pts_20", "p_2nd_ret_pts_19", "p_2nd_ret_pts_18", "p_2nd_ret_pts_17", "p_2nd_ret_pts_16", "p_2nd_ret_pts_15", "p_2nd_ret_pts_14", "p_2nd_ret_pts_13", "p_2nd_ret_pts_12", "p_2nd_ret_pts_11", "p_2nd_ret_pts_10", "p_2nd_ret_pts_9", "p_2nd_ret_pts_8", "p_2nd_ret_pts_7", "p_2nd_ret_pts_6", "p_2nd_ret_pts_5", "p_2nd_ret_pts_4", "p_2nd_ret_pts_3", "p_2nd_ret_pts_2", "p_2nd_ret_pts_1"]].sum(axis=1)
df_player1["p_2nd_ret_pts_won_l60_ws"] = df_player1[["p_2nd_ret_pts_won_60", "p_2nd_ret_pts_won_59", "p_2nd_ret_pts_won_58", "p_2nd_ret_pts_won_57", "p_2nd_ret_pts_won_56", "p_2nd_ret_pts_won_55", "p_2nd_ret_pts_won_54", "p_2nd_ret_pts_won_53", "p_2nd_ret_pts_won_52", "p_2nd_ret_pts_won_51", "p_2nd_ret_pts_won_50", "p_2nd_ret_pts_won_49", "p_2nd_ret_pts_won_48", "p_2nd_ret_pts_won_47", "p_2nd_ret_pts_won_46", "p_2nd_ret_pts_won_45", "p_2nd_ret_pts_won_44", "p_2nd_ret_pts_won_43", "p_2nd_ret_pts_won_42", "p_2nd_ret_pts_won_41", "p_2nd_ret_pts_won_40", "p_2nd_ret_pts_won_39", "p_2nd_ret_pts_won_38", "p_2nd_ret_pts_won_37", "p_2nd_ret_pts_won_36", "p_2nd_ret_pts_won_35", "p_2nd_ret_pts_won_34", "p_2nd_ret_pts_won_33", "p_2nd_ret_pts_won_32", "p_2nd_ret_pts_won_31", "p_2nd_ret_pts_won_30", "p_2nd_ret_pts_won_29", "p_2nd_ret_pts_won_28", "p_2nd_ret_pts_won_27", "p_2nd_ret_pts_won_26", "p_2nd_ret_pts_won_25", "p_2nd_ret_pts_won_24", "p_2nd_ret_pts_won_23", "p_2nd_ret_pts_won_22", "p_2nd_ret_pts_won_21", "p_2nd_ret_pts_won_20", "p_2nd_ret_pts_won_19", "p_2nd_ret_pts_won_18", "p_2nd_ret_pts_won_17", "p_2nd_ret_pts_won_16", "p_2nd_ret_pts_won_15", "p_2nd_ret_pts_won_14", "p_2nd_ret_pts_won_13", "p_2nd_ret_pts_won_12", "p_2nd_ret_pts_won_11", "p_2nd_ret_pts_won_10", "p_2nd_ret_pts_won_9", "p_2nd_ret_pts_won_8", "p_2nd_ret_pts_won_7", "p_2nd_ret_pts_won_6", "p_2nd_ret_pts_won_5", "p_2nd_ret_pts_won_4", "p_2nd_ret_pts_won_3", "p_2nd_ret_pts_won_2", "p_2nd_ret_pts_won_1"]].sum(axis=1)
df_player1["p_2nd_ret_pts_won%_l60_tw_ss_IO"] = ((df_player1["p_2nd_ret_pts_won_l60_ws"]/df_player1["p_2nd_ret_pts_l60_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

In [57]:
# 'p_2nd_ret_pts_won%_l10_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor (IO) specific mean SECOND (SERVE) RETURN POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_2nd_ret_pts_l10_ws"] = df_player1[["p_2nd_ret_pts_60", "p_2nd_ret_pts_59", "p_2nd_ret_pts_58", "p_2nd_ret_pts_57", "p_2nd_ret_pts_56", "p_2nd_ret_pts_55", "p_2nd_ret_pts_54", "p_2nd_ret_pts_53", "p_2nd_ret_pts_52", "p_2nd_ret_pts_51"]].sum(axis=1)
df_player1["p_2nd_ret_pts_won_l10_ws"] = df_player1[["p_2nd_ret_pts_won_60", "p_2nd_ret_pts_won_59", "p_2nd_ret_pts_won_58", "p_2nd_ret_pts_won_57", "p_2nd_ret_pts_won_56", "p_2nd_ret_pts_won_55", "p_2nd_ret_pts_won_54", "p_2nd_ret_pts_won_53", "p_2nd_ret_pts_won_52", "p_2nd_ret_pts_won_51"]].sum(axis=1)
df_player1["p_2nd_ret_pts_won%_l10_tw_ss_IO"] = ((df_player1["p_2nd_ret_pts_won_l10_ws"]/df_player1["p_2nd_ret_pts_l10_ws"])*100).round(2)
#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_2nd_ret_pts_l60_ws", "p_2nd_ret_pts_won_l60_ws", "p_2nd_ret_pts_l10_ws", "p_2nd_ret_pts_won_l10_ws", "p_2nd_ret_pts_60", "p_2nd_ret_pts_59", "p_2nd_ret_pts_58", "p_2nd_ret_pts_57", "p_2nd_ret_pts_56", "p_2nd_ret_pts_55", "p_2nd_ret_pts_54", "p_2nd_ret_pts_53", "p_2nd_ret_pts_52", "p_2nd_ret_pts_51", "p_2nd_ret_pts_50", "p_2nd_ret_pts_49", "p_2nd_ret_pts_48", "p_2nd_ret_pts_47", "p_2nd_ret_pts_46", "p_2nd_ret_pts_45", "p_2nd_ret_pts_44", "p_2nd_ret_pts_43", "p_2nd_ret_pts_42", "p_2nd_ret_pts_41", "p_2nd_ret_pts_40", "p_2nd_ret_pts_39", "p_2nd_ret_pts_38", "p_2nd_ret_pts_37", "p_2nd_ret_pts_36", "p_2nd_ret_pts_35", "p_2nd_ret_pts_34", "p_2nd_ret_pts_33", "p_2nd_ret_pts_32", "p_2nd_ret_pts_31", "p_2nd_ret_pts_30", "p_2nd_ret_pts_29", "p_2nd_ret_pts_28", "p_2nd_ret_pts_27", "p_2nd_ret_pts_26", "p_2nd_ret_pts_25", "p_2nd_ret_pts_24", "p_2nd_ret_pts_23", "p_2nd_ret_pts_22", "p_2nd_ret_pts_21", "p_2nd_ret_pts_20", "p_2nd_ret_pts_19", "p_2nd_ret_pts_18", "p_2nd_ret_pts_17", "p_2nd_ret_pts_16", "p_2nd_ret_pts_15", "p_2nd_ret_pts_14", "p_2nd_ret_pts_13", "p_2nd_ret_pts_12", "p_2nd_ret_pts_11", "p_2nd_ret_pts_10", "p_2nd_ret_pts_9", "p_2nd_ret_pts_8", "p_2nd_ret_pts_7", "p_2nd_ret_pts_6", "p_2nd_ret_pts_5", "p_2nd_ret_pts_4", "p_2nd_ret_pts_3", "p_2nd_ret_pts_2", "p_2nd_ret_pts_1", "p_2nd_ret_pts_won_60", "p_2nd_ret_pts_won_59", "p_2nd_ret_pts_won_58", "p_2nd_ret_pts_won_57", "p_2nd_ret_pts_won_56", "p_2nd_ret_pts_won_55", "p_2nd_ret_pts_won_54", "p_2nd_ret_pts_won_53", "p_2nd_ret_pts_won_52", "p_2nd_ret_pts_won_51", "p_2nd_ret_pts_won_50", "p_2nd_ret_pts_won_49", "p_2nd_ret_pts_won_48", "p_2nd_ret_pts_won_47", "p_2nd_ret_pts_won_46", "p_2nd_ret_pts_won_45", "p_2nd_ret_pts_won_44", "p_2nd_ret_pts_won_43", "p_2nd_ret_pts_won_42", "p_2nd_ret_pts_won_41", "p_2nd_ret_pts_won_40", "p_2nd_ret_pts_won_39", "p_2nd_ret_pts_won_38", "p_2nd_ret_pts_won_37", "p_2nd_ret_pts_won_36", "p_2nd_ret_pts_won_35", "p_2nd_ret_pts_won_34", "p_2nd_ret_pts_won_33", "p_2nd_ret_pts_won_32", "p_2nd_ret_pts_won_31", "p_2nd_ret_pts_won_30", "p_2nd_ret_pts_won_29", "p_2nd_ret_pts_won_28", "p_2nd_ret_pts_won_27", "p_2nd_ret_pts_won_26", "p_2nd_ret_pts_won_25", "p_2nd_ret_pts_won_24", "p_2nd_ret_pts_won_23", "p_2nd_ret_pts_won_22", "p_2nd_ret_pts_won_21", "p_2nd_ret_pts_won_20", "p_2nd_ret_pts_won_19", "p_2nd_ret_pts_won_18", "p_2nd_ret_pts_won_17", "p_2nd_ret_pts_won_16", "p_2nd_ret_pts_won_15", "p_2nd_ret_pts_won_14", "p_2nd_ret_pts_won_13", "p_2nd_ret_pts_won_12", "p_2nd_ret_pts_won_11", "p_2nd_ret_pts_won_10", "p_2nd_ret_pts_won_9", "p_2nd_ret_pts_won_8", "p_2nd_ret_pts_won_7", "p_2nd_ret_pts_won_6", "p_2nd_ret_pts_won_5", "p_2nd_ret_pts_won_4", "p_2nd_ret_pts_won_3", "p_2nd_ret_pts_won_2", "p_2nd_ret_pts_won_1"], axis = 1)

In [58]:
df_player1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57068 entries, 56533 to 40644
Columns: 108 entries, t_id to p_2nd_ret_pts_won%_l10_tw_ss_IO
dtypes: datetime64[ns](1), float64(68), int64(33), object(6)
memory usage: 47.5+ MB


In [59]:
# p_tot_pts_won%_l60_tw_ss_comp
# Alternative, composite total points won% variant derived from summing serve and return l60_tw_ss 
df_player1["p_tot_pts_won%_l60_tw_ss_comp"] = (df_player1["p_sv_pts_won%_l60_tw_ss"]/2) + (df_player1["p_ret_pts_won%_l60_tw_ss"]/2)

In [60]:
# p_tot_pts_won%_l10_tw_ss_comp
# Alternative, composite total points won% variant derived from summing serve and return l60_tw_ss 
df_player1["p_tot_pts_won%_l10_tw_ss_comp"] = (df_player1["p_sv_pts_won%_l10_tw_ss"]/2) + (df_player1["p_ret_pts_won%_l10_tw_ss"]/2)

In [61]:
# p_tot_pts_won%_l60_tw_ss_IO_comp
# Alternative, composite total points won% variant derived from summing serve and return l60_tw_ss_IO 
df_player1["p_tot_pts_won%_l60_tw_ss_IO_comp"] = (df_player1["p_sv_pts_won%_l60_tw_ss_IO"]/2) + (df_player1["p_ret_pts_won%_l60_tw_ss_IO"]/2)

In [62]:
# p_tot_pts_won%_l10_tw_ss_IO_comp
# Alternative, composite total points won% variant derived from summing serve and return l10_tw_ss_IO 
df_player1["p_tot_pts_won%_l10_tw_ss_IO_comp"] = (df_player1["p_sv_pts_won%_l10_tw_ss_IO"]/2) + (df_player1["p_ret_pts_won%_l10_tw_ss_IO"]/2)

In [63]:
# 'p_ace%_l60_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean ACE performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

df_player1["p_sv_pts_60"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-1)
df_player1["p_ace_60"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-1)

df_player1["p_sv_pts_59"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-2)
df_player1["p_ace_59"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-2)

df_player1["p_sv_pts_58"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-3)
df_player1["p_ace_58"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-3)

df_player1["p_sv_pts_57"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-4)
df_player1["p_ace_57"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-4)

df_player1["p_sv_pts_56"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-5)
df_player1["p_ace_56"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-5)

df_player1["p_sv_pts_55"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-6)
df_player1["p_ace_55"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-6)

df_player1["p_sv_pts_54"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-7)
df_player1["p_ace_54"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-7)

df_player1["p_sv_pts_53"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-8)
df_player1["p_ace_53"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-8)

df_player1["p_sv_pts_52"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-9)
df_player1["p_ace_52"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-9)

df_player1["p_sv_pts_51"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-10)
df_player1["p_ace_51"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-10)

df_player1["p_sv_pts_50"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-11)
df_player1["p_ace_50"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-11)

df_player1["p_sv_pts_49"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-12)
df_player1["p_ace_49"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-12)

df_player1["p_sv_pts_48"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-13)
df_player1["p_ace_48"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-13)

df_player1["p_sv_pts_47"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-14)
df_player1["p_ace_47"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-14)

df_player1["p_sv_pts_46"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-15)
df_player1["p_ace_46"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-15)

df_player1["p_sv_pts_45"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-16)
df_player1["p_ace_45"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-16)

df_player1["p_sv_pts_44"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-17)
df_player1["p_ace_44"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-17)

df_player1["p_sv_pts_43"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-18)
df_player1["p_ace_43"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-18)

df_player1["p_sv_pts_42"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-19)
df_player1["p_ace_42"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-19)

df_player1["p_sv_pts_41"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-20)
df_player1["p_ace_41"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-20)

df_player1["p_sv_pts_40"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-21)
df_player1["p_ace_40"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-21)

df_player1["p_sv_pts_39"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-22)
df_player1["p_ace_39"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-22)

df_player1["p_sv_pts_38"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-23)
df_player1["p_ace_38"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-23)

df_player1["p_sv_pts_37"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-24)
df_player1["p_ace_37"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-24)

df_player1["p_sv_pts_36"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-25)
df_player1["p_ace_36"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-25)

df_player1["p_sv_pts_35"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-26)
df_player1["p_ace_35"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-26)

df_player1["p_sv_pts_34"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-27)
df_player1["p_ace_34"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-27)

df_player1["p_sv_pts_33"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-28)
df_player1["p_ace_33"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-28)

df_player1["p_sv_pts_32"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-29)
df_player1["p_ace_32"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-29)

df_player1["p_sv_pts_31"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-30)
df_player1["p_ace_31"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-30)

df_player1["p_sv_pts_30"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-31)
df_player1["p_ace_30"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-31)

df_player1["p_sv_pts_29"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-32)
df_player1["p_ace_29"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-32)

df_player1["p_sv_pts_28"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-33)
df_player1["p_ace_28"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-33)

df_player1["p_sv_pts_27"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-34)
df_player1["p_ace_27"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-34)

df_player1["p_sv_pts_26"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-35)
df_player1["p_ace_26"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-35)

df_player1["p_sv_pts_25"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-36)
df_player1["p_ace_25"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-36)

df_player1["p_sv_pts_24"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-37)
df_player1["p_ace_24"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-37)

df_player1["p_sv_pts_23"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-38)
df_player1["p_ace_23"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-38)

df_player1["p_sv_pts_22"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-39)
df_player1["p_ace_22"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-39)

df_player1["p_sv_pts_21"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-40)
df_player1["p_ace_21"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-40)

df_player1["p_sv_pts_20"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-41)
df_player1["p_ace_20"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-41)

df_player1["p_sv_pts_19"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-42)
df_player1["p_ace_19"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-42)

df_player1["p_sv_pts_18"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-43)
df_player1["p_ace_18"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-43)

df_player1["p_sv_pts_17"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-44)
df_player1["p_ace_17"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-44)

df_player1["p_sv_pts_16"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-45)
df_player1["p_ace_16"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-45)

df_player1["p_sv_pts_15"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-46)
df_player1["p_ace_15"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-46)

df_player1["p_sv_pts_14"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-47)
df_player1["p_ace_14"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-47)

df_player1["p_sv_pts_13"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-48)
df_player1["p_ace_13"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-48)

df_player1["p_sv_pts_12"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-49)
df_player1["p_ace_12"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-49)

df_player1["p_sv_pts_11"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-50)
df_player1["p_ace_11"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-50)

df_player1["p_sv_pts_10"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-51)
df_player1["p_ace_10"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-51)

df_player1["p_sv_pts_9"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-52)
df_player1["p_ace_9"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-52)

df_player1["p_sv_pts_8"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-53)
df_player1["p_ace_8"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-53)

df_player1["p_sv_pts_7"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-54)
df_player1["p_ace_7"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-54)

df_player1["p_sv_pts_6"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-55)
df_player1["p_ace_6"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-55)

df_player1["p_sv_pts_5"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-56)
df_player1["p_ace_5"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-56)

df_player1["p_sv_pts_4"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-57)
df_player1["p_ace_4"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-57)

df_player1["p_sv_pts_3"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-58)
df_player1["p_ace_3"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-58)

df_player1["p_sv_pts_2"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-59)
df_player1["p_ace_2"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-59)

df_player1["p_sv_pts_1"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-60)
df_player1["p_ace_1"] = df_player1.groupby(['p_id','t_surf'])['p_ace'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_sv_pts_l60_ws"] = df_player1[["p_sv_pts_60", "p_sv_pts_59", "p_sv_pts_58", "p_sv_pts_57", "p_sv_pts_56", "p_sv_pts_55", "p_sv_pts_54", "p_sv_pts_53", "p_sv_pts_52", "p_sv_pts_51", "p_sv_pts_50", "p_sv_pts_49", "p_sv_pts_48", "p_sv_pts_47", "p_sv_pts_46", "p_sv_pts_45", "p_sv_pts_44", "p_sv_pts_43", "p_sv_pts_42", "p_sv_pts_41", "p_sv_pts_40", "p_sv_pts_39", "p_sv_pts_38", "p_sv_pts_37", "p_sv_pts_36", "p_sv_pts_35", "p_sv_pts_34", "p_sv_pts_33", "p_sv_pts_32", "p_sv_pts_31", "p_sv_pts_30", "p_sv_pts_29", "p_sv_pts_28", "p_sv_pts_27", "p_sv_pts_26", "p_sv_pts_25", "p_sv_pts_24", "p_sv_pts_23", "p_sv_pts_22", "p_sv_pts_21", "p_sv_pts_20", "p_sv_pts_19", "p_sv_pts_18", "p_sv_pts_17", "p_sv_pts_16", "p_sv_pts_15", "p_sv_pts_14", "p_sv_pts_13", "p_sv_pts_12", "p_sv_pts_11", "p_sv_pts_10", "p_sv_pts_9", "p_sv_pts_8", "p_sv_pts_7", "p_sv_pts_6", "p_sv_pts_5", "p_sv_pts_4", "p_sv_pts_3", "p_sv_pts_2", "p_sv_pts_1"]].sum(axis=1)
df_player1["p_ace_l60_ws"] = df_player1[["p_ace_60", "p_ace_59", "p_ace_58", "p_ace_57", "p_ace_56", "p_ace_55", "p_ace_54", "p_ace_53", "p_ace_52", "p_ace_51", "p_ace_50", "p_ace_49", "p_ace_48", "p_ace_47", "p_ace_46", "p_ace_45", "p_ace_44", "p_ace_43", "p_ace_42", "p_ace_41", "p_ace_40", "p_ace_39", "p_ace_38", "p_ace_37", "p_ace_36", "p_ace_35", "p_ace_34", "p_ace_33", "p_ace_32", "p_ace_31", "p_ace_30", "p_ace_29", "p_ace_28", "p_ace_27", "p_ace_26", "p_ace_25", "p_ace_24", "p_ace_23", "p_ace_22", "p_ace_21", "p_ace_20", "p_ace_19", "p_ace_18", "p_ace_17", "p_ace_16", "p_ace_15", "p_ace_14", "p_ace_13", "p_ace_12", "p_ace_11", "p_ace_10", "p_ace_9", "p_ace_8", "p_ace_7", "p_ace_6", "p_ace_5", "p_ace_4", "p_ace_3", "p_ace_2", "p_ace_1"]].sum(axis=1)
df_player1["p_ace%_l60_tw_ss"] = ((df_player1["p_ace_l60_ws"]/df_player1["p_sv_pts_l60_ws"])*100).round(2)

# Set zeroes to a very low rate to avoid divide by zero errors in ratio generation later on 
df_player1.loc[(df_player1["p_ace%_l60_tw_ss"] == 0), "p_ace%_l60_tw_ss"] = 0.1

#(ws = weighted sum; tw = time-weighted)

In [64]:
# 'p_ace%_l10_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean ACE performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_sv_pts_l10_ws"] = df_player1[["p_sv_pts_60", "p_sv_pts_59", "p_sv_pts_58", "p_sv_pts_57", "p_sv_pts_56", "p_sv_pts_55", "p_sv_pts_54", "p_sv_pts_53", "p_sv_pts_52", "p_sv_pts_51"]].sum(axis=1)
df_player1["p_ace_l10_ws"] = df_player1[["p_ace_60", "p_ace_59", "p_ace_58", "p_ace_57", "p_ace_56", "p_ace_55", "p_ace_54", "p_ace_53", "p_ace_52", "p_ace_51"]].sum(axis=1)
df_player1["p_ace%_l10_tw_ss"] = ((df_player1["p_ace_l10_ws"]/df_player1["p_sv_pts_l10_ws"])*100).round(2)

# Set zeroes to a very low rate to avoid divide by zero errors in ratio generation later on 
df_player1.loc[(df_player1["p_ace%_l10_tw_ss"] == 0), "p_ace%_l10_tw_ss"] = 0.1
#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_sv_pts_l60_ws", "p_ace_l60_ws", "p_sv_pts_l10_ws", "p_ace_l10_ws", "p_ace_60", "p_ace_59", "p_ace_58", "p_ace_57", "p_ace_56", "p_ace_55", "p_ace_54", "p_ace_53", "p_ace_52", "p_ace_51", "p_ace_50", "p_ace_49", "p_ace_48", "p_ace_47", "p_ace_46", "p_ace_45", "p_ace_44", "p_ace_43", "p_ace_42", "p_ace_41", "p_ace_40", "p_ace_39", "p_ace_38", "p_ace_37", "p_ace_36", "p_ace_35", "p_ace_34", "p_ace_33", "p_ace_32", "p_ace_31", "p_ace_30", "p_ace_29", "p_ace_28", "p_ace_27", "p_ace_26", "p_ace_25", "p_ace_24", "p_ace_23", "p_ace_22", "p_ace_21", "p_ace_20", "p_ace_19", "p_ace_18", "p_ace_17", "p_ace_16", "p_ace_15", "p_ace_14", "p_ace_13", "p_ace_12", "p_ace_11", "p_ace_10", "p_ace_9", "p_ace_8", "p_ace_7", "p_ace_6", "p_ace_5", "p_ace_4", "p_ace_3", "p_ace_2", "p_ace_1", "p_sv_pts_60", "p_sv_pts_59", "p_sv_pts_58", "p_sv_pts_57", "p_sv_pts_56", "p_sv_pts_55", "p_sv_pts_54", "p_sv_pts_53", "p_sv_pts_52", "p_sv_pts_51", "p_sv_pts_50", "p_sv_pts_49", "p_sv_pts_48", "p_sv_pts_47", "p_sv_pts_46", "p_sv_pts_45", "p_sv_pts_44", "p_sv_pts_43", "p_sv_pts_42", "p_sv_pts_41", "p_sv_pts_40", "p_sv_pts_39", "p_sv_pts_38", "p_sv_pts_37", "p_sv_pts_36", "p_sv_pts_35", "p_sv_pts_34", "p_sv_pts_33", "p_sv_pts_32", "p_sv_pts_31", "p_sv_pts_30", "p_sv_pts_29", "p_sv_pts_28", "p_sv_pts_27", "p_sv_pts_26", "p_sv_pts_25", "p_sv_pts_24", "p_sv_pts_23", "p_sv_pts_22", "p_sv_pts_21", "p_sv_pts_20", "p_sv_pts_19", "p_sv_pts_18", "p_sv_pts_17", "p_sv_pts_16", "p_sv_pts_15", "p_sv_pts_14", "p_sv_pts_13", "p_sv_pts_12", "p_sv_pts_11", "p_sv_pts_10", "p_sv_pts_9", "p_sv_pts_8", "p_sv_pts_7", "p_sv_pts_6", "p_sv_pts_5", "p_sv_pts_4", "p_sv_pts_3", "p_sv_pts_2", "p_sv_pts_1"], axis = 1)

In [65]:
# 'p_ace%_l60_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor (IO) specific mean ACE performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

df_player1["p_sv_pts_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-1)
df_player1["p_ace_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-1)

df_player1["p_sv_pts_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-2)
df_player1["p_ace_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-2)

df_player1["p_sv_pts_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-3)
df_player1["p_ace_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-3)

df_player1["p_sv_pts_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-4)
df_player1["p_ace_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-4)

df_player1["p_sv_pts_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-5)
df_player1["p_ace_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-5)

df_player1["p_sv_pts_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-6)
df_player1["p_ace_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-6)

df_player1["p_sv_pts_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-7)
df_player1["p_ace_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-7)

df_player1["p_sv_pts_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-8)
df_player1["p_ace_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-8)

df_player1["p_sv_pts_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-9)
df_player1["p_ace_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-9)

df_player1["p_sv_pts_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-10)
df_player1["p_ace_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-10)

df_player1["p_sv_pts_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-11)
df_player1["p_ace_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-11)

df_player1["p_sv_pts_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-12)
df_player1["p_ace_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-12)

df_player1["p_sv_pts_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-13)
df_player1["p_ace_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-13)

df_player1["p_sv_pts_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-14)
df_player1["p_ace_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-14)

df_player1["p_sv_pts_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-15)
df_player1["p_ace_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-15)

df_player1["p_sv_pts_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-16)
df_player1["p_ace_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-16)

df_player1["p_sv_pts_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-17)
df_player1["p_ace_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-17)

df_player1["p_sv_pts_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-18)
df_player1["p_ace_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-18)

df_player1["p_sv_pts_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-19)
df_player1["p_ace_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-19)

df_player1["p_sv_pts_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-20)
df_player1["p_ace_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-20)

df_player1["p_sv_pts_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-21)
df_player1["p_ace_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-21)

df_player1["p_sv_pts_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-22)
df_player1["p_ace_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-22)

df_player1["p_sv_pts_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-23)
df_player1["p_ace_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-23)

df_player1["p_sv_pts_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-24)
df_player1["p_ace_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-24)

df_player1["p_sv_pts_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-25)
df_player1["p_ace_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-25)

df_player1["p_sv_pts_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-26)
df_player1["p_ace_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-26)

df_player1["p_sv_pts_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-27)
df_player1["p_ace_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-27)

df_player1["p_sv_pts_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-28)
df_player1["p_ace_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-28)

df_player1["p_sv_pts_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-29)
df_player1["p_ace_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-29)

df_player1["p_sv_pts_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-30)
df_player1["p_ace_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-30)

df_player1["p_sv_pts_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-31)
df_player1["p_ace_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-31)

df_player1["p_sv_pts_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-32)
df_player1["p_ace_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-32)

df_player1["p_sv_pts_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-33)
df_player1["p_ace_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-33)

df_player1["p_sv_pts_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-34)
df_player1["p_ace_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-34)

df_player1["p_sv_pts_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-35)
df_player1["p_ace_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-35)

df_player1["p_sv_pts_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-36)
df_player1["p_ace_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-36)

df_player1["p_sv_pts_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-37)
df_player1["p_ace_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-37)

df_player1["p_sv_pts_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-38)
df_player1["p_ace_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-38)

df_player1["p_sv_pts_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-39)
df_player1["p_ace_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-39)

df_player1["p_sv_pts_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-40)
df_player1["p_ace_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-40)

df_player1["p_sv_pts_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-41)
df_player1["p_ace_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-41)

df_player1["p_sv_pts_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-42)
df_player1["p_ace_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-42)

df_player1["p_sv_pts_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-43)
df_player1["p_ace_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-43)

df_player1["p_sv_pts_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-44)
df_player1["p_ace_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-44)

df_player1["p_sv_pts_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-45)
df_player1["p_ace_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-45)

df_player1["p_sv_pts_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-46)
df_player1["p_ace_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-46)

df_player1["p_sv_pts_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-47)
df_player1["p_ace_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-47)

df_player1["p_sv_pts_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-48)
df_player1["p_ace_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-48)

df_player1["p_sv_pts_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-49)
df_player1["p_ace_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-49)

df_player1["p_sv_pts_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-50)
df_player1["p_ace_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-50)

df_player1["p_sv_pts_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-51)
df_player1["p_ace_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-51)

df_player1["p_sv_pts_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-52)
df_player1["p_ace_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-52)

df_player1["p_sv_pts_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-53)
df_player1["p_ace_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-53)

df_player1["p_sv_pts_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-54)
df_player1["p_ace_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-54)

df_player1["p_sv_pts_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-55)
df_player1["p_ace_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-55)

df_player1["p_sv_pts_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-56)
df_player1["p_ace_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-56)

df_player1["p_sv_pts_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-57)
df_player1["p_ace_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-57)

df_player1["p_sv_pts_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-58)
df_player1["p_ace_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-58)

df_player1["p_sv_pts_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-59)
df_player1["p_ace_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-59)

df_player1["p_sv_pts_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-60)
df_player1["p_ace_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ace'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_sv_pts_l60_ws"] = df_player1[["p_sv_pts_60", "p_sv_pts_59", "p_sv_pts_58", "p_sv_pts_57", "p_sv_pts_56", "p_sv_pts_55", "p_sv_pts_54", "p_sv_pts_53", "p_sv_pts_52", "p_sv_pts_51", "p_sv_pts_50", "p_sv_pts_49", "p_sv_pts_48", "p_sv_pts_47", "p_sv_pts_46", "p_sv_pts_45", "p_sv_pts_44", "p_sv_pts_43", "p_sv_pts_42", "p_sv_pts_41", "p_sv_pts_40", "p_sv_pts_39", "p_sv_pts_38", "p_sv_pts_37", "p_sv_pts_36", "p_sv_pts_35", "p_sv_pts_34", "p_sv_pts_33", "p_sv_pts_32", "p_sv_pts_31", "p_sv_pts_30", "p_sv_pts_29", "p_sv_pts_28", "p_sv_pts_27", "p_sv_pts_26", "p_sv_pts_25", "p_sv_pts_24", "p_sv_pts_23", "p_sv_pts_22", "p_sv_pts_21", "p_sv_pts_20", "p_sv_pts_19", "p_sv_pts_18", "p_sv_pts_17", "p_sv_pts_16", "p_sv_pts_15", "p_sv_pts_14", "p_sv_pts_13", "p_sv_pts_12", "p_sv_pts_11", "p_sv_pts_10", "p_sv_pts_9", "p_sv_pts_8", "p_sv_pts_7", "p_sv_pts_6", "p_sv_pts_5", "p_sv_pts_4", "p_sv_pts_3", "p_sv_pts_2", "p_sv_pts_1"]].sum(axis=1)
df_player1["p_ace_l60_ws"] = df_player1[["p_ace_60", "p_ace_59", "p_ace_58", "p_ace_57", "p_ace_56", "p_ace_55", "p_ace_54", "p_ace_53", "p_ace_52", "p_ace_51", "p_ace_50", "p_ace_49", "p_ace_48", "p_ace_47", "p_ace_46", "p_ace_45", "p_ace_44", "p_ace_43", "p_ace_42", "p_ace_41", "p_ace_40", "p_ace_39", "p_ace_38", "p_ace_37", "p_ace_36", "p_ace_35", "p_ace_34", "p_ace_33", "p_ace_32", "p_ace_31", "p_ace_30", "p_ace_29", "p_ace_28", "p_ace_27", "p_ace_26", "p_ace_25", "p_ace_24", "p_ace_23", "p_ace_22", "p_ace_21", "p_ace_20", "p_ace_19", "p_ace_18", "p_ace_17", "p_ace_16", "p_ace_15", "p_ace_14", "p_ace_13", "p_ace_12", "p_ace_11", "p_ace_10", "p_ace_9", "p_ace_8", "p_ace_7", "p_ace_6", "p_ace_5", "p_ace_4", "p_ace_3", "p_ace_2", "p_ace_1"]].sum(axis=1)
df_player1["p_ace%_l60_tw_ss_IO"] = ((df_player1["p_ace_l60_ws"]/df_player1["p_sv_pts_l60_ws"])*100).round(2)

# Set zeroes to a very low rate to avoid divide by zero errors in ratio generation later on 
df_player1.loc[(df_player1["p_ace%_l60_tw_ss_IO"] == 0), "p_ace%_l60_tw_ss_IO"] = 0.1

#(ws = weighted sum; tw = time-weighted)

In [66]:
# 'p_ace%_l10_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), mean ACE performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_sv_pts_l10_ws"] = df_player1[["p_sv_pts_60", "p_sv_pts_59", "p_sv_pts_58", "p_sv_pts_57", "p_sv_pts_56", "p_sv_pts_55", "p_sv_pts_54", "p_sv_pts_53", "p_sv_pts_52", "p_sv_pts_51"]].sum(axis=1)
df_player1["p_ace_l10_ws"] = df_player1[["p_ace_60", "p_ace_59", "p_ace_58", "p_ace_57", "p_ace_56", "p_ace_55", "p_ace_54", "p_ace_53", "p_ace_52", "p_ace_51"]].sum(axis=1)
df_player1["p_ace%_l10_tw_ss_IO"] = ((df_player1["p_ace_l10_ws"]/df_player1["p_sv_pts_l10_ws"])*100).round(2)

# Set zeroes to a very low rate to avoid divide by zero errors in ratio generation later on 
df_player1.loc[(df_player1["p_ace%_l10_tw_ss_IO"] == 0), "p_ace%_l10_tw_ss_IO"] = 0.1

#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_sv_pts_l60_ws", "p_ace_l60_ws", "p_sv_pts_l10_ws", "p_ace_l10_ws", "p_ace_60", "p_ace_59", "p_ace_58", "p_ace_57", "p_ace_56", "p_ace_55", "p_ace_54", "p_ace_53", "p_ace_52", "p_ace_51", "p_ace_50", "p_ace_49", "p_ace_48", "p_ace_47", "p_ace_46", "p_ace_45", "p_ace_44", "p_ace_43", "p_ace_42", "p_ace_41", "p_ace_40", "p_ace_39", "p_ace_38", "p_ace_37", "p_ace_36", "p_ace_35", "p_ace_34", "p_ace_33", "p_ace_32", "p_ace_31", "p_ace_30", "p_ace_29", "p_ace_28", "p_ace_27", "p_ace_26", "p_ace_25", "p_ace_24", "p_ace_23", "p_ace_22", "p_ace_21", "p_ace_20", "p_ace_19", "p_ace_18", "p_ace_17", "p_ace_16", "p_ace_15", "p_ace_14", "p_ace_13", "p_ace_12", "p_ace_11", "p_ace_10", "p_ace_9", "p_ace_8", "p_ace_7", "p_ace_6", "p_ace_5", "p_ace_4", "p_ace_3", "p_ace_2", "p_ace_1", "p_sv_pts_60", "p_sv_pts_59", "p_sv_pts_58", "p_sv_pts_57", "p_sv_pts_56", "p_sv_pts_55", "p_sv_pts_54", "p_sv_pts_53", "p_sv_pts_52", "p_sv_pts_51", "p_sv_pts_50", "p_sv_pts_49", "p_sv_pts_48", "p_sv_pts_47", "p_sv_pts_46", "p_sv_pts_45", "p_sv_pts_44", "p_sv_pts_43", "p_sv_pts_42", "p_sv_pts_41", "p_sv_pts_40", "p_sv_pts_39", "p_sv_pts_38", "p_sv_pts_37", "p_sv_pts_36", "p_sv_pts_35", "p_sv_pts_34", "p_sv_pts_33", "p_sv_pts_32", "p_sv_pts_31", "p_sv_pts_30", "p_sv_pts_29", "p_sv_pts_28", "p_sv_pts_27", "p_sv_pts_26", "p_sv_pts_25", "p_sv_pts_24", "p_sv_pts_23", "p_sv_pts_22", "p_sv_pts_21", "p_sv_pts_20", "p_sv_pts_19", "p_sv_pts_18", "p_sv_pts_17", "p_sv_pts_16", "p_sv_pts_15", "p_sv_pts_14", "p_sv_pts_13", "p_sv_pts_12", "p_sv_pts_11", "p_sv_pts_10", "p_sv_pts_9", "p_sv_pts_8", "p_sv_pts_7", "p_sv_pts_6", "p_sv_pts_5", "p_sv_pts_4", "p_sv_pts_3", "p_sv_pts_2", "p_sv_pts_1"], axis = 1)

In [67]:
# 'p_aced%_l60_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean ACED performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

df_player1["p_ret_pts_60"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-1)
df_player1["opp_ace_60"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-1)

df_player1["p_ret_pts_59"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-2)
df_player1["opp_ace_59"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-2)

df_player1["p_ret_pts_58"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-3)
df_player1["opp_ace_58"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-3)

df_player1["p_ret_pts_57"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-4)
df_player1["opp_ace_57"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-4)

df_player1["p_ret_pts_56"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-5)
df_player1["opp_ace_56"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-5)

df_player1["p_ret_pts_55"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-6)
df_player1["opp_ace_55"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-6)

df_player1["p_ret_pts_54"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-7)
df_player1["opp_ace_54"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-7)

df_player1["p_ret_pts_53"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-8)
df_player1["opp_ace_53"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-8)

df_player1["p_ret_pts_52"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-9)
df_player1["opp_ace_52"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-9)

df_player1["p_ret_pts_51"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-10)
df_player1["opp_ace_51"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-10)

df_player1["p_ret_pts_50"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-11)
df_player1["opp_ace_50"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-11)

df_player1["p_ret_pts_49"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-12)
df_player1["opp_ace_49"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-12)

df_player1["p_ret_pts_48"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-13)
df_player1["opp_ace_48"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-13)

df_player1["p_ret_pts_47"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-14)
df_player1["opp_ace_47"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-14)

df_player1["p_ret_pts_46"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-15)
df_player1["opp_ace_46"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-15)

df_player1["p_ret_pts_45"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-16)
df_player1["opp_ace_45"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-16)

df_player1["p_ret_pts_44"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-17)
df_player1["opp_ace_44"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-17)

df_player1["p_ret_pts_43"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-18)
df_player1["opp_ace_43"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-18)

df_player1["p_ret_pts_42"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-19)
df_player1["opp_ace_42"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-19)

df_player1["p_ret_pts_41"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-20)
df_player1["opp_ace_41"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-20)

df_player1["p_ret_pts_40"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-21)
df_player1["opp_ace_40"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-21)

df_player1["p_ret_pts_39"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-22)
df_player1["opp_ace_39"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-22)

df_player1["p_ret_pts_38"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-23)
df_player1["opp_ace_38"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-23)

df_player1["p_ret_pts_37"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-24)
df_player1["opp_ace_37"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-24)

df_player1["p_ret_pts_36"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-25)
df_player1["opp_ace_36"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-25)

df_player1["p_ret_pts_35"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-26)
df_player1["opp_ace_35"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-26)

df_player1["p_ret_pts_34"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-27)
df_player1["opp_ace_34"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-27)

df_player1["p_ret_pts_33"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-28)
df_player1["opp_ace_33"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-28)

df_player1["p_ret_pts_32"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-29)
df_player1["opp_ace_32"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-29)

df_player1["p_ret_pts_31"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-30)
df_player1["opp_ace_31"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-30)

df_player1["p_ret_pts_30"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-31)
df_player1["opp_ace_30"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-31)

df_player1["p_ret_pts_29"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-32)
df_player1["opp_ace_29"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-32)

df_player1["p_ret_pts_28"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-33)
df_player1["opp_ace_28"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-33)

df_player1["p_ret_pts_27"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-34)
df_player1["opp_ace_27"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-34)

df_player1["p_ret_pts_26"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-35)
df_player1["opp_ace_26"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-35)

df_player1["p_ret_pts_25"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-36)
df_player1["opp_ace_25"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-36)

df_player1["p_ret_pts_24"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-37)
df_player1["opp_ace_24"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-37)

df_player1["p_ret_pts_23"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-38)
df_player1["opp_ace_23"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-38)

df_player1["p_ret_pts_22"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-39)
df_player1["opp_ace_22"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-39)

df_player1["p_ret_pts_21"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-40)
df_player1["opp_ace_21"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-40)

df_player1["p_ret_pts_20"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-41)
df_player1["opp_ace_20"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-41)

df_player1["p_ret_pts_19"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-42)
df_player1["opp_ace_19"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-42)

df_player1["p_ret_pts_18"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-43)
df_player1["opp_ace_18"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-43)

df_player1["p_ret_pts_17"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-44)
df_player1["opp_ace_17"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-44)

df_player1["p_ret_pts_16"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-45)
df_player1["opp_ace_16"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-45)

df_player1["p_ret_pts_15"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-46)
df_player1["opp_ace_15"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-46)

df_player1["p_ret_pts_14"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-47)
df_player1["opp_ace_14"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-47)

df_player1["p_ret_pts_13"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-48)
df_player1["opp_ace_13"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-48)

df_player1["p_ret_pts_12"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-49)
df_player1["opp_ace_12"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-49)

df_player1["p_ret_pts_11"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-50)
df_player1["opp_ace_11"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-50)

df_player1["p_ret_pts_10"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-51)
df_player1["opp_ace_10"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-51)

df_player1["p_ret_pts_9"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-52)
df_player1["opp_ace_9"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-52)

df_player1["p_ret_pts_8"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-53)
df_player1["opp_ace_8"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-53)

df_player1["p_ret_pts_7"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-54)
df_player1["opp_ace_7"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-54)

df_player1["p_ret_pts_6"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-55)
df_player1["opp_ace_6"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-55)

df_player1["p_ret_pts_5"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-56)
df_player1["opp_ace_5"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-56)

df_player1["p_ret_pts_4"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-57)
df_player1["opp_ace_4"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-57)

df_player1["p_ret_pts_3"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-58)
df_player1["opp_ace_3"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-58)

df_player1["p_ret_pts_2"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-59)
df_player1["opp_ace_2"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-59)

df_player1["p_ret_pts_1"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-60)
df_player1["opp_ace_1"] = df_player1.groupby(['p_id','t_surf'])['opp_ace'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_ret_pts_l60_ws"] = df_player1[["p_ret_pts_60", "p_ret_pts_59", "p_ret_pts_58", "p_ret_pts_57", "p_ret_pts_56", "p_ret_pts_55", "p_ret_pts_54", "p_ret_pts_53", "p_ret_pts_52", "p_ret_pts_51", "p_ret_pts_50", "p_ret_pts_49", "p_ret_pts_48", "p_ret_pts_47", "p_ret_pts_46", "p_ret_pts_45", "p_ret_pts_44", "p_ret_pts_43", "p_ret_pts_42", "p_ret_pts_41", "p_ret_pts_40", "p_ret_pts_39", "p_ret_pts_38", "p_ret_pts_37", "p_ret_pts_36", "p_ret_pts_35", "p_ret_pts_34", "p_ret_pts_33", "p_ret_pts_32", "p_ret_pts_31", "p_ret_pts_30", "p_ret_pts_29", "p_ret_pts_28", "p_ret_pts_27", "p_ret_pts_26", "p_ret_pts_25", "p_ret_pts_24", "p_ret_pts_23", "p_ret_pts_22", "p_ret_pts_21", "p_ret_pts_20", "p_ret_pts_19", "p_ret_pts_18", "p_ret_pts_17", "p_ret_pts_16", "p_ret_pts_15", "p_ret_pts_14", "p_ret_pts_13", "p_ret_pts_12", "p_ret_pts_11", "p_ret_pts_10", "p_ret_pts_9", "p_ret_pts_8", "p_ret_pts_7", "p_ret_pts_6", "p_ret_pts_5", "p_ret_pts_4", "p_ret_pts_3", "p_ret_pts_2", "p_ret_pts_1"]].sum(axis=1)
df_player1["opp_ace_l60_ws"] = df_player1[["opp_ace_60", "opp_ace_59", "opp_ace_58", "opp_ace_57", "opp_ace_56", "opp_ace_55", "opp_ace_54", "opp_ace_53", "opp_ace_52", "opp_ace_51", "opp_ace_50", "opp_ace_49", "opp_ace_48", "opp_ace_47", "opp_ace_46", "opp_ace_45", "opp_ace_44", "opp_ace_43", "opp_ace_42", "opp_ace_41", "opp_ace_40", "opp_ace_39", "opp_ace_38", "opp_ace_37", "opp_ace_36", "opp_ace_35", "opp_ace_34", "opp_ace_33", "opp_ace_32", "opp_ace_31", "opp_ace_30", "opp_ace_29", "opp_ace_28", "opp_ace_27", "opp_ace_26", "opp_ace_25", "opp_ace_24", "opp_ace_23", "opp_ace_22", "opp_ace_21", "opp_ace_20", "opp_ace_19", "opp_ace_18", "opp_ace_17", "opp_ace_16", "opp_ace_15", "opp_ace_14", "opp_ace_13", "opp_ace_12", "opp_ace_11", "opp_ace_10", "opp_ace_9", "opp_ace_8", "opp_ace_7", "opp_ace_6", "opp_ace_5", "opp_ace_4", "opp_ace_3", "opp_ace_2", "opp_ace_1"]].sum(axis=1)
df_player1["p_aced%_l60_tw_ss"] = ((df_player1["opp_ace_l60_ws"]/df_player1["p_ret_pts_l60_ws"])*100).round(2)

# Set zeroes to a very low rate to avoid divide by zero errors in ratio generation later on 
df_player1.loc[(df_player1["p_aced%_l60_tw_ss"] == 0), "p_aced%_l60_tw_ss"] = 0.1

#(ws = weighted sum; tw = time-weighted)

In [68]:
# 'p_aced%_l10_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean ACED performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_ret_pts_l10_ws"] = df_player1[["p_ret_pts_60", "p_ret_pts_59", "p_ret_pts_58", "p_ret_pts_57", "p_ret_pts_56", "p_ret_pts_55", "p_ret_pts_54", "p_ret_pts_53", "p_ret_pts_52", "p_ret_pts_51"]].sum(axis=1)
df_player1["opp_ace_l10_ws"] = df_player1[["opp_ace_60", "opp_ace_59", "opp_ace_58", "opp_ace_57", "opp_ace_56", "opp_ace_55", "opp_ace_54", "opp_ace_53", "opp_ace_52", "opp_ace_51"]].sum(axis=1)
df_player1["p_aced%_l10_tw_ss"] = ((df_player1["opp_ace_l10_ws"]/df_player1["p_ret_pts_l10_ws"])*100).round(2)

# Set zeroes to a very low rate to avoid divide by zero errors in ratio generation later on 
df_player1.loc[(df_player1["p_aced%_l10_tw_ss"] == 0), "p_aced%_l10_tw_ss"] = 0.1

#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_ret_pts_l60_ws", "opp_ace_l60_ws", "p_ret_pts_l10_ws", "opp_ace_l10_ws", "opp_ace_60", "opp_ace_59", "opp_ace_58", "opp_ace_57", "opp_ace_56", "opp_ace_55", "opp_ace_54", "opp_ace_53", "opp_ace_52", "opp_ace_51", "opp_ace_50", "opp_ace_49", "opp_ace_48", "opp_ace_47", "opp_ace_46", "opp_ace_45", "opp_ace_44", "opp_ace_43", "opp_ace_42", "opp_ace_41", "opp_ace_40", "opp_ace_39", "opp_ace_38", "opp_ace_37", "opp_ace_36", "opp_ace_35", "opp_ace_34", "opp_ace_33", "opp_ace_32", "opp_ace_31", "opp_ace_30", "opp_ace_29", "opp_ace_28", "opp_ace_27", "opp_ace_26", "opp_ace_25", "opp_ace_24", "opp_ace_23", "opp_ace_22", "opp_ace_21", "opp_ace_20", "opp_ace_19", "opp_ace_18", "opp_ace_17", "opp_ace_16", "opp_ace_15", "opp_ace_14", "opp_ace_13", "opp_ace_12", "opp_ace_11", "opp_ace_10", "opp_ace_9", "opp_ace_8", "opp_ace_7", "opp_ace_6", "opp_ace_5", "opp_ace_4", "opp_ace_3", "opp_ace_2", "opp_ace_1", "p_ret_pts_60", "p_ret_pts_59", "p_ret_pts_58", "p_ret_pts_57", "p_ret_pts_56", "p_ret_pts_55", "p_ret_pts_54", "p_ret_pts_53", "p_ret_pts_52", "p_ret_pts_51", "p_ret_pts_50", "p_ret_pts_49", "p_ret_pts_48", "p_ret_pts_47", "p_ret_pts_46", "p_ret_pts_45", "p_ret_pts_44", "p_ret_pts_43", "p_ret_pts_42", "p_ret_pts_41", "p_ret_pts_40", "p_ret_pts_39", "p_ret_pts_38", "p_ret_pts_37", "p_ret_pts_36", "p_ret_pts_35", "p_ret_pts_34", "p_ret_pts_33", "p_ret_pts_32", "p_ret_pts_31", "p_ret_pts_30", "p_ret_pts_29", "p_ret_pts_28", "p_ret_pts_27", "p_ret_pts_26", "p_ret_pts_25", "p_ret_pts_24", "p_ret_pts_23", "p_ret_pts_22", "p_ret_pts_21", "p_ret_pts_20", "p_ret_pts_19", "p_ret_pts_18", "p_ret_pts_17", "p_ret_pts_16", "p_ret_pts_15", "p_ret_pts_14", "p_ret_pts_13", "p_ret_pts_12", "p_ret_pts_11", "p_ret_pts_10", "p_ret_pts_9", "p_ret_pts_8", "p_ret_pts_7", "p_ret_pts_6", "p_ret_pts_5", "p_ret_pts_4", "p_ret_pts_3", "p_ret_pts_2", "p_ret_pts_1"], axis = 1)

In [69]:
# 'p_aced%_l60_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor (IO) specific mean ACED performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

df_player1["p_ret_pts_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-1)
df_player1["opp_ace_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-1)

df_player1["p_ret_pts_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-2)
df_player1["opp_ace_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-2)

df_player1["p_ret_pts_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-3)
df_player1["opp_ace_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-3)

df_player1["p_ret_pts_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-4)
df_player1["opp_ace_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-4)

df_player1["p_ret_pts_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-5)
df_player1["opp_ace_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-5)

df_player1["p_ret_pts_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-6)
df_player1["opp_ace_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-6)

df_player1["p_ret_pts_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-7)
df_player1["opp_ace_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-7)

df_player1["p_ret_pts_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-8)
df_player1["opp_ace_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-8)

df_player1["p_ret_pts_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-9)
df_player1["opp_ace_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-9)

df_player1["p_ret_pts_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-10)
df_player1["opp_ace_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-10)

df_player1["p_ret_pts_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-11)
df_player1["opp_ace_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-11)

df_player1["p_ret_pts_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-12)
df_player1["opp_ace_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-12)

df_player1["p_ret_pts_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-13)
df_player1["opp_ace_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-13)

df_player1["p_ret_pts_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-14)
df_player1["opp_ace_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-14)

df_player1["p_ret_pts_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-15)
df_player1["opp_ace_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-15)

df_player1["p_ret_pts_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-16)
df_player1["opp_ace_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-16)

df_player1["p_ret_pts_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-17)
df_player1["opp_ace_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-17)

df_player1["p_ret_pts_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-18)
df_player1["opp_ace_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-18)

df_player1["p_ret_pts_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-19)
df_player1["opp_ace_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-19)

df_player1["p_ret_pts_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-20)
df_player1["opp_ace_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-20)

df_player1["p_ret_pts_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-21)
df_player1["opp_ace_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-21)

df_player1["p_ret_pts_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-22)
df_player1["opp_ace_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-22)

df_player1["p_ret_pts_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-23)
df_player1["opp_ace_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-23)

df_player1["p_ret_pts_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-24)
df_player1["opp_ace_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-24)

df_player1["p_ret_pts_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-25)
df_player1["opp_ace_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-25)

df_player1["p_ret_pts_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-26)
df_player1["opp_ace_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-26)

df_player1["p_ret_pts_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-27)
df_player1["opp_ace_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-27)

df_player1["p_ret_pts_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-28)
df_player1["opp_ace_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-28)

df_player1["p_ret_pts_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-29)
df_player1["opp_ace_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-29)

df_player1["p_ret_pts_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-30)
df_player1["opp_ace_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-30)

df_player1["p_ret_pts_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-31)
df_player1["opp_ace_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-31)

df_player1["p_ret_pts_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-32)
df_player1["opp_ace_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-32)

df_player1["p_ret_pts_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-33)
df_player1["opp_ace_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-33)

df_player1["p_ret_pts_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-34)
df_player1["opp_ace_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-34)

df_player1["p_ret_pts_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-35)
df_player1["opp_ace_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-35)

df_player1["p_ret_pts_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-36)
df_player1["opp_ace_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-36)

df_player1["p_ret_pts_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-37)
df_player1["opp_ace_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-37)

df_player1["p_ret_pts_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-38)
df_player1["opp_ace_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-38)

df_player1["p_ret_pts_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-39)
df_player1["opp_ace_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-39)

df_player1["p_ret_pts_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-40)
df_player1["opp_ace_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-40)

df_player1["p_ret_pts_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-41)
df_player1["opp_ace_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-41)

df_player1["p_ret_pts_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-42)
df_player1["opp_ace_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-42)

df_player1["p_ret_pts_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-43)
df_player1["opp_ace_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-43)

df_player1["p_ret_pts_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-44)
df_player1["opp_ace_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-44)

df_player1["p_ret_pts_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-45)
df_player1["opp_ace_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-45)

df_player1["p_ret_pts_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-46)
df_player1["opp_ace_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-46)

df_player1["p_ret_pts_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-47)
df_player1["opp_ace_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-47)

df_player1["p_ret_pts_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-48)
df_player1["opp_ace_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-48)

df_player1["p_ret_pts_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-49)
df_player1["opp_ace_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-49)

df_player1["p_ret_pts_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-50)
df_player1["opp_ace_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-50)

df_player1["p_ret_pts_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-51)
df_player1["opp_ace_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-51)

df_player1["p_ret_pts_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-52)
df_player1["opp_ace_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-52)

df_player1["p_ret_pts_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-53)
df_player1["opp_ace_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-53)

df_player1["p_ret_pts_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-54)
df_player1["opp_ace_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-54)

df_player1["p_ret_pts_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-55)
df_player1["opp_ace_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-55)

df_player1["p_ret_pts_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-56)
df_player1["opp_ace_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-56)

df_player1["p_ret_pts_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-57)
df_player1["opp_ace_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-57)

df_player1["p_ret_pts_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-58)
df_player1["opp_ace_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-58)

df_player1["p_ret_pts_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-59)
df_player1["opp_ace_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-59)

df_player1["p_ret_pts_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-60)
df_player1["opp_ace_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_ace'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_ret_pts_l60_ws"] = df_player1[["p_ret_pts_60", "p_ret_pts_59", "p_ret_pts_58", "p_ret_pts_57", "p_ret_pts_56", "p_ret_pts_55", "p_ret_pts_54", "p_ret_pts_53", "p_ret_pts_52", "p_ret_pts_51", "p_ret_pts_50", "p_ret_pts_49", "p_ret_pts_48", "p_ret_pts_47", "p_ret_pts_46", "p_ret_pts_45", "p_ret_pts_44", "p_ret_pts_43", "p_ret_pts_42", "p_ret_pts_41", "p_ret_pts_40", "p_ret_pts_39", "p_ret_pts_38", "p_ret_pts_37", "p_ret_pts_36", "p_ret_pts_35", "p_ret_pts_34", "p_ret_pts_33", "p_ret_pts_32", "p_ret_pts_31", "p_ret_pts_30", "p_ret_pts_29", "p_ret_pts_28", "p_ret_pts_27", "p_ret_pts_26", "p_ret_pts_25", "p_ret_pts_24", "p_ret_pts_23", "p_ret_pts_22", "p_ret_pts_21", "p_ret_pts_20", "p_ret_pts_19", "p_ret_pts_18", "p_ret_pts_17", "p_ret_pts_16", "p_ret_pts_15", "p_ret_pts_14", "p_ret_pts_13", "p_ret_pts_12", "p_ret_pts_11", "p_ret_pts_10", "p_ret_pts_9", "p_ret_pts_8", "p_ret_pts_7", "p_ret_pts_6", "p_ret_pts_5", "p_ret_pts_4", "p_ret_pts_3", "p_ret_pts_2", "p_ret_pts_1"]].sum(axis=1)
df_player1["opp_ace_l60_ws"] = df_player1[["opp_ace_60", "opp_ace_59", "opp_ace_58", "opp_ace_57", "opp_ace_56", "opp_ace_55", "opp_ace_54", "opp_ace_53", "opp_ace_52", "opp_ace_51", "opp_ace_50", "opp_ace_49", "opp_ace_48", "opp_ace_47", "opp_ace_46", "opp_ace_45", "opp_ace_44", "opp_ace_43", "opp_ace_42", "opp_ace_41", "opp_ace_40", "opp_ace_39", "opp_ace_38", "opp_ace_37", "opp_ace_36", "opp_ace_35", "opp_ace_34", "opp_ace_33", "opp_ace_32", "opp_ace_31", "opp_ace_30", "opp_ace_29", "opp_ace_28", "opp_ace_27", "opp_ace_26", "opp_ace_25", "opp_ace_24", "opp_ace_23", "opp_ace_22", "opp_ace_21", "opp_ace_20", "opp_ace_19", "opp_ace_18", "opp_ace_17", "opp_ace_16", "opp_ace_15", "opp_ace_14", "opp_ace_13", "opp_ace_12", "opp_ace_11", "opp_ace_10", "opp_ace_9", "opp_ace_8", "opp_ace_7", "opp_ace_6", "opp_ace_5", "opp_ace_4", "opp_ace_3", "opp_ace_2", "opp_ace_1"]].sum(axis=1)
df_player1["p_aced%_l60_tw_ss_IO"] = ((df_player1["opp_ace_l60_ws"]/df_player1["p_ret_pts_l60_ws"])*100).round(2)

# Set zeroes to a very low rate to avoid divide by zero errors in ratio generation later on 
df_player1.loc[(df_player1["p_aced%_l60_tw_ss_IO"] == 0), "p_aced%_l60_tw_ss_IO"] = 0.1

#(ws = weighted sum; tw = time-weighted)

In [70]:
# 'p_aced%_l10_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor (IO) specific mean ACED performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_ret_pts_l10_ws"] = df_player1[["p_ret_pts_60", "p_ret_pts_59", "p_ret_pts_58", "p_ret_pts_57", "p_ret_pts_56", "p_ret_pts_55", "p_ret_pts_54", "p_ret_pts_53", "p_ret_pts_52", "p_ret_pts_51"]].sum(axis=1)
df_player1["opp_ace_l10_ws"] = df_player1[["opp_ace_60", "opp_ace_59", "opp_ace_58", "opp_ace_57", "opp_ace_56", "opp_ace_55", "opp_ace_54", "opp_ace_53", "opp_ace_52", "opp_ace_51"]].sum(axis=1)
df_player1["p_aced%_l10_tw_ss_IO"] = ((df_player1["opp_ace_l10_ws"]/df_player1["p_ret_pts_l10_ws"])*100).round(2)

# Set zeroes to a very low rate to avoid divide by zero errors in ratio generation later on 
df_player1.loc[(df_player1["p_aced%_l10_tw_ss_IO"] == 0), "p_aced%_l10_tw_ss_IO"] = 0.1

#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_ret_pts_l60_ws", "opp_ace_l60_ws", "p_ret_pts_l10_ws", "opp_ace_l10_ws", "opp_ace_60", "opp_ace_59", "opp_ace_58", "opp_ace_57", "opp_ace_56", "opp_ace_55", "opp_ace_54", "opp_ace_53", "opp_ace_52", "opp_ace_51", "opp_ace_50", "opp_ace_49", "opp_ace_48", "opp_ace_47", "opp_ace_46", "opp_ace_45", "opp_ace_44", "opp_ace_43", "opp_ace_42", "opp_ace_41", "opp_ace_40", "opp_ace_39", "opp_ace_38", "opp_ace_37", "opp_ace_36", "opp_ace_35", "opp_ace_34", "opp_ace_33", "opp_ace_32", "opp_ace_31", "opp_ace_30", "opp_ace_29", "opp_ace_28", "opp_ace_27", "opp_ace_26", "opp_ace_25", "opp_ace_24", "opp_ace_23", "opp_ace_22", "opp_ace_21", "opp_ace_20", "opp_ace_19", "opp_ace_18", "opp_ace_17", "opp_ace_16", "opp_ace_15", "opp_ace_14", "opp_ace_13", "opp_ace_12", "opp_ace_11", "opp_ace_10", "opp_ace_9", "opp_ace_8", "opp_ace_7", "opp_ace_6", "opp_ace_5", "opp_ace_4", "opp_ace_3", "opp_ace_2", "opp_ace_1", "p_ret_pts_60", "p_ret_pts_59", "p_ret_pts_58", "p_ret_pts_57", "p_ret_pts_56", "p_ret_pts_55", "p_ret_pts_54", "p_ret_pts_53", "p_ret_pts_52", "p_ret_pts_51", "p_ret_pts_50", "p_ret_pts_49", "p_ret_pts_48", "p_ret_pts_47", "p_ret_pts_46", "p_ret_pts_45", "p_ret_pts_44", "p_ret_pts_43", "p_ret_pts_42", "p_ret_pts_41", "p_ret_pts_40", "p_ret_pts_39", "p_ret_pts_38", "p_ret_pts_37", "p_ret_pts_36", "p_ret_pts_35", "p_ret_pts_34", "p_ret_pts_33", "p_ret_pts_32", "p_ret_pts_31", "p_ret_pts_30", "p_ret_pts_29", "p_ret_pts_28", "p_ret_pts_27", "p_ret_pts_26", "p_ret_pts_25", "p_ret_pts_24", "p_ret_pts_23", "p_ret_pts_22", "p_ret_pts_21", "p_ret_pts_20", "p_ret_pts_19", "p_ret_pts_18", "p_ret_pts_17", "p_ret_pts_16", "p_ret_pts_15", "p_ret_pts_14", "p_ret_pts_13", "p_ret_pts_12", "p_ret_pts_11", "p_ret_pts_10", "p_ret_pts_9", "p_ret_pts_8", "p_ret_pts_7", "p_ret_pts_6", "p_ret_pts_5", "p_ret_pts_4", "p_ret_pts_3", "p_ret_pts_2", "p_ret_pts_1"], axis = 1)

In [71]:
# 'p_df%_l60_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean DOUBLE FAULT performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

df_player1["p_sv_pts_60"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-1)
df_player1["p_df_60"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-1)

df_player1["p_sv_pts_59"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-2)
df_player1["p_df_59"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-2)

df_player1["p_sv_pts_58"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-3)
df_player1["p_df_58"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-3)

df_player1["p_sv_pts_57"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-4)
df_player1["p_df_57"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-4)

df_player1["p_sv_pts_56"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-5)
df_player1["p_df_56"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-5)

df_player1["p_sv_pts_55"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-6)
df_player1["p_df_55"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-6)

df_player1["p_sv_pts_54"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-7)
df_player1["p_df_54"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-7)

df_player1["p_sv_pts_53"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-8)
df_player1["p_df_53"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-8)

df_player1["p_sv_pts_52"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-9)
df_player1["p_df_52"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-9)

df_player1["p_sv_pts_51"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-10)
df_player1["p_df_51"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-10)

df_player1["p_sv_pts_50"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-11)
df_player1["p_df_50"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-11)

df_player1["p_sv_pts_49"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-12)
df_player1["p_df_49"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-12)

df_player1["p_sv_pts_48"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-13)
df_player1["p_df_48"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-13)

df_player1["p_sv_pts_47"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-14)
df_player1["p_df_47"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-14)

df_player1["p_sv_pts_46"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-15)
df_player1["p_df_46"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-15)

df_player1["p_sv_pts_45"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-16)
df_player1["p_df_45"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-16)

df_player1["p_sv_pts_44"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-17)
df_player1["p_df_44"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-17)

df_player1["p_sv_pts_43"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-18)
df_player1["p_df_43"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-18)

df_player1["p_sv_pts_42"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-19)
df_player1["p_df_42"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-19)

df_player1["p_sv_pts_41"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-20)
df_player1["p_df_41"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-20)

df_player1["p_sv_pts_40"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-21)
df_player1["p_df_40"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-21)

df_player1["p_sv_pts_39"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-22)
df_player1["p_df_39"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-22)

df_player1["p_sv_pts_38"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-23)
df_player1["p_df_38"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-23)

df_player1["p_sv_pts_37"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-24)
df_player1["p_df_37"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-24)

df_player1["p_sv_pts_36"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-25)
df_player1["p_df_36"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-25)

df_player1["p_sv_pts_35"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-26)
df_player1["p_df_35"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-26)

df_player1["p_sv_pts_34"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-27)
df_player1["p_df_34"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-27)

df_player1["p_sv_pts_33"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-28)
df_player1["p_df_33"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-28)

df_player1["p_sv_pts_32"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-29)
df_player1["p_df_32"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-29)

df_player1["p_sv_pts_31"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-30)
df_player1["p_df_31"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-30)

df_player1["p_sv_pts_30"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-31)
df_player1["p_df_30"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-31)

df_player1["p_sv_pts_29"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-32)
df_player1["p_df_29"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-32)

df_player1["p_sv_pts_28"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-33)
df_player1["p_df_28"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-33)

df_player1["p_sv_pts_27"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-34)
df_player1["p_df_27"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-34)

df_player1["p_sv_pts_26"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-35)
df_player1["p_df_26"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-35)

df_player1["p_sv_pts_25"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-36)
df_player1["p_df_25"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-36)

df_player1["p_sv_pts_24"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-37)
df_player1["p_df_24"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-37)

df_player1["p_sv_pts_23"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-38)
df_player1["p_df_23"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-38)

df_player1["p_sv_pts_22"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-39)
df_player1["p_df_22"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-39)

df_player1["p_sv_pts_21"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-40)
df_player1["p_df_21"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-40)

df_player1["p_sv_pts_20"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-41)
df_player1["p_df_20"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-41)

df_player1["p_sv_pts_19"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-42)
df_player1["p_df_19"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-42)

df_player1["p_sv_pts_18"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-43)
df_player1["p_df_18"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-43)

df_player1["p_sv_pts_17"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-44)
df_player1["p_df_17"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-44)

df_player1["p_sv_pts_16"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-45)
df_player1["p_df_16"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-45)

df_player1["p_sv_pts_15"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-46)
df_player1["p_df_15"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-46)

df_player1["p_sv_pts_14"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-47)
df_player1["p_df_14"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-47)

df_player1["p_sv_pts_13"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-48)
df_player1["p_df_13"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-48)

df_player1["p_sv_pts_12"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-49)
df_player1["p_df_12"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-49)

df_player1["p_sv_pts_11"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-50)
df_player1["p_df_11"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-50)

df_player1["p_sv_pts_10"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-51)
df_player1["p_df_10"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-51)

df_player1["p_sv_pts_9"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-52)
df_player1["p_df_9"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-52)

df_player1["p_sv_pts_8"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-53)
df_player1["p_df_8"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-53)

df_player1["p_sv_pts_7"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-54)
df_player1["p_df_7"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-54)

df_player1["p_sv_pts_6"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-55)
df_player1["p_df_6"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-55)

df_player1["p_sv_pts_5"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-56)
df_player1["p_df_5"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-56)

df_player1["p_sv_pts_4"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-57)
df_player1["p_df_4"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-57)

df_player1["p_sv_pts_3"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-58)
df_player1["p_df_3"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-58)

df_player1["p_sv_pts_2"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-59)
df_player1["p_df_2"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-59)

df_player1["p_sv_pts_1"] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts'].shift(-60)
df_player1["p_df_1"] = df_player1.groupby(['p_id','t_surf'])['p_df'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surfdf will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_sv_pts_l60_ws"] = df_player1[["p_sv_pts_60", "p_sv_pts_59", "p_sv_pts_58", "p_sv_pts_57", "p_sv_pts_56", "p_sv_pts_55", "p_sv_pts_54", "p_sv_pts_53", "p_sv_pts_52", "p_sv_pts_51", "p_sv_pts_50", "p_sv_pts_49", "p_sv_pts_48", "p_sv_pts_47", "p_sv_pts_46", "p_sv_pts_45", "p_sv_pts_44", "p_sv_pts_43", "p_sv_pts_42", "p_sv_pts_41", "p_sv_pts_40", "p_sv_pts_39", "p_sv_pts_38", "p_sv_pts_37", "p_sv_pts_36", "p_sv_pts_35", "p_sv_pts_34", "p_sv_pts_33", "p_sv_pts_32", "p_sv_pts_31", "p_sv_pts_30", "p_sv_pts_29", "p_sv_pts_28", "p_sv_pts_27", "p_sv_pts_26", "p_sv_pts_25", "p_sv_pts_24", "p_sv_pts_23", "p_sv_pts_22", "p_sv_pts_21", "p_sv_pts_20", "p_sv_pts_19", "p_sv_pts_18", "p_sv_pts_17", "p_sv_pts_16", "p_sv_pts_15", "p_sv_pts_14", "p_sv_pts_13", "p_sv_pts_12", "p_sv_pts_11", "p_sv_pts_10", "p_sv_pts_9", "p_sv_pts_8", "p_sv_pts_7", "p_sv_pts_6", "p_sv_pts_5", "p_sv_pts_4", "p_sv_pts_3", "p_sv_pts_2", "p_sv_pts_1"]].sum(axis=1)
df_player1["p_df_l60_ws"] = df_player1[["p_df_60", "p_df_59", "p_df_58", "p_df_57", "p_df_56", "p_df_55", "p_df_54", "p_df_53", "p_df_52", "p_df_51", "p_df_50", "p_df_49", "p_df_48", "p_df_47", "p_df_46", "p_df_45", "p_df_44", "p_df_43", "p_df_42", "p_df_41", "p_df_40", "p_df_39", "p_df_38", "p_df_37", "p_df_36", "p_df_35", "p_df_34", "p_df_33", "p_df_32", "p_df_31", "p_df_30", "p_df_29", "p_df_28", "p_df_27", "p_df_26", "p_df_25", "p_df_24", "p_df_23", "p_df_22", "p_df_21", "p_df_20", "p_df_19", "p_df_18", "p_df_17", "p_df_16", "p_df_15", "p_df_14", "p_df_13", "p_df_12", "p_df_11", "p_df_10", "p_df_9", "p_df_8", "p_df_7", "p_df_6", "p_df_5", "p_df_4", "p_df_3", "p_df_2", "p_df_1"]].sum(axis=1)
df_player1["p_df%_l60_tw_ss"] = ((df_player1["p_df_l60_ws"]/df_player1["p_sv_pts_l60_ws"])*100).round(2)

# Set zeroes to a very low rate to avoid divide by zero errors in ratio generation later on 
df_player1.loc[(df_player1["p_df%_l60_tw_ss"] == 0), "p_df%_l60_tw_ss"] = 0.1

#(ws = weighted sum; tw = time-weighted)

In [72]:
# 'p_df%_l10_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean df performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surfdf will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_sv_pts_l10_ws"] = df_player1[["p_sv_pts_60", "p_sv_pts_59", "p_sv_pts_58", "p_sv_pts_57", "p_sv_pts_56", "p_sv_pts_55", "p_sv_pts_54", "p_sv_pts_53", "p_sv_pts_52", "p_sv_pts_51"]].sum(axis=1)
df_player1["p_df_l10_ws"] = df_player1[["p_df_60", "p_df_59", "p_df_58", "p_df_57", "p_df_56", "p_df_55", "p_df_54", "p_df_53", "p_df_52", "p_df_51"]].sum(axis=1)
df_player1["p_df%_l10_tw_ss"] = ((df_player1["p_df_l10_ws"]/df_player1["p_sv_pts_l10_ws"])*100).round(2)

# Set zeroes to a very low rate to avoid divide by zero errors in ratio generation later on 
df_player1.loc[(df_player1["p_df%_l10_tw_ss"] == 0), "p_df%_l10_tw_ss"] = 0.1

#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_sv_pts_l60_ws", "p_df_l60_ws", "p_sv_pts_l10_ws", "p_df_l10_ws", "p_df_60", "p_df_59", "p_df_58", "p_df_57", "p_df_56", "p_df_55", "p_df_54", "p_df_53", "p_df_52", "p_df_51", "p_df_50", "p_df_49", "p_df_48", "p_df_47", "p_df_46", "p_df_45", "p_df_44", "p_df_43", "p_df_42", "p_df_41", "p_df_40", "p_df_39", "p_df_38", "p_df_37", "p_df_36", "p_df_35", "p_df_34", "p_df_33", "p_df_32", "p_df_31", "p_df_30", "p_df_29", "p_df_28", "p_df_27", "p_df_26", "p_df_25", "p_df_24", "p_df_23", "p_df_22", "p_df_21", "p_df_20", "p_df_19", "p_df_18", "p_df_17", "p_df_16", "p_df_15", "p_df_14", "p_df_13", "p_df_12", "p_df_11", "p_df_10", "p_df_9", "p_df_8", "p_df_7", "p_df_6", "p_df_5", "p_df_4", "p_df_3", "p_df_2", "p_df_1", "p_sv_pts_60", "p_sv_pts_59", "p_sv_pts_58", "p_sv_pts_57", "p_sv_pts_56", "p_sv_pts_55", "p_sv_pts_54", "p_sv_pts_53", "p_sv_pts_52", "p_sv_pts_51", "p_sv_pts_50", "p_sv_pts_49", "p_sv_pts_48", "p_sv_pts_47", "p_sv_pts_46", "p_sv_pts_45", "p_sv_pts_44", "p_sv_pts_43", "p_sv_pts_42", "p_sv_pts_41", "p_sv_pts_40", "p_sv_pts_39", "p_sv_pts_38", "p_sv_pts_37", "p_sv_pts_36", "p_sv_pts_35", "p_sv_pts_34", "p_sv_pts_33", "p_sv_pts_32", "p_sv_pts_31", "p_sv_pts_30", "p_sv_pts_29", "p_sv_pts_28", "p_sv_pts_27", "p_sv_pts_26", "p_sv_pts_25", "p_sv_pts_24", "p_sv_pts_23", "p_sv_pts_22", "p_sv_pts_21", "p_sv_pts_20", "p_sv_pts_19", "p_sv_pts_18", "p_sv_pts_17", "p_sv_pts_16", "p_sv_pts_15", "p_sv_pts_14", "p_sv_pts_13", "p_sv_pts_12", "p_sv_pts_11", "p_sv_pts_10", "p_sv_pts_9", "p_sv_pts_8", "p_sv_pts_7", "p_sv_pts_6", "p_sv_pts_5", "p_sv_pts_4", "p_sv_pts_3", "p_sv_pts_2", "p_sv_pts_1"], axis = 1)

In [73]:
# 'p_df%_l60_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor (IO) specific mean DOUBLE FAULT performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

df_player1["p_sv_pts_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-1)
df_player1["p_df_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-1)

df_player1["p_sv_pts_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-2)
df_player1["p_df_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-2)

df_player1["p_sv_pts_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-3)
df_player1["p_df_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-3)

df_player1["p_sv_pts_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-4)
df_player1["p_df_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-4)

df_player1["p_sv_pts_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-5)
df_player1["p_df_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-5)

df_player1["p_sv_pts_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-6)
df_player1["p_df_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-6)

df_player1["p_sv_pts_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-7)
df_player1["p_df_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-7)

df_player1["p_sv_pts_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-8)
df_player1["p_df_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-8)

df_player1["p_sv_pts_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-9)
df_player1["p_df_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-9)

df_player1["p_sv_pts_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-10)
df_player1["p_df_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-10)

df_player1["p_sv_pts_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-11)
df_player1["p_df_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-11)

df_player1["p_sv_pts_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-12)
df_player1["p_df_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-12)

df_player1["p_sv_pts_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-13)
df_player1["p_df_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-13)

df_player1["p_sv_pts_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-14)
df_player1["p_df_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-14)

df_player1["p_sv_pts_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-15)
df_player1["p_df_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-15)

df_player1["p_sv_pts_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-16)
df_player1["p_df_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-16)

df_player1["p_sv_pts_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-17)
df_player1["p_df_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-17)

df_player1["p_sv_pts_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-18)
df_player1["p_df_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-18)

df_player1["p_sv_pts_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-19)
df_player1["p_df_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-19)

df_player1["p_sv_pts_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-20)
df_player1["p_df_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-20)

df_player1["p_sv_pts_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-21)
df_player1["p_df_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-21)

df_player1["p_sv_pts_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-22)
df_player1["p_df_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-22)

df_player1["p_sv_pts_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-23)
df_player1["p_df_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-23)

df_player1["p_sv_pts_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-24)
df_player1["p_df_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-24)

df_player1["p_sv_pts_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-25)
df_player1["p_df_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-25)

df_player1["p_sv_pts_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-26)
df_player1["p_df_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-26)

df_player1["p_sv_pts_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-27)
df_player1["p_df_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-27)

df_player1["p_sv_pts_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-28)
df_player1["p_df_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-28)

df_player1["p_sv_pts_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-29)
df_player1["p_df_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-29)

df_player1["p_sv_pts_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-30)
df_player1["p_df_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-30)

df_player1["p_sv_pts_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-31)
df_player1["p_df_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-31)

df_player1["p_sv_pts_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-32)
df_player1["p_df_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-32)

df_player1["p_sv_pts_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-33)
df_player1["p_df_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-33)

df_player1["p_sv_pts_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-34)
df_player1["p_df_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-34)

df_player1["p_sv_pts_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-35)
df_player1["p_df_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-35)

df_player1["p_sv_pts_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-36)
df_player1["p_df_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-36)

df_player1["p_sv_pts_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-37)
df_player1["p_df_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-37)

df_player1["p_sv_pts_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-38)
df_player1["p_df_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-38)

df_player1["p_sv_pts_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-39)
df_player1["p_df_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-39)

df_player1["p_sv_pts_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-40)
df_player1["p_df_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-40)

df_player1["p_sv_pts_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-41)
df_player1["p_df_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-41)

df_player1["p_sv_pts_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-42)
df_player1["p_df_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-42)

df_player1["p_sv_pts_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-43)
df_player1["p_df_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-43)

df_player1["p_sv_pts_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-44)
df_player1["p_df_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-44)

df_player1["p_sv_pts_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-45)
df_player1["p_df_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-45)

df_player1["p_sv_pts_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-46)
df_player1["p_df_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-46)

df_player1["p_sv_pts_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-47)
df_player1["p_df_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-47)

df_player1["p_sv_pts_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-48)
df_player1["p_df_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-48)

df_player1["p_sv_pts_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-49)
df_player1["p_df_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-49)

df_player1["p_sv_pts_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-50)
df_player1["p_df_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-50)

df_player1["p_sv_pts_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-51)
df_player1["p_df_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-51)

df_player1["p_sv_pts_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-52)
df_player1["p_df_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-52)

df_player1["p_sv_pts_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-53)
df_player1["p_df_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-53)

df_player1["p_sv_pts_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-54)
df_player1["p_df_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-54)

df_player1["p_sv_pts_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-55)
df_player1["p_df_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-55)

df_player1["p_sv_pts_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-56)
df_player1["p_df_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-56)

df_player1["p_sv_pts_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-57)
df_player1["p_df_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-57)

df_player1["p_sv_pts_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-58)
df_player1["p_df_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-58)

df_player1["p_sv_pts_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-59)
df_player1["p_df_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-59)

df_player1["p_sv_pts_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_sv_pts'].shift(-60)
df_player1["p_df_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_df'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surfdf will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_sv_pts_l60_ws"] = df_player1[["p_sv_pts_60", "p_sv_pts_59", "p_sv_pts_58", "p_sv_pts_57", "p_sv_pts_56", "p_sv_pts_55", "p_sv_pts_54", "p_sv_pts_53", "p_sv_pts_52", "p_sv_pts_51", "p_sv_pts_50", "p_sv_pts_49", "p_sv_pts_48", "p_sv_pts_47", "p_sv_pts_46", "p_sv_pts_45", "p_sv_pts_44", "p_sv_pts_43", "p_sv_pts_42", "p_sv_pts_41", "p_sv_pts_40", "p_sv_pts_39", "p_sv_pts_38", "p_sv_pts_37", "p_sv_pts_36", "p_sv_pts_35", "p_sv_pts_34", "p_sv_pts_33", "p_sv_pts_32", "p_sv_pts_31", "p_sv_pts_30", "p_sv_pts_29", "p_sv_pts_28", "p_sv_pts_27", "p_sv_pts_26", "p_sv_pts_25", "p_sv_pts_24", "p_sv_pts_23", "p_sv_pts_22", "p_sv_pts_21", "p_sv_pts_20", "p_sv_pts_19", "p_sv_pts_18", "p_sv_pts_17", "p_sv_pts_16", "p_sv_pts_15", "p_sv_pts_14", "p_sv_pts_13", "p_sv_pts_12", "p_sv_pts_11", "p_sv_pts_10", "p_sv_pts_9", "p_sv_pts_8", "p_sv_pts_7", "p_sv_pts_6", "p_sv_pts_5", "p_sv_pts_4", "p_sv_pts_3", "p_sv_pts_2", "p_sv_pts_1"]].sum(axis=1)
df_player1["p_df_l60_ws"] = df_player1[["p_df_60", "p_df_59", "p_df_58", "p_df_57", "p_df_56", "p_df_55", "p_df_54", "p_df_53", "p_df_52", "p_df_51", "p_df_50", "p_df_49", "p_df_48", "p_df_47", "p_df_46", "p_df_45", "p_df_44", "p_df_43", "p_df_42", "p_df_41", "p_df_40", "p_df_39", "p_df_38", "p_df_37", "p_df_36", "p_df_35", "p_df_34", "p_df_33", "p_df_32", "p_df_31", "p_df_30", "p_df_29", "p_df_28", "p_df_27", "p_df_26", "p_df_25", "p_df_24", "p_df_23", "p_df_22", "p_df_21", "p_df_20", "p_df_19", "p_df_18", "p_df_17", "p_df_16", "p_df_15", "p_df_14", "p_df_13", "p_df_12", "p_df_11", "p_df_10", "p_df_9", "p_df_8", "p_df_7", "p_df_6", "p_df_5", "p_df_4", "p_df_3", "p_df_2", "p_df_1"]].sum(axis=1)
df_player1["p_df%_l60_tw_ss_IO"] = ((df_player1["p_df_l60_ws"]/df_player1["p_sv_pts_l60_ws"])*100).round(2)

# Set zeroes to a very low rate to avoid divide by zero errors in ratio generation later on 
df_player1.loc[(df_player1["p_df%_l60_tw_ss_IO"] == 0), "p_df%_l60_tw_ss_IO"] = 0.1

#(ws = weighted sum; tw = time-weighted)

In [74]:
# 'p_df%_l10_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor (IO) specific mean df performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surfdf will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_sv_pts_l10_ws"] = df_player1[["p_sv_pts_60", "p_sv_pts_59", "p_sv_pts_58", "p_sv_pts_57", "p_sv_pts_56", "p_sv_pts_55", "p_sv_pts_54", "p_sv_pts_53", "p_sv_pts_52", "p_sv_pts_51"]].sum(axis=1)
df_player1["p_df_l10_ws"] = df_player1[["p_df_60", "p_df_59", "p_df_58", "p_df_57", "p_df_56", "p_df_55", "p_df_54", "p_df_53", "p_df_52", "p_df_51"]].sum(axis=1)
df_player1["p_df%_l10_tw_ss_IO"] = ((df_player1["p_df_l10_ws"]/df_player1["p_sv_pts_l10_ws"])*100).round(2)

# Set zeroes to a very low rate to avoid divide by zero errors in ratio generation later on 
df_player1.loc[(df_player1["p_df%_l10_tw_ss_IO"] == 0), "p_df%_l10_tw_ss_IO"] = 0.1

#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_sv_pts_l60_ws", "p_df_l60_ws", "p_sv_pts_l10_ws", "p_df_l10_ws", "p_df_60", "p_df_59", "p_df_58", "p_df_57", "p_df_56", "p_df_55", "p_df_54", "p_df_53", "p_df_52", "p_df_51", "p_df_50", "p_df_49", "p_df_48", "p_df_47", "p_df_46", "p_df_45", "p_df_44", "p_df_43", "p_df_42", "p_df_41", "p_df_40", "p_df_39", "p_df_38", "p_df_37", "p_df_36", "p_df_35", "p_df_34", "p_df_33", "p_df_32", "p_df_31", "p_df_30", "p_df_29", "p_df_28", "p_df_27", "p_df_26", "p_df_25", "p_df_24", "p_df_23", "p_df_22", "p_df_21", "p_df_20", "p_df_19", "p_df_18", "p_df_17", "p_df_16", "p_df_15", "p_df_14", "p_df_13", "p_df_12", "p_df_11", "p_df_10", "p_df_9", "p_df_8", "p_df_7", "p_df_6", "p_df_5", "p_df_4", "p_df_3", "p_df_2", "p_df_1", "p_sv_pts_60", "p_sv_pts_59", "p_sv_pts_58", "p_sv_pts_57", "p_sv_pts_56", "p_sv_pts_55", "p_sv_pts_54", "p_sv_pts_53", "p_sv_pts_52", "p_sv_pts_51", "p_sv_pts_50", "p_sv_pts_49", "p_sv_pts_48", "p_sv_pts_47", "p_sv_pts_46", "p_sv_pts_45", "p_sv_pts_44", "p_sv_pts_43", "p_sv_pts_42", "p_sv_pts_41", "p_sv_pts_40", "p_sv_pts_39", "p_sv_pts_38", "p_sv_pts_37", "p_sv_pts_36", "p_sv_pts_35", "p_sv_pts_34", "p_sv_pts_33", "p_sv_pts_32", "p_sv_pts_31", "p_sv_pts_30", "p_sv_pts_29", "p_sv_pts_28", "p_sv_pts_27", "p_sv_pts_26", "p_sv_pts_25", "p_sv_pts_24", "p_sv_pts_23", "p_sv_pts_22", "p_sv_pts_21", "p_sv_pts_20", "p_sv_pts_19", "p_sv_pts_18", "p_sv_pts_17", "p_sv_pts_16", "p_sv_pts_15", "p_sv_pts_14", "p_sv_pts_13", "p_sv_pts_12", "p_sv_pts_11", "p_sv_pts_10", "p_sv_pts_9", "p_sv_pts_8", "p_sv_pts_7", "p_sv_pts_6", "p_sv_pts_5", "p_sv_pts_4", "p_sv_pts_3", "p_sv_pts_2", "p_sv_pts_1"], axis = 1)

In [75]:
# 'p_df_induce%_l60_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean DOUBLE FAULT-INDUCE performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

df_player1["p_ret_pts_60"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-1)
df_player1["opp_df_60"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-1)

df_player1["p_ret_pts_59"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-2)
df_player1["opp_df_59"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-2)

df_player1["p_ret_pts_58"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-3)
df_player1["opp_df_58"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-3)

df_player1["p_ret_pts_57"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-4)
df_player1["opp_df_57"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-4)

df_player1["p_ret_pts_56"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-5)
df_player1["opp_df_56"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-5)

df_player1["p_ret_pts_55"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-6)
df_player1["opp_df_55"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-6)

df_player1["p_ret_pts_54"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-7)
df_player1["opp_df_54"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-7)

df_player1["p_ret_pts_53"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-8)
df_player1["opp_df_53"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-8)

df_player1["p_ret_pts_52"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-9)
df_player1["opp_df_52"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-9)

df_player1["p_ret_pts_51"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-10)
df_player1["opp_df_51"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-10)

df_player1["p_ret_pts_50"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-11)
df_player1["opp_df_50"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-11)

df_player1["p_ret_pts_49"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-12)
df_player1["opp_df_49"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-12)

df_player1["p_ret_pts_48"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-13)
df_player1["opp_df_48"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-13)

df_player1["p_ret_pts_47"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-14)
df_player1["opp_df_47"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-14)

df_player1["p_ret_pts_46"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-15)
df_player1["opp_df_46"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-15)

df_player1["p_ret_pts_45"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-16)
df_player1["opp_df_45"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-16)

df_player1["p_ret_pts_44"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-17)
df_player1["opp_df_44"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-17)

df_player1["p_ret_pts_43"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-18)
df_player1["opp_df_43"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-18)

df_player1["p_ret_pts_42"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-19)
df_player1["opp_df_42"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-19)

df_player1["p_ret_pts_41"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-20)
df_player1["opp_df_41"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-20)

df_player1["p_ret_pts_40"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-21)
df_player1["opp_df_40"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-21)

df_player1["p_ret_pts_39"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-22)
df_player1["opp_df_39"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-22)

df_player1["p_ret_pts_38"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-23)
df_player1["opp_df_38"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-23)

df_player1["p_ret_pts_37"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-24)
df_player1["opp_df_37"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-24)

df_player1["p_ret_pts_36"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-25)
df_player1["opp_df_36"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-25)

df_player1["p_ret_pts_35"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-26)
df_player1["opp_df_35"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-26)

df_player1["p_ret_pts_34"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-27)
df_player1["opp_df_34"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-27)

df_player1["p_ret_pts_33"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-28)
df_player1["opp_df_33"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-28)

df_player1["p_ret_pts_32"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-29)
df_player1["opp_df_32"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-29)

df_player1["p_ret_pts_31"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-30)
df_player1["opp_df_31"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-30)

df_player1["p_ret_pts_30"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-31)
df_player1["opp_df_30"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-31)

df_player1["p_ret_pts_29"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-32)
df_player1["opp_df_29"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-32)

df_player1["p_ret_pts_28"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-33)
df_player1["opp_df_28"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-33)

df_player1["p_ret_pts_27"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-34)
df_player1["opp_df_27"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-34)

df_player1["p_ret_pts_26"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-35)
df_player1["opp_df_26"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-35)

df_player1["p_ret_pts_25"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-36)
df_player1["opp_df_25"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-36)

df_player1["p_ret_pts_24"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-37)
df_player1["opp_df_24"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-37)

df_player1["p_ret_pts_23"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-38)
df_player1["opp_df_23"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-38)

df_player1["p_ret_pts_22"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-39)
df_player1["opp_df_22"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-39)

df_player1["p_ret_pts_21"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-40)
df_player1["opp_df_21"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-40)

df_player1["p_ret_pts_20"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-41)
df_player1["opp_df_20"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-41)

df_player1["p_ret_pts_19"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-42)
df_player1["opp_df_19"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-42)

df_player1["p_ret_pts_18"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-43)
df_player1["opp_df_18"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-43)

df_player1["p_ret_pts_17"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-44)
df_player1["opp_df_17"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-44)

df_player1["p_ret_pts_16"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-45)
df_player1["opp_df_16"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-45)

df_player1["p_ret_pts_15"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-46)
df_player1["opp_df_15"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-46)

df_player1["p_ret_pts_14"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-47)
df_player1["opp_df_14"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-47)

df_player1["p_ret_pts_13"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-48)
df_player1["opp_df_13"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-48)

df_player1["p_ret_pts_12"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-49)
df_player1["opp_df_12"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-49)

df_player1["p_ret_pts_11"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-50)
df_player1["opp_df_11"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-50)

df_player1["p_ret_pts_10"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-51)
df_player1["opp_df_10"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-51)

df_player1["p_ret_pts_9"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-52)
df_player1["opp_df_9"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-52)

df_player1["p_ret_pts_8"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-53)
df_player1["opp_df_8"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-53)

df_player1["p_ret_pts_7"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-54)
df_player1["opp_df_7"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-54)

df_player1["p_ret_pts_6"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-55)
df_player1["opp_df_6"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-55)

df_player1["p_ret_pts_5"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-56)
df_player1["opp_df_5"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-56)

df_player1["p_ret_pts_4"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-57)
df_player1["opp_df_4"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-57)

df_player1["p_ret_pts_3"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-58)
df_player1["opp_df_3"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-58)

df_player1["p_ret_pts_2"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-59)
df_player1["opp_df_2"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-59)

df_player1["p_ret_pts_1"] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts'].shift(-60)
df_player1["opp_df_1"] = df_player1.groupby(['p_id','t_surf'])['opp_df'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_ret_pts_l60_ws"] = df_player1[["p_ret_pts_60", "p_ret_pts_59", "p_ret_pts_58", "p_ret_pts_57", "p_ret_pts_56", "p_ret_pts_55", "p_ret_pts_54", "p_ret_pts_53", "p_ret_pts_52", "p_ret_pts_51", "p_ret_pts_50", "p_ret_pts_49", "p_ret_pts_48", "p_ret_pts_47", "p_ret_pts_46", "p_ret_pts_45", "p_ret_pts_44", "p_ret_pts_43", "p_ret_pts_42", "p_ret_pts_41", "p_ret_pts_40", "p_ret_pts_39", "p_ret_pts_38", "p_ret_pts_37", "p_ret_pts_36", "p_ret_pts_35", "p_ret_pts_34", "p_ret_pts_33", "p_ret_pts_32", "p_ret_pts_31", "p_ret_pts_30", "p_ret_pts_29", "p_ret_pts_28", "p_ret_pts_27", "p_ret_pts_26", "p_ret_pts_25", "p_ret_pts_24", "p_ret_pts_23", "p_ret_pts_22", "p_ret_pts_21", "p_ret_pts_20", "p_ret_pts_19", "p_ret_pts_18", "p_ret_pts_17", "p_ret_pts_16", "p_ret_pts_15", "p_ret_pts_14", "p_ret_pts_13", "p_ret_pts_12", "p_ret_pts_11", "p_ret_pts_10", "p_ret_pts_9", "p_ret_pts_8", "p_ret_pts_7", "p_ret_pts_6", "p_ret_pts_5", "p_ret_pts_4", "p_ret_pts_3", "p_ret_pts_2", "p_ret_pts_1"]].sum(axis=1)
df_player1["opp_df_l60_ws"] = df_player1[["opp_df_60", "opp_df_59", "opp_df_58", "opp_df_57", "opp_df_56", "opp_df_55", "opp_df_54", "opp_df_53", "opp_df_52", "opp_df_51", "opp_df_50", "opp_df_49", "opp_df_48", "opp_df_47", "opp_df_46", "opp_df_45", "opp_df_44", "opp_df_43", "opp_df_42", "opp_df_41", "opp_df_40", "opp_df_39", "opp_df_38", "opp_df_37", "opp_df_36", "opp_df_35", "opp_df_34", "opp_df_33", "opp_df_32", "opp_df_31", "opp_df_30", "opp_df_29", "opp_df_28", "opp_df_27", "opp_df_26", "opp_df_25", "opp_df_24", "opp_df_23", "opp_df_22", "opp_df_21", "opp_df_20", "opp_df_19", "opp_df_18", "opp_df_17", "opp_df_16", "opp_df_15", "opp_df_14", "opp_df_13", "opp_df_12", "opp_df_11", "opp_df_10", "opp_df_9", "opp_df_8", "opp_df_7", "opp_df_6", "opp_df_5", "opp_df_4", "opp_df_3", "opp_df_2", "opp_df_1"]].sum(axis=1)
df_player1["p_df_induce%_l60_tw_ss"] = ((df_player1["opp_df_l60_ws"]/df_player1["p_ret_pts_l60_ws"])*100).round(2)

# Set zeroes to a very low rate to avoid divide by zero errors in ratio generation later on 
df_player1.loc[(df_player1["p_df_induce%_l60_tw_ss"] == 0), "p_df_induce%_l60_tw_ss"] = 0.1

#(ws = weighted sum; tw = time-weighted)

In [76]:
# 'p_df_induce%_l10_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean DOUBLE FAULT-INDUCE performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_ret_pts_l10_ws"] = df_player1[["p_ret_pts_60", "p_ret_pts_59", "p_ret_pts_58", "p_ret_pts_57", "p_ret_pts_56", "p_ret_pts_55", "p_ret_pts_54", "p_ret_pts_53", "p_ret_pts_52", "p_ret_pts_51"]].sum(axis=1)
df_player1["opp_df_l10_ws"] = df_player1[["opp_df_60", "opp_df_59", "opp_df_58", "opp_df_57", "opp_df_56", "opp_df_55", "opp_df_54", "opp_df_53", "opp_df_52", "opp_df_51"]].sum(axis=1)
df_player1["p_df_induce%_l10_tw_ss"] = ((df_player1["opp_df_l10_ws"]/df_player1["p_ret_pts_l10_ws"])*100).round(2)

# Set zeroes to a very low rate to avoid divide by zero errors in ratio generation later on 
df_player1.loc[(df_player1["p_df_induce%_l10_tw_ss"] == 0), "p_df_induce%_l10_tw_ss"] = 0.1

#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_ret_pts_l60_ws", "opp_df_l60_ws", "p_ret_pts_l10_ws", "opp_df_l10_ws", "opp_df_60", "opp_df_59", "opp_df_58", "opp_df_57", "opp_df_56", "opp_df_55", "opp_df_54", "opp_df_53", "opp_df_52", "opp_df_51", "opp_df_50", "opp_df_49", "opp_df_48", "opp_df_47", "opp_df_46", "opp_df_45", "opp_df_44", "opp_df_43", "opp_df_42", "opp_df_41", "opp_df_40", "opp_df_39", "opp_df_38", "opp_df_37", "opp_df_36", "opp_df_35", "opp_df_34", "opp_df_33", "opp_df_32", "opp_df_31", "opp_df_30", "opp_df_29", "opp_df_28", "opp_df_27", "opp_df_26", "opp_df_25", "opp_df_24", "opp_df_23", "opp_df_22", "opp_df_21", "opp_df_20", "opp_df_19", "opp_df_18", "opp_df_17", "opp_df_16", "opp_df_15", "opp_df_14", "opp_df_13", "opp_df_12", "opp_df_11", "opp_df_10", "opp_df_9", "opp_df_8", "opp_df_7", "opp_df_6", "opp_df_5", "opp_df_4", "opp_df_3", "opp_df_2", "opp_df_1", "p_ret_pts_60", "p_ret_pts_59", "p_ret_pts_58", "p_ret_pts_57", "p_ret_pts_56", "p_ret_pts_55", "p_ret_pts_54", "p_ret_pts_53", "p_ret_pts_52", "p_ret_pts_51", "p_ret_pts_50", "p_ret_pts_49", "p_ret_pts_48", "p_ret_pts_47", "p_ret_pts_46", "p_ret_pts_45", "p_ret_pts_44", "p_ret_pts_43", "p_ret_pts_42", "p_ret_pts_41", "p_ret_pts_40", "p_ret_pts_39", "p_ret_pts_38", "p_ret_pts_37", "p_ret_pts_36", "p_ret_pts_35", "p_ret_pts_34", "p_ret_pts_33", "p_ret_pts_32", "p_ret_pts_31", "p_ret_pts_30", "p_ret_pts_29", "p_ret_pts_28", "p_ret_pts_27", "p_ret_pts_26", "p_ret_pts_25", "p_ret_pts_24", "p_ret_pts_23", "p_ret_pts_22", "p_ret_pts_21", "p_ret_pts_20", "p_ret_pts_19", "p_ret_pts_18", "p_ret_pts_17", "p_ret_pts_16", "p_ret_pts_15", "p_ret_pts_14", "p_ret_pts_13", "p_ret_pts_12", "p_ret_pts_11", "p_ret_pts_10", "p_ret_pts_9", "p_ret_pts_8", "p_ret_pts_7", "p_ret_pts_6", "p_ret_pts_5", "p_ret_pts_4", "p_ret_pts_3", "p_ret_pts_2", "p_ret_pts_1"], axis = 1)

In [77]:
# 'p_df_induce%_l60_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor (IO) specific mean DOUBLE FAULT-INDUCE performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

df_player1["p_ret_pts_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-1)
df_player1["opp_df_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-1)

df_player1["p_ret_pts_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-2)
df_player1["opp_df_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-2)

df_player1["p_ret_pts_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-3)
df_player1["opp_df_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-3)

df_player1["p_ret_pts_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-4)
df_player1["opp_df_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-4)

df_player1["p_ret_pts_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-5)
df_player1["opp_df_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-5)

df_player1["p_ret_pts_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-6)
df_player1["opp_df_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-6)

df_player1["p_ret_pts_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-7)
df_player1["opp_df_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-7)

df_player1["p_ret_pts_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-8)
df_player1["opp_df_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-8)

df_player1["p_ret_pts_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-9)
df_player1["opp_df_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-9)

df_player1["p_ret_pts_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-10)
df_player1["opp_df_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-10)

df_player1["p_ret_pts_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-11)
df_player1["opp_df_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-11)

df_player1["p_ret_pts_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-12)
df_player1["opp_df_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-12)

df_player1["p_ret_pts_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-13)
df_player1["opp_df_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-13)

df_player1["p_ret_pts_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-14)
df_player1["opp_df_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-14)

df_player1["p_ret_pts_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-15)
df_player1["opp_df_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-15)

df_player1["p_ret_pts_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-16)
df_player1["opp_df_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-16)

df_player1["p_ret_pts_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-17)
df_player1["opp_df_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-17)

df_player1["p_ret_pts_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-18)
df_player1["opp_df_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-18)

df_player1["p_ret_pts_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-19)
df_player1["opp_df_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-19)

df_player1["p_ret_pts_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-20)
df_player1["opp_df_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-20)

df_player1["p_ret_pts_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-21)
df_player1["opp_df_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-21)

df_player1["p_ret_pts_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-22)
df_player1["opp_df_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-22)

df_player1["p_ret_pts_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-23)
df_player1["opp_df_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-23)

df_player1["p_ret_pts_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-24)
df_player1["opp_df_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-24)

df_player1["p_ret_pts_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-25)
df_player1["opp_df_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-25)

df_player1["p_ret_pts_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-26)
df_player1["opp_df_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-26)

df_player1["p_ret_pts_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-27)
df_player1["opp_df_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-27)

df_player1["p_ret_pts_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-28)
df_player1["opp_df_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-28)

df_player1["p_ret_pts_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-29)
df_player1["opp_df_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-29)

df_player1["p_ret_pts_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-30)
df_player1["opp_df_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-30)

df_player1["p_ret_pts_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-31)
df_player1["opp_df_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-31)

df_player1["p_ret_pts_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-32)
df_player1["opp_df_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-32)

df_player1["p_ret_pts_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-33)
df_player1["opp_df_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-33)

df_player1["p_ret_pts_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-34)
df_player1["opp_df_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-34)

df_player1["p_ret_pts_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-35)
df_player1["opp_df_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-35)

df_player1["p_ret_pts_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-36)
df_player1["opp_df_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-36)

df_player1["p_ret_pts_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-37)
df_player1["opp_df_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-37)

df_player1["p_ret_pts_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-38)
df_player1["opp_df_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-38)

df_player1["p_ret_pts_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-39)
df_player1["opp_df_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-39)

df_player1["p_ret_pts_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-40)
df_player1["opp_df_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-40)

df_player1["p_ret_pts_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-41)
df_player1["opp_df_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-41)

df_player1["p_ret_pts_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-42)
df_player1["opp_df_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-42)

df_player1["p_ret_pts_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-43)
df_player1["opp_df_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-43)

df_player1["p_ret_pts_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-44)
df_player1["opp_df_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-44)

df_player1["p_ret_pts_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-45)
df_player1["opp_df_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-45)

df_player1["p_ret_pts_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-46)
df_player1["opp_df_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-46)

df_player1["p_ret_pts_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-47)
df_player1["opp_df_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-47)

df_player1["p_ret_pts_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-48)
df_player1["opp_df_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-48)

df_player1["p_ret_pts_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-49)
df_player1["opp_df_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-49)

df_player1["p_ret_pts_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-50)
df_player1["opp_df_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-50)

df_player1["p_ret_pts_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-51)
df_player1["opp_df_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-51)

df_player1["p_ret_pts_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-52)
df_player1["opp_df_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-52)

df_player1["p_ret_pts_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-53)
df_player1["opp_df_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-53)

df_player1["p_ret_pts_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-54)
df_player1["opp_df_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-54)

df_player1["p_ret_pts_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-55)
df_player1["opp_df_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-55)

df_player1["p_ret_pts_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-56)
df_player1["opp_df_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-56)

df_player1["p_ret_pts_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-57)
df_player1["opp_df_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-57)

df_player1["p_ret_pts_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-58)
df_player1["opp_df_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-58)

df_player1["p_ret_pts_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-59)
df_player1["opp_df_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-59)

df_player1["p_ret_pts_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_ret_pts'].shift(-60)
df_player1["opp_df_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_df'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_ret_pts_l60_ws"] = df_player1[["p_ret_pts_60", "p_ret_pts_59", "p_ret_pts_58", "p_ret_pts_57", "p_ret_pts_56", "p_ret_pts_55", "p_ret_pts_54", "p_ret_pts_53", "p_ret_pts_52", "p_ret_pts_51", "p_ret_pts_50", "p_ret_pts_49", "p_ret_pts_48", "p_ret_pts_47", "p_ret_pts_46", "p_ret_pts_45", "p_ret_pts_44", "p_ret_pts_43", "p_ret_pts_42", "p_ret_pts_41", "p_ret_pts_40", "p_ret_pts_39", "p_ret_pts_38", "p_ret_pts_37", "p_ret_pts_36", "p_ret_pts_35", "p_ret_pts_34", "p_ret_pts_33", "p_ret_pts_32", "p_ret_pts_31", "p_ret_pts_30", "p_ret_pts_29", "p_ret_pts_28", "p_ret_pts_27", "p_ret_pts_26", "p_ret_pts_25", "p_ret_pts_24", "p_ret_pts_23", "p_ret_pts_22", "p_ret_pts_21", "p_ret_pts_20", "p_ret_pts_19", "p_ret_pts_18", "p_ret_pts_17", "p_ret_pts_16", "p_ret_pts_15", "p_ret_pts_14", "p_ret_pts_13", "p_ret_pts_12", "p_ret_pts_11", "p_ret_pts_10", "p_ret_pts_9", "p_ret_pts_8", "p_ret_pts_7", "p_ret_pts_6", "p_ret_pts_5", "p_ret_pts_4", "p_ret_pts_3", "p_ret_pts_2", "p_ret_pts_1"]].sum(axis=1)
df_player1["opp_df_l60_ws"] = df_player1[["opp_df_60", "opp_df_59", "opp_df_58", "opp_df_57", "opp_df_56", "opp_df_55", "opp_df_54", "opp_df_53", "opp_df_52", "opp_df_51", "opp_df_50", "opp_df_49", "opp_df_48", "opp_df_47", "opp_df_46", "opp_df_45", "opp_df_44", "opp_df_43", "opp_df_42", "opp_df_41", "opp_df_40", "opp_df_39", "opp_df_38", "opp_df_37", "opp_df_36", "opp_df_35", "opp_df_34", "opp_df_33", "opp_df_32", "opp_df_31", "opp_df_30", "opp_df_29", "opp_df_28", "opp_df_27", "opp_df_26", "opp_df_25", "opp_df_24", "opp_df_23", "opp_df_22", "opp_df_21", "opp_df_20", "opp_df_19", "opp_df_18", "opp_df_17", "opp_df_16", "opp_df_15", "opp_df_14", "opp_df_13", "opp_df_12", "opp_df_11", "opp_df_10", "opp_df_9", "opp_df_8", "opp_df_7", "opp_df_6", "opp_df_5", "opp_df_4", "opp_df_3", "opp_df_2", "opp_df_1"]].sum(axis=1)
df_player1["p_df_induce%_l60_tw_ss_IO"] = ((df_player1["opp_df_l60_ws"]/df_player1["p_ret_pts_l60_ws"])*100).round(2)

# Set zeroes to a very low rate to avoid divide by zero errors in ratio generation later on 
df_player1.loc[(df_player1["p_df_induce%_l60_tw_ss_IO"] == 0), "p_df_induce%_l60_tw_ss_IO"] = 0.1

#(ws = weighted sum; tw = time-weighted)

In [78]:
# 'p_df_induce%_l10_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor (IO) specific mean DOUBLE FAULT-INDUCE performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_ret_pts_l10_ws"] = df_player1[["p_ret_pts_60", "p_ret_pts_59", "p_ret_pts_58", "p_ret_pts_57", "p_ret_pts_56", "p_ret_pts_55", "p_ret_pts_54", "p_ret_pts_53", "p_ret_pts_52", "p_ret_pts_51"]].sum(axis=1)
df_player1["opp_df_l10_ws"] = df_player1[["opp_df_60", "opp_df_59", "opp_df_58", "opp_df_57", "opp_df_56", "opp_df_55", "opp_df_54", "opp_df_53", "opp_df_52", "opp_df_51"]].sum(axis=1)
df_player1["p_df_induce%_l10_tw_ss_IO"] = ((df_player1["opp_df_l10_ws"]/df_player1["p_ret_pts_l10_ws"])*100).round(2)

# Set zeroes to a very low rate to avoid divide by zero errors in ratio generation later on 
df_player1.loc[(df_player1["p_df_induce%_l10_tw_ss_IO"] == 0), "p_df_induce%_l10_tw_ss_IO"] = 0.1

#(ws = weighted sum; tw = time-weighted)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_ret_pts_l60_ws", "opp_df_l60_ws", "p_ret_pts_l10_ws", "opp_df_l10_ws", "opp_df_60", "opp_df_59", "opp_df_58", "opp_df_57", "opp_df_56", "opp_df_55", "opp_df_54", "opp_df_53", "opp_df_52", "opp_df_51", "opp_df_50", "opp_df_49", "opp_df_48", "opp_df_47", "opp_df_46", "opp_df_45", "opp_df_44", "opp_df_43", "opp_df_42", "opp_df_41", "opp_df_40", "opp_df_39", "opp_df_38", "opp_df_37", "opp_df_36", "opp_df_35", "opp_df_34", "opp_df_33", "opp_df_32", "opp_df_31", "opp_df_30", "opp_df_29", "opp_df_28", "opp_df_27", "opp_df_26", "opp_df_25", "opp_df_24", "opp_df_23", "opp_df_22", "opp_df_21", "opp_df_20", "opp_df_19", "opp_df_18", "opp_df_17", "opp_df_16", "opp_df_15", "opp_df_14", "opp_df_13", "opp_df_12", "opp_df_11", "opp_df_10", "opp_df_9", "opp_df_8", "opp_df_7", "opp_df_6", "opp_df_5", "opp_df_4", "opp_df_3", "opp_df_2", "opp_df_1", "p_ret_pts_60", "p_ret_pts_59", "p_ret_pts_58", "p_ret_pts_57", "p_ret_pts_56", "p_ret_pts_55", "p_ret_pts_54", "p_ret_pts_53", "p_ret_pts_52", "p_ret_pts_51", "p_ret_pts_50", "p_ret_pts_49", "p_ret_pts_48", "p_ret_pts_47", "p_ret_pts_46", "p_ret_pts_45", "p_ret_pts_44", "p_ret_pts_43", "p_ret_pts_42", "p_ret_pts_41", "p_ret_pts_40", "p_ret_pts_39", "p_ret_pts_38", "p_ret_pts_37", "p_ret_pts_36", "p_ret_pts_35", "p_ret_pts_34", "p_ret_pts_33", "p_ret_pts_32", "p_ret_pts_31", "p_ret_pts_30", "p_ret_pts_29", "p_ret_pts_28", "p_ret_pts_27", "p_ret_pts_26", "p_ret_pts_25", "p_ret_pts_24", "p_ret_pts_23", "p_ret_pts_22", "p_ret_pts_21", "p_ret_pts_20", "p_ret_pts_19", "p_ret_pts_18", "p_ret_pts_17", "p_ret_pts_16", "p_ret_pts_15", "p_ret_pts_14", "p_ret_pts_13", "p_ret_pts_12", "p_ret_pts_11", "p_ret_pts_10", "p_ret_pts_9", "p_ret_pts_8", "p_ret_pts_7", "p_ret_pts_6", "p_ret_pts_5", "p_ret_pts_4", "p_ret_pts_3", "p_ret_pts_2", "p_ret_pts_1"], axis = 1)

In [79]:
# 'p_bp_save%_l60_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean BREAK POINT SAVED performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

df_player1["p_bp_faced_60"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-1)
df_player1["p_bp_saved_60"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-1)

df_player1["p_bp_faced_59"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-2)
df_player1["p_bp_saved_59"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-2)

df_player1["p_bp_faced_58"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-3)
df_player1["p_bp_saved_58"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-3)

df_player1["p_bp_faced_57"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-4)
df_player1["p_bp_saved_57"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-4)

df_player1["p_bp_faced_56"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-5)
df_player1["p_bp_saved_56"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-5)

df_player1["p_bp_faced_55"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-6)
df_player1["p_bp_saved_55"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-6)

df_player1["p_bp_faced_54"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-7)
df_player1["p_bp_saved_54"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-7)

df_player1["p_bp_faced_53"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-8)
df_player1["p_bp_saved_53"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-8)

df_player1["p_bp_faced_52"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-9)
df_player1["p_bp_saved_52"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-9)

df_player1["p_bp_faced_51"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-10)
df_player1["p_bp_saved_51"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-10)

df_player1["p_bp_faced_50"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-11)
df_player1["p_bp_saved_50"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-11)

df_player1["p_bp_faced_49"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-12)
df_player1["p_bp_saved_49"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-12)

df_player1["p_bp_faced_48"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-13)
df_player1["p_bp_saved_48"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-13)

df_player1["p_bp_faced_47"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-14)
df_player1["p_bp_saved_47"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-14)

df_player1["p_bp_faced_46"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-15)
df_player1["p_bp_saved_46"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-15)

df_player1["p_bp_faced_45"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-16)
df_player1["p_bp_saved_45"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-16)

df_player1["p_bp_faced_44"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-17)
df_player1["p_bp_saved_44"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-17)

df_player1["p_bp_faced_43"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-18)
df_player1["p_bp_saved_43"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-18)

df_player1["p_bp_faced_42"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-19)
df_player1["p_bp_saved_42"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-19)

df_player1["p_bp_faced_41"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-20)
df_player1["p_bp_saved_41"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-20)

df_player1["p_bp_faced_40"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-21)
df_player1["p_bp_saved_40"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-21)

df_player1["p_bp_faced_39"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-22)
df_player1["p_bp_saved_39"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-22)

df_player1["p_bp_faced_38"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-23)
df_player1["p_bp_saved_38"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-23)

df_player1["p_bp_faced_37"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-24)
df_player1["p_bp_saved_37"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-24)

df_player1["p_bp_faced_36"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-25)
df_player1["p_bp_saved_36"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-25)

df_player1["p_bp_faced_35"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-26)
df_player1["p_bp_saved_35"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-26)

df_player1["p_bp_faced_34"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-27)
df_player1["p_bp_saved_34"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-27)

df_player1["p_bp_faced_33"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-28)
df_player1["p_bp_saved_33"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-28)

df_player1["p_bp_faced_32"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-29)
df_player1["p_bp_saved_32"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-29)

df_player1["p_bp_faced_31"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-30)
df_player1["p_bp_saved_31"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-30)

df_player1["p_bp_faced_30"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-31)
df_player1["p_bp_saved_30"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-31)

df_player1["p_bp_faced_29"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-32)
df_player1["p_bp_saved_29"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-32)

df_player1["p_bp_faced_28"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-33)
df_player1["p_bp_saved_28"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-33)

df_player1["p_bp_faced_27"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-34)
df_player1["p_bp_saved_27"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-34)

df_player1["p_bp_faced_26"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-35)
df_player1["p_bp_saved_26"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-35)

df_player1["p_bp_faced_25"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-36)
df_player1["p_bp_saved_25"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-36)

df_player1["p_bp_faced_24"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-37)
df_player1["p_bp_saved_24"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-37)

df_player1["p_bp_faced_23"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-38)
df_player1["p_bp_saved_23"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-38)

df_player1["p_bp_faced_22"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-39)
df_player1["p_bp_saved_22"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-39)

df_player1["p_bp_faced_21"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-40)
df_player1["p_bp_saved_21"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-40)

df_player1["p_bp_faced_20"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-41)
df_player1["p_bp_saved_20"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-41)

df_player1["p_bp_faced_19"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-42)
df_player1["p_bp_saved_19"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-42)

df_player1["p_bp_faced_18"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-43)
df_player1["p_bp_saved_18"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-43)

df_player1["p_bp_faced_17"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-44)
df_player1["p_bp_saved_17"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-44)

df_player1["p_bp_faced_16"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-45)
df_player1["p_bp_saved_16"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-45)

df_player1["p_bp_faced_15"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-46)
df_player1["p_bp_saved_15"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-46)

df_player1["p_bp_faced_14"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-47)
df_player1["p_bp_saved_14"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-47)

df_player1["p_bp_faced_13"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-48)
df_player1["p_bp_saved_13"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-48)

df_player1["p_bp_faced_12"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-49)
df_player1["p_bp_saved_12"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-49)

df_player1["p_bp_faced_11"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-50)
df_player1["p_bp_saved_11"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-50)

df_player1["p_bp_faced_10"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-51)
df_player1["p_bp_saved_10"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-51)

df_player1["p_bp_faced_9"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-52)
df_player1["p_bp_saved_9"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-52)

df_player1["p_bp_faced_8"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-53)
df_player1["p_bp_saved_8"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-53)

df_player1["p_bp_faced_7"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-54)
df_player1["p_bp_saved_7"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-54)

df_player1["p_bp_faced_6"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-55)
df_player1["p_bp_saved_6"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-55)

df_player1["p_bp_faced_5"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-56)
df_player1["p_bp_saved_5"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-56)

df_player1["p_bp_faced_4"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-57)
df_player1["p_bp_saved_4"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-57)

df_player1["p_bp_faced_3"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-58)
df_player1["p_bp_saved_3"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-58)

df_player1["p_bp_faced_2"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-59)
df_player1["p_bp_saved_2"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-59)

df_player1["p_bp_faced_1"] = df_player1.groupby(['p_id','t_surf'])['p_bp_faced'].shift(-60)
df_player1["p_bp_saved_1"] = df_player1.groupby(['p_id','t_surf'])['p_bp_saved'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_bp_faced_l60"] = df_player1[["p_bp_faced_60", "p_bp_faced_59", "p_bp_faced_58", "p_bp_faced_57", "p_bp_faced_56", "p_bp_faced_55", "p_bp_faced_54", "p_bp_faced_53", "p_bp_faced_52", "p_bp_faced_51", "p_bp_faced_50", "p_bp_faced_49", "p_bp_faced_48", "p_bp_faced_47", "p_bp_faced_46", "p_bp_faced_45", "p_bp_faced_44", "p_bp_faced_43", "p_bp_faced_42", "p_bp_faced_41", "p_bp_faced_40", "p_bp_faced_39", "p_bp_faced_38", "p_bp_faced_37", "p_bp_faced_36", "p_bp_faced_35", "p_bp_faced_34", "p_bp_faced_33", "p_bp_faced_32", "p_bp_faced_31", "p_bp_faced_30", "p_bp_faced_29", "p_bp_faced_28", "p_bp_faced_27", "p_bp_faced_26", "p_bp_faced_25", "p_bp_faced_24", "p_bp_faced_23", "p_bp_faced_22", "p_bp_faced_21", "p_bp_faced_20", "p_bp_faced_19", "p_bp_faced_18", "p_bp_faced_17", "p_bp_faced_16", "p_bp_faced_15", "p_bp_faced_14", "p_bp_faced_13", "p_bp_faced_12", "p_bp_faced_11", "p_bp_faced_10", "p_bp_faced_9", "p_bp_faced_8", "p_bp_faced_7", "p_bp_faced_6", "p_bp_faced_5", "p_bp_faced_4", "p_bp_faced_3", "p_bp_faced_2", "p_bp_faced_1"]].sum(axis=1)
df_player1["p_bp_saved_l60"] = df_player1[["p_bp_saved_60", "p_bp_saved_59", "p_bp_saved_58", "p_bp_saved_57", "p_bp_saved_56", "p_bp_saved_55", "p_bp_saved_54", "p_bp_saved_53", "p_bp_saved_52", "p_bp_saved_51", "p_bp_saved_50", "p_bp_saved_49", "p_bp_saved_48", "p_bp_saved_47", "p_bp_saved_46", "p_bp_saved_45", "p_bp_saved_44", "p_bp_saved_43", "p_bp_saved_42", "p_bp_saved_41", "p_bp_saved_40", "p_bp_saved_39", "p_bp_saved_38", "p_bp_saved_37", "p_bp_saved_36", "p_bp_saved_35", "p_bp_saved_34", "p_bp_saved_33", "p_bp_saved_32", "p_bp_saved_31", "p_bp_saved_30", "p_bp_saved_29", "p_bp_saved_28", "p_bp_saved_27", "p_bp_saved_26", "p_bp_saved_25", "p_bp_saved_24", "p_bp_saved_23", "p_bp_saved_22", "p_bp_saved_21", "p_bp_saved_20", "p_bp_saved_19", "p_bp_saved_18", "p_bp_saved_17", "p_bp_saved_16", "p_bp_saved_15", "p_bp_saved_14", "p_bp_saved_13", "p_bp_saved_12", "p_bp_saved_11", "p_bp_saved_10", "p_bp_saved_9", "p_bp_saved_8", "p_bp_saved_7", "p_bp_saved_6", "p_bp_saved_5", "p_bp_saved_4", "p_bp_saved_3", "p_bp_saved_2", "p_bp_saved_1"]].sum(axis=1)
df_player1["p_bp_save%_l60_tw_ss"] = ((df_player1["p_bp_saved_l60"]/df_player1["p_bp_faced_l60"])*100).round(2)
#(ws = weighted sum)

In [80]:
#'p_bp_save%_l10_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean BREAK POINT SAVED performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_bp_faced_l10"] = df_player1[["p_bp_faced_60", "p_bp_faced_59", "p_bp_faced_58", "p_bp_faced_57", "p_bp_faced_56", "p_bp_faced_55", "p_bp_faced_54", "p_bp_faced_53", "p_bp_faced_52", "p_bp_faced_51"]].sum(axis=1)
df_player1["p_bp_saved_l10"] = df_player1[["p_bp_saved_60", "p_bp_saved_59", "p_bp_saved_58", "p_bp_saved_57", "p_bp_saved_56", "p_bp_saved_55", "p_bp_saved_54", "p_bp_saved_53", "p_bp_saved_52", "p_bp_saved_51"]].sum(axis=1)
df_player1["p_bp_save%_l10_tw_ss"] = ((df_player1["p_bp_saved_l10"]/df_player1["p_bp_faced_l10"])*100).round(2)
#(ws = weighted sum)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_bp_faced_l60", "p_bp_saved_l60", "p_bp_faced_l10", "p_bp_saved_l10", "p_bp_saved_60", "p_bp_saved_59", "p_bp_saved_58", "p_bp_saved_57", "p_bp_saved_56", "p_bp_saved_55", "p_bp_saved_54", "p_bp_saved_53", "p_bp_saved_52", "p_bp_saved_51", "p_bp_saved_50", "p_bp_saved_49", "p_bp_saved_48", "p_bp_saved_47", "p_bp_saved_46", "p_bp_saved_45", "p_bp_saved_44", "p_bp_saved_43", "p_bp_saved_42", "p_bp_saved_41", "p_bp_saved_40", "p_bp_saved_39", "p_bp_saved_38", "p_bp_saved_37", "p_bp_saved_36", "p_bp_saved_35", "p_bp_saved_34", "p_bp_saved_33", "p_bp_saved_32", "p_bp_saved_31", "p_bp_saved_30", "p_bp_saved_29", "p_bp_saved_28", "p_bp_saved_27", "p_bp_saved_26", "p_bp_saved_25", "p_bp_saved_24", "p_bp_saved_23", "p_bp_saved_22", "p_bp_saved_21", "p_bp_saved_20", "p_bp_saved_19", "p_bp_saved_18", "p_bp_saved_17", "p_bp_saved_16", "p_bp_saved_15", "p_bp_saved_14", "p_bp_saved_13", "p_bp_saved_12", "p_bp_saved_11", "p_bp_saved_10", "p_bp_saved_9", "p_bp_saved_8", "p_bp_saved_7", "p_bp_saved_6", "p_bp_saved_5", "p_bp_saved_4", "p_bp_saved_3", "p_bp_saved_2", "p_bp_saved_1", "p_bp_faced_60", "p_bp_faced_59", "p_bp_faced_58", "p_bp_faced_57", "p_bp_faced_56", "p_bp_faced_55", "p_bp_faced_54", "p_bp_faced_53", "p_bp_faced_52", "p_bp_faced_51", "p_bp_faced_50", "p_bp_faced_49", "p_bp_faced_48", "p_bp_faced_47", "p_bp_faced_46", "p_bp_faced_45", "p_bp_faced_44", "p_bp_faced_43", "p_bp_faced_42", "p_bp_faced_41", "p_bp_faced_40", "p_bp_faced_39", "p_bp_faced_38", "p_bp_faced_37", "p_bp_faced_36", "p_bp_faced_35", "p_bp_faced_34", "p_bp_faced_33", "p_bp_faced_32", "p_bp_faced_31", "p_bp_faced_30", "p_bp_faced_29", "p_bp_faced_28", "p_bp_faced_27", "p_bp_faced_26", "p_bp_faced_25", "p_bp_faced_24", "p_bp_faced_23", "p_bp_faced_22", "p_bp_faced_21", "p_bp_faced_20", "p_bp_faced_19", "p_bp_faced_18", "p_bp_faced_17", "p_bp_faced_16", "p_bp_faced_15", "p_bp_faced_14", "p_bp_faced_13", "p_bp_faced_12", "p_bp_faced_11", "p_bp_faced_10", "p_bp_faced_9", "p_bp_faced_8", "p_bp_faced_7", "p_bp_faced_6", "p_bp_faced_5", "p_bp_faced_4", "p_bp_faced_3", "p_bp_faced_2", "p_bp_faced_1"], axis = 1)

In [81]:
# 'p_bp_save%_l60_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor (IO) specific mean BREAK POINT SAVED performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

df_player1["p_bp_faced_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-1)
df_player1["p_bp_saved_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-1)

df_player1["p_bp_faced_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-2)
df_player1["p_bp_saved_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-2)

df_player1["p_bp_faced_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-3)
df_player1["p_bp_saved_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-3)

df_player1["p_bp_faced_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-4)
df_player1["p_bp_saved_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-4)

df_player1["p_bp_faced_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-5)
df_player1["p_bp_saved_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-5)

df_player1["p_bp_faced_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-6)
df_player1["p_bp_saved_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-6)

df_player1["p_bp_faced_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-7)
df_player1["p_bp_saved_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-7)

df_player1["p_bp_faced_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-8)
df_player1["p_bp_saved_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-8)

df_player1["p_bp_faced_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-9)
df_player1["p_bp_saved_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-9)

df_player1["p_bp_faced_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-10)
df_player1["p_bp_saved_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-10)

df_player1["p_bp_faced_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-11)
df_player1["p_bp_saved_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-11)

df_player1["p_bp_faced_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-12)
df_player1["p_bp_saved_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-12)

df_player1["p_bp_faced_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-13)
df_player1["p_bp_saved_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-13)

df_player1["p_bp_faced_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-14)
df_player1["p_bp_saved_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-14)

df_player1["p_bp_faced_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-15)
df_player1["p_bp_saved_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-15)

df_player1["p_bp_faced_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-16)
df_player1["p_bp_saved_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-16)

df_player1["p_bp_faced_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-17)
df_player1["p_bp_saved_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-17)

df_player1["p_bp_faced_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-18)
df_player1["p_bp_saved_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-18)

df_player1["p_bp_faced_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-19)
df_player1["p_bp_saved_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-19)

df_player1["p_bp_faced_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-20)
df_player1["p_bp_saved_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-20)

df_player1["p_bp_faced_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-21)
df_player1["p_bp_saved_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-21)

df_player1["p_bp_faced_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-22)
df_player1["p_bp_saved_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-22)

df_player1["p_bp_faced_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-23)
df_player1["p_bp_saved_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-23)

df_player1["p_bp_faced_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-24)
df_player1["p_bp_saved_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-24)

df_player1["p_bp_faced_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-25)
df_player1["p_bp_saved_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-25)

df_player1["p_bp_faced_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-26)
df_player1["p_bp_saved_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-26)

df_player1["p_bp_faced_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-27)
df_player1["p_bp_saved_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-27)

df_player1["p_bp_faced_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-28)
df_player1["p_bp_saved_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-28)

df_player1["p_bp_faced_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-29)
df_player1["p_bp_saved_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-29)

df_player1["p_bp_faced_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-30)
df_player1["p_bp_saved_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-30)

df_player1["p_bp_faced_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-31)
df_player1["p_bp_saved_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-31)

df_player1["p_bp_faced_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-32)
df_player1["p_bp_saved_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-32)

df_player1["p_bp_faced_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-33)
df_player1["p_bp_saved_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-33)

df_player1["p_bp_faced_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-34)
df_player1["p_bp_saved_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-34)

df_player1["p_bp_faced_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-35)
df_player1["p_bp_saved_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-35)

df_player1["p_bp_faced_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-36)
df_player1["p_bp_saved_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-36)

df_player1["p_bp_faced_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-37)
df_player1["p_bp_saved_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-37)

df_player1["p_bp_faced_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-38)
df_player1["p_bp_saved_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-38)

df_player1["p_bp_faced_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-39)
df_player1["p_bp_saved_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-39)

df_player1["p_bp_faced_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-40)
df_player1["p_bp_saved_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-40)

df_player1["p_bp_faced_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-41)
df_player1["p_bp_saved_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-41)

df_player1["p_bp_faced_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-42)
df_player1["p_bp_saved_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-42)

df_player1["p_bp_faced_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-43)
df_player1["p_bp_saved_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-43)

df_player1["p_bp_faced_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-44)
df_player1["p_bp_saved_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-44)

df_player1["p_bp_faced_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-45)
df_player1["p_bp_saved_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-45)

df_player1["p_bp_faced_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-46)
df_player1["p_bp_saved_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-46)

df_player1["p_bp_faced_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-47)
df_player1["p_bp_saved_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-47)

df_player1["p_bp_faced_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-48)
df_player1["p_bp_saved_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-48)

df_player1["p_bp_faced_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-49)
df_player1["p_bp_saved_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-49)

df_player1["p_bp_faced_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-50)
df_player1["p_bp_saved_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-50)

df_player1["p_bp_faced_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-51)
df_player1["p_bp_saved_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-51)

df_player1["p_bp_faced_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-52)
df_player1["p_bp_saved_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-52)

df_player1["p_bp_faced_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-53)
df_player1["p_bp_saved_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-53)

df_player1["p_bp_faced_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-54)
df_player1["p_bp_saved_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-54)

df_player1["p_bp_faced_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-55)
df_player1["p_bp_saved_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-55)

df_player1["p_bp_faced_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-56)
df_player1["p_bp_saved_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-56)

df_player1["p_bp_faced_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-57)
df_player1["p_bp_saved_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-57)

df_player1["p_bp_faced_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-58)
df_player1["p_bp_saved_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-58)

df_player1["p_bp_faced_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-59)
df_player1["p_bp_saved_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-59)

df_player1["p_bp_faced_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_faced'].shift(-60)
df_player1["p_bp_saved_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_bp_saved'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_bp_faced_l60"] = df_player1[["p_bp_faced_60", "p_bp_faced_59", "p_bp_faced_58", "p_bp_faced_57", "p_bp_faced_56", "p_bp_faced_55", "p_bp_faced_54", "p_bp_faced_53", "p_bp_faced_52", "p_bp_faced_51", "p_bp_faced_50", "p_bp_faced_49", "p_bp_faced_48", "p_bp_faced_47", "p_bp_faced_46", "p_bp_faced_45", "p_bp_faced_44", "p_bp_faced_43", "p_bp_faced_42", "p_bp_faced_41", "p_bp_faced_40", "p_bp_faced_39", "p_bp_faced_38", "p_bp_faced_37", "p_bp_faced_36", "p_bp_faced_35", "p_bp_faced_34", "p_bp_faced_33", "p_bp_faced_32", "p_bp_faced_31", "p_bp_faced_30", "p_bp_faced_29", "p_bp_faced_28", "p_bp_faced_27", "p_bp_faced_26", "p_bp_faced_25", "p_bp_faced_24", "p_bp_faced_23", "p_bp_faced_22", "p_bp_faced_21", "p_bp_faced_20", "p_bp_faced_19", "p_bp_faced_18", "p_bp_faced_17", "p_bp_faced_16", "p_bp_faced_15", "p_bp_faced_14", "p_bp_faced_13", "p_bp_faced_12", "p_bp_faced_11", "p_bp_faced_10", "p_bp_faced_9", "p_bp_faced_8", "p_bp_faced_7", "p_bp_faced_6", "p_bp_faced_5", "p_bp_faced_4", "p_bp_faced_3", "p_bp_faced_2", "p_bp_faced_1"]].sum(axis=1)
df_player1["p_bp_saved_l60"] = df_player1[["p_bp_saved_60", "p_bp_saved_59", "p_bp_saved_58", "p_bp_saved_57", "p_bp_saved_56", "p_bp_saved_55", "p_bp_saved_54", "p_bp_saved_53", "p_bp_saved_52", "p_bp_saved_51", "p_bp_saved_50", "p_bp_saved_49", "p_bp_saved_48", "p_bp_saved_47", "p_bp_saved_46", "p_bp_saved_45", "p_bp_saved_44", "p_bp_saved_43", "p_bp_saved_42", "p_bp_saved_41", "p_bp_saved_40", "p_bp_saved_39", "p_bp_saved_38", "p_bp_saved_37", "p_bp_saved_36", "p_bp_saved_35", "p_bp_saved_34", "p_bp_saved_33", "p_bp_saved_32", "p_bp_saved_31", "p_bp_saved_30", "p_bp_saved_29", "p_bp_saved_28", "p_bp_saved_27", "p_bp_saved_26", "p_bp_saved_25", "p_bp_saved_24", "p_bp_saved_23", "p_bp_saved_22", "p_bp_saved_21", "p_bp_saved_20", "p_bp_saved_19", "p_bp_saved_18", "p_bp_saved_17", "p_bp_saved_16", "p_bp_saved_15", "p_bp_saved_14", "p_bp_saved_13", "p_bp_saved_12", "p_bp_saved_11", "p_bp_saved_10", "p_bp_saved_9", "p_bp_saved_8", "p_bp_saved_7", "p_bp_saved_6", "p_bp_saved_5", "p_bp_saved_4", "p_bp_saved_3", "p_bp_saved_2", "p_bp_saved_1"]].sum(axis=1)
df_player1["p_bp_save%_l60_tw_ss_IO"] = ((df_player1["p_bp_saved_l60"]/df_player1["p_bp_faced_l60"])*100).round(2)
#(ws = weighted sum)

In [82]:
#'p_bp_save%_l10_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor (IO) specific mean BREAK POINT SAVED performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_bp_faced_l10"] = df_player1[["p_bp_faced_60", "p_bp_faced_59", "p_bp_faced_58", "p_bp_faced_57", "p_bp_faced_56", "p_bp_faced_55", "p_bp_faced_54", "p_bp_faced_53", "p_bp_faced_52", "p_bp_faced_51"]].sum(axis=1)
df_player1["p_bp_saved_l10"] = df_player1[["p_bp_saved_60", "p_bp_saved_59", "p_bp_saved_58", "p_bp_saved_57", "p_bp_saved_56", "p_bp_saved_55", "p_bp_saved_54", "p_bp_saved_53", "p_bp_saved_52", "p_bp_saved_51"]].sum(axis=1)
df_player1["p_bp_save%_l10_tw_ss_IO"] = ((df_player1["p_bp_saved_l10"]/df_player1["p_bp_faced_l10"])*100).round(2)
#(ws = weighted sum)

# Deleting the many transient columns
df_player1 = df_player1.drop(["p_bp_faced_l60", "p_bp_saved_l60", "p_bp_faced_l10", "p_bp_saved_l10", "p_bp_saved_60", "p_bp_saved_59", "p_bp_saved_58", "p_bp_saved_57", "p_bp_saved_56", "p_bp_saved_55", "p_bp_saved_54", "p_bp_saved_53", "p_bp_saved_52", "p_bp_saved_51", "p_bp_saved_50", "p_bp_saved_49", "p_bp_saved_48", "p_bp_saved_47", "p_bp_saved_46", "p_bp_saved_45", "p_bp_saved_44", "p_bp_saved_43", "p_bp_saved_42", "p_bp_saved_41", "p_bp_saved_40", "p_bp_saved_39", "p_bp_saved_38", "p_bp_saved_37", "p_bp_saved_36", "p_bp_saved_35", "p_bp_saved_34", "p_bp_saved_33", "p_bp_saved_32", "p_bp_saved_31", "p_bp_saved_30", "p_bp_saved_29", "p_bp_saved_28", "p_bp_saved_27", "p_bp_saved_26", "p_bp_saved_25", "p_bp_saved_24", "p_bp_saved_23", "p_bp_saved_22", "p_bp_saved_21", "p_bp_saved_20", "p_bp_saved_19", "p_bp_saved_18", "p_bp_saved_17", "p_bp_saved_16", "p_bp_saved_15", "p_bp_saved_14", "p_bp_saved_13", "p_bp_saved_12", "p_bp_saved_11", "p_bp_saved_10", "p_bp_saved_9", "p_bp_saved_8", "p_bp_saved_7", "p_bp_saved_6", "p_bp_saved_5", "p_bp_saved_4", "p_bp_saved_3", "p_bp_saved_2", "p_bp_saved_1", "p_bp_faced_60", "p_bp_faced_59", "p_bp_faced_58", "p_bp_faced_57", "p_bp_faced_56", "p_bp_faced_55", "p_bp_faced_54", "p_bp_faced_53", "p_bp_faced_52", "p_bp_faced_51", "p_bp_faced_50", "p_bp_faced_49", "p_bp_faced_48", "p_bp_faced_47", "p_bp_faced_46", "p_bp_faced_45", "p_bp_faced_44", "p_bp_faced_43", "p_bp_faced_42", "p_bp_faced_41", "p_bp_faced_40", "p_bp_faced_39", "p_bp_faced_38", "p_bp_faced_37", "p_bp_faced_36", "p_bp_faced_35", "p_bp_faced_34", "p_bp_faced_33", "p_bp_faced_32", "p_bp_faced_31", "p_bp_faced_30", "p_bp_faced_29", "p_bp_faced_28", "p_bp_faced_27", "p_bp_faced_26", "p_bp_faced_25", "p_bp_faced_24", "p_bp_faced_23", "p_bp_faced_22", "p_bp_faced_21", "p_bp_faced_20", "p_bp_faced_19", "p_bp_faced_18", "p_bp_faced_17", "p_bp_faced_16", "p_bp_faced_15", "p_bp_faced_14", "p_bp_faced_13", "p_bp_faced_12", "p_bp_faced_11", "p_bp_faced_10", "p_bp_faced_9", "p_bp_faced_8", "p_bp_faced_7", "p_bp_faced_6", "p_bp_faced_5", "p_bp_faced_4", "p_bp_faced_3", "p_bp_faced_2", "p_bp_faced_1"], axis = 1)

In [83]:
# 'p_bp_conv%_l60_tw_ss'
# Provides time-weighted(tw), surface-specific (SS), mean BREAK POINT CONVERTED performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

df_player1["opp_bp_faced_60"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-1)
df_player1["opp_bp_saved_60"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-1)

df_player1["opp_bp_faced_59"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-2)
df_player1["opp_bp_saved_59"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-2)

df_player1["opp_bp_faced_58"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-3)
df_player1["opp_bp_saved_58"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-3)

df_player1["opp_bp_faced_57"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-4)
df_player1["opp_bp_saved_57"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-4)

df_player1["opp_bp_faced_56"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-5)
df_player1["opp_bp_saved_56"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-5)

df_player1["opp_bp_faced_55"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-6)
df_player1["opp_bp_saved_55"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-6)

df_player1["opp_bp_faced_54"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-7)
df_player1["opp_bp_saved_54"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-7)

df_player1["opp_bp_faced_53"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-8)
df_player1["opp_bp_saved_53"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-8)

df_player1["opp_bp_faced_52"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-9)
df_player1["opp_bp_saved_52"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-9)

df_player1["opp_bp_faced_51"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-10)
df_player1["opp_bp_saved_51"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-10)

df_player1["opp_bp_faced_50"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-11)
df_player1["opp_bp_saved_50"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-11)

df_player1["opp_bp_faced_49"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-12)
df_player1["opp_bp_saved_49"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-12)

df_player1["opp_bp_faced_48"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-13)
df_player1["opp_bp_saved_48"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-13)

df_player1["opp_bp_faced_47"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-14)
df_player1["opp_bp_saved_47"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-14)

df_player1["opp_bp_faced_46"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-15)
df_player1["opp_bp_saved_46"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-15)

df_player1["opp_bp_faced_45"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-16)
df_player1["opp_bp_saved_45"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-16)

df_player1["opp_bp_faced_44"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-17)
df_player1["opp_bp_saved_44"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-17)

df_player1["opp_bp_faced_43"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-18)
df_player1["opp_bp_saved_43"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-18)

df_player1["opp_bp_faced_42"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-19)
df_player1["opp_bp_saved_42"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-19)

df_player1["opp_bp_faced_41"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-20)
df_player1["opp_bp_saved_41"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-20)

df_player1["opp_bp_faced_40"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-21)
df_player1["opp_bp_saved_40"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-21)

df_player1["opp_bp_faced_39"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-22)
df_player1["opp_bp_saved_39"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-22)

df_player1["opp_bp_faced_38"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-23)
df_player1["opp_bp_saved_38"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-23)

df_player1["opp_bp_faced_37"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-24)
df_player1["opp_bp_saved_37"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-24)

df_player1["opp_bp_faced_36"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-25)
df_player1["opp_bp_saved_36"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-25)

df_player1["opp_bp_faced_35"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-26)
df_player1["opp_bp_saved_35"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-26)

df_player1["opp_bp_faced_34"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-27)
df_player1["opp_bp_saved_34"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-27)

df_player1["opp_bp_faced_33"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-28)
df_player1["opp_bp_saved_33"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-28)

df_player1["opp_bp_faced_32"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-29)
df_player1["opp_bp_saved_32"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-29)

df_player1["opp_bp_faced_31"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-30)
df_player1["opp_bp_saved_31"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-30)

df_player1["opp_bp_faced_30"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-31)
df_player1["opp_bp_saved_30"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-31)

df_player1["opp_bp_faced_29"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-32)
df_player1["opp_bp_saved_29"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-32)

df_player1["opp_bp_faced_28"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-33)
df_player1["opp_bp_saved_28"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-33)

df_player1["opp_bp_faced_27"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-34)
df_player1["opp_bp_saved_27"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-34)

df_player1["opp_bp_faced_26"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-35)
df_player1["opp_bp_saved_26"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-35)

df_player1["opp_bp_faced_25"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-36)
df_player1["opp_bp_saved_25"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-36)

df_player1["opp_bp_faced_24"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-37)
df_player1["opp_bp_saved_24"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-37)

df_player1["opp_bp_faced_23"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-38)
df_player1["opp_bp_saved_23"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-38)

df_player1["opp_bp_faced_22"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-39)
df_player1["opp_bp_saved_22"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-39)

df_player1["opp_bp_faced_21"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-40)
df_player1["opp_bp_saved_21"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-40)

df_player1["opp_bp_faced_20"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-41)
df_player1["opp_bp_saved_20"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-41)

df_player1["opp_bp_faced_19"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-42)
df_player1["opp_bp_saved_19"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-42)

df_player1["opp_bp_faced_18"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-43)
df_player1["opp_bp_saved_18"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-43)

df_player1["opp_bp_faced_17"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-44)
df_player1["opp_bp_saved_17"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-44)

df_player1["opp_bp_faced_16"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-45)
df_player1["opp_bp_saved_16"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-45)

df_player1["opp_bp_faced_15"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-46)
df_player1["opp_bp_saved_15"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-46)

df_player1["opp_bp_faced_14"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-47)
df_player1["opp_bp_saved_14"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-47)

df_player1["opp_bp_faced_13"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-48)
df_player1["opp_bp_saved_13"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-48)

df_player1["opp_bp_faced_12"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-49)
df_player1["opp_bp_saved_12"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-49)

df_player1["opp_bp_faced_11"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-50)
df_player1["opp_bp_saved_11"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-50)

df_player1["opp_bp_faced_10"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-51)
df_player1["opp_bp_saved_10"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-51)

df_player1["opp_bp_faced_9"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-52)
df_player1["opp_bp_saved_9"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-52)

df_player1["opp_bp_faced_8"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-53)
df_player1["opp_bp_saved_8"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-53)

df_player1["opp_bp_faced_7"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-54)
df_player1["opp_bp_saved_7"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-54)

df_player1["opp_bp_faced_6"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-55)
df_player1["opp_bp_saved_6"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-55)

df_player1["opp_bp_faced_5"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-56)
df_player1["opp_bp_saved_5"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-56)

df_player1["opp_bp_faced_4"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-57)
df_player1["opp_bp_saved_4"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-57)

df_player1["opp_bp_faced_3"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-58)
df_player1["opp_bp_saved_3"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-58)

df_player1["opp_bp_faced_2"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-59)
df_player1["opp_bp_saved_2"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-59)

df_player1["opp_bp_faced_1"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_faced'].shift(-60)
df_player1["opp_bp_saved_1"] = df_player1.groupby(['p_id','t_surf'])['opp_bp_saved'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["opp_bp_faced_l60"] = df_player1[["opp_bp_faced_60", "opp_bp_faced_59", "opp_bp_faced_58", "opp_bp_faced_57", "opp_bp_faced_56", "opp_bp_faced_55", "opp_bp_faced_54", "opp_bp_faced_53", "opp_bp_faced_52", "opp_bp_faced_51", "opp_bp_faced_50", "opp_bp_faced_49", "opp_bp_faced_48", "opp_bp_faced_47", "opp_bp_faced_46", "opp_bp_faced_45", "opp_bp_faced_44", "opp_bp_faced_43", "opp_bp_faced_42", "opp_bp_faced_41", "opp_bp_faced_40", "opp_bp_faced_39", "opp_bp_faced_38", "opp_bp_faced_37", "opp_bp_faced_36", "opp_bp_faced_35", "opp_bp_faced_34", "opp_bp_faced_33", "opp_bp_faced_32", "opp_bp_faced_31", "opp_bp_faced_30", "opp_bp_faced_29", "opp_bp_faced_28", "opp_bp_faced_27", "opp_bp_faced_26", "opp_bp_faced_25", "opp_bp_faced_24", "opp_bp_faced_23", "opp_bp_faced_22", "opp_bp_faced_21", "opp_bp_faced_20", "opp_bp_faced_19", "opp_bp_faced_18", "opp_bp_faced_17", "opp_bp_faced_16", "opp_bp_faced_15", "opp_bp_faced_14", "opp_bp_faced_13", "opp_bp_faced_12", "opp_bp_faced_11", "opp_bp_faced_10", "opp_bp_faced_9", "opp_bp_faced_8", "opp_bp_faced_7", "opp_bp_faced_6", "opp_bp_faced_5", "opp_bp_faced_4", "opp_bp_faced_3", "opp_bp_faced_2", "opp_bp_faced_1"]].sum(axis=1)
df_player1["opp_bp_saved_l60"] = df_player1[["opp_bp_saved_60", "opp_bp_saved_59", "opp_bp_saved_58", "opp_bp_saved_57", "opp_bp_saved_56", "opp_bp_saved_55", "opp_bp_saved_54", "opp_bp_saved_53", "opp_bp_saved_52", "opp_bp_saved_51", "opp_bp_saved_50", "opp_bp_saved_49", "opp_bp_saved_48", "opp_bp_saved_47", "opp_bp_saved_46", "opp_bp_saved_45", "opp_bp_saved_44", "opp_bp_saved_43", "opp_bp_saved_42", "opp_bp_saved_41", "opp_bp_saved_40", "opp_bp_saved_39", "opp_bp_saved_38", "opp_bp_saved_37", "opp_bp_saved_36", "opp_bp_saved_35", "opp_bp_saved_34", "opp_bp_saved_33", "opp_bp_saved_32", "opp_bp_saved_31", "opp_bp_saved_30", "opp_bp_saved_29", "opp_bp_saved_28", "opp_bp_saved_27", "opp_bp_saved_26", "opp_bp_saved_25", "opp_bp_saved_24", "opp_bp_saved_23", "opp_bp_saved_22", "opp_bp_saved_21", "opp_bp_saved_20", "opp_bp_saved_19", "opp_bp_saved_18", "opp_bp_saved_17", "opp_bp_saved_16", "opp_bp_saved_15", "opp_bp_saved_14", "opp_bp_saved_13", "opp_bp_saved_12", "opp_bp_saved_11", "opp_bp_saved_10", "opp_bp_saved_9", "opp_bp_saved_8", "opp_bp_saved_7", "opp_bp_saved_6", "opp_bp_saved_5", "opp_bp_saved_4", "opp_bp_saved_3", "opp_bp_saved_2", "opp_bp_saved_1"]].sum(axis=1)
df_player1["p_bp_conv%_l60_tw_ss"] = ((1-(df_player1["opp_bp_saved_l60"]/df_player1["opp_bp_faced_l60"]))*100).round(2)
#(ws = weighted sum)

In [84]:
# 'p_bp_conv%_l10_tw_ss'
# Provides time-weighted (tw), surface-specific (SS), mean BREAK POINT CONVERTED performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["opp_bp_faced_l10"] = df_player1[["opp_bp_faced_60", "opp_bp_faced_59", "opp_bp_faced_58", "opp_bp_faced_57", "opp_bp_faced_56", "opp_bp_faced_55", "opp_bp_faced_54", "opp_bp_faced_53", "opp_bp_faced_52", "opp_bp_faced_51"]].sum(axis=1)
df_player1["opp_bp_saved_l10"] = df_player1[["opp_bp_saved_60", "opp_bp_saved_59", "opp_bp_saved_58", "opp_bp_saved_57", "opp_bp_saved_56", "opp_bp_saved_55", "opp_bp_saved_54", "opp_bp_saved_53", "opp_bp_saved_52", "opp_bp_saved_51"]].sum(axis=1)
df_player1["p_bp_conv%_l10_tw_ss"] = ((1-(df_player1["opp_bp_saved_l10"]/df_player1["opp_bp_faced_l10"]))*100).round(2)

# Deleting the many transient columns
df_player1 = df_player1.drop(["opp_bp_faced_l60", "opp_bp_saved_l60", "opp_bp_faced_l10", "opp_bp_saved_l10", "opp_bp_saved_60", "opp_bp_saved_59", "opp_bp_saved_58", "opp_bp_saved_57", "opp_bp_saved_56", "opp_bp_saved_55", "opp_bp_saved_54", "opp_bp_saved_53", "opp_bp_saved_52", "opp_bp_saved_51", "opp_bp_saved_50", "opp_bp_saved_49", "opp_bp_saved_48", "opp_bp_saved_47", "opp_bp_saved_46", "opp_bp_saved_45", "opp_bp_saved_44", "opp_bp_saved_43", "opp_bp_saved_42", "opp_bp_saved_41", "opp_bp_saved_40", "opp_bp_saved_39", "opp_bp_saved_38", "opp_bp_saved_37", "opp_bp_saved_36", "opp_bp_saved_35", "opp_bp_saved_34", "opp_bp_saved_33", "opp_bp_saved_32", "opp_bp_saved_31", "opp_bp_saved_30", "opp_bp_saved_29", "opp_bp_saved_28", "opp_bp_saved_27", "opp_bp_saved_26", "opp_bp_saved_25", "opp_bp_saved_24", "opp_bp_saved_23", "opp_bp_saved_22", "opp_bp_saved_21", "opp_bp_saved_20", "opp_bp_saved_19", "opp_bp_saved_18", "opp_bp_saved_17", "opp_bp_saved_16", "opp_bp_saved_15", "opp_bp_saved_14", "opp_bp_saved_13", "opp_bp_saved_12", "opp_bp_saved_11", "opp_bp_saved_10", "opp_bp_saved_9", "opp_bp_saved_8", "opp_bp_saved_7", "opp_bp_saved_6", "opp_bp_saved_5", "opp_bp_saved_4", "opp_bp_saved_3", "opp_bp_saved_2", "opp_bp_saved_1", "opp_bp_faced_60", "opp_bp_faced_59", "opp_bp_faced_58", "opp_bp_faced_57", "opp_bp_faced_56", "opp_bp_faced_55", "opp_bp_faced_54", "opp_bp_faced_53", "opp_bp_faced_52", "opp_bp_faced_51", "opp_bp_faced_50", "opp_bp_faced_49", "opp_bp_faced_48", "opp_bp_faced_47", "opp_bp_faced_46", "opp_bp_faced_45", "opp_bp_faced_44", "opp_bp_faced_43", "opp_bp_faced_42", "opp_bp_faced_41", "opp_bp_faced_40", "opp_bp_faced_39", "opp_bp_faced_38", "opp_bp_faced_37", "opp_bp_faced_36", "opp_bp_faced_35", "opp_bp_faced_34", "opp_bp_faced_33", "opp_bp_faced_32", "opp_bp_faced_31", "opp_bp_faced_30", "opp_bp_faced_29", "opp_bp_faced_28", "opp_bp_faced_27", "opp_bp_faced_26", "opp_bp_faced_25", "opp_bp_faced_24", "opp_bp_faced_23", "opp_bp_faced_22", "opp_bp_faced_21", "opp_bp_faced_20", "opp_bp_faced_19", "opp_bp_faced_18", "opp_bp_faced_17", "opp_bp_faced_16", "opp_bp_faced_15", "opp_bp_faced_14", "opp_bp_faced_13", "opp_bp_faced_12", "opp_bp_faced_11", "opp_bp_faced_10", "opp_bp_faced_9", "opp_bp_faced_8", "opp_bp_faced_7", "opp_bp_faced_6", "opp_bp_faced_5", "opp_bp_faced_4", "opp_bp_faced_3", "opp_bp_faced_2", "opp_bp_faced_1"], axis = 1)

In [85]:
# 'p_bp_conv%_l60_tw_ss_IO'
# Provides time-weighted(tw), surface-specific (SS), indoor/outdoor (IO) specific mean BREAK POINT CONVERTED performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player1 = df_player1.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

df_player1["opp_bp_faced_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-1)
df_player1["opp_bp_saved_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-1)

df_player1["opp_bp_faced_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-2)
df_player1["opp_bp_saved_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-2)

df_player1["opp_bp_faced_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-3)
df_player1["opp_bp_saved_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-3)

df_player1["opp_bp_faced_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-4)
df_player1["opp_bp_saved_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-4)

df_player1["opp_bp_faced_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-5)
df_player1["opp_bp_saved_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-5)

df_player1["opp_bp_faced_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-6)
df_player1["opp_bp_saved_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-6)

df_player1["opp_bp_faced_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-7)
df_player1["opp_bp_saved_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-7)

df_player1["opp_bp_faced_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-8)
df_player1["opp_bp_saved_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-8)

df_player1["opp_bp_faced_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-9)
df_player1["opp_bp_saved_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-9)

df_player1["opp_bp_faced_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-10)
df_player1["opp_bp_saved_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-10)

df_player1["opp_bp_faced_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-11)
df_player1["opp_bp_saved_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-11)

df_player1["opp_bp_faced_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-12)
df_player1["opp_bp_saved_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-12)

df_player1["opp_bp_faced_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-13)
df_player1["opp_bp_saved_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-13)

df_player1["opp_bp_faced_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-14)
df_player1["opp_bp_saved_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-14)

df_player1["opp_bp_faced_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-15)
df_player1["opp_bp_saved_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-15)

df_player1["opp_bp_faced_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-16)
df_player1["opp_bp_saved_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-16)

df_player1["opp_bp_faced_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-17)
df_player1["opp_bp_saved_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-17)

df_player1["opp_bp_faced_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-18)
df_player1["opp_bp_saved_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-18)

df_player1["opp_bp_faced_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-19)
df_player1["opp_bp_saved_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-19)

df_player1["opp_bp_faced_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-20)
df_player1["opp_bp_saved_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-20)

df_player1["opp_bp_faced_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-21)
df_player1["opp_bp_saved_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-21)

df_player1["opp_bp_faced_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-22)
df_player1["opp_bp_saved_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-22)

df_player1["opp_bp_faced_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-23)
df_player1["opp_bp_saved_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-23)

df_player1["opp_bp_faced_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-24)
df_player1["opp_bp_saved_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-24)

df_player1["opp_bp_faced_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-25)
df_player1["opp_bp_saved_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-25)

df_player1["opp_bp_faced_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-26)
df_player1["opp_bp_saved_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-26)

df_player1["opp_bp_faced_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-27)
df_player1["opp_bp_saved_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-27)

df_player1["opp_bp_faced_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-28)
df_player1["opp_bp_saved_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-28)

df_player1["opp_bp_faced_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-29)
df_player1["opp_bp_saved_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-29)

df_player1["opp_bp_faced_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-30)
df_player1["opp_bp_saved_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-30)

df_player1["opp_bp_faced_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-31)
df_player1["opp_bp_saved_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-31)

df_player1["opp_bp_faced_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-32)
df_player1["opp_bp_saved_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-32)

df_player1["opp_bp_faced_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-33)
df_player1["opp_bp_saved_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-33)

df_player1["opp_bp_faced_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-34)
df_player1["opp_bp_saved_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-34)

df_player1["opp_bp_faced_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-35)
df_player1["opp_bp_saved_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-35)

df_player1["opp_bp_faced_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-36)
df_player1["opp_bp_saved_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-36)

df_player1["opp_bp_faced_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-37)
df_player1["opp_bp_saved_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-37)

df_player1["opp_bp_faced_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-38)
df_player1["opp_bp_saved_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-38)

df_player1["opp_bp_faced_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-39)
df_player1["opp_bp_saved_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-39)

df_player1["opp_bp_faced_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-40)
df_player1["opp_bp_saved_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-40)

df_player1["opp_bp_faced_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-41)
df_player1["opp_bp_saved_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-41)

df_player1["opp_bp_faced_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-42)
df_player1["opp_bp_saved_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-42)

df_player1["opp_bp_faced_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-43)
df_player1["opp_bp_saved_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-43)

df_player1["opp_bp_faced_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-44)
df_player1["opp_bp_saved_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-44)

df_player1["opp_bp_faced_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-45)
df_player1["opp_bp_saved_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-45)

df_player1["opp_bp_faced_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-46)
df_player1["opp_bp_saved_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-46)

df_player1["opp_bp_faced_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-47)
df_player1["opp_bp_saved_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-47)

df_player1["opp_bp_faced_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-48)
df_player1["opp_bp_saved_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-48)

df_player1["opp_bp_faced_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-49)
df_player1["opp_bp_saved_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-49)

df_player1["opp_bp_faced_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-50)
df_player1["opp_bp_saved_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-50)

df_player1["opp_bp_faced_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-51)
df_player1["opp_bp_saved_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-51)

df_player1["opp_bp_faced_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-52)
df_player1["opp_bp_saved_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-52)

df_player1["opp_bp_faced_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-53)
df_player1["opp_bp_saved_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-53)

df_player1["opp_bp_faced_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-54)
df_player1["opp_bp_saved_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-54)

df_player1["opp_bp_faced_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-55)
df_player1["opp_bp_saved_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-55)

df_player1["opp_bp_faced_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-56)
df_player1["opp_bp_saved_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-56)

df_player1["opp_bp_faced_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-57)
df_player1["opp_bp_saved_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-57)

df_player1["opp_bp_faced_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-58)
df_player1["opp_bp_saved_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-58)

df_player1["opp_bp_faced_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-59)
df_player1["opp_bp_saved_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-59)

df_player1["opp_bp_faced_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_faced'].shift(-60)
df_player1["opp_bp_saved_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['opp_bp_saved'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["opp_bp_faced_l60"] = df_player1[["opp_bp_faced_60", "opp_bp_faced_59", "opp_bp_faced_58", "opp_bp_faced_57", "opp_bp_faced_56", "opp_bp_faced_55", "opp_bp_faced_54", "opp_bp_faced_53", "opp_bp_faced_52", "opp_bp_faced_51", "opp_bp_faced_50", "opp_bp_faced_49", "opp_bp_faced_48", "opp_bp_faced_47", "opp_bp_faced_46", "opp_bp_faced_45", "opp_bp_faced_44", "opp_bp_faced_43", "opp_bp_faced_42", "opp_bp_faced_41", "opp_bp_faced_40", "opp_bp_faced_39", "opp_bp_faced_38", "opp_bp_faced_37", "opp_bp_faced_36", "opp_bp_faced_35", "opp_bp_faced_34", "opp_bp_faced_33", "opp_bp_faced_32", "opp_bp_faced_31", "opp_bp_faced_30", "opp_bp_faced_29", "opp_bp_faced_28", "opp_bp_faced_27", "opp_bp_faced_26", "opp_bp_faced_25", "opp_bp_faced_24", "opp_bp_faced_23", "opp_bp_faced_22", "opp_bp_faced_21", "opp_bp_faced_20", "opp_bp_faced_19", "opp_bp_faced_18", "opp_bp_faced_17", "opp_bp_faced_16", "opp_bp_faced_15", "opp_bp_faced_14", "opp_bp_faced_13", "opp_bp_faced_12", "opp_bp_faced_11", "opp_bp_faced_10", "opp_bp_faced_9", "opp_bp_faced_8", "opp_bp_faced_7", "opp_bp_faced_6", "opp_bp_faced_5", "opp_bp_faced_4", "opp_bp_faced_3", "opp_bp_faced_2", "opp_bp_faced_1"]].sum(axis=1)
df_player1["opp_bp_saved_l60"] = df_player1[["opp_bp_saved_60", "opp_bp_saved_59", "opp_bp_saved_58", "opp_bp_saved_57", "opp_bp_saved_56", "opp_bp_saved_55", "opp_bp_saved_54", "opp_bp_saved_53", "opp_bp_saved_52", "opp_bp_saved_51", "opp_bp_saved_50", "opp_bp_saved_49", "opp_bp_saved_48", "opp_bp_saved_47", "opp_bp_saved_46", "opp_bp_saved_45", "opp_bp_saved_44", "opp_bp_saved_43", "opp_bp_saved_42", "opp_bp_saved_41", "opp_bp_saved_40", "opp_bp_saved_39", "opp_bp_saved_38", "opp_bp_saved_37", "opp_bp_saved_36", "opp_bp_saved_35", "opp_bp_saved_34", "opp_bp_saved_33", "opp_bp_saved_32", "opp_bp_saved_31", "opp_bp_saved_30", "opp_bp_saved_29", "opp_bp_saved_28", "opp_bp_saved_27", "opp_bp_saved_26", "opp_bp_saved_25", "opp_bp_saved_24", "opp_bp_saved_23", "opp_bp_saved_22", "opp_bp_saved_21", "opp_bp_saved_20", "opp_bp_saved_19", "opp_bp_saved_18", "opp_bp_saved_17", "opp_bp_saved_16", "opp_bp_saved_15", "opp_bp_saved_14", "opp_bp_saved_13", "opp_bp_saved_12", "opp_bp_saved_11", "opp_bp_saved_10", "opp_bp_saved_9", "opp_bp_saved_8", "opp_bp_saved_7", "opp_bp_saved_6", "opp_bp_saved_5", "opp_bp_saved_4", "opp_bp_saved_3", "opp_bp_saved_2", "opp_bp_saved_1"]].sum(axis=1)
df_player1["p_bp_conv%_l60_tw_ss_IO"] = ((1-(df_player1["opp_bp_saved_l60"]/df_player1["opp_bp_faced_l60"]))*100).round(2)
#(ws = weighted sum)

In [86]:
# 'p_bp_conv%_l10_tw_ss_IO'
# Provides time-weighted (tw), surface-specific (SS), indoor/outdoor (IO) specific mean BREAK POINT CONVERTED performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["opp_bp_faced_l10"] = df_player1[["opp_bp_faced_60", "opp_bp_faced_59", "opp_bp_faced_58", "opp_bp_faced_57", "opp_bp_faced_56", "opp_bp_faced_55", "opp_bp_faced_54", "opp_bp_faced_53", "opp_bp_faced_52", "opp_bp_faced_51"]].sum(axis=1)
df_player1["opp_bp_saved_l10"] = df_player1[["opp_bp_saved_60", "opp_bp_saved_59", "opp_bp_saved_58", "opp_bp_saved_57", "opp_bp_saved_56", "opp_bp_saved_55", "opp_bp_saved_54", "opp_bp_saved_53", "opp_bp_saved_52", "opp_bp_saved_51"]].sum(axis=1)
df_player1["p_bp_conv%_l10_tw_ss_IO"] = ((1-(df_player1["opp_bp_saved_l10"]/df_player1["opp_bp_faced_l10"]))*100).round(2)

# Deleting the many transient columns
df_player1 = df_player1.drop(["opp_bp_faced_l60", "opp_bp_saved_l60", "opp_bp_faced_l10", "opp_bp_saved_l10", "opp_bp_saved_60", "opp_bp_saved_59", "opp_bp_saved_58", "opp_bp_saved_57", "opp_bp_saved_56", "opp_bp_saved_55", "opp_bp_saved_54", "opp_bp_saved_53", "opp_bp_saved_52", "opp_bp_saved_51", "opp_bp_saved_50", "opp_bp_saved_49", "opp_bp_saved_48", "opp_bp_saved_47", "opp_bp_saved_46", "opp_bp_saved_45", "opp_bp_saved_44", "opp_bp_saved_43", "opp_bp_saved_42", "opp_bp_saved_41", "opp_bp_saved_40", "opp_bp_saved_39", "opp_bp_saved_38", "opp_bp_saved_37", "opp_bp_saved_36", "opp_bp_saved_35", "opp_bp_saved_34", "opp_bp_saved_33", "opp_bp_saved_32", "opp_bp_saved_31", "opp_bp_saved_30", "opp_bp_saved_29", "opp_bp_saved_28", "opp_bp_saved_27", "opp_bp_saved_26", "opp_bp_saved_25", "opp_bp_saved_24", "opp_bp_saved_23", "opp_bp_saved_22", "opp_bp_saved_21", "opp_bp_saved_20", "opp_bp_saved_19", "opp_bp_saved_18", "opp_bp_saved_17", "opp_bp_saved_16", "opp_bp_saved_15", "opp_bp_saved_14", "opp_bp_saved_13", "opp_bp_saved_12", "opp_bp_saved_11", "opp_bp_saved_10", "opp_bp_saved_9", "opp_bp_saved_8", "opp_bp_saved_7", "opp_bp_saved_6", "opp_bp_saved_5", "opp_bp_saved_4", "opp_bp_saved_3", "opp_bp_saved_2", "opp_bp_saved_1", "opp_bp_faced_60", "opp_bp_faced_59", "opp_bp_faced_58", "opp_bp_faced_57", "opp_bp_faced_56", "opp_bp_faced_55", "opp_bp_faced_54", "opp_bp_faced_53", "opp_bp_faced_52", "opp_bp_faced_51", "opp_bp_faced_50", "opp_bp_faced_49", "opp_bp_faced_48", "opp_bp_faced_47", "opp_bp_faced_46", "opp_bp_faced_45", "opp_bp_faced_44", "opp_bp_faced_43", "opp_bp_faced_42", "opp_bp_faced_41", "opp_bp_faced_40", "opp_bp_faced_39", "opp_bp_faced_38", "opp_bp_faced_37", "opp_bp_faced_36", "opp_bp_faced_35", "opp_bp_faced_34", "opp_bp_faced_33", "opp_bp_faced_32", "opp_bp_faced_31", "opp_bp_faced_30", "opp_bp_faced_29", "opp_bp_faced_28", "opp_bp_faced_27", "opp_bp_faced_26", "opp_bp_faced_25", "opp_bp_faced_24", "opp_bp_faced_23", "opp_bp_faced_22", "opp_bp_faced_21", "opp_bp_faced_20", "opp_bp_faced_19", "opp_bp_faced_18", "opp_bp_faced_17", "opp_bp_faced_16", "opp_bp_faced_15", "opp_bp_faced_14", "opp_bp_faced_13", "opp_bp_faced_12", "opp_bp_faced_11", "opp_bp_faced_10", "opp_bp_faced_9", "opp_bp_faced_8", "opp_bp_faced_7", "opp_bp_faced_6", "opp_bp_faced_5", "opp_bp_faced_4", "opp_bp_faced_3", "opp_bp_faced_2", "opp_bp_faced_1"], axis = 1)

In [87]:
# Deleting a few other opponent stat columns we no longer need
df_player1 = df_player1.drop(["opp_1st_sv_in", "opp_ace", "opp_df", "opp_bp_saved", "opp_bp_faced"], axis=1)

Below a few "efficiency" ratios related to serving are computed that could potentially be useful as predictive features. We will also compute variants of these ratios later on that have other adjustments to the data already baked in.

In [88]:
# 'p_ace_df%_ratio_l60_tw_ss'
# Provides the ratio of % aces to % double faults for PLAYER over the last 60 surface-specific matches prior to the match being predicted

df_player1["p_ace_df%_ratio_l60_tw_ss"] = (df_player1["p_ace%_l60_tw_ss"]/df_player1["p_df%_l60_tw_ss"]).round(2)

In [89]:
# 'p_ace_df%_ratio_l10_tw_ss'
# Provides the ratio of % aces to % double faults for PLAYER over the last 10 surface-specific matches prior to the match being predicted

df_player1["p_ace_df%_ratio_l10_tw_ss"] = (df_player1["p_ace%_l10_tw_ss"]/df_player1["p_df%_l10_tw_ss"]).round(2)

In [90]:
# 'p_ace_df%_ratio_l60_tw_ss_IO'
# Provides the ratio of % aces to % double faults for PLAYER over the last 60 surface and indoor/outdoor-specific matches prior to the match being predicted

df_player1["p_ace_df%_ratio_l60_tw_ss_IO"] = (df_player1["p_ace%_l60_tw_ss_IO"]/df_player1["p_df%_l60_tw_ss_IO"]).round(2)

In [91]:
# 'p_ace_df%_ratio_l10_tw_ss_IO'
# Provides the ratio of % aces to % double faults for PLAYER over the last 10 surface and indoor/outdoor-specific matches prior to the match being predicted

df_player1["p_ace_df%_ratio_l10_tw_ss_IO"] = (df_player1["p_ace%_l10_tw_ss_IO"]/df_player1["p_df%_l10_tw_ss_IO"]).round(2)

In [92]:
# 'p_1stSvWon_1stSv%_ratio_l60_tw_ss'
# Provides the ratio of % first serves in to % first serve points won for PLAYER over the last 60 surface-specific matches prior to the match being predicted

df_player1["p_1stSvWon_1stSv%_ratio_l60_tw_ss"] = (df_player1["p_1st_sv_pts_won%_l60_tw_ss"]/df_player1["p_1st_sv%_l60_tw_ss"]).round(2)

In [93]:
# 'p_1stSvWon_1stSv%_ratio_l10_tw_ss'
# Provides the ratio of % first serves in to % first serve points won for PLAYER over the last 10 surface-specific matches prior to the match being predicted

df_player1["p_1stSvWon_1stSv%_ratio_l10_tw_ss"] = (df_player1["p_1st_sv_pts_won%_l10_tw_ss"]/df_player1["p_1st_sv%_l10_tw_ss"]).round(2)

In [94]:
# 'p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO'
# Provides the ratio of % first serves in to % first serve points won for PLAYER over the last 60 surface and indoor/outdoor-specific matches prior to the match being predicted

df_player1["p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO"] = (df_player1["p_1st_sv_pts_won%_l60_tw_ss_IO"]/df_player1["p_1st_sv%_l60_tw_ss_IO"]).round(2)

In [95]:
# 'p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO'
# Provides the ratio of % first serves in to % first serve points won for PLAYER over the last 10 surface and indoor/outdoor-specific matches prior to the match being predicted

df_player1["p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO"] = (df_player1["p_1st_sv_pts_won%_l10_tw_ss_IO"]/df_player1["p_1st_sv%_l10_tw_ss_IO"]).round(2)

In [96]:
# 'p_ace_1stSv%_ratio_l60_tw_ss'
# Provides the ratio of % aces in to % first serves in for PLAYER over the last 60 surface-specific matches prior to the match being predicted

df_player1["p_ace_1stSv%_ratio_l60_tw_ss"] = (df_player1["p_ace%_l60_tw_ss"]/df_player1["p_1st_sv%_l60_tw_ss"]).round(2)

In [97]:
# 'p_ace_1stSv%_ratio_l10_tw_ss'
# Provides the ratio of % aces in to % first serves in for PLAYER over the last 10 surface-specific matches prior to the match being predicted

df_player1["p_ace_1stSv%_ratio_l10_tw_ss"] = (df_player1["p_ace%_l10_tw_ss"]/df_player1["p_1st_sv%_l10_tw_ss"]).round(2)

In [98]:
# 'p_ace_1stSv%_ratio_l60_tw_ss_IO'
# Provides the ratio of % aces in to % first serves in for PLAYER over the last 60 surface and indoor/outdoor-specific matches prior to the match being predicted

df_player1["p_ace_1stSv%_ratio_l60_tw_ss_IO"] = (df_player1["p_ace%_l60_tw_ss_IO"]/df_player1["p_1st_sv%_l60_tw_ss_IO"]).round(2)

In [99]:
# 'p_ace_1stSv%_ratio_l10_tw_ss_IO'
# Provides the ratio of % aces in to % first serves in for PLAYER over the last 10 surface and indoor/outdoor-specific matches prior to the match being predicted

df_player1["p_ace_1stSv%_ratio_l10_tw_ss_IO"] = (df_player1["p_ace%_l10_tw_ss_IO"]/df_player1["p_1st_sv%_l10_tw_ss_IO"]).round(2)

In [100]:
# 'p_df_SvPtsWon%_ratio_l60_tw_ss'
# Provides the ratio of % double faults to % serve points won for PLAYER over the last 60 surface-specific matches prior to the match being predicted

df_player1["p_df_SvPtsWon%_ratio_l60_tw_ss"] = (df_player1["p_df%_l60_tw_ss"]/df_player1["p_sv_pts_won%_l60_tw_ss"]).round(2)

In [101]:
# 'p_df_SvPtsWon%_ratio_l10_tw_ss'
# Provides the ratio of % double faults to % serve points won for PLAYER over the last 10 surface-specific matches prior to the match being predicted

df_player1["p_df_SvPtsWon%_ratio_l10_tw_ss"] = (df_player1["p_df%_l10_tw_ss"]/df_player1["p_sv_pts_won%_l10_tw_ss"]).round(2)

In [102]:
# 'p_df_SvPtsWon%_ratio_l60_tw_ss_IO'
# Provides the ratio of % double faults to % serve points won for PLAYER over the last 60 surface and indoor/outdoor-specific matches prior to the match being predicted

df_player1["p_df_SvPtsWon%_ratio_l60_tw_ss_IO"] = (df_player1["p_df%_l60_tw_ss_IO"]/df_player1["p_sv_pts_won%_l60_tw_ss_IO"]).round(2)

In [103]:
# 'p_df_SvPtsWon%_ratio_l10_tw_ss_IO'
# Provides the ratio of % double faults to % serve points won for PLAYER over the last 10 surface and indoor/outdoor-specific matches prior to the match being predicted

df_player1["p_df_SvPtsWon%_ratio_l10_tw_ss_IO"] = (df_player1["p_df%_l10_tw_ss_IO"]/df_player1["p_sv_pts_won%_l10_tw_ss_IO"]).round(2)

In [104]:
df_player1.to_csv('../data/df_player1.csv', index=False)

In [105]:
df_player1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57068 entries, 56533 to 40644
Columns: 147 entries, t_id to p_df_SvPtsWon%_ratio_l10_tw_ss_IO
dtypes: datetime64[ns](1), float64(111), int64(29), object(6)
memory usage: 64.4+ MB


### 3. Predictive Features: Retrospective, Implied Win Probability (IWP) Accrual-Derived
These IWPs are derived from the AVERAGED CLOSING LINES across multiple sports books, with the vig removed. Thus, the 'wisdom of the markets' from past matches played by a given player is used to derive "perceived player strength" preditive features. 

Importantly, the closing lines from the match being predicted are NOT included in building IWP-based predictive features. We are ultimately interested in building an actionable match wagering model, and prediction with the closing line of a match being predicted would therefore represent data leakage (ie, an actual bet will have to be made before the closing line is available). Of course, regressing a model-based prediction based on the OPENING LINE is a wholly tangible proposition.....

In [106]:
# 'p_AVG_C_IP_l60_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean IMPLIED WIN PROBABILITY performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

# These IWPs are derived from the AVERAGED CLOSING LINES across multiple sports books, with the vig removed. Thus, the 'wisdom of the markets' from past matches is used as a replacement for match stats or rankings. 
# Importantly, the closing lines from the match being predicted are NOT included in this predictive feature. We are ultimately interested in building a wagering model, and such usage would therefore represent data leakage (ie, a bet will have to be made before the closing line is available).

df_player1 = df_player1.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

df_player1["p_AVG_C_IP_60"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-1)
df_player1["p_AVG_C_IP_59"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-2)
df_player1["p_AVG_C_IP_58"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-3)
df_player1["p_AVG_C_IP_57"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-4)
df_player1["p_AVG_C_IP_56"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-5)
df_player1["p_AVG_C_IP_55"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-6)
df_player1["p_AVG_C_IP_54"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-7)
df_player1["p_AVG_C_IP_53"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-8)
df_player1["p_AVG_C_IP_52"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-9)
df_player1["p_AVG_C_IP_51"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-10)
df_player1["p_AVG_C_IP_50"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-11)
df_player1["p_AVG_C_IP_49"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-12)
df_player1["p_AVG_C_IP_48"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-13)
df_player1["p_AVG_C_IP_47"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-14)
df_player1["p_AVG_C_IP_46"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-15)
df_player1["p_AVG_C_IP_45"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-16)
df_player1["p_AVG_C_IP_44"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-17)
df_player1["p_AVG_C_IP_43"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-18)
df_player1["p_AVG_C_IP_42"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-19)
df_player1["p_AVG_C_IP_41"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-20)
df_player1["p_AVG_C_IP_40"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-21)
df_player1["p_AVG_C_IP_39"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-22)
df_player1["p_AVG_C_IP_38"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-23)
df_player1["p_AVG_C_IP_37"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-24)
df_player1["p_AVG_C_IP_36"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-25)
df_player1["p_AVG_C_IP_35"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-26)
df_player1["p_AVG_C_IP_34"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-27)
df_player1["p_AVG_C_IP_33"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-28)
df_player1["p_AVG_C_IP_32"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-29)
df_player1["p_AVG_C_IP_31"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-30)
df_player1["p_AVG_C_IP_30"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-31)
df_player1["p_AVG_C_IP_29"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-32)
df_player1["p_AVG_C_IP_28"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-33)
df_player1["p_AVG_C_IP_27"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-34)
df_player1["p_AVG_C_IP_26"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-35)
df_player1["p_AVG_C_IP_25"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-36)
df_player1["p_AVG_C_IP_24"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-37)
df_player1["p_AVG_C_IP_23"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-38)
df_player1["p_AVG_C_IP_22"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-39)
df_player1["p_AVG_C_IP_21"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-40)
df_player1["p_AVG_C_IP_20"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-41)
df_player1["p_AVG_C_IP_19"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-42)
df_player1["p_AVG_C_IP_18"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-43)
df_player1["p_AVG_C_IP_17"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-44)
df_player1["p_AVG_C_IP_16"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-45)
df_player1["p_AVG_C_IP_15"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-46)
df_player1["p_AVG_C_IP_14"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-47)
df_player1["p_AVG_C_IP_13"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-48)
df_player1["p_AVG_C_IP_12"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-49)
df_player1["p_AVG_C_IP_11"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-50)
df_player1["p_AVG_C_IP_10"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-51)
df_player1["p_AVG_C_IP_9"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-52)
df_player1["p_AVG_C_IP_8"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-53)
df_player1["p_AVG_C_IP_7"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-54)
df_player1["p_AVG_C_IP_6"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-55)
df_player1["p_AVG_C_IP_5"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-56)
df_player1["p_AVG_C_IP_4"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-57)
df_player1["p_AVG_C_IP_3"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-58)
df_player1["p_AVG_C_IP_2"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-59)
df_player1["p_AVG_C_IP_1"] = df_player1.groupby(['p_id','t_surf'])['p_AVG_C_IP'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_AVG_C_IP_l60_ws"] = df_player1[["p_AVG_C_IP_60", "p_AVG_C_IP_59", "p_AVG_C_IP_58", "p_AVG_C_IP_57", "p_AVG_C_IP_56", "p_AVG_C_IP_55", "p_AVG_C_IP_54", "p_AVG_C_IP_53", "p_AVG_C_IP_52", "p_AVG_C_IP_51", "p_AVG_C_IP_50", "p_AVG_C_IP_49", "p_AVG_C_IP_48", "p_AVG_C_IP_47", "p_AVG_C_IP_46", "p_AVG_C_IP_45", "p_AVG_C_IP_44", "p_AVG_C_IP_43", "p_AVG_C_IP_42", "p_AVG_C_IP_41", "p_AVG_C_IP_40", "p_AVG_C_IP_39", "p_AVG_C_IP_38", "p_AVG_C_IP_37", "p_AVG_C_IP_36", "p_AVG_C_IP_35", "p_AVG_C_IP_34", "p_AVG_C_IP_33", "p_AVG_C_IP_32", "p_AVG_C_IP_31", "p_AVG_C_IP_30", "p_AVG_C_IP_29", "p_AVG_C_IP_28", "p_AVG_C_IP_27", "p_AVG_C_IP_26", "p_AVG_C_IP_25", "p_AVG_C_IP_24", "p_AVG_C_IP_23", "p_AVG_C_IP_22", "p_AVG_C_IP_21", "p_AVG_C_IP_20", "p_AVG_C_IP_19", "p_AVG_C_IP_18", "p_AVG_C_IP_17", "p_AVG_C_IP_16", "p_AVG_C_IP_15", "p_AVG_C_IP_14", "p_AVG_C_IP_13", "p_AVG_C_IP_12", "p_AVG_C_IP_11", "p_AVG_C_IP_10", "p_AVG_C_IP_9", "p_AVG_C_IP_8", "p_AVG_C_IP_7", "p_AVG_C_IP_6", "p_AVG_C_IP_5", "p_AVG_C_IP_4", "p_AVG_C_IP_3", "p_AVG_C_IP_2", "p_AVG_C_IP_1"]].sum(axis=1)
df_player1["p_AVG_C_IP_l60_ws_ct"] = df_player1[["p_AVG_C_IP_60", "p_AVG_C_IP_59", "p_AVG_C_IP_58", "p_AVG_C_IP_57", "p_AVG_C_IP_56", "p_AVG_C_IP_55", "p_AVG_C_IP_54", "p_AVG_C_IP_53", "p_AVG_C_IP_52", "p_AVG_C_IP_51", "p_AVG_C_IP_50", "p_AVG_C_IP_49", "p_AVG_C_IP_48", "p_AVG_C_IP_47", "p_AVG_C_IP_46", "p_AVG_C_IP_45", "p_AVG_C_IP_44", "p_AVG_C_IP_43", "p_AVG_C_IP_42", "p_AVG_C_IP_41", "p_AVG_C_IP_40", "p_AVG_C_IP_39", "p_AVG_C_IP_38", "p_AVG_C_IP_37", "p_AVG_C_IP_36", "p_AVG_C_IP_35", "p_AVG_C_IP_34", "p_AVG_C_IP_33", "p_AVG_C_IP_32", "p_AVG_C_IP_31", "p_AVG_C_IP_30", "p_AVG_C_IP_29", "p_AVG_C_IP_28", "p_AVG_C_IP_27", "p_AVG_C_IP_26", "p_AVG_C_IP_25", "p_AVG_C_IP_24", "p_AVG_C_IP_23", "p_AVG_C_IP_22", "p_AVG_C_IP_21", "p_AVG_C_IP_20", "p_AVG_C_IP_19", "p_AVG_C_IP_18", "p_AVG_C_IP_17", "p_AVG_C_IP_16", "p_AVG_C_IP_15", "p_AVG_C_IP_14", "p_AVG_C_IP_13", "p_AVG_C_IP_12", "p_AVG_C_IP_11", "p_AVG_C_IP_10", "p_AVG_C_IP_9", "p_AVG_C_IP_8", "p_AVG_C_IP_7", "p_AVG_C_IP_6", "p_AVG_C_IP_5", "p_AVG_C_IP_4", "p_AVG_C_IP_3", "p_AVG_C_IP_2", "p_AVG_C_IP_1"]].count(axis=1)
df_player1["p_AVG_C_IP_l60_tw_ss"] = (df_player1["p_AVG_C_IP_l60_ws"]/df_player1["p_AVG_C_IP_l60_ws_ct"]).round(2) 
#(ws = weighted sum; tw = time-weighted)

In [107]:
#df_player1.to_csv('../data/df_player1.csv', index=False)

In [108]:
# 'p_AVG_C_IP_l10_tw_ss'
# Provides time-weighted (TW), surface-specific (SS), mean IMPLIED WIN PROBABILITY performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

# These IWPs are derived from the AVERAGED CLOSING LINES across multiple sports books, with the vig removed. Thus, the 'wisdom of the markets' from past matches is used as a replacement for match stats or rankings. 
# Importantly, the closing lines from the match being predicted are NOT included in this predictive feature. We are ultimately interested in building a wagering model, and such usage would therefore represent data leakage (ie, a bet will have to be made before the closing line is available).

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_AVG_C_IP_l10_ws"] = df_player1[["p_AVG_C_IP_60", "p_AVG_C_IP_59", "p_AVG_C_IP_58", "p_AVG_C_IP_57", "p_AVG_C_IP_56", "p_AVG_C_IP_55", "p_AVG_C_IP_54", "p_AVG_C_IP_53", "p_AVG_C_IP_52", "p_AVG_C_IP_51"]].sum(axis=1)
df_player1["p_AVG_C_IP_l10_ws_ct"] = df_player1[["p_AVG_C_IP_60", "p_AVG_C_IP_59", "p_AVG_C_IP_58", "p_AVG_C_IP_57", "p_AVG_C_IP_56", "p_AVG_C_IP_55", "p_AVG_C_IP_54", "p_AVG_C_IP_53", "p_AVG_C_IP_52", "p_AVG_C_IP_51"]].count(axis=1)
df_player1["p_AVG_C_IP_l10_tw_ss"] = (df_player1["p_AVG_C_IP_l10_ws"]/df_player1["p_AVG_C_IP_l10_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

#Deleting the many transient columns
df_player1 = df_player1.drop(["p_AVG_C_IP_l60_ws", "p_AVG_C_IP_l60_ws_ct", "p_AVG_C_IP_l10_ws", "p_AVG_C_IP_l10_ws_ct", "p_AVG_C_IP_60", "p_AVG_C_IP_59", "p_AVG_C_IP_58", "p_AVG_C_IP_57", "p_AVG_C_IP_56", "p_AVG_C_IP_55", "p_AVG_C_IP_54", "p_AVG_C_IP_53", "p_AVG_C_IP_52", "p_AVG_C_IP_51", "p_AVG_C_IP_50", "p_AVG_C_IP_49", "p_AVG_C_IP_48", "p_AVG_C_IP_47", "p_AVG_C_IP_46", "p_AVG_C_IP_45", "p_AVG_C_IP_44", "p_AVG_C_IP_43", "p_AVG_C_IP_42", "p_AVG_C_IP_41", "p_AVG_C_IP_40", "p_AVG_C_IP_39", "p_AVG_C_IP_38", "p_AVG_C_IP_37", "p_AVG_C_IP_36", "p_AVG_C_IP_35", "p_AVG_C_IP_34", "p_AVG_C_IP_33", "p_AVG_C_IP_32", "p_AVG_C_IP_31", "p_AVG_C_IP_30", "p_AVG_C_IP_29", "p_AVG_C_IP_28", "p_AVG_C_IP_27", "p_AVG_C_IP_26", "p_AVG_C_IP_25", "p_AVG_C_IP_24", "p_AVG_C_IP_23", "p_AVG_C_IP_22", "p_AVG_C_IP_21", "p_AVG_C_IP_20", "p_AVG_C_IP_19", "p_AVG_C_IP_18", "p_AVG_C_IP_17", "p_AVG_C_IP_16", "p_AVG_C_IP_15", "p_AVG_C_IP_14", "p_AVG_C_IP_13", "p_AVG_C_IP_12", "p_AVG_C_IP_11", "p_AVG_C_IP_10", "p_AVG_C_IP_9", "p_AVG_C_IP_8", "p_AVG_C_IP_7", "p_AVG_C_IP_6", "p_AVG_C_IP_5", "p_AVG_C_IP_4", "p_AVG_C_IP_3", "p_AVG_C_IP_2", "p_AVG_C_IP_1"], axis = 1)

# Updating Column Inclusion and Sequence
#df_player1 = df_player1 [["t_nm", "t_co", "t_GMT", "t_surf", "t_lvl", "t_draw_sz", "t_ind", "t_alt", "m_num", "m_date", "m_rd_num", "m_bestof", "m_t(m)", "m_outcome", "p_id", "p_nm", "p_rk", "p_rk_pts", "p_co", "p_ent", "p_hd", "p_ht", "p_age", "m_tot_pts", "p_pts_won%", "p_pts_won%_l60_tw", "p_pts_won%_l10_tw", "p_pts_won%_l60_tw_IO", "p_pts_won%_l10_tw_IO", "p_sv_pts_won%", "p_sv_pts_won%_l60_tw", "p_sv_pts_won%_l10_tw", "p_ret_pts_won%", "p_ret_pts_won%_l60_tw", "p_ret_pts_won%_l10_tw", "p_ace%_l60_tw", "p_ace%_l10_tw", "p_aced%_l60_tw", "p_aced%_l10_tw", "p_bp_save%_l60", "p_bp_save%_l10", "p_bp_conv%_l60", "p_bp_conv%_l10", "p_AVG_C_IP_NV", "p_AVG_C_IP_l60_tw", "p_AVG_C_IP_l10_tw", "p_PS_O_IP_NV", "p_PS_C_IP_NV", "opp_id", "Comment"]]

In [109]:
# 'p_AVG_C_IP_IO_l60_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor-specific (IO) mean IMPLIED WIN PROBABILITY performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

# These IWPs are derived from the AVERAGED CLOSING LINES across multiple sports books, with the vig removed. Thus, the 'wisdom of the markets' from past matches is used as a replacement for match stats or rankings. 
# Importantly, the closing lines from the match being predicted are NOT included in this predictive feature. We are ultimately interested in building a wagering model, and such usage would therefore represent data leakage (ie, a bet will have to be made before the closing line is available).

df_player1 = df_player1.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

df_player1["p_AVG_C_IP_IO_60"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-1)
df_player1["p_AVG_C_IP_IO_59"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-2)
df_player1["p_AVG_C_IP_IO_58"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-3)
df_player1["p_AVG_C_IP_IO_57"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-4)
df_player1["p_AVG_C_IP_IO_56"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-5)
df_player1["p_AVG_C_IP_IO_55"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-6)
df_player1["p_AVG_C_IP_IO_54"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-7)
df_player1["p_AVG_C_IP_IO_53"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-8)
df_player1["p_AVG_C_IP_IO_52"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-9)
df_player1["p_AVG_C_IP_IO_51"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-10)
df_player1["p_AVG_C_IP_IO_50"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-11)
df_player1["p_AVG_C_IP_IO_49"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-12)
df_player1["p_AVG_C_IP_IO_48"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-13)
df_player1["p_AVG_C_IP_IO_47"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-14)
df_player1["p_AVG_C_IP_IO_46"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-15)
df_player1["p_AVG_C_IP_IO_45"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-16)
df_player1["p_AVG_C_IP_IO_44"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-17)
df_player1["p_AVG_C_IP_IO_43"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-18)
df_player1["p_AVG_C_IP_IO_42"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-19)
df_player1["p_AVG_C_IP_IO_41"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-20)
df_player1["p_AVG_C_IP_IO_40"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-21)
df_player1["p_AVG_C_IP_IO_39"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-22)
df_player1["p_AVG_C_IP_IO_38"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-23)
df_player1["p_AVG_C_IP_IO_37"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-24)
df_player1["p_AVG_C_IP_IO_36"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-25)
df_player1["p_AVG_C_IP_IO_35"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-26)
df_player1["p_AVG_C_IP_IO_34"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-27)
df_player1["p_AVG_C_IP_IO_33"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-28)
df_player1["p_AVG_C_IP_IO_32"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-29)
df_player1["p_AVG_C_IP_IO_31"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-30)
df_player1["p_AVG_C_IP_IO_30"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-31)
df_player1["p_AVG_C_IP_IO_29"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-32)
df_player1["p_AVG_C_IP_IO_28"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-33)
df_player1["p_AVG_C_IP_IO_27"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-34)
df_player1["p_AVG_C_IP_IO_26"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-35)
df_player1["p_AVG_C_IP_IO_25"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-36)
df_player1["p_AVG_C_IP_IO_24"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-37)
df_player1["p_AVG_C_IP_IO_23"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-38)
df_player1["p_AVG_C_IP_IO_22"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-39)
df_player1["p_AVG_C_IP_IO_21"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-40)
df_player1["p_AVG_C_IP_IO_20"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-41)
df_player1["p_AVG_C_IP_IO_19"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-42)
df_player1["p_AVG_C_IP_IO_18"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-43)
df_player1["p_AVG_C_IP_IO_17"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-44)
df_player1["p_AVG_C_IP_IO_16"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-45)
df_player1["p_AVG_C_IP_IO_15"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-46)
df_player1["p_AVG_C_IP_IO_14"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-47)
df_player1["p_AVG_C_IP_IO_13"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-48)
df_player1["p_AVG_C_IP_IO_12"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-49)
df_player1["p_AVG_C_IP_IO_11"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-50)
df_player1["p_AVG_C_IP_IO_10"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-51)
df_player1["p_AVG_C_IP_IO_9"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-52)
df_player1["p_AVG_C_IP_IO_8"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-53)
df_player1["p_AVG_C_IP_IO_7"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-54)
df_player1["p_AVG_C_IP_IO_6"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-55)
df_player1["p_AVG_C_IP_IO_5"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-56)
df_player1["p_AVG_C_IP_IO_4"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-57)
df_player1["p_AVG_C_IP_IO_3"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-58)
df_player1["p_AVG_C_IP_IO_2"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-59)
df_player1["p_AVG_C_IP_IO_1"] = df_player1.groupby(['p_id','t_surf','t_ind'])['p_AVG_C_IP'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_AVG_C_IP_IO_l60_ws"] = df_player1[["p_AVG_C_IP_IO_60", "p_AVG_C_IP_IO_59", "p_AVG_C_IP_IO_58", "p_AVG_C_IP_IO_57", "p_AVG_C_IP_IO_56", "p_AVG_C_IP_IO_55", "p_AVG_C_IP_IO_54", "p_AVG_C_IP_IO_53", "p_AVG_C_IP_IO_52", "p_AVG_C_IP_IO_51", "p_AVG_C_IP_IO_50", "p_AVG_C_IP_IO_49", "p_AVG_C_IP_IO_48", "p_AVG_C_IP_IO_47", "p_AVG_C_IP_IO_46", "p_AVG_C_IP_IO_45", "p_AVG_C_IP_IO_44", "p_AVG_C_IP_IO_43", "p_AVG_C_IP_IO_42", "p_AVG_C_IP_IO_41", "p_AVG_C_IP_IO_40", "p_AVG_C_IP_IO_39", "p_AVG_C_IP_IO_38", "p_AVG_C_IP_IO_37", "p_AVG_C_IP_IO_36", "p_AVG_C_IP_IO_35", "p_AVG_C_IP_IO_34", "p_AVG_C_IP_IO_33", "p_AVG_C_IP_IO_32", "p_AVG_C_IP_IO_31", "p_AVG_C_IP_IO_30", "p_AVG_C_IP_IO_29", "p_AVG_C_IP_IO_28", "p_AVG_C_IP_IO_27", "p_AVG_C_IP_IO_26", "p_AVG_C_IP_IO_25", "p_AVG_C_IP_IO_24", "p_AVG_C_IP_IO_23", "p_AVG_C_IP_IO_22", "p_AVG_C_IP_IO_21", "p_AVG_C_IP_IO_20", "p_AVG_C_IP_IO_19", "p_AVG_C_IP_IO_18", "p_AVG_C_IP_IO_17", "p_AVG_C_IP_IO_16", "p_AVG_C_IP_IO_15", "p_AVG_C_IP_IO_14", "p_AVG_C_IP_IO_13", "p_AVG_C_IP_IO_12", "p_AVG_C_IP_IO_11", "p_AVG_C_IP_IO_10", "p_AVG_C_IP_IO_9", "p_AVG_C_IP_IO_8", "p_AVG_C_IP_IO_7", "p_AVG_C_IP_IO_6", "p_AVG_C_IP_IO_5", "p_AVG_C_IP_IO_4", "p_AVG_C_IP_IO_3", "p_AVG_C_IP_IO_2", "p_AVG_C_IP_IO_1"]].sum(axis=1)
df_player1["p_AVG_C_IP_IO_l60_ws_ct"] = df_player1[["p_AVG_C_IP_IO_60", "p_AVG_C_IP_IO_59", "p_AVG_C_IP_IO_58", "p_AVG_C_IP_IO_57", "p_AVG_C_IP_IO_56", "p_AVG_C_IP_IO_55", "p_AVG_C_IP_IO_54", "p_AVG_C_IP_IO_53", "p_AVG_C_IP_IO_52", "p_AVG_C_IP_IO_51", "p_AVG_C_IP_IO_50", "p_AVG_C_IP_IO_49", "p_AVG_C_IP_IO_48", "p_AVG_C_IP_IO_47", "p_AVG_C_IP_IO_46", "p_AVG_C_IP_IO_45", "p_AVG_C_IP_IO_44", "p_AVG_C_IP_IO_43", "p_AVG_C_IP_IO_42", "p_AVG_C_IP_IO_41", "p_AVG_C_IP_IO_40", "p_AVG_C_IP_IO_39", "p_AVG_C_IP_IO_38", "p_AVG_C_IP_IO_37", "p_AVG_C_IP_IO_36", "p_AVG_C_IP_IO_35", "p_AVG_C_IP_IO_34", "p_AVG_C_IP_IO_33", "p_AVG_C_IP_IO_32", "p_AVG_C_IP_IO_31", "p_AVG_C_IP_IO_30", "p_AVG_C_IP_IO_29", "p_AVG_C_IP_IO_28", "p_AVG_C_IP_IO_27", "p_AVG_C_IP_IO_26", "p_AVG_C_IP_IO_25", "p_AVG_C_IP_IO_24", "p_AVG_C_IP_IO_23", "p_AVG_C_IP_IO_22", "p_AVG_C_IP_IO_21", "p_AVG_C_IP_IO_20", "p_AVG_C_IP_IO_19", "p_AVG_C_IP_IO_18", "p_AVG_C_IP_IO_17", "p_AVG_C_IP_IO_16", "p_AVG_C_IP_IO_15", "p_AVG_C_IP_IO_14", "p_AVG_C_IP_IO_13", "p_AVG_C_IP_IO_12", "p_AVG_C_IP_IO_11", "p_AVG_C_IP_IO_10", "p_AVG_C_IP_IO_9", "p_AVG_C_IP_IO_8", "p_AVG_C_IP_IO_7", "p_AVG_C_IP_IO_6", "p_AVG_C_IP_IO_5", "p_AVG_C_IP_IO_4", "p_AVG_C_IP_IO_3", "p_AVG_C_IP_IO_2", "p_AVG_C_IP_IO_1"]].count(axis=1)
df_player1["p_AVG_C_IP_l60_tw_ss_IO"] = (df_player1["p_AVG_C_IP_IO_l60_ws"]/df_player1["p_AVG_C_IP_IO_l60_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

In [110]:
# 'p_AVG_C_IP_IO_l10_tw_ss_IO'
# Provides time-weighted (TW), surface-specific (SS), indoor/outdoor-specific (IO) mean IMPLIED WIN PROBABILITY performance of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_AVG_C_IP_IO_l10_ws"] = df_player1[["p_AVG_C_IP_IO_60", "p_AVG_C_IP_IO_59", "p_AVG_C_IP_IO_58", "p_AVG_C_IP_IO_57", "p_AVG_C_IP_IO_56", "p_AVG_C_IP_IO_55", "p_AVG_C_IP_IO_54", "p_AVG_C_IP_IO_53", "p_AVG_C_IP_IO_52", "p_AVG_C_IP_IO_51"]].sum(axis=1)
df_player1["p_AVG_C_IP_IO_l10_ws_ct"] = df_player1[["p_AVG_C_IP_IO_60", "p_AVG_C_IP_IO_59", "p_AVG_C_IP_IO_58", "p_AVG_C_IP_IO_57", "p_AVG_C_IP_IO_56", "p_AVG_C_IP_IO_55", "p_AVG_C_IP_IO_54", "p_AVG_C_IP_IO_53", "p_AVG_C_IP_IO_52", "p_AVG_C_IP_IO_51"]].count(axis=1)
df_player1["p_AVG_C_IP_l10_tw_ss_IO"] = (df_player1["p_AVG_C_IP_IO_l10_ws"]/df_player1["p_AVG_C_IP_IO_l10_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

df_player1 = df_player1.drop(["p_AVG_C_IP_IO_l60_ws", "p_AVG_C_IP_IO_l60_ws_ct", "p_AVG_C_IP_IO_l10_ws", "p_AVG_C_IP_IO_l10_ws_ct", "p_AVG_C_IP_IO_60", "p_AVG_C_IP_IO_59", "p_AVG_C_IP_IO_58", "p_AVG_C_IP_IO_57", "p_AVG_C_IP_IO_56", "p_AVG_C_IP_IO_55", "p_AVG_C_IP_IO_54", "p_AVG_C_IP_IO_53", "p_AVG_C_IP_IO_52", "p_AVG_C_IP_IO_51", "p_AVG_C_IP_IO_50", "p_AVG_C_IP_IO_49", "p_AVG_C_IP_IO_48", "p_AVG_C_IP_IO_47", "p_AVG_C_IP_IO_46", "p_AVG_C_IP_IO_45", "p_AVG_C_IP_IO_44", "p_AVG_C_IP_IO_43", "p_AVG_C_IP_IO_42", "p_AVG_C_IP_IO_41", "p_AVG_C_IP_IO_40", "p_AVG_C_IP_IO_39", "p_AVG_C_IP_IO_38", "p_AVG_C_IP_IO_37", "p_AVG_C_IP_IO_36", "p_AVG_C_IP_IO_35", "p_AVG_C_IP_IO_34", "p_AVG_C_IP_IO_33", "p_AVG_C_IP_IO_32", "p_AVG_C_IP_IO_31", "p_AVG_C_IP_IO_30", "p_AVG_C_IP_IO_29", "p_AVG_C_IP_IO_28", "p_AVG_C_IP_IO_27", "p_AVG_C_IP_IO_26", "p_AVG_C_IP_IO_25", "p_AVG_C_IP_IO_24", "p_AVG_C_IP_IO_23", "p_AVG_C_IP_IO_22", "p_AVG_C_IP_IO_21", "p_AVG_C_IP_IO_20", "p_AVG_C_IP_IO_19", "p_AVG_C_IP_IO_18", "p_AVG_C_IP_IO_17", "p_AVG_C_IP_IO_16", "p_AVG_C_IP_IO_15", "p_AVG_C_IP_IO_14", "p_AVG_C_IP_IO_13", "p_AVG_C_IP_IO_12", "p_AVG_C_IP_IO_11", "p_AVG_C_IP_IO_10", "p_AVG_C_IP_IO_9", "p_AVG_C_IP_IO_8", "p_AVG_C_IP_IO_7", "p_AVG_C_IP_IO_6", "p_AVG_C_IP_IO_5", "p_AVG_C_IP_IO_4", "p_AVG_C_IP_IO_3", "p_AVG_C_IP_IO_2", "p_AVG_C_IP_IO_1"], axis = 1)

In [111]:
# 'p_AVG_C_IP_l60_tw_nss'
# Provides time-weighted (TW), NON-surface-specific (SS), mean IMPLIED WIN PROBABILITY performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

# These IWPs are derived from the AVERAGED CLOSING LINES across multiple sports books, with the vig removed. Thus, the 'wisdom of the markets' from past matches is used as a replacement for match stats or rankings. 
# Importantly, the closing lines from the match being predicted are NOT included in this predictive feature. We are ultimately interested in building a wagering model, and such usage would therefore represent data leakage (ie, a bet will have to be made before the closing line is available).

df_player1 = df_player1.sort_values(by=['p_id','m_date','m_rd_num'], ascending = False)

df_player1["p_AVG_C_IP_60"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-1)
df_player1["p_AVG_C_IP_59"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-2)
df_player1["p_AVG_C_IP_58"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-3)
df_player1["p_AVG_C_IP_57"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-4)
df_player1["p_AVG_C_IP_56"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-5)
df_player1["p_AVG_C_IP_55"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-6)
df_player1["p_AVG_C_IP_54"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-7)
df_player1["p_AVG_C_IP_53"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-8)
df_player1["p_AVG_C_IP_52"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-9)
df_player1["p_AVG_C_IP_51"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-10)
df_player1["p_AVG_C_IP_50"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-11)
df_player1["p_AVG_C_IP_49"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-12)
df_player1["p_AVG_C_IP_48"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-13)
df_player1["p_AVG_C_IP_47"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-14)
df_player1["p_AVG_C_IP_46"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-15)
df_player1["p_AVG_C_IP_45"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-16)
df_player1["p_AVG_C_IP_44"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-17)
df_player1["p_AVG_C_IP_43"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-18)
df_player1["p_AVG_C_IP_42"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-19)
df_player1["p_AVG_C_IP_41"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-20)
df_player1["p_AVG_C_IP_40"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-21)
df_player1["p_AVG_C_IP_39"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-22)
df_player1["p_AVG_C_IP_38"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-23)
df_player1["p_AVG_C_IP_37"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-24)
df_player1["p_AVG_C_IP_36"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-25)
df_player1["p_AVG_C_IP_35"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-26)
df_player1["p_AVG_C_IP_34"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-27)
df_player1["p_AVG_C_IP_33"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-28)
df_player1["p_AVG_C_IP_32"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-29)
df_player1["p_AVG_C_IP_31"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-30)
df_player1["p_AVG_C_IP_30"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-31)
df_player1["p_AVG_C_IP_29"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-32)
df_player1["p_AVG_C_IP_28"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-33)
df_player1["p_AVG_C_IP_27"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-34)
df_player1["p_AVG_C_IP_26"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-35)
df_player1["p_AVG_C_IP_25"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-36)
df_player1["p_AVG_C_IP_24"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-37)
df_player1["p_AVG_C_IP_23"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-38)
df_player1["p_AVG_C_IP_22"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-39)
df_player1["p_AVG_C_IP_21"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-40)
df_player1["p_AVG_C_IP_20"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-41)
df_player1["p_AVG_C_IP_19"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-42)
df_player1["p_AVG_C_IP_18"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-43)
df_player1["p_AVG_C_IP_17"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-44)
df_player1["p_AVG_C_IP_16"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-45)
df_player1["p_AVG_C_IP_15"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-46)
df_player1["p_AVG_C_IP_14"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-47)
df_player1["p_AVG_C_IP_13"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-48)
df_player1["p_AVG_C_IP_12"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-49)
df_player1["p_AVG_C_IP_11"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-50)
df_player1["p_AVG_C_IP_10"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-51)
df_player1["p_AVG_C_IP_9"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-52)
df_player1["p_AVG_C_IP_8"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-53)
df_player1["p_AVG_C_IP_7"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-54)
df_player1["p_AVG_C_IP_6"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-55)
df_player1["p_AVG_C_IP_5"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-56)
df_player1["p_AVG_C_IP_4"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-57)
df_player1["p_AVG_C_IP_3"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-58)
df_player1["p_AVG_C_IP_2"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-59)
df_player1["p_AVG_C_IP_1"] = df_player1.groupby(['p_id'])['p_AVG_C_IP'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_AVG_C_IP_l60_ws"] = df_player1[["p_AVG_C_IP_60", "p_AVG_C_IP_59", "p_AVG_C_IP_58", "p_AVG_C_IP_57", "p_AVG_C_IP_56", "p_AVG_C_IP_55", "p_AVG_C_IP_54", "p_AVG_C_IP_53", "p_AVG_C_IP_52", "p_AVG_C_IP_51", "p_AVG_C_IP_50", "p_AVG_C_IP_49", "p_AVG_C_IP_48", "p_AVG_C_IP_47", "p_AVG_C_IP_46", "p_AVG_C_IP_45", "p_AVG_C_IP_44", "p_AVG_C_IP_43", "p_AVG_C_IP_42", "p_AVG_C_IP_41", "p_AVG_C_IP_40", "p_AVG_C_IP_39", "p_AVG_C_IP_38", "p_AVG_C_IP_37", "p_AVG_C_IP_36", "p_AVG_C_IP_35", "p_AVG_C_IP_34", "p_AVG_C_IP_33", "p_AVG_C_IP_32", "p_AVG_C_IP_31", "p_AVG_C_IP_30", "p_AVG_C_IP_29", "p_AVG_C_IP_28", "p_AVG_C_IP_27", "p_AVG_C_IP_26", "p_AVG_C_IP_25", "p_AVG_C_IP_24", "p_AVG_C_IP_23", "p_AVG_C_IP_22", "p_AVG_C_IP_21", "p_AVG_C_IP_20", "p_AVG_C_IP_19", "p_AVG_C_IP_18", "p_AVG_C_IP_17", "p_AVG_C_IP_16", "p_AVG_C_IP_15", "p_AVG_C_IP_14", "p_AVG_C_IP_13", "p_AVG_C_IP_12", "p_AVG_C_IP_11", "p_AVG_C_IP_10", "p_AVG_C_IP_9", "p_AVG_C_IP_8", "p_AVG_C_IP_7", "p_AVG_C_IP_6", "p_AVG_C_IP_5", "p_AVG_C_IP_4", "p_AVG_C_IP_3", "p_AVG_C_IP_2", "p_AVG_C_IP_1"]].sum(axis=1)
df_player1["p_AVG_C_IP_l60_ws_ct"] = df_player1[["p_AVG_C_IP_60", "p_AVG_C_IP_59", "p_AVG_C_IP_58", "p_AVG_C_IP_57", "p_AVG_C_IP_56", "p_AVG_C_IP_55", "p_AVG_C_IP_54", "p_AVG_C_IP_53", "p_AVG_C_IP_52", "p_AVG_C_IP_51", "p_AVG_C_IP_50", "p_AVG_C_IP_49", "p_AVG_C_IP_48", "p_AVG_C_IP_47", "p_AVG_C_IP_46", "p_AVG_C_IP_45", "p_AVG_C_IP_44", "p_AVG_C_IP_43", "p_AVG_C_IP_42", "p_AVG_C_IP_41", "p_AVG_C_IP_40", "p_AVG_C_IP_39", "p_AVG_C_IP_38", "p_AVG_C_IP_37", "p_AVG_C_IP_36", "p_AVG_C_IP_35", "p_AVG_C_IP_34", "p_AVG_C_IP_33", "p_AVG_C_IP_32", "p_AVG_C_IP_31", "p_AVG_C_IP_30", "p_AVG_C_IP_29", "p_AVG_C_IP_28", "p_AVG_C_IP_27", "p_AVG_C_IP_26", "p_AVG_C_IP_25", "p_AVG_C_IP_24", "p_AVG_C_IP_23", "p_AVG_C_IP_22", "p_AVG_C_IP_21", "p_AVG_C_IP_20", "p_AVG_C_IP_19", "p_AVG_C_IP_18", "p_AVG_C_IP_17", "p_AVG_C_IP_16", "p_AVG_C_IP_15", "p_AVG_C_IP_14", "p_AVG_C_IP_13", "p_AVG_C_IP_12", "p_AVG_C_IP_11", "p_AVG_C_IP_10", "p_AVG_C_IP_9", "p_AVG_C_IP_8", "p_AVG_C_IP_7", "p_AVG_C_IP_6", "p_AVG_C_IP_5", "p_AVG_C_IP_4", "p_AVG_C_IP_3", "p_AVG_C_IP_2", "p_AVG_C_IP_1"]].count(axis=1)
df_player1["p_AVG_C_IP_l60_tw_nss"] = (df_player1["p_AVG_C_IP_l60_ws"]/df_player1["p_AVG_C_IP_l60_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

In [112]:
# 'p_AVG_C_IP_l10_tw_nss'
# Provides time-weighted (TW), NON-surface-specific (SS), mean IMPLIED WIN PROBABILITY performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

# These IWPs are derived from the AVERAGED CLOSING LINES across multiple sports books, with the vig removed. Thus, the 'wisdom of the markets' from past matches is used as a replacement for match stats or rankings. 
# Importantly, the closing lines from the match being predicted are NOT included in this predictive feature. We are ultimately interested in building a wagering model, and such usage would therefore represent data leakage (ie, a bet will have to be made before the closing line is available).

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player1["p_AVG_C_IP_l10_ws"] = df_player1[["p_AVG_C_IP_60", "p_AVG_C_IP_59", "p_AVG_C_IP_58", "p_AVG_C_IP_57", "p_AVG_C_IP_56", "p_AVG_C_IP_55", "p_AVG_C_IP_54", "p_AVG_C_IP_53", "p_AVG_C_IP_52", "p_AVG_C_IP_51"]].sum(axis=1)
df_player1["p_AVG_C_IP_l10_ws_ct"] = df_player1[["p_AVG_C_IP_60", "p_AVG_C_IP_59", "p_AVG_C_IP_58", "p_AVG_C_IP_57", "p_AVG_C_IP_56", "p_AVG_C_IP_55", "p_AVG_C_IP_54", "p_AVG_C_IP_53", "p_AVG_C_IP_52", "p_AVG_C_IP_51"]].count(axis=1)
df_player1["p_AVG_C_IP_l10_tw_nss"] = (df_player1["p_AVG_C_IP_l10_ws"]/df_player1["p_AVG_C_IP_l10_ws_ct"]).round(2) 
#(ws = weighted sum; tw = time-weighted)

#Deleting the many transient columns
df_player1 = df_player1.drop(["p_AVG_C_IP_l60_ws", "p_AVG_C_IP_l60_ws_ct", "p_AVG_C_IP_l10_ws", "p_AVG_C_IP_l10_ws_ct", "p_AVG_C_IP_60", "p_AVG_C_IP_59", "p_AVG_C_IP_58", "p_AVG_C_IP_57", "p_AVG_C_IP_56", "p_AVG_C_IP_55", "p_AVG_C_IP_54", "p_AVG_C_IP_53", "p_AVG_C_IP_52", "p_AVG_C_IP_51", "p_AVG_C_IP_50", "p_AVG_C_IP_49", "p_AVG_C_IP_48", "p_AVG_C_IP_47", "p_AVG_C_IP_46", "p_AVG_C_IP_45", "p_AVG_C_IP_44", "p_AVG_C_IP_43", "p_AVG_C_IP_42", "p_AVG_C_IP_41", "p_AVG_C_IP_40", "p_AVG_C_IP_39", "p_AVG_C_IP_38", "p_AVG_C_IP_37", "p_AVG_C_IP_36", "p_AVG_C_IP_35", "p_AVG_C_IP_34", "p_AVG_C_IP_33", "p_AVG_C_IP_32", "p_AVG_C_IP_31", "p_AVG_C_IP_30", "p_AVG_C_IP_29", "p_AVG_C_IP_28", "p_AVG_C_IP_27", "p_AVG_C_IP_26", "p_AVG_C_IP_25", "p_AVG_C_IP_24", "p_AVG_C_IP_23", "p_AVG_C_IP_22", "p_AVG_C_IP_21", "p_AVG_C_IP_20", "p_AVG_C_IP_19", "p_AVG_C_IP_18", "p_AVG_C_IP_17", "p_AVG_C_IP_16", "p_AVG_C_IP_15", "p_AVG_C_IP_14", "p_AVG_C_IP_13", "p_AVG_C_IP_12", "p_AVG_C_IP_11", "p_AVG_C_IP_10", "p_AVG_C_IP_9", "p_AVG_C_IP_8", "p_AVG_C_IP_7", "p_AVG_C_IP_6", "p_AVG_C_IP_5", "p_AVG_C_IP_4", "p_AVG_C_IP_3", "p_AVG_C_IP_2", "p_AVG_C_IP_1"], axis = 1)

In [113]:
df_player1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57068 entries, 56533 to 40644
Columns: 153 entries, t_id to p_AVG_C_IP_l10_tw_nss
dtypes: datetime64[ns](1), float64(117), int64(29), object(6)
memory usage: 67.1+ MB


### 4. Predictive Features: Retrospective, Accrual-Derived Fatigue and Stamina

In [114]:
# 'p_tot_time_l7d_tw'
# Player time-weighted (tw) total time spent on court playing top level ATP matches over the past 7 days prior to a given match
# One small caveat is that matches carried over across days (much more likely at a major) are credited to the day that match was initiated 
# Qualifiers and Lucky Losers are given t_mean minutes spent on court for days during the qualifying period to reflect the added workload of the qualifying process.
# I'd like to eventually include data from Qualifying tournaments directly, but there's just too much data missing from Jeff's files on these matches at this point. Will require resourceful scraping and/or painful manual input

df_player1 = df_player1.sort_values(by=['p_id','m_date','m_rd_num'], ascending = False)

#Lists of potential decay weights to empirically test
#dw = [1, .5, .25, .125, .06, .03, .02, .01] #player "sheds" 1/2 of residual fatigue from a given match per day past that match he is
dw = [1, .67, .45, .3, .2, .13, .09, .06] #player "sheds" 1/3 of residual fatigue from a given match per day past that match he is
#dw = [1, .75, .56, .42, .32, .24, .18, .135] #player "sheds" 1/4 of residual fatigue from a given match per day past that match he is

# Holds dates and time of match of player's past 10 matches (in highly unlikely event that player had multiple days in that span with two matches)
df_player1["m_date1"] = df_player1.groupby(['p_id'])['m_date'].shift(-1)
df_player1["m_tot_time1"] = df_player1.groupby(['p_id'])['m_t(m)'].shift(-1)
df_player1["m_date2"] = df_player1.groupby(['p_id'])['m_date'].shift(-2)
df_player1["m_tot_time2"] = df_player1.groupby(['p_id'])['m_t(m)'].shift(-2)
df_player1["m_date3"] = df_player1.groupby(['p_id'])['m_date'].shift(-3)
df_player1["m_tot_time3"] = df_player1.groupby(['p_id'])['m_t(m)'].shift(-3)
df_player1["m_date4"] = df_player1.groupby(['p_id'])['m_date'].shift(-4)
df_player1["m_tot_time4"] = df_player1.groupby(['p_id'])['m_t(m)'].shift(-4)
df_player1["m_date5"] = df_player1.groupby(['p_id'])['m_date'].shift(-5)
df_player1["m_tot_time5"] = df_player1.groupby(['p_id'])['m_t(m)'].shift(-5)
df_player1["m_date6"] = df_player1.groupby(['p_id'])['m_date'].shift(-6)
df_player1["m_tot_time6"] = df_player1.groupby(['p_id'])['m_t(m)'].shift(-6)
df_player1["m_date7"] = df_player1.groupby(['p_id'])['m_date'].shift(-7)
df_player1["m_tot_time7"] = df_player1.groupby(['p_id'])['m_t(m)'].shift(-7)
df_player1["m_date8"] = df_player1.groupby(['p_id'])['m_date'].shift(-8)
df_player1["m_tot_time8"] = df_player1.groupby(['p_id'])['m_t(m)'].shift(-8)
df_player1["m_date9"] = df_player1.groupby(['p_id'])['m_date'].shift(-9)
df_player1["m_tot_time9"] = df_player1.groupby(['p_id'])['m_t(m)'].shift(-9)
df_player1["m_date10"] = df_player1.groupby(['p_id'])['m_date'].shift(-10)
df_player1["m_tot_time10"] = df_player1.groupby(['p_id'])['m_t(m)'].shift(-10)

# Evaluates for each of the 7 days prior to a given match how long ago that match was, and assigns a decay weight to the appropriate time
df_player1.loc[(df_player1["m_date"] - df_player1["m_date1"] == "0 days"), "m_tot_time1_dw"] = df_player1["m_tot_time1"]*dw[0]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date1"] == "1 days"), "m_tot_time1_dw"] = df_player1["m_tot_time1"]*dw[1]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date1"] == "2 days"), "m_tot_time1_dw"] = df_player1["m_tot_time1"]*dw[2]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date1"] == "3 days"), "m_tot_time1_dw"] = df_player1["m_tot_time1"]*dw[3]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date1"] == "4 days"), "m_tot_time1_dw"] = df_player1["m_tot_time1"]*dw[4]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date1"] == "5 days"), "m_tot_time1_dw"] = df_player1["m_tot_time1"]*dw[5]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date1"] == "6 days"), "m_tot_time1_dw"] = df_player1["m_tot_time1"]*dw[6]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date1"] == "7 days"), "m_tot_time1_dw"] = df_player1["m_tot_time1"]*dw[7]

df_player1.loc[(df_player1["m_date"] - df_player1["m_date2"] == "0 days"), "m_tot_time2_dw"] = df_player1["m_tot_time2"]*dw[0]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date2"] == "1 days"), "m_tot_time2_dw"] = df_player1["m_tot_time2"]*dw[1]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date2"] == "2 days"), "m_tot_time2_dw"] = df_player1["m_tot_time2"]*dw[2]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date2"] == "3 days"), "m_tot_time2_dw"] = df_player1["m_tot_time2"]*dw[3]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date2"] == "4 days"), "m_tot_time2_dw"] = df_player1["m_tot_time2"]*dw[4]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date2"] == "5 days"), "m_tot_time2_dw"] = df_player1["m_tot_time2"]*dw[5]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date2"] == "6 days"), "m_tot_time2_dw"] = df_player1["m_tot_time2"]*dw[6]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date2"] == "7 days"), "m_tot_time2_dw"] = df_player1["m_tot_time2"]*dw[7]

df_player1.loc[(df_player1["m_date"] - df_player1["m_date3"] == "0 days"), "m_tot_time3_dw"] = df_player1["m_tot_time3"]*dw[0]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date3"] == "1 days"), "m_tot_time3_dw"] = df_player1["m_tot_time3"]*dw[1]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date3"] == "2 days"), "m_tot_time3_dw"] = df_player1["m_tot_time3"]*dw[2]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date3"] == "3 days"), "m_tot_time3_dw"] = df_player1["m_tot_time3"]*dw[3]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date3"] == "4 days"), "m_tot_time3_dw"] = df_player1["m_tot_time3"]*dw[4]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date3"] == "5 days"), "m_tot_time3_dw"] = df_player1["m_tot_time3"]*dw[5]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date3"] == "6 days"), "m_tot_time3_dw"] = df_player1["m_tot_time3"]*dw[6]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date3"] == "7 days"), "m_tot_time3_dw"] = df_player1["m_tot_time3"]*dw[7]

df_player1.loc[(df_player1["m_date"] - df_player1["m_date4"] == "0 days"), "m_tot_time4_dw"] = df_player1["m_tot_time4"]*dw[0]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date4"] == "1 days"), "m_tot_time4_dw"] = df_player1["m_tot_time4"]*dw[1]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date4"] == "2 days"), "m_tot_time4_dw"] = df_player1["m_tot_time4"]*dw[2]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date4"] == "3 days"), "m_tot_time4_dw"] = df_player1["m_tot_time4"]*dw[3]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date4"] == "4 days"), "m_tot_time4_dw"] = df_player1["m_tot_time4"]*dw[4]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date4"] == "5 days"), "m_tot_time4_dw"] = df_player1["m_tot_time4"]*dw[5]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date4"] == "6 days"), "m_tot_time4_dw"] = df_player1["m_tot_time4"]*dw[6]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date4"] == "7 days"), "m_tot_time4_dw"] = df_player1["m_tot_time4"]*dw[7]

df_player1.loc[(df_player1["m_date"] - df_player1["m_date5"] == "0 days"), "m_tot_time5_dw"] = df_player1["m_tot_time5"]*dw[0]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date5"] == "1 days"), "m_tot_time5_dw"] = df_player1["m_tot_time5"]*dw[1]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date5"] == "2 days"), "m_tot_time5_dw"] = df_player1["m_tot_time5"]*dw[2]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date5"] == "3 days"), "m_tot_time5_dw"] = df_player1["m_tot_time5"]*dw[3]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date5"] == "4 days"), "m_tot_time5_dw"] = df_player1["m_tot_time5"]*dw[4]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date5"] == "5 days"), "m_tot_time5_dw"] = df_player1["m_tot_time5"]*dw[5]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date5"] == "6 days"), "m_tot_time5_dw"] = df_player1["m_tot_time5"]*dw[6]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date5"] == "7 days"), "m_tot_time5_dw"] = df_player1["m_tot_time5"]*dw[7]

df_player1.loc[(df_player1["m_date"] - df_player1["m_date6"] == "0 days"), "m_tot_time6_dw"] = df_player1["m_tot_time6"]*dw[0]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date6"] == "1 days"), "m_tot_time6_dw"] = df_player1["m_tot_time6"]*dw[1]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date6"] == "2 days"), "m_tot_time6_dw"] = df_player1["m_tot_time6"]*dw[2]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date6"] == "3 days"), "m_tot_time6_dw"] = df_player1["m_tot_time6"]*dw[3]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date6"] == "4 days"), "m_tot_time6_dw"] = df_player1["m_tot_time6"]*dw[4]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date6"] == "5 days"), "m_tot_time6_dw"] = df_player1["m_tot_time6"]*dw[5]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date6"] == "6 days"), "m_tot_time6_dw"] = df_player1["m_tot_time6"]*dw[6]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date6"] == "7 days"), "m_tot_time6_dw"] = df_player1["m_tot_time6"]*dw[7]

df_player1.loc[(df_player1["m_date"] - df_player1["m_date7"] == "0 days"), "m_tot_time7_dw"] = df_player1["m_tot_time7"]*dw[0]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date7"] == "1 days"), "m_tot_time7_dw"] = df_player1["m_tot_time7"]*dw[1]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date7"] == "2 days"), "m_tot_time7_dw"] = df_player1["m_tot_time7"]*dw[2]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date7"] == "3 days"), "m_tot_time7_dw"] = df_player1["m_tot_time7"]*dw[3]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date7"] == "4 days"), "m_tot_time7_dw"] = df_player1["m_tot_time7"]*dw[4]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date7"] == "5 days"), "m_tot_time7_dw"] = df_player1["m_tot_time7"]*dw[5]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date7"] == "6 days"), "m_tot_time7_dw"] = df_player1["m_tot_time7"]*dw[6]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date7"] == "7 days"), "m_tot_time7_dw"] = df_player1["m_tot_time7"]*dw[7]

df_player1.loc[(df_player1["m_date"] - df_player1["m_date8"] == "0 days"), "m_tot_time8_dw"] = df_player1["m_tot_time8"]*dw[0]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date8"] == "1 days"), "m_tot_time8_dw"] = df_player1["m_tot_time8"]*dw[1]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date8"] == "2 days"), "m_tot_time8_dw"] = df_player1["m_tot_time8"]*dw[2]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date8"] == "3 days"), "m_tot_time8_dw"] = df_player1["m_tot_time8"]*dw[3]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date8"] == "4 days"), "m_tot_time8_dw"] = df_player1["m_tot_time8"]*dw[4]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date8"] == "5 days"), "m_tot_time8_dw"] = df_player1["m_tot_time8"]*dw[5]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date8"] == "6 days"), "m_tot_time8_dw"] = df_player1["m_tot_time8"]*dw[6]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date8"] == "7 days"), "m_tot_time8_dw"] = df_player1["m_tot_time8"]*dw[7]

df_player1.loc[(df_player1["m_date"] - df_player1["m_date9"] == "0 days"), "m_tot_time9_dw"] = df_player1["m_tot_time9"]*dw[0]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date9"] == "1 days"), "m_tot_time9_dw"] = df_player1["m_tot_time9"]*dw[1]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date9"] == "2 days"), "m_tot_time9_dw"] = df_player1["m_tot_time9"]*dw[2]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date9"] == "3 days"), "m_tot_time9_dw"] = df_player1["m_tot_time9"]*dw[3]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date9"] == "4 days"), "m_tot_time9_dw"] = df_player1["m_tot_time9"]*dw[4]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date9"] == "5 days"), "m_tot_time9_dw"] = df_player1["m_tot_time9"]*dw[5]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date9"] == "6 days"), "m_tot_time9_dw"] = df_player1["m_tot_time9"]*dw[6]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date9"] == "7 days"), "m_tot_time9_dw"] = df_player1["m_tot_time9"]*dw[7]

df_player1.loc[(df_player1["m_date"] - df_player1["m_date10"] == "0 days"), "m_tot_time10_dw"] = df_player1["m_tot_time10"]*dw[0]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date10"] == "1 days"), "m_tot_time10_dw"] = df_player1["m_tot_time10"]*dw[1]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date10"] == "2 days"), "m_tot_time10_dw"] = df_player1["m_tot_time10"]*dw[2]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date10"] == "3 days"), "m_tot_time10_dw"] = df_player1["m_tot_time10"]*dw[3]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date10"] == "4 days"), "m_tot_time10_dw"] = df_player1["m_tot_time10"]*dw[4]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date10"] == "5 days"), "m_tot_time10_dw"] = df_player1["m_tot_time10"]*dw[5]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date10"] == "6 days"), "m_tot_time10_dw"] = df_player1["m_tot_time10"]*dw[6]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date10"] == "7 days"), "m_tot_time10_dw"] = df_player1["m_tot_time10"]*dw[7]

# For Qualifiers (and Lucky Losers), estimated time load from qualifying matches is computed and added prior to summing total accumulated time on court per player, per match (qualifying matches not captured currently in dataframe) 

# Filling NaNs with 0s (helps avoid annoying syntax limitations with Booleans and NaNs)
df_player1[["m_tot_time1_dw", "m_tot_time2_dw", "m_tot_time3_dw", "m_tot_time4_dw", "m_tot_time5_dw", "m_tot_time6_dw", "m_tot_time7_dw", "m_tot_time8_dw", "m_tot_time9_dw", "m_tot_time10_dw"]] = df_player1[["m_tot_time1_dw", "m_tot_time2_dw", "m_tot_time3_dw", "m_tot_time4_dw", "m_tot_time5_dw", "m_tot_time6_dw", "m_tot_time7_dw", "m_tot_time8_dw", "m_tot_time9_dw", "m_tot_time10_dw"]].fillna(0)

# # Qualies at non-slams are best of 3, base mean time estimate for a 3 set match is calculated here per surface (and I/O for hard) from relevant sample means:
mean_mt_clay = df_player1.loc[(df_player1["m_bestof"] == 3) & (df_player1["t_surf"] == 1) & (df_player1["Comment"] == "Completed"), "m_t(m)"].mean()
mean_mt_grass = df_player1.loc[(df_player1["m_bestof"] == 3) & (df_player1["t_surf"] == 3) & (df_player1["Comment"] == "Completed"), "m_t(m)"].mean()
mean_mt_hard_o = df_player1.loc[(df_player1["m_bestof"] == 3) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & (df_player1["Comment"] == "Completed"), "m_t(m)"].mean()
mean_mt_hard_i = df_player1.loc[(df_player1["m_bestof"] == 3) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & (df_player1["Comment"] == "Completed"), "m_t(m)"].mean()
surf_mt_means = mean_mt_clay, mean_mt_grass, mean_mt_hard_i, mean_mt_hard_o

# At non-Grand Slams (2 qualifying matches; final round of qualies is typically 1 day before the first day of 1st Round
# # Also playing every day during the main draw of the tourny; 5 or 6 rounds max (highest 3 numbers are QF/SF/F)
## For player entry type ("p_ent"), 2 is Qualifier and 2.5 is 'Lucky Loser'; For tourny level ("t_lvl"), 4 is Grand Slam

df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time1_dw"] == 0), "m_tot_time1_dw"] = (surf_mt_means[0]*dw[1]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time2_dw"] == 0), "m_tot_time2_dw"] = (surf_mt_means[0]*dw[2])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time2_dw"] == 0), "m_tot_time2_dw"] = (surf_mt_means[0]*dw[2]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time3_dw"] == 0), "m_tot_time3_dw"] = (surf_mt_means[0]*dw[3])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time3_dw"] == 0), "m_tot_time3_dw"] = (surf_mt_means[0]*dw[3]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time4_dw"] == 0), "m_tot_time4_dw"] = (surf_mt_means[0]*dw[4])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 4) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time4_dw"] == 0), "m_tot_time4_dw"] = (surf_mt_means[0]*dw[4]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 4) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time5_dw"] == 0), "m_tot_time5_dw"] = (surf_mt_means[0]*dw[5])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 5) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time5_dw"] == 0), "m_tot_time5_dw"] = (surf_mt_means[0]*dw[5]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 5) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time6_dw"] == 0), "m_tot_time6_dw"] = (surf_mt_means[0]*dw[6])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 6) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time6_dw"] == 0), "m_tot_time6_dw"] = (surf_mt_means[0]*dw[6]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 6) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time7_dw"] == 0), "m_tot_time7_dw"] = (surf_mt_means[0]*dw[7])

df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time1_dw"] == 0), "m_tot_time1_dw"] = (surf_mt_means[1]*dw[1]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time2_dw"] == 0), "m_tot_time2_dw"] = (surf_mt_means[1]*dw[2])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time2_dw"] == 0), "m_tot_time2_dw"] = (surf_mt_means[1]*dw[2]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time3_dw"] == 0), "m_tot_time3_dw"] = (surf_mt_means[1]*dw[3])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time3_dw"] == 0), "m_tot_time3_dw"] = (surf_mt_means[1]*dw[3]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time4_dw"] == 0), "m_tot_time4_dw"] = (surf_mt_means[1]*dw[4])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 4) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time4_dw"] == 0), "m_tot_time4_dw"] = (surf_mt_means[1]*dw[4]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 4) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time5_dw"] == 0), "m_tot_time5_dw"] = (surf_mt_means[1]*dw[5])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 5) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time5_dw"] == 0), "m_tot_time5_dw"] = (surf_mt_means[1]*dw[5]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 5) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time6_dw"] == 0), "m_tot_time6_dw"] = (surf_mt_means[1]*dw[6])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 6) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time6_dw"] == 0), "m_tot_time6_dw"] = (surf_mt_means[1]*dw[6]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 6) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time7_dw"] == 0), "m_tot_time7_dw"] = (surf_mt_means[1]*dw[7])

df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time1_dw"] == 0), "m_tot_time1_dw"] = (surf_mt_means[2]*dw[1]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time2_dw"] == 0), "m_tot_time2_dw"] = (surf_mt_means[2]*dw[2])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time2_dw"] == 0), "m_tot_time2_dw"] = (surf_mt_means[2]*dw[2]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time3_dw"] == 0), "m_tot_time3_dw"] = (surf_mt_means[2]*dw[3])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time3_dw"] == 0), "m_tot_time3_dw"] = (surf_mt_means[2]*dw[3]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time4_dw"] == 0), "m_tot_time4_dw"] = (surf_mt_means[2]*dw[4])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 4) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time4_dw"] == 0), "m_tot_time4_dw"] = (surf_mt_means[2]*dw[4]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 4) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time5_dw"] == 0), "m_tot_time5_dw"] = (surf_mt_means[2]*dw[5])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 5) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time5_dw"] == 0), "m_tot_time5_dw"] = (surf_mt_means[2]*dw[5]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 5) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time6_dw"] == 0), "m_tot_time6_dw"] = (surf_mt_means[2]*dw[6])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 6) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time6_dw"] == 0), "m_tot_time6_dw"] = (surf_mt_means[2]*dw[6]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 6) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time7_dw"] == 0), "m_tot_time7_dw"] = (surf_mt_means[2]*dw[7])

df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time1_dw"] == 0), "m_tot_time1_dw"] = (surf_mt_means[3]*dw[1]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time2_dw"] == 0), "m_tot_time2_dw"] = (surf_mt_means[3]*dw[2])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time2_dw"] == 0), "m_tot_time2_dw"] = (surf_mt_means[3]*dw[2]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time3_dw"] == 0), "m_tot_time3_dw"] = (surf_mt_means[3]*dw[3])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time3_dw"] == 0), "m_tot_time3_dw"] = (surf_mt_means[3]*dw[3]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time4_dw"] == 0), "m_tot_time4_dw"] = (surf_mt_means[3]*dw[4])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 4) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time4_dw"] == 0), "m_tot_time4_dw"] = (surf_mt_means[3]*dw[4]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 4) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time5_dw"] == 0), "m_tot_time5_dw"] = (surf_mt_means[3]*dw[5])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 5) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time5_dw"] == 0), "m_tot_time5_dw"] = (surf_mt_means[3]*dw[5]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 5) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time6_dw"] == 0), "m_tot_time6_dw"] = (surf_mt_means[3]*dw[6])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 6) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time6_dw"] == 0), "m_tot_time6_dw"] = (surf_mt_means[3]*dw[6]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 6) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time7_dw"] == 0), "m_tot_time7_dw"] = (surf_mt_means[3]*dw[7])

# # At Grand Slams (3 qualifying matches; final round of qualies is typically 3 days before the first day of 1st Round)
# # Qualies at Grand Slams are only best of 3, so base time estimate of t_mean minutes (mean best of 3-sets match duration) is appropriate
# # Playing, typically, every other day during the main draw; total of 7 rounds max (5-7 are QF/SF/F)

df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time1_dw"] == 0), "m_tot_time1_dw"] = (surf_mt_means[0]*dw[3]) 
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time2_dw"] == 0), "m_tot_time2_dw"] = (surf_mt_means[0]*dw[4])
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time3_dw"] == 0), "m_tot_time3_dw"] = (surf_mt_means[0]*dw[5])
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time2_dw"] == 0), "m_tot_time2_dw"] = (surf_mt_means[0]*dw[5]) 
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time3_dw"] == 0), "m_tot_time3_dw"] = (surf_mt_means[0]*dw[6])
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time4_dw"] == 0), "m_tot_time4_dw"] = (surf_mt_means[0]*dw[7])
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time3_dw"] == 0), "m_tot_time3_dw"] = (surf_mt_means[0]*dw[6]) 
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time4_dw"] == 0), "m_tot_time4_dw"] = (surf_mt_means[0]*dw[7]) 

df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time1_dw"] == 0), "m_tot_time1_dw"] = (surf_mt_means[1]*dw[3]) 
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time2_dw"] == 0), "m_tot_time2_dw"] = (surf_mt_means[1]*dw[4])
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time3_dw"] == 0), "m_tot_time3_dw"] = (surf_mt_means[1]*dw[5])
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time2_dw"] == 0), "m_tot_time2_dw"] = (surf_mt_means[1]*dw[5]) 
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time3_dw"] == 0), "m_tot_time3_dw"] = (surf_mt_means[1]*dw[6])
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time4_dw"] == 0), "m_tot_time4_dw"] = (surf_mt_means[1]*dw[7])
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time3_dw"] == 0), "m_tot_time3_dw"] = (surf_mt_means[1]*dw[6]) 
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time4_dw"] == 0), "m_tot_time4_dw"] = (surf_mt_means[1]*dw[7])

# No indoor hard courts for Slams (well, ignoring retractable roof at least, there isn't)
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 2) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time1_dw"] == 0), "m_tot_time1_dw"] = (surf_mt_means[3]*dw[3]) 
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 2) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time2_dw"] == 0), "m_tot_time2_dw"] = (surf_mt_means[3]*dw[4])
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 2) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time3_dw"] == 0), "m_tot_time3_dw"] = (surf_mt_means[3]*dw[5])
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 2) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time2_dw"] == 0), "m_tot_time2_dw"] = (surf_mt_means[3]*dw[5]) 
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 2) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time3_dw"] == 0), "m_tot_time3_dw"] = (surf_mt_means[3]*dw[6])
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 2) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time4_dw"] == 0), "m_tot_time4_dw"] = (surf_mt_means[3]*dw[7])
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 2) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time3_dw"] == 0), "m_tot_time3_dw"] = (surf_mt_means[3]*dw[6]) 
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 2) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_time4_dw"] == 0), "m_tot_time4_dw"] = (surf_mt_means[3]*dw[7])

#Summing across 'm_tot_time_x'columns (finally!) to get weighted "time load" per player prior to a given match
df_player1["p_tot_time_l7d_tw"] = df_player1[["m_tot_time1_dw", "m_tot_time2_dw", "m_tot_time3_dw", "m_tot_time4_dw", "m_tot_time5_dw", "m_tot_time6_dw", "m_tot_time7_dw", "m_tot_time8_dw", "m_tot_time9_dw", "m_tot_time10_dw"]].sum(axis=1)
df_player1["p_tot_time_l7d_tw"] = df_player1["p_tot_time_l7d_tw"].round(2)

# # Deleting the many transient columns and variables
df_player1 = df_player1.drop(["m_tot_time1", "m_tot_time2", "m_tot_time3", "m_tot_time4", "m_tot_time5", "m_tot_time6", "m_tot_time7", "m_tot_time8", "m_tot_time9", "m_tot_time10", "m_date1", "m_date2", "m_date3", "m_date4", "m_date5", "m_date6", "m_date7", "m_date8", "m_date9", "m_date10", "m_tot_time1_dw", "m_tot_time2_dw", "m_tot_time3_dw", "m_tot_time4_dw", "m_tot_time5_dw", "m_tot_time6_dw", "m_tot_time7_dw", "m_tot_time8_dw", "m_tot_time9_dw", "m_tot_time10_dw"], axis = 1)
del mean_mt_clay, mean_mt_grass, mean_mt_hard_o, mean_mt_hard_i, surf_mt_means

In [115]:
# 'p_tot_pts_l7d_tw'
# Player time-weighted (tw) total pts played in top level ATP matches over the past 7 days prior to a given match
# One small caveat is that matches carried over across days (much more likely at a major) are credited to the day that match was initiated 
# Qualifiers and Lucky Losers are given pts_mean points played for days during the qualifying period to reflect the added workload of the qualifying process.
# I'd like to eventually include data from Qualifying tournaments directly, but there's just too much data missing from Jeff's files on these matches at this point. Will require resourceful scraping and/or painful manual input

df_player1 = df_player1.sort_values(by=['p_id','m_date','m_rd_num'], ascending = False)

#Lists of potential decay weights to empirically test
#dw = [1, .5, .25, .125, .06, .03, .02, .01] #player "sheds" 1/2 of residual fatigue from a given match per day past that match he is
dw = [1, .67, .45, .3, .2, .13, .09, .06] #player "sheds" 1/3 of residual fatigue from a given match per day past that match he is
#dw = [1, .75, .56, .42, .32, .24, .18, .135] #player "sheds" 1/4 of residual fatigue from a given match per day past that match he is

# Holds dates and total pts played in player's past 10 matches (in highly unlikely event that player had multiple days in that span with two matches)
df_player1["m_date1"] = df_player1.groupby(['p_id'])['m_date'].shift(-1)
df_player1["m_tot_pts1"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-1)
df_player1["m_date2"] = df_player1.groupby(['p_id'])['m_date'].shift(-2)
df_player1["m_tot_pts2"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-2)
df_player1["m_date3"] = df_player1.groupby(['p_id'])['m_date'].shift(-3)
df_player1["m_tot_pts3"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-3)
df_player1["m_date4"] = df_player1.groupby(['p_id'])['m_date'].shift(-4)
df_player1["m_tot_pts4"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-4)
df_player1["m_date5"] = df_player1.groupby(['p_id'])['m_date'].shift(-5)
df_player1["m_tot_pts5"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-5)
df_player1["m_date6"] = df_player1.groupby(['p_id'])['m_date'].shift(-6)
df_player1["m_tot_pts6"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-6)
df_player1["m_date7"] = df_player1.groupby(['p_id'])['m_date'].shift(-7)
df_player1["m_tot_pts7"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-7)
df_player1["m_date8"] = df_player1.groupby(['p_id'])['m_date'].shift(-8)
df_player1["m_tot_pts8"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-8)
df_player1["m_date9"] = df_player1.groupby(['p_id'])['m_date'].shift(-9)
df_player1["m_tot_pts9"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-9)
df_player1["m_date10"] = df_player1.groupby(['p_id'])['m_date'].shift(-10)
df_player1["m_tot_pts10"] = df_player1.groupby(['p_id'])['m_tot_pts'].shift(-10)

# Evaluates for each of the 7 days prior to a given match how long ago that match was, and assigns a decay weight to the appropriate pts
df_player1.loc[(df_player1["m_date"] - df_player1["m_date1"] == "0 days"), "m_tot_pts1_dw"] = df_player1["m_tot_pts1"]*dw[0]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date1"] == "1 days"), "m_tot_pts1_dw"] = df_player1["m_tot_pts1"]*dw[1]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date1"] == "2 days"), "m_tot_pts1_dw"] = df_player1["m_tot_pts1"]*dw[2]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date1"] == "3 days"), "m_tot_pts1_dw"] = df_player1["m_tot_pts1"]*dw[3]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date1"] == "4 days"), "m_tot_pts1_dw"] = df_player1["m_tot_pts1"]*dw[4]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date1"] == "5 days"), "m_tot_pts1_dw"] = df_player1["m_tot_pts1"]*dw[5]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date1"] == "6 days"), "m_tot_pts1_dw"] = df_player1["m_tot_pts1"]*dw[6]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date1"] == "7 days"), "m_tot_pts1_dw"] = df_player1["m_tot_pts1"]*dw[7]

df_player1.loc[(df_player1["m_date"] - df_player1["m_date2"] == "0 days"), "m_tot_pts2_dw"] = df_player1["m_tot_pts2"]*dw[0]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date2"] == "1 days"), "m_tot_pts2_dw"] = df_player1["m_tot_pts2"]*dw[1]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date2"] == "2 days"), "m_tot_pts2_dw"] = df_player1["m_tot_pts2"]*dw[2]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date2"] == "3 days"), "m_tot_pts2_dw"] = df_player1["m_tot_pts2"]*dw[3]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date2"] == "4 days"), "m_tot_pts2_dw"] = df_player1["m_tot_pts2"]*dw[4]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date2"] == "5 days"), "m_tot_pts2_dw"] = df_player1["m_tot_pts2"]*dw[5]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date2"] == "6 days"), "m_tot_pts2_dw"] = df_player1["m_tot_pts2"]*dw[6]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date2"] == "7 days"), "m_tot_pts2_dw"] = df_player1["m_tot_pts2"]*dw[7]

df_player1.loc[(df_player1["m_date"] - df_player1["m_date3"] == "0 days"), "m_tot_pts3_dw"] = df_player1["m_tot_pts3"]*dw[0]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date3"] == "1 days"), "m_tot_pts3_dw"] = df_player1["m_tot_pts3"]*dw[1]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date3"] == "2 days"), "m_tot_pts3_dw"] = df_player1["m_tot_pts3"]*dw[2]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date3"] == "3 days"), "m_tot_pts3_dw"] = df_player1["m_tot_pts3"]*dw[3]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date3"] == "4 days"), "m_tot_pts3_dw"] = df_player1["m_tot_pts3"]*dw[4]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date3"] == "5 days"), "m_tot_pts3_dw"] = df_player1["m_tot_pts3"]*dw[5]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date3"] == "6 days"), "m_tot_pts3_dw"] = df_player1["m_tot_pts3"]*dw[6]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date3"] == "7 days"), "m_tot_pts3_dw"] = df_player1["m_tot_pts3"]*dw[7]

df_player1.loc[(df_player1["m_date"] - df_player1["m_date4"] == "0 days"), "m_tot_pts4_dw"] = df_player1["m_tot_pts4"]*dw[0]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date4"] == "1 days"), "m_tot_pts4_dw"] = df_player1["m_tot_pts4"]*dw[1]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date4"] == "2 days"), "m_tot_pts4_dw"] = df_player1["m_tot_pts4"]*dw[2]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date4"] == "3 days"), "m_tot_pts4_dw"] = df_player1["m_tot_pts4"]*dw[3]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date4"] == "4 days"), "m_tot_pts4_dw"] = df_player1["m_tot_pts4"]*dw[4]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date4"] == "5 days"), "m_tot_pts4_dw"] = df_player1["m_tot_pts4"]*dw[5]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date4"] == "6 days"), "m_tot_pts4_dw"] = df_player1["m_tot_pts4"]*dw[6]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date4"] == "7 days"), "m_tot_pts4_dw"] = df_player1["m_tot_pts4"]*dw[7]

df_player1.loc[(df_player1["m_date"] - df_player1["m_date5"] == "0 days"), "m_tot_pts5_dw"] = df_player1["m_tot_pts5"]*dw[0]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date5"] == "1 days"), "m_tot_pts5_dw"] = df_player1["m_tot_pts5"]*dw[1]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date5"] == "2 days"), "m_tot_pts5_dw"] = df_player1["m_tot_pts5"]*dw[2]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date5"] == "3 days"), "m_tot_pts5_dw"] = df_player1["m_tot_pts5"]*dw[3]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date5"] == "4 days"), "m_tot_pts5_dw"] = df_player1["m_tot_pts5"]*dw[4]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date5"] == "5 days"), "m_tot_pts5_dw"] = df_player1["m_tot_pts5"]*dw[5]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date5"] == "6 days"), "m_tot_pts5_dw"] = df_player1["m_tot_pts5"]*dw[6]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date5"] == "7 days"), "m_tot_pts5_dw"] = df_player1["m_tot_pts5"]*dw[7]

df_player1.loc[(df_player1["m_date"] - df_player1["m_date6"] == "0 days"), "m_tot_pts6_dw"] = df_player1["m_tot_pts6"]*dw[0]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date6"] == "1 days"), "m_tot_pts6_dw"] = df_player1["m_tot_pts6"]*dw[1]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date6"] == "2 days"), "m_tot_pts6_dw"] = df_player1["m_tot_pts6"]*dw[2]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date6"] == "3 days"), "m_tot_pts6_dw"] = df_player1["m_tot_pts6"]*dw[3]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date6"] == "4 days"), "m_tot_pts6_dw"] = df_player1["m_tot_pts6"]*dw[4]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date6"] == "5 days"), "m_tot_pts6_dw"] = df_player1["m_tot_pts6"]*dw[5]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date6"] == "6 days"), "m_tot_pts6_dw"] = df_player1["m_tot_pts6"]*dw[6]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date6"] == "7 days"), "m_tot_pts6_dw"] = df_player1["m_tot_pts6"]*dw[7]

df_player1.loc[(df_player1["m_date"] - df_player1["m_date7"] == "0 days"), "m_tot_pts7_dw"] = df_player1["m_tot_pts7"]*dw[0]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date7"] == "1 days"), "m_tot_pts7_dw"] = df_player1["m_tot_pts7"]*dw[1]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date7"] == "2 days"), "m_tot_pts7_dw"] = df_player1["m_tot_pts7"]*dw[2]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date7"] == "3 days"), "m_tot_pts7_dw"] = df_player1["m_tot_pts7"]*dw[3]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date7"] == "4 days"), "m_tot_pts7_dw"] = df_player1["m_tot_pts7"]*dw[4]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date7"] == "5 days"), "m_tot_pts7_dw"] = df_player1["m_tot_pts7"]*dw[5]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date7"] == "6 days"), "m_tot_pts7_dw"] = df_player1["m_tot_pts7"]*dw[6]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date7"] == "7 days"), "m_tot_pts7_dw"] = df_player1["m_tot_pts7"]*dw[7]

df_player1.loc[(df_player1["m_date"] - df_player1["m_date8"] == "0 days"), "m_tot_pts8_dw"] = df_player1["m_tot_pts8"]*dw[0]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date8"] == "1 days"), "m_tot_pts8_dw"] = df_player1["m_tot_pts8"]*dw[1]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date8"] == "2 days"), "m_tot_pts8_dw"] = df_player1["m_tot_pts8"]*dw[2]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date8"] == "3 days"), "m_tot_pts8_dw"] = df_player1["m_tot_pts8"]*dw[3]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date8"] == "4 days"), "m_tot_pts8_dw"] = df_player1["m_tot_pts8"]*dw[4]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date8"] == "5 days"), "m_tot_pts8_dw"] = df_player1["m_tot_pts8"]*dw[5]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date8"] == "6 days"), "m_tot_pts8_dw"] = df_player1["m_tot_pts8"]*dw[6]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date8"] == "7 days"), "m_tot_pts8_dw"] = df_player1["m_tot_pts8"]*dw[7]

df_player1.loc[(df_player1["m_date"] - df_player1["m_date9"] == "0 days"), "m_tot_pts9_dw"] = df_player1["m_tot_pts9"]*dw[0]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date9"] == "1 days"), "m_tot_pts9_dw"] = df_player1["m_tot_pts9"]*dw[1]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date9"] == "2 days"), "m_tot_pts9_dw"] = df_player1["m_tot_pts9"]*dw[2]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date9"] == "3 days"), "m_tot_pts9_dw"] = df_player1["m_tot_pts9"]*dw[3]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date9"] == "4 days"), "m_tot_pts9_dw"] = df_player1["m_tot_pts9"]*dw[4]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date9"] == "5 days"), "m_tot_pts9_dw"] = df_player1["m_tot_pts9"]*dw[5]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date9"] == "6 days"), "m_tot_pts9_dw"] = df_player1["m_tot_pts9"]*dw[6]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date9"] == "7 days"), "m_tot_pts9_dw"] = df_player1["m_tot_pts9"]*dw[7]

df_player1.loc[(df_player1["m_date"] - df_player1["m_date10"] == "0 days"), "m_tot_pts10_dw"] = df_player1["m_tot_pts10"]*dw[0]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date10"] == "1 days"), "m_tot_pts10_dw"] = df_player1["m_tot_pts10"]*dw[1]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date10"] == "2 days"), "m_tot_pts10_dw"] = df_player1["m_tot_pts10"]*dw[2]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date10"] == "3 days"), "m_tot_pts10_dw"] = df_player1["m_tot_pts10"]*dw[3]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date10"] == "4 days"), "m_tot_pts10_dw"] = df_player1["m_tot_pts10"]*dw[4]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date10"] == "5 days"), "m_tot_pts10_dw"] = df_player1["m_tot_pts10"]*dw[5]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date10"] == "6 days"), "m_tot_pts10_dw"] = df_player1["m_tot_pts10"]*dw[6]
df_player1.loc[(df_player1["m_date"] - df_player1["m_date10"] == "7 days"), "m_tot_pts10_dw"] = df_player1["m_tot_pts10"]*dw[7]

# For Qualifiers (and Lucky Losers), estimated pts load from qualifying matches is computed and added prior to summing total accumulated pts on court per player, per match (qualifying matches not captured currently in dataframe) 

# Filling NaNs with 0s (helps avoid annoying syntax limitations with Booleans and NaNs)
df_player1[["m_tot_pts1_dw", "m_tot_pts2_dw", "m_tot_pts3_dw", "m_tot_pts4_dw", "m_tot_pts5_dw", "m_tot_pts6_dw", "m_tot_pts7_dw", "m_tot_pts8_dw", "m_tot_pts9_dw", "m_tot_pts10_dw"]] = df_player1[["m_tot_pts1_dw", "m_tot_pts2_dw", "m_tot_pts3_dw", "m_tot_pts4_dw", "m_tot_pts5_dw", "m_tot_pts6_dw", "m_tot_pts7_dw", "m_tot_pts8_dw", "m_tot_pts9_dw", "m_tot_pts10_dw"]].fillna(0)

# # Qualies at non-slams are best of 3, base mean pts estimate for a 3 set match is calculated here per surface (and I/O for hard) from relevant sample means:
mean_pts_clay = df_player1.loc[(df_player1["m_bestof"] == 3) & (df_player1["t_surf"] == 1) & (df_player1["Comment"] == "Completed"), "m_tot_pts"].mean()
mean_pts_grass = df_player1.loc[(df_player1["m_bestof"] == 3) & (df_player1["t_surf"] == 3) & (df_player1["Comment"] == "Completed"), "m_tot_pts"].mean()
mean_pts_hard_o = df_player1.loc[(df_player1["m_bestof"] == 3) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & (df_player1["Comment"] == "Completed"), "m_tot_pts"].mean()
mean_pts_hard_i = df_player1.loc[(df_player1["m_bestof"] == 3) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & (df_player1["Comment"] == "Completed"), "m_tot_pts"].mean()
surf_pts_means = mean_pts_clay, mean_pts_grass, mean_pts_hard_i, mean_pts_hard_o

# At non-Grand Slams (2 qualifying matches; final round of qualies is typically 1 day before the first day of 1st Round
# # Also playing every day during the main draw of the tourny; 5 or 6 rounds max (highest 3 numbers are QF/SF/F)
## For player entry type ("p_ent"), 2 is Qualifier and 2.5 is 'Lucky Loser'; For tourny level ("t_lvl"), 4 is Grand Slam

df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts1_dw"] == 0), "m_tot_pts1_dw"] = (surf_pts_means[0]*dw[1]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts2_dw"] == 0), "m_tot_pts2_dw"] = (surf_pts_means[0]*dw[2])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts2_dw"] == 0), "m_tot_pts2_dw"] = (surf_pts_means[0]*dw[2]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts3_dw"] == 0), "m_tot_pts3_dw"] = (surf_pts_means[0]*dw[3])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts3_dw"] == 0), "m_tot_pts3_dw"] = (surf_pts_means[0]*dw[3]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts4_dw"] == 0), "m_tot_pts4_dw"] = (surf_pts_means[0]*dw[4])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 4) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts4_dw"] == 0), "m_tot_pts4_dw"] = (surf_pts_means[0]*dw[4]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 4) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts5_dw"] == 0), "m_tot_pts5_dw"] = (surf_pts_means[0]*dw[5])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 5) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts5_dw"] == 0), "m_tot_pts5_dw"] = (surf_pts_means[0]*dw[5]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 5) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts6_dw"] == 0), "m_tot_pts6_dw"] = (surf_pts_means[0]*dw[6])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 6) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts6_dw"] == 0), "m_tot_pts6_dw"] = (surf_pts_means[0]*dw[6]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 6) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts7_dw"] == 0), "m_tot_pts7_dw"] = (surf_pts_means[0]*dw[7])

df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts1_dw"] == 0), "m_tot_pts1_dw"] = (surf_pts_means[1]*dw[1]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts2_dw"] == 0), "m_tot_pts2_dw"] = (surf_pts_means[1]*dw[2])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts2_dw"] == 0), "m_tot_pts2_dw"] = (surf_pts_means[1]*dw[2]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts3_dw"] == 0), "m_tot_pts3_dw"] = (surf_pts_means[1]*dw[3])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts3_dw"] == 0), "m_tot_pts3_dw"] = (surf_pts_means[1]*dw[3]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts4_dw"] == 0), "m_tot_pts4_dw"] = (surf_pts_means[1]*dw[4])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 4) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts4_dw"] == 0), "m_tot_pts4_dw"] = (surf_pts_means[1]*dw[4]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 4) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts5_dw"] == 0), "m_tot_pts5_dw"] = (surf_pts_means[1]*dw[5])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 5) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts5_dw"] == 0), "m_tot_pts5_dw"] = (surf_pts_means[1]*dw[5]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 5) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts6_dw"] == 0), "m_tot_pts6_dw"] = (surf_pts_means[1]*dw[6])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 6) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts6_dw"] == 0), "m_tot_pts6_dw"] = (surf_pts_means[1]*dw[6]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 6) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts7_dw"] == 0), "m_tot_pts7_dw"] = (surf_pts_means[1]*dw[7])

df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts1_dw"] == 0), "m_tot_pts1_dw"] = (surf_pts_means[2]*dw[1]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts2_dw"] == 0), "m_tot_pts2_dw"] = (surf_pts_means[2]*dw[2])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts2_dw"] == 0), "m_tot_pts2_dw"] = (surf_pts_means[2]*dw[2]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts3_dw"] == 0), "m_tot_pts3_dw"] = (surf_pts_means[2]*dw[3])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts3_dw"] == 0), "m_tot_pts3_dw"] = (surf_pts_means[2]*dw[3]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts4_dw"] == 0), "m_tot_pts4_dw"] = (surf_pts_means[2]*dw[4])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 4) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts4_dw"] == 0), "m_tot_pts4_dw"] = (surf_pts_means[2]*dw[4]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 4) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts5_dw"] == 0), "m_tot_pts5_dw"] = (surf_pts_means[2]*dw[5])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 5) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts5_dw"] == 0), "m_tot_pts5_dw"] = (surf_pts_means[2]*dw[5]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 5) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts6_dw"] == 0), "m_tot_pts6_dw"] = (surf_pts_means[2]*dw[6])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 6) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts6_dw"] == 0), "m_tot_pts6_dw"] = (surf_pts_means[2]*dw[6]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 6) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts7_dw"] == 0), "m_tot_pts7_dw"] = (surf_pts_means[2]*dw[7])

df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts1_dw"] == 0), "m_tot_pts1_dw"] = (surf_pts_means[3]*dw[1]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts2_dw"] == 0), "m_tot_pts2_dw"] = (surf_pts_means[3]*dw[2])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts2_dw"] == 0), "m_tot_pts2_dw"] = (surf_pts_means[3]*dw[2]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts3_dw"] == 0), "m_tot_pts3_dw"] = (surf_pts_means[3]*dw[3])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts3_dw"] == 0), "m_tot_pts3_dw"] = (surf_pts_means[3]*dw[3]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts4_dw"] == 0), "m_tot_pts4_dw"] = (surf_pts_means[3]*dw[4])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 4) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts4_dw"] == 0), "m_tot_pts4_dw"] = (surf_pts_means[3]*dw[4]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 4) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts5_dw"] == 0), "m_tot_pts5_dw"] = (surf_pts_means[3]*dw[5])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 5) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts5_dw"] == 0), "m_tot_pts5_dw"] = (surf_pts_means[3]*dw[5]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 5) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts6_dw"] == 0), "m_tot_pts6_dw"] = (surf_pts_means[3]*dw[6])
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 6) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts6_dw"] == 0), "m_tot_pts6_dw"] = (surf_pts_means[3]*dw[6]) 
df_player1.loc[(df_player1["t_lvl"] != 4) & (df_player1["m_rd_num"] == 6) & (df_player1["t_surf"] == 2) & (df_player1["t_ind"] == 0) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts7_dw"] == 0), "m_tot_pts7_dw"] = (surf_pts_means[3]*dw[7])

# # At Grand Slams (3 qualifying matches; final round of qualies is typically 3 days before the first day of 1st Round)
# # Qualies at Grand Slams are only best of 3
# # Playing, typically, every other day during the main draw; total of 7 rounds max (5-7 are QF/SF/F)

df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts1_dw"] == 0), "m_tot_pts1_dw"] = (surf_pts_means[0]*dw[3]) 
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts2_dw"] == 0), "m_tot_pts2_dw"] = (surf_pts_means[0]*dw[4])
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts3_dw"] == 0), "m_tot_pts3_dw"] = (surf_pts_means[0]*dw[5])
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts2_dw"] == 0), "m_tot_pts2_dw"] = (surf_pts_means[0]*dw[5]) 
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts3_dw"] == 0), "m_tot_pts3_dw"] = (surf_pts_means[0]*dw[6])
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts4_dw"] == 0), "m_tot_pts4_dw"] = (surf_pts_means[0]*dw[7])
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts3_dw"] == 0), "m_tot_pts3_dw"] = (surf_pts_means[0]*dw[6]) 
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 1) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts4_dw"] == 0), "m_tot_pts4_dw"] = (surf_pts_means[0]*dw[7]) 

df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts1_dw"] == 0), "m_tot_pts1_dw"] = (surf_pts_means[1]*dw[3]) 
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts2_dw"] == 0), "m_tot_pts2_dw"] = (surf_pts_means[1]*dw[4])
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts3_dw"] == 0), "m_tot_pts3_dw"] = (surf_pts_means[1]*dw[5])
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts2_dw"] == 0), "m_tot_pts2_dw"] = (surf_pts_means[1]*dw[5]) 
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts3_dw"] == 0), "m_tot_pts3_dw"] = (surf_pts_means[1]*dw[6])
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts4_dw"] == 0), "m_tot_pts4_dw"] = (surf_pts_means[1]*dw[7])
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts3_dw"] == 0), "m_tot_pts3_dw"] = (surf_pts_means[1]*dw[6]) 
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 3) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts4_dw"] == 0), "m_tot_pts4_dw"] = (surf_pts_means[1]*dw[7])

# No indoor hard courts for Slams (well, ignoring retractable roof at least, there isn't)
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 2) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts1_dw"] == 0), "m_tot_pts1_dw"] = (surf_pts_means[3]*dw[3]) 
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 2) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts2_dw"] == 0), "m_tot_pts2_dw"] = (surf_pts_means[3]*dw[4])
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 1) & (df_player1["t_surf"] == 2) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts3_dw"] == 0), "m_tot_pts3_dw"] = (surf_pts_means[3]*dw[5])
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 2) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts2_dw"] == 0), "m_tot_pts2_dw"] = (surf_pts_means[3]*dw[5]) 
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 2) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts3_dw"] == 0), "m_tot_pts3_dw"] = (surf_pts_means[3]*dw[6])
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 2) & (df_player1["t_surf"] == 2) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts4_dw"] == 0), "m_tot_pts4_dw"] = (surf_pts_means[3]*dw[7])
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 2) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts3_dw"] == 0), "m_tot_pts3_dw"] = (surf_pts_means[3]*dw[6]) 
df_player1.loc[(df_player1["t_lvl"] == 4) & (df_player1["m_rd_num"] == 3) & (df_player1["t_surf"] == 2) & ((df_player1["p_ent"] == 2) | (df_player1["p_ent"] == 2.5)) & (df_player1["m_tot_pts4_dw"] == 0), "m_tot_pts4_dw"] = (surf_pts_means[3]*dw[7])

#Summing across 'm_tot_pts_x'columns (finally!) to get weighted "pts load" per player prior to a given match
df_player1["p_tot_pts_l7d_tw"] = df_player1[["m_tot_pts1_dw", "m_tot_pts2_dw", "m_tot_pts3_dw", "m_tot_pts4_dw", "m_tot_pts5_dw", "m_tot_pts6_dw", "m_tot_pts7_dw", "m_tot_pts8_dw", "m_tot_pts9_dw", "m_tot_pts10_dw"]].sum(axis=1)
df_player1["p_tot_pts_l7d_tw"] = df_player1["p_tot_pts_l7d_tw"].round(2)

# # Deleting the many transient columns and variables
df_player1 = df_player1.drop(["m_tot_pts1", "m_tot_pts2", "m_tot_pts3", "m_tot_pts4", "m_tot_pts5", "m_tot_pts6", "m_tot_pts7", "m_tot_pts8", "m_tot_pts9", "m_tot_pts10", "m_date1", "m_date2", "m_date3", "m_date4", "m_date5", "m_date6", "m_date7", "m_date8", "m_date9", "m_date10", "m_tot_pts1_dw", "m_tot_pts2_dw", "m_tot_pts3_dw", "m_tot_pts4_dw", "m_tot_pts5_dw", "m_tot_pts6_dw", "m_tot_pts7_dw", "m_tot_pts8_dw", "m_tot_pts9_dw", "m_tot_pts10_dw"], axis = 1)
del mean_pts_clay, mean_pts_grass, mean_pts_hard_o, mean_pts_hard_i, surf_pts_means

In [116]:
# 'p_matches_ss'
# Computes number of matches in the overall sample played by a given player PER SURFACE (clay, hard, grass) PRIOR TO a given match
# SS = Surface selective

df_player1 = df_player1.sort_values(by=['p_id', 't_surf', 'm_date','m_rd_num'], ascending = False)

df_player1 = df_player1.iloc[::-1] #not sure why, but df inversion is necessary to get this compulation to work correctly (as opposed to just employing negative shifts)
df_player1['p_matches_ss'] = df_player1.groupby(['p_id', 't_surf'])['p_id'].transform(lambda x: x.rolling(window = 50000, min_periods=1).count().shift(1))
# If this is the first match in the sample for the player, the NaN will become 1 to avoid divide by zero issues down the road (these matches will be filtered out before modeling anyhow)
df_player1['p_matches_ss'] = df_player1['p_matches_ss'].fillna(1)
df_player1 = df_player1.iloc[::-1]

In [117]:
# 'p_matches_nss'
# Computes number of matches in the overall sample played by a given player ACROSS SURFACES PRIOR TO a given match
# NSS = non-surface selective

df_player1 = df_player1.sort_values(by=['p_id','m_date','m_rd_num'], ascending = False)

df_player1 = df_player1.iloc[::-1] #not sure why, but df inversion is necessary to get this compulation to work correctly (as opposed to just employing negative shifts)
df_player1['p_matches_nss'] = df_player1.groupby(['p_id'])['p_id'].transform(lambda x: x.rolling(window = 50000, min_periods=1).count().shift(1))
# If this is the first match in the sample for the player, the NaN will become 1 to avoid divide by zero issues down the road (these matches will be filtered out before modeling anyhow)
df_player1['p_matches_nss'] = df_player1['p_matches_nss'].fillna(1)
df_player1 = df_player1.iloc[::-1]

In [118]:
# 'p_body_battery_t_tw'
# Integrates "stamina" and "fatigue" features into a single feature. This version uses time-weighted (tw) time as fatigue proxy.
# Currently, player matches in denom factored 4th root, based on some prediction quality feedbck from simple (linear) model
# It's a little counterintuitive, but a higher number indicates more "wear" on the battery
df_player1["p_body_battery_t_tw"] = (df_player1["p_tot_time_l7d_tw"])/(df_player1["p_matches_nss"]**(1/4))
df_player1["p_body_battery_t_tw"] = df_player1["p_body_battery_t_tw"].round(2)

# 'p_body_battery_pts_tw'
# Integrates "stamina" and "fatigue" features into a single feature. This version uses time-weighted (tw) points as fatigue proxy.
# Currently, player matches in denom factored 4th root, based on some prediction quality feedbck from simple (linear) model 
# It's a little counterintuitive, but a higher number indicates more "wear" on the battery
df_player1["p_body_battery_pts_tw"] = (df_player1["p_tot_pts_l7d_tw"])/(df_player1["p_matches_nss"]**(1/4))
df_player1["p_body_battery_pts_tw"] = df_player1["p_body_battery_pts_tw"].round(2)

In [119]:
# 'p_surf_chg'
# 1 means the player has changed (chg) surfaces AT LEAST ONCE within their past 3 matches (before the one at hand); 0 means short term surface 'continuity'
# Does NOT count an indoor/outdoor switch (on the same surface) as a surface switch
df_player1 = df_player1.sort_values(by=['p_id','m_date','m_rd_num'], ascending = False)

df_player1["p_surf1"] = df_player1.groupby(['p_id'])['t_surf'].shift(-1)
df_player1["p_surf2"] = df_player1.groupby(['p_id'])['t_surf'].shift(-2)
df_player1["p_surf3"] = df_player1.groupby(['p_id'])['t_surf'].shift(-3)

df_player1.loc[(df_player1["t_surf"] != df_player1["p_surf1"]) | (df_player1["t_surf"] != df_player1["p_surf2"]) | (df_player1["t_surf"] != df_player1["p_surf3"]), "p_surf_chg"] = 1
df_player1['p_surf_chg'] = df_player1['p_surf_chg'].fillna(0)

df_player1 = df_player1.drop(["p_surf1", "p_surf2", "p_surf3"], axis = 1)

In [120]:
# 'p_tz_chg'
# Captures how many time zones (tz) a player traveled from the site of their last (tour level) match, provided that last match was within the previous 4 days.
df_player1 = df_player1.sort_values(by=['p_id','m_date','m_rd_num'], ascending = False)

df_player1["p_GMT1"] = df_player1.groupby(['p_id'])['t_GMT_diff'].shift(-1)
df_player1["p_mdate1"] = df_player1.groupby(['p_id'])['m_date'].shift(-1)

df_player1.loc[(df_player1["m_date"] - df_player1["p_mdate1"] <= "4 days"), "p_tz_chg"] = abs(df_player1["t_GMT_diff"] - df_player1["p_GMT1"])
df_player1['p_tz_chg'] = df_player1['p_tz_chg'].fillna(0)

df_player1 = df_player1.drop(["p_GMT1", "p_mdate1"], axis = 1)

In [121]:
#df_player1.to_csv('../data/df_player1.csv', index=False)

### 5. Predictive Features: Retrospective, Accrual-Derived Player Head-to-Heads vs Match Opponent

In [122]:
# 'p_H2H_w_ss'
# At time of a given match, player past (surface-specific, entire sample previous to given match) head-to-head wins vs opponent in that match
# SS = surface-specific

df_player1 = df_player1.sort_values(by=['p_id', 't_surf', 'm_date','m_rd_num'], ascending = False)

df_player1 = df_player1.iloc[::-1]
df_player1['p_H2H_w_ss'] = df_player1.groupby(['p_id','opp_id','t_surf'])['m_outcome'].transform(lambda x: x.rolling(window=50000, min_periods = 1).sum().shift(1))
df_player1['p_H2H_w_ss'] = df_player1['p_H2H_w_ss'].fillna(0)
df_player1 = df_player1.iloc[::-1]

# 'p_H2H_tot_pts_won%_ss'
# At time of a given match, player past (surface-specific, entire sample previous to given match) PTS WON% vs opponent in that match
# SS = surface-specific

df_player1 = df_player1.sort_values(by=['p_id', 't_surf', 'm_date','m_rd_num'], ascending = False)

df_player1 = df_player1.iloc[::-1]
df_player1['p_H2H_tot_pts_won%_ss'] = df_player1.groupby(['p_id','opp_id','t_surf'])['p_tot_pts_won%'].transform(lambda x: x.rolling(window=50000, min_periods = 1).mean().shift(1))
df_player1['p_H2H_tot_pts_won%_ss'] = df_player1['p_H2H_tot_pts_won%_ss'].round(2)
df_player1 = df_player1.iloc[::-1]

In [123]:
# 'p_H2H_w_nss'
# At time of a given match, player past (surface-specific, entire sample previous to given match) head-to-head wins vs opponent in that match
# NSS = non-surface specific

df_player1 = df_player1.sort_values(by=['p_id', 'm_date','m_rd_num'], ascending = False)

df_player1 = df_player1.iloc[::-1]
df_player1['p_H2H_w_nss'] = df_player1.groupby(['p_id','opp_id'])['m_outcome'].transform(lambda x: x.rolling(window=50000, min_periods = 1).sum().shift(1))
df_player1['p_H2H_w_nss'] = df_player1['p_H2H_w_nss'].fillna(0)
df_player1 = df_player1.iloc[::-1]

# 'p_H2H_tot_pts_won%_nss'
# At time of a given match, player past (surface-specific, entire sample previous to given match) PTS WON% vs opponent in that match
# NSS = non-surface specific

df_player1 = df_player1.sort_values(by=['p_id', 'm_date','m_rd_num'], ascending = False)

df_player1 = df_player1.iloc[::-1]
df_player1['p_H2H_tot_pts_won%_nss'] = df_player1.groupby(['p_id','opp_id'])['p_tot_pts_won%'].transform(lambda x: x.rolling(window=50000, min_periods = 1).mean().shift(1))
df_player1['p_H2H_tot_pts_won%_nss'] = df_player1['p_H2H_tot_pts_won%_nss'].round(2)
df_player1 = df_player1.iloc[::-1]

In [124]:
df_player1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57068 entries, 56533 to 40644
Columns: 165 entries, t_id to p_H2H_tot_pts_won%_nss
dtypes: datetime64[ns](1), float64(129), int64(29), object(6)
memory usage: 72.3+ MB


In [125]:
df_player1.to_csv('../data/df_player1_no_dtw.csv', index=False)

### 6. Predictive Features: Strength of Schedule (SOS) Calculation and Application to Stats-Derived Features

Above we computed a number of retrospective, time-decay weighted (mostly), averaged stats on a surface-specific basis (and a few also on a non-surface specific basis) per player, per match being predicted. SOS adjustment serves to adjust these stats by the aggregate overall performance of opponents faced in the stretch over which those features were generated. For example, a player winning 60% of their serve points against a schedule of opponents who had historically lost 65% of points on their opponents' serve is not as strong as a player winning 60% of their serve points against a schedule of opponents who had historically lost 55% of their opponents' serve points. SOS adjustment, therefore ensures that the latter average serve performance is rated higher than the former.  

In [126]:
# For the SOS calculations, we will first create columns for each relevant feature, with each row containing the time-weighted (for most features) past performance (either past 60 or past 10 matches) for the opponent in a given match to be predicted 
# Initializing the opponent columns

df_player2 = df_player1
df_player2 = df_player2.sort_values(by=['m_num','m_outcome'], ascending = False)

df_player2["p_opp_tot_pts_won%_l60_tw_ss"] = ""
df_player2["p_opp_tot_pts_won%_l10_tw_ss"] = ""
df_player2["p_opp_tot_pts_won%_l60_tw_ss_IO"] = ""
df_player2["p_opp_tot_pts_won%_l10_tw_ss_IO"] = ""
df_player2["p_opp_tot_pts_won%_l60_tw_nss"] = ""
df_player2["p_opp_tot_pts_won%_l10_tw_nss"] = ""
df_player2["p_opp_tot_pts_won%_l60_tw_ss_comp"] = ""
df_player2["p_opp_tot_pts_won%_l10_tw_ss_comp"] = ""
df_player2["p_opp_tot_pts_won%_l60_tw_ss_IO_comp"] = ""
df_player2["p_opp_tot_pts_won%_l10_tw_ss_IO_comp"] = ""
df_player2["p_opp_1st_sv%_l60_tw_ss"] = ""
df_player2["p_opp_1st_sv%_l10_tw_ss"] = ""
df_player2["p_opp_1st_sv%_l60_tw_ss_IO"] = ""
df_player2["p_opp_1st_sv%_l10_tw_ss_IO"] = ""
df_player2["p_opp_1st_sv%_yielded_l60_tw_ss"] = ""
df_player2["p_opp_1st_sv%_yielded_l10_tw_ss"] = ""
df_player2["p_opp_1st_sv%_yielded_l60_tw_ss_IO"] = ""
df_player2["p_opp_1st_sv%_yielded_l10_tw_ss_IO"] = ""
df_player2["p_opp_sv_pts_won%_l60_tw_ss"] = ""
df_player2["p_opp_sv_pts_won%_l10_tw_ss"] = ""
df_player2["p_opp_sv_pts_won%_l60_tw_ss_IO"] = ""
df_player2["p_opp_sv_pts_won%_l10_tw_ss_IO"] = ""
df_player2["p_opp_1st_sv_pts_won%_l60_tw_ss"] = ""
df_player2["p_opp_1st_sv_pts_won%_l10_tw_ss"] = ""
df_player2["p_opp_1st_sv_pts_won%_l60_tw_ss_IO"] = ""
df_player2["p_opp_1st_sv_pts_won%_l10_tw_ss_IO"] = ""
df_player2["p_opp_2nd_sv_pts_won%_l60_tw_ss"] = ""
df_player2["p_opp_2nd_sv_pts_won%_l10_tw_ss"] = ""
df_player2["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO"] = ""
df_player2["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO"] = ""
df_player2["p_opp_ret_pts_won%_l60_tw_ss"] = ""
df_player2["p_opp_ret_pts_won%_l10_tw_ss"] = ""
df_player2["p_opp_ret_pts_won%_l60_tw_ss_IO"] = ""
df_player2["p_opp_ret_pts_won%_l10_tw_ss_IO"] = ""
df_player2["p_opp_1st_ret_pts_won%_l60_tw_ss"] = ""
df_player2["p_opp_1st_ret_pts_won%_l10_tw_ss"] = ""
df_player2["p_opp_1st_ret_pts_won%_l60_tw_ss_IO"] = ""
df_player2["p_opp_1st_ret_pts_won%_l10_tw_ss_IO"] = ""
df_player2["p_opp_2nd_ret_pts_won%_l60_tw_ss"] = ""
df_player2["p_opp_2nd_ret_pts_won%_l10_tw_ss"] = ""
df_player2["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO"] = ""
df_player2["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO"] = ""
df_player2["p_opp_ace%_l60_tw_ss"] = ""
df_player2["p_opp_ace%_l10_tw_ss"] = ""
df_player2["p_opp_ace%_l60_tw_ss_IO"] = ""
df_player2["p_opp_ace%_l10_tw_ss_IO"] = ""
df_player2["p_opp_aced%_l60_tw_ss"] = ""
df_player2["p_opp_aced%_l10_tw_ss"] = ""
df_player2["p_opp_aced%_l60_tw_ss_IO"] = ""
df_player2["p_opp_aced%_l10_tw_ss_IO"] = ""
df_player2["p_opp_df%_l60_tw_ss"] = ""
df_player2["p_opp_df%_l10_tw_ss"] = ""
df_player2["p_opp_df%_l60_tw_ss_IO"] = ""
df_player2["p_opp_df%_l10_tw_ss_IO"] = ""
df_player2["p_opp_df_induce%_l60_tw_ss"] = ""
df_player2["p_opp_df_induce%_l10_tw_ss"] = ""
df_player2["p_opp_df_induce%_l60_tw_ss_IO"] = ""
df_player2["p_opp_df_induce%_l10_tw_ss_IO"] = ""
df_player2["p_opp_bp_save%_l60_tw_ss"] = ""
df_player2["p_opp_bp_save%_l10_tw_ss"] = ""
df_player2["p_opp_bp_save%_l60_tw_ss_IO"] = ""
df_player2["p_opp_bp_save%_l10_tw_ss_IO"] = ""
df_player2["p_opp_bp_conv%_l60_tw_ss"] = ""
df_player2["p_opp_bp_conv%_l10_tw_ss"] = ""
df_player2["p_opp_bp_conv%_l60_tw_ss_IO"] = ""
df_player2["p_opp_bp_conv%_l10_tw_ss_IO"] = ""
df_player2["p_opp_AVG_C_IP_l60_tw_ss"] = ""
df_player2["p_opp_AVG_C_IP_l10_tw_ss"] = ""
df_player2["p_opp_AVG_C_IP_l60_tw_ss_IO"] = ""
df_player2["p_opp_AVG_C_IP_l10_tw_ss_IO"] = ""
df_player2["p_opp_AVG_C_IP_l60_tw_nss"] = ""
df_player2["p_opp_AVG_C_IP_l10_tw_nss"] = ""

# Populating the opponent columns
df_player2["p_opp_tot_pts_won%_l60_tw_ss"] = df_player2.groupby(['m_num'])['p_tot_pts_won%_l60_tw_ss'].shift(-1)
df_player2["p_opp_tot_pts_won%_l60_tw_ss"] = df_player2["p_opp_tot_pts_won%_l60_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_tot_pts_won%_l60_tw_ss'].shift(1))

df_player2["p_opp_tot_pts_won%_l10_tw_ss"] = df_player2.groupby(['m_num'])['p_tot_pts_won%_l10_tw_ss'].shift(-1)
df_player2["p_opp_tot_pts_won%_l10_tw_ss"] = df_player2["p_opp_tot_pts_won%_l10_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_tot_pts_won%_l10_tw_ss'].shift(1))

df_player2["p_opp_tot_pts_won%_l60_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_tot_pts_won%_l60_tw_ss_IO'].shift(-1)
df_player2["p_opp_tot_pts_won%_l60_tw_ss_IO"] = df_player2["p_opp_tot_pts_won%_l60_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_tot_pts_won%_l60_tw_ss_IO'].shift(1))

df_player2["p_opp_tot_pts_won%_l10_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_tot_pts_won%_l10_tw_ss_IO'].shift(-1)
df_player2["p_opp_tot_pts_won%_l10_tw_ss_IO"] = df_player2["p_opp_tot_pts_won%_l10_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_tot_pts_won%_l10_tw_ss_IO'].shift(1))

df_player2["p_opp_tot_pts_won%_l60_tw_nss"] = df_player2.groupby(['m_num'])['p_tot_pts_won%_l60_tw_nss'].shift(-1)
df_player2["p_opp_tot_pts_won%_l60_tw_nss"] = df_player2["p_opp_tot_pts_won%_l60_tw_nss"].fillna(df_player2.groupby(['m_num'])['p_tot_pts_won%_l60_tw_nss'].shift(1))

df_player2["p_opp_tot_pts_won%_l10_tw_nss"] = df_player2.groupby(['m_num'])['p_tot_pts_won%_l10_tw_nss'].shift(-1)
df_player2["p_opp_tot_pts_won%_l10_tw_nss"] = df_player2["p_opp_tot_pts_won%_l10_tw_nss"].fillna(df_player2.groupby(['m_num'])['p_tot_pts_won%_l10_tw_nss'].shift(1))

df_player2["p_opp_tot_pts_won%_l60_tw_ss_comp"] = df_player2.groupby(['m_num'])['p_tot_pts_won%_l60_tw_ss_comp'].shift(-1)
df_player2["p_opp_tot_pts_won%_l60_tw_ss_comp"] = df_player2["p_opp_tot_pts_won%_l60_tw_ss_comp"].fillna(df_player2.groupby(['m_num'])['p_tot_pts_won%_l60_tw_ss_comp'].shift(1))

df_player2["p_opp_tot_pts_won%_l10_tw_ss_comp"] = df_player2.groupby(['m_num'])['p_tot_pts_won%_l10_tw_ss_comp'].shift(-1)
df_player2["p_opp_tot_pts_won%_l10_tw_ss_comp"] = df_player2["p_opp_tot_pts_won%_l10_tw_ss_comp"].fillna(df_player2.groupby(['m_num'])['p_tot_pts_won%_l10_tw_ss_comp'].shift(1))

df_player2["p_opp_tot_pts_won%_l60_tw_ss_IO_comp"] = df_player2.groupby(['m_num'])['p_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-1)
df_player2["p_opp_tot_pts_won%_l60_tw_ss_IO_comp"] = df_player2["p_opp_tot_pts_won%_l60_tw_ss_IO_comp"].fillna(df_player2.groupby(['m_num'])['p_tot_pts_won%_l60_tw_ss_IO_comp'].shift(1))

df_player2["p_opp_tot_pts_won%_l10_tw_ss_IO_comp"] = df_player2.groupby(['m_num'])['p_tot_pts_won%_l10_tw_ss_IO_comp'].shift(-1)
df_player2["p_opp_tot_pts_won%_l10_tw_ss_IO_comp"] = df_player2["p_opp_tot_pts_won%_l10_tw_ss_IO_comp"].fillna(df_player2.groupby(['m_num'])['p_tot_pts_won%_l10_tw_ss_IO_comp'].shift(1))

df_player2["p_opp_1st_sv%_l60_tw_ss"] = df_player2.groupby(['m_num'])['p_1st_sv%_l60_tw_ss'].shift(-1)
df_player2["p_opp_1st_sv%_l60_tw_ss"] = df_player2["p_opp_1st_sv%_l60_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_1st_sv%_l60_tw_ss'].shift(1))

df_player2["p_opp_1st_sv%_l10_tw_ss"] = df_player2.groupby(['m_num'])['p_1st_sv%_l10_tw_ss'].shift(-1)
df_player2["p_opp_1st_sv%_l10_tw_ss"] = df_player2["p_opp_1st_sv%_l10_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_1st_sv%_l10_tw_ss'].shift(1))

df_player2["p_opp_1st_sv%_l60_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_1st_sv%_l60_tw_ss_IO'].shift(-1)
df_player2["p_opp_1st_sv%_l60_tw_ss_IO"] = df_player2["p_opp_1st_sv%_l60_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_1st_sv%_l60_tw_ss_IO'].shift(1))

df_player2["p_opp_1st_sv%_l10_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_1st_sv%_l10_tw_ss_IO'].shift(-1)
df_player2["p_opp_1st_sv%_l10_tw_ss_IO"] = df_player2["p_opp_1st_sv%_l10_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_1st_sv%_l10_tw_ss_IO'].shift(1))

df_player2["p_opp_1st_sv%_yielded_l60_tw_ss"] = df_player2.groupby(['m_num'])['p_1st_sv%_yielded_l60_tw_ss'].shift(-1)
df_player2["p_opp_1st_sv%_yielded_l60_tw_ss"] = df_player2["p_opp_1st_sv%_yielded_l60_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_1st_sv%_yielded_l60_tw_ss'].shift(1))

df_player2["p_opp_1st_sv%_yielded_l10_tw_ss"] = df_player2.groupby(['m_num'])['p_1st_sv%_yielded_l10_tw_ss'].shift(-1)
df_player2["p_opp_1st_sv%_yielded_l10_tw_ss"] = df_player2["p_opp_1st_sv%_yielded_l10_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_1st_sv%_yielded_l10_tw_ss'].shift(1))

df_player2["p_opp_1st_sv%_yielded_l60_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_1st_sv%_yielded_l60_tw_ss_IO'].shift(-1)
df_player2["p_opp_1st_sv%_yielded_l60_tw_ss_IO"] = df_player2["p_opp_1st_sv%_yielded_l60_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_1st_sv%_yielded_l60_tw_ss_IO'].shift(1))

df_player2["p_opp_1st_sv%_yielded_l10_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_1st_sv%_yielded_l10_tw_ss_IO'].shift(-1)
df_player2["p_opp_1st_sv%_yielded_l10_tw_ss_IO"] = df_player2["p_opp_1st_sv%_yielded_l10_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_1st_sv%_yielded_l10_tw_ss_IO'].shift(1))

df_player2["p_opp_sv_pts_won%_l60_tw_ss"] = df_player2.groupby(['m_num'])['p_sv_pts_won%_l60_tw_ss'].shift(-1)
df_player2["p_opp_sv_pts_won%_l60_tw_ss"] = df_player2["p_opp_sv_pts_won%_l60_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_sv_pts_won%_l60_tw_ss'].shift(1))

df_player2["p_opp_sv_pts_won%_l10_tw_ss"] = df_player2.groupby(['m_num'])['p_sv_pts_won%_l10_tw_ss'].shift(-1)
df_player2["p_opp_sv_pts_won%_l10_tw_ss"] = df_player2["p_opp_sv_pts_won%_l10_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_sv_pts_won%_l10_tw_ss'].shift(1))

df_player2["p_opp_sv_pts_won%_l60_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_sv_pts_won%_l60_tw_ss_IO'].shift(-1)
df_player2["p_opp_sv_pts_won%_l60_tw_ss_IO"] = df_player2["p_opp_sv_pts_won%_l60_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_sv_pts_won%_l60_tw_ss_IO'].shift(1))

df_player2["p_opp_sv_pts_won%_l10_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_sv_pts_won%_l10_tw_ss_IO'].shift(-1)
df_player2["p_opp_sv_pts_won%_l10_tw_ss_IO"] = df_player2["p_opp_sv_pts_won%_l10_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_sv_pts_won%_l10_tw_ss_IO'].shift(1))

df_player2["p_opp_1st_sv_pts_won%_l60_tw_ss"] = df_player2.groupby(['m_num'])['p_1st_sv_pts_won%_l60_tw_ss'].shift(-1)
df_player2["p_opp_1st_sv_pts_won%_l60_tw_ss"] = df_player2["p_opp_1st_sv_pts_won%_l60_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_1st_sv_pts_won%_l60_tw_ss'].shift(1))

df_player2["p_opp_1st_sv_pts_won%_l10_tw_ss"] = df_player2.groupby(['m_num'])['p_1st_sv_pts_won%_l10_tw_ss'].shift(-1)
df_player2["p_opp_1st_sv_pts_won%_l10_tw_ss"] = df_player2["p_opp_1st_sv_pts_won%_l10_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_1st_sv_pts_won%_l10_tw_ss'].shift(1))

df_player2["p_opp_1st_sv_pts_won%_l60_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-1)
df_player2["p_opp_1st_sv_pts_won%_l60_tw_ss_IO"] = df_player2["p_opp_1st_sv_pts_won%_l60_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_1st_sv_pts_won%_l60_tw_ss_IO'].shift(1))

df_player2["p_opp_1st_sv_pts_won%_l10_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_1st_sv_pts_won%_l10_tw_ss_IO'].shift(-1)
df_player2["p_opp_1st_sv_pts_won%_l10_tw_ss_IO"] = df_player2["p_opp_1st_sv_pts_won%_l10_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_1st_sv_pts_won%_l10_tw_ss_IO'].shift(1))

df_player2["p_opp_2nd_sv_pts_won%_l60_tw_ss"] = df_player2.groupby(['m_num'])['p_2nd_sv_pts_won%_l60_tw_ss'].shift(-1)
df_player2["p_opp_2nd_sv_pts_won%_l60_tw_ss"] = df_player2["p_opp_2nd_sv_pts_won%_l60_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_2nd_sv_pts_won%_l60_tw_ss'].shift(1))

df_player2["p_opp_2nd_sv_pts_won%_l10_tw_ss"] = df_player2.groupby(['m_num'])['p_2nd_sv_pts_won%_l10_tw_ss'].shift(-1)
df_player2["p_opp_2nd_sv_pts_won%_l10_tw_ss"] = df_player2["p_opp_2nd_sv_pts_won%_l10_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_2nd_sv_pts_won%_l10_tw_ss'].shift(1))

df_player2["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-1)
df_player2["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO"] = df_player2["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(1))

df_player2["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_2nd_sv_pts_won%_l10_tw_ss_IO'].shift(-1)
df_player2["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO"] = df_player2["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_2nd_sv_pts_won%_l10_tw_ss_IO'].shift(1))

df_player2["p_opp_ret_pts_won%_l60_tw_ss"] = df_player2.groupby(['m_num'])['p_ret_pts_won%_l60_tw_ss'].shift(-1)
df_player2["p_opp_ret_pts_won%_l60_tw_ss"] = df_player2["p_opp_ret_pts_won%_l60_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_ret_pts_won%_l60_tw_ss'].shift(1))

df_player2["p_opp_ret_pts_won%_l10_tw_ss"] = df_player2.groupby(['m_num'])['p_ret_pts_won%_l10_tw_ss'].shift(-1)
df_player2["p_opp_ret_pts_won%_l10_tw_ss"] = df_player2["p_opp_ret_pts_won%_l10_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_ret_pts_won%_l10_tw_ss'].shift(1))

df_player2["p_opp_ret_pts_won%_l60_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_ret_pts_won%_l60_tw_ss_IO'].shift(-1)
df_player2["p_opp_ret_pts_won%_l60_tw_ss_IO"] = df_player2["p_opp_ret_pts_won%_l60_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_ret_pts_won%_l60_tw_ss_IO'].shift(1))

df_player2["p_opp_ret_pts_won%_l10_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_ret_pts_won%_l10_tw_ss_IO'].shift(-1)
df_player2["p_opp_ret_pts_won%_l10_tw_ss_IO"] = df_player2["p_opp_ret_pts_won%_l10_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_ret_pts_won%_l10_tw_ss_IO'].shift(1))

df_player2["p_opp_1st_ret_pts_won%_l60_tw_ss"] = df_player2.groupby(['m_num'])['p_1st_ret_pts_won%_l60_tw_ss'].shift(-1)
df_player2["p_opp_1st_ret_pts_won%_l60_tw_ss"] = df_player2["p_opp_1st_ret_pts_won%_l60_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_1st_ret_pts_won%_l60_tw_ss'].shift(1))

df_player2["p_opp_1st_ret_pts_won%_l10_tw_ss"] = df_player2.groupby(['m_num'])['p_1st_ret_pts_won%_l10_tw_ss'].shift(-1)
df_player2["p_opp_1st_ret_pts_won%_l10_tw_ss"] = df_player2["p_opp_1st_ret_pts_won%_l10_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_1st_ret_pts_won%_l10_tw_ss'].shift(1))

df_player2["p_opp_1st_ret_pts_won%_l60_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-1)
df_player2["p_opp_1st_ret_pts_won%_l60_tw_ss_IO"] = df_player2["p_opp_1st_ret_pts_won%_l60_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_1st_ret_pts_won%_l60_tw_ss_IO'].shift(1))

df_player2["p_opp_1st_ret_pts_won%_l10_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_1st_ret_pts_won%_l10_tw_ss_IO'].shift(-1)
df_player2["p_opp_1st_ret_pts_won%_l10_tw_ss_IO"] = df_player2["p_opp_1st_ret_pts_won%_l10_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_1st_ret_pts_won%_l10_tw_ss_IO'].shift(1))

df_player2["p_opp_2nd_ret_pts_won%_l60_tw_ss"] = df_player2.groupby(['m_num'])['p_2nd_ret_pts_won%_l60_tw_ss'].shift(-1)
df_player2["p_opp_2nd_ret_pts_won%_l60_tw_ss"] = df_player2["p_opp_2nd_ret_pts_won%_l60_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_2nd_ret_pts_won%_l60_tw_ss'].shift(1))

df_player2["p_opp_2nd_ret_pts_won%_l10_tw_ss"] = df_player2.groupby(['m_num'])['p_2nd_ret_pts_won%_l10_tw_ss'].shift(-1)
df_player2["p_opp_2nd_ret_pts_won%_l10_tw_ss"] = df_player2["p_opp_2nd_ret_pts_won%_l10_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_2nd_ret_pts_won%_l10_tw_ss'].shift(1))

df_player2["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-1)
df_player2["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO"] = df_player2["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(1))

df_player2["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_2nd_ret_pts_won%_l10_tw_ss_IO'].shift(-1)
df_player2["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO"] = df_player2["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_2nd_ret_pts_won%_l10_tw_ss_IO'].shift(1))

df_player2["p_opp_ace%_l60_tw_ss"] = df_player2.groupby(['m_num'])['p_ace%_l60_tw_ss'].shift(-1)
df_player2["p_opp_ace%_l60_tw_ss"] = df_player2["p_opp_ace%_l60_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_ace%_l60_tw_ss'].shift(1))

df_player2["p_opp_ace%_l10_tw_ss"] = df_player2.groupby(['m_num'])['p_ace%_l10_tw_ss'].shift(-1)
df_player2["p_opp_ace%_l10_tw_ss"] = df_player2["p_opp_ace%_l10_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_ace%_l10_tw_ss'].shift(1))

df_player2["p_opp_ace%_l60_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_ace%_l60_tw_ss_IO'].shift(-1)
df_player2["p_opp_ace%_l60_tw_ss_IO"] = df_player2["p_opp_ace%_l60_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_ace%_l60_tw_ss_IO'].shift(1))

df_player2["p_opp_ace%_l10_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_ace%_l10_tw_ss_IO'].shift(-1)
df_player2["p_opp_ace%_l10_tw_ss_IO"] = df_player2["p_opp_ace%_l10_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_ace%_l10_tw_ss_IO'].shift(1))

df_player2["p_opp_aced%_l60_tw_ss"] = df_player2.groupby(['m_num'])['p_aced%_l60_tw_ss'].shift(-1)
df_player2["p_opp_aced%_l60_tw_ss"] = df_player2["p_opp_aced%_l60_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_aced%_l60_tw_ss'].shift(1))

df_player2["p_opp_aced%_l10_tw_ss"] = df_player2.groupby(['m_num'])['p_aced%_l10_tw_ss'].shift(-1)
df_player2["p_opp_aced%_l10_tw_ss"] = df_player2["p_opp_aced%_l10_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_aced%_l10_tw_ss'].shift(1))

df_player2["p_opp_aced%_l60_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_aced%_l60_tw_ss_IO'].shift(-1)
df_player2["p_opp_aced%_l60_tw_ss_IO"] = df_player2["p_opp_aced%_l60_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_aced%_l60_tw_ss_IO'].shift(1))

df_player2["p_opp_aced%_l10_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_aced%_l10_tw_ss_IO'].shift(-1)
df_player2["p_opp_aced%_l10_tw_ss_IO"] = df_player2["p_opp_aced%_l10_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_aced%_l10_tw_ss_IO'].shift(1))

df_player2["p_opp_df%_l60_tw_ss"] = df_player2.groupby(['m_num'])['p_df%_l60_tw_ss'].shift(-1)
df_player2["p_opp_df%_l60_tw_ss"] = df_player2["p_opp_df%_l60_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_df%_l60_tw_ss'].shift(1))

df_player2["p_opp_df%_l10_tw_ss"] = df_player2.groupby(['m_num'])['p_df%_l10_tw_ss'].shift(-1)
df_player2["p_opp_df%_l10_tw_ss"] = df_player2["p_opp_df%_l10_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_df%_l10_tw_ss'].shift(1))

df_player2["p_opp_df%_l60_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_df%_l60_tw_ss_IO'].shift(-1)
df_player2["p_opp_df%_l60_tw_ss_IO"] = df_player2["p_opp_df%_l60_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_df%_l60_tw_ss_IO'].shift(1))

df_player2["p_opp_df%_l10_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_df%_l10_tw_ss_IO'].shift(-1)
df_player2["p_opp_df%_l10_tw_ss_IO"] = df_player2["p_opp_df%_l10_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_df%_l10_tw_ss_IO'].shift(1))

df_player2["p_opp_df_induce%_l60_tw_ss"] = df_player2.groupby(['m_num'])['p_df_induce%_l60_tw_ss'].shift(-1)
df_player2["p_opp_df_induce%_l60_tw_ss"] = df_player2["p_opp_df_induce%_l60_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_df_induce%_l60_tw_ss'].shift(1))

df_player2["p_opp_df_induce%_l10_tw_ss"] = df_player2.groupby(['m_num'])['p_df_induce%_l10_tw_ss'].shift(-1)
df_player2["p_opp_df_induce%_l10_tw_ss"] = df_player2["p_opp_df_induce%_l10_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_df_induce%_l10_tw_ss'].shift(1))

df_player2["p_opp_df_induce%_l60_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_df_induce%_l60_tw_ss_IO'].shift(-1)
df_player2["p_opp_df_induce%_l60_tw_ss_IO"] = df_player2["p_opp_df_induce%_l60_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_df_induce%_l60_tw_ss_IO'].shift(1))

df_player2["p_opp_df_induce%_l10_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_df_induce%_l10_tw_ss_IO'].shift(-1)
df_player2["p_opp_df_induce%_l10_tw_ss_IO"] = df_player2["p_opp_df_induce%_l10_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_df_induce%_l10_tw_ss_IO'].shift(1))

df_player2["p_opp_bp_save%_l60_tw_ss"] = df_player2.groupby(['m_num'])['p_bp_save%_l60_tw_ss'].shift(-1)
df_player2["p_opp_bp_save%_l60_tw_ss"] = df_player2["p_opp_bp_save%_l60_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_bp_save%_l60_tw_ss'].shift(1))

df_player2["p_opp_bp_save%_l10_tw_ss"] = df_player2.groupby(['m_num'])['p_bp_save%_l10_tw_ss'].shift(-1)
df_player2["p_opp_bp_save%_l10_tw_ss"] = df_player2["p_opp_bp_save%_l10_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_bp_save%_l10_tw_ss'].shift(1))

df_player2["p_opp_bp_save%_l60_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_bp_save%_l60_tw_ss_IO'].shift(-1)
df_player2["p_opp_bp_save%_l60_tw_ss_IO"] = df_player2["p_opp_bp_save%_l60_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_bp_save%_l60_tw_ss_IO'].shift(1))

df_player2["p_opp_bp_save%_l10_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_bp_save%_l10_tw_ss_IO'].shift(-1)
df_player2["p_opp_bp_save%_l10_tw_ss_IO"] = df_player2["p_opp_bp_save%_l10_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_bp_save%_l10_tw_ss_IO'].shift(1))

df_player2["p_opp_bp_conv%_l60_tw_ss"] = df_player2.groupby(['m_num'])['p_bp_conv%_l60_tw_ss'].shift(-1)
df_player2["p_opp_bp_conv%_l60_tw_ss"] = df_player2["p_opp_bp_conv%_l60_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_bp_conv%_l60_tw_ss'].shift(1))

df_player2["p_opp_bp_conv%_l10_tw_ss"] = df_player2.groupby(['m_num'])['p_bp_conv%_l10_tw_ss'].shift(-1)
df_player2["p_opp_bp_conv%_l10_tw_ss"] = df_player2["p_opp_bp_conv%_l10_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_bp_conv%_l10_tw_ss'].shift(1))

df_player2["p_opp_bp_conv%_l60_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_bp_conv%_l60_tw_ss_IO'].shift(-1)
df_player2["p_opp_bp_conv%_l60_tw_ss_IO"] = df_player2["p_opp_bp_conv%_l60_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_bp_conv%_l60_tw_ss_IO'].shift(1))

df_player2["p_opp_bp_conv%_l10_tw_ss_IO"] = df_player2.groupby(['m_num'])['p_bp_conv%_l10_tw_ss_IO'].shift(-1)
df_player2["p_opp_bp_conv%_l10_tw_ss_IO"] = df_player2["p_opp_bp_conv%_l10_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_bp_conv%_l10_tw_ss_IO'].shift(1))

df_player2["p_opp_AVG_C_IP_l60_tw_ss"] = df_player2.groupby(['m_num'])["p_AVG_C_IP_l60_tw_ss"].shift(-1)
df_player2["p_opp_AVG_C_IP_l60_tw_ss"] = df_player2["p_opp_AVG_C_IP_l60_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_AVG_C_IP_l60_tw_ss'].shift(1))

df_player2["p_opp_AVG_C_IP_l10_tw_ss"] = df_player2.groupby(['m_num'])["p_AVG_C_IP_l10_tw_ss"].shift(-1)
df_player2["p_opp_AVG_C_IP_l10_tw_ss"] = df_player2["p_opp_AVG_C_IP_l10_tw_ss"].fillna(df_player2.groupby(['m_num'])['p_AVG_C_IP_l10_tw_ss'].shift(1))

df_player2["p_opp_AVG_C_IP_l60_tw_ss_IO"] = df_player2.groupby(['m_num'])["p_AVG_C_IP_l60_tw_ss_IO"].shift(-1)
df_player2["p_opp_AVG_C_IP_l60_tw_ss_IO"] = df_player2["p_opp_AVG_C_IP_l60_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_AVG_C_IP_l60_tw_ss_IO'].shift(1))

df_player2["p_opp_AVG_C_IP_l10_tw_ss_IO"] = df_player2.groupby(['m_num'])["p_AVG_C_IP_l10_tw_ss_IO"].shift(-1)
df_player2["p_opp_AVG_C_IP_l10_tw_ss_IO"] = df_player2["p_opp_AVG_C_IP_l10_tw_ss_IO"].fillna(df_player2.groupby(['m_num'])['p_AVG_C_IP_l10_tw_ss_IO'].shift(1))

df_player2["p_opp_AVG_C_IP_l60_tw_nss"] = df_player2.groupby(['m_num'])["p_AVG_C_IP_l60_tw_nss"].shift(-1)
df_player2["p_opp_AVG_C_IP_l60_tw_nss"] = df_player2["p_opp_AVG_C_IP_l60_tw_nss"].fillna(df_player2.groupby(['m_num'])['p_AVG_C_IP_l60_tw_nss'].shift(1))

df_player2["p_opp_AVG_C_IP_l10_tw_nss"] = df_player2.groupby(['m_num'])["p_AVG_C_IP_l10_tw_nss"].shift(-1)
df_player2["p_opp_AVG_C_IP_l10_tw_nss"] = df_player2["p_opp_AVG_C_IP_l10_tw_nss"].fillna(df_player2.groupby(['m_num'])['p_AVG_C_IP_l10_tw_nss'].shift(1))

#del df_player1
#NSS = non-surface specific; SS = surface-specific; TW = time-weighted

In [127]:
df_player2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57068 entries, 28533 to 28534
Columns: 237 entries, t_id to p_opp_AVG_C_IP_l10_tw_nss
dtypes: datetime64[ns](1), float64(201), int64(29), object(6)
memory usage: 103.6+ MB


In [128]:
#df_player2.to_csv('../data/df_player2.csv', index=False)

In [129]:
# 'p_tot_pts_won%_l60_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) TOTAL POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player2 = df_player2.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific TOTAL POINTS WON performance of player OPPONENTS on the same surface over the maximum interval of interest (60 matches) prior to the match being predicted 
df_player2["SOS_tw_l60_opp_tot_pts_won%_60"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-1)
df_player2["SOS_tw_l60_opp_tot_pts_won%_59"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-2)
df_player2["SOS_tw_l60_opp_tot_pts_won%_58"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-3)
df_player2["SOS_tw_l60_opp_tot_pts_won%_57"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-4)
df_player2["SOS_tw_l60_opp_tot_pts_won%_56"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-5)
df_player2["SOS_tw_l60_opp_tot_pts_won%_55"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-6)
df_player2["SOS_tw_l60_opp_tot_pts_won%_54"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-7)
df_player2["SOS_tw_l60_opp_tot_pts_won%_53"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-8)
df_player2["SOS_tw_l60_opp_tot_pts_won%_52"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-9)
df_player2["SOS_tw_l60_opp_tot_pts_won%_51"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-10)
df_player2["SOS_tw_l60_opp_tot_pts_won%_50"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-11)
df_player2["SOS_tw_l60_opp_tot_pts_won%_49"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-12)
df_player2["SOS_tw_l60_opp_tot_pts_won%_48"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-13)
df_player2["SOS_tw_l60_opp_tot_pts_won%_47"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-14)
df_player2["SOS_tw_l60_opp_tot_pts_won%_46"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-15)
df_player2["SOS_tw_l60_opp_tot_pts_won%_45"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-16)
df_player2["SOS_tw_l60_opp_tot_pts_won%_44"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-17)
df_player2["SOS_tw_l60_opp_tot_pts_won%_43"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-18)
df_player2["SOS_tw_l60_opp_tot_pts_won%_42"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-19)
df_player2["SOS_tw_l60_opp_tot_pts_won%_41"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-20)
df_player2["SOS_tw_l60_opp_tot_pts_won%_40"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-21)
df_player2["SOS_tw_l60_opp_tot_pts_won%_39"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-22)
df_player2["SOS_tw_l60_opp_tot_pts_won%_38"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-23)
df_player2["SOS_tw_l60_opp_tot_pts_won%_37"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-24)
df_player2["SOS_tw_l60_opp_tot_pts_won%_36"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-25)
df_player2["SOS_tw_l60_opp_tot_pts_won%_35"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-26)
df_player2["SOS_tw_l60_opp_tot_pts_won%_34"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-27)
df_player2["SOS_tw_l60_opp_tot_pts_won%_33"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-28)
df_player2["SOS_tw_l60_opp_tot_pts_won%_32"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-29)
df_player2["SOS_tw_l60_opp_tot_pts_won%_31"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-30)
df_player2["SOS_tw_l60_opp_tot_pts_won%_30"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-31)
df_player2["SOS_tw_l60_opp_tot_pts_won%_29"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-32)
df_player2["SOS_tw_l60_opp_tot_pts_won%_28"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-33)
df_player2["SOS_tw_l60_opp_tot_pts_won%_27"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-34)
df_player2["SOS_tw_l60_opp_tot_pts_won%_26"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-35)
df_player2["SOS_tw_l60_opp_tot_pts_won%_25"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-36)
df_player2["SOS_tw_l60_opp_tot_pts_won%_24"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-37)
df_player2["SOS_tw_l60_opp_tot_pts_won%_23"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-38)
df_player2["SOS_tw_l60_opp_tot_pts_won%_22"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-39)
df_player2["SOS_tw_l60_opp_tot_pts_won%_21"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-40)
df_player2["SOS_tw_l60_opp_tot_pts_won%_20"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-41)
df_player2["SOS_tw_l60_opp_tot_pts_won%_19"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-42)
df_player2["SOS_tw_l60_opp_tot_pts_won%_18"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-43)
df_player2["SOS_tw_l60_opp_tot_pts_won%_17"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-44)
df_player2["SOS_tw_l60_opp_tot_pts_won%_16"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-45)
df_player2["SOS_tw_l60_opp_tot_pts_won%_15"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-46)
df_player2["SOS_tw_l60_opp_tot_pts_won%_14"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-47)
df_player2["SOS_tw_l60_opp_tot_pts_won%_13"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-48)
df_player2["SOS_tw_l60_opp_tot_pts_won%_12"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-49)
df_player2["SOS_tw_l60_opp_tot_pts_won%_11"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-50)
df_player2["SOS_tw_l60_opp_tot_pts_won%_10"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-51)
df_player2["SOS_tw_l60_opp_tot_pts_won%_9"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-52)
df_player2["SOS_tw_l60_opp_tot_pts_won%_8"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-53)
df_player2["SOS_tw_l60_opp_tot_pts_won%_7"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-54)
df_player2["SOS_tw_l60_opp_tot_pts_won%_6"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-55)
df_player2["SOS_tw_l60_opp_tot_pts_won%_5"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-56)
df_player2["SOS_tw_l60_opp_tot_pts_won%_4"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-57)
df_player2["SOS_tw_l60_opp_tot_pts_won%_3"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-58)
df_player2["SOS_tw_l60_opp_tot_pts_won%_2"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-59)
df_player2["SOS_tw_l60_opp_tot_pts_won%_1"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss'].shift(-60)

#Using sum function allows ignoring of NaN instead of interpolation. In modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_tot_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_tot_pts_won%_60", "SOS_tw_l60_opp_tot_pts_won%_59", "SOS_tw_l60_opp_tot_pts_won%_58", "SOS_tw_l60_opp_tot_pts_won%_57", "SOS_tw_l60_opp_tot_pts_won%_56", "SOS_tw_l60_opp_tot_pts_won%_55", "SOS_tw_l60_opp_tot_pts_won%_54", "SOS_tw_l60_opp_tot_pts_won%_53", "SOS_tw_l60_opp_tot_pts_won%_52", "SOS_tw_l60_opp_tot_pts_won%_51", "SOS_tw_l60_opp_tot_pts_won%_50", "SOS_tw_l60_opp_tot_pts_won%_49", "SOS_tw_l60_opp_tot_pts_won%_48", "SOS_tw_l60_opp_tot_pts_won%_47", "SOS_tw_l60_opp_tot_pts_won%_46", "SOS_tw_l60_opp_tot_pts_won%_45", "SOS_tw_l60_opp_tot_pts_won%_44", "SOS_tw_l60_opp_tot_pts_won%_43", "SOS_tw_l60_opp_tot_pts_won%_42", "SOS_tw_l60_opp_tot_pts_won%_41", "SOS_tw_l60_opp_tot_pts_won%_40", "SOS_tw_l60_opp_tot_pts_won%_39", "SOS_tw_l60_opp_tot_pts_won%_38", "SOS_tw_l60_opp_tot_pts_won%_37", "SOS_tw_l60_opp_tot_pts_won%_36", "SOS_tw_l60_opp_tot_pts_won%_35", "SOS_tw_l60_opp_tot_pts_won%_34", "SOS_tw_l60_opp_tot_pts_won%_33", "SOS_tw_l60_opp_tot_pts_won%_32", "SOS_tw_l60_opp_tot_pts_won%_31", "SOS_tw_l60_opp_tot_pts_won%_30", "SOS_tw_l60_opp_tot_pts_won%_29", "SOS_tw_l60_opp_tot_pts_won%_28", "SOS_tw_l60_opp_tot_pts_won%_27", "SOS_tw_l60_opp_tot_pts_won%_26", "SOS_tw_l60_opp_tot_pts_won%_25", "SOS_tw_l60_opp_tot_pts_won%_24", "SOS_tw_l60_opp_tot_pts_won%_23", "SOS_tw_l60_opp_tot_pts_won%_22", "SOS_tw_l60_opp_tot_pts_won%_21", "SOS_tw_l60_opp_tot_pts_won%_20", "SOS_tw_l60_opp_tot_pts_won%_19", "SOS_tw_l60_opp_tot_pts_won%_18", "SOS_tw_l60_opp_tot_pts_won%_17", "SOS_tw_l60_opp_tot_pts_won%_16", "SOS_tw_l60_opp_tot_pts_won%_15", "SOS_tw_l60_opp_tot_pts_won%_14", "SOS_tw_l60_opp_tot_pts_won%_13", "SOS_tw_l60_opp_tot_pts_won%_12", "SOS_tw_l60_opp_tot_pts_won%_11", "SOS_tw_l60_opp_tot_pts_won%_10", "SOS_tw_l60_opp_tot_pts_won%_9", "SOS_tw_l60_opp_tot_pts_won%_8", "SOS_tw_l60_opp_tot_pts_won%_7", "SOS_tw_l60_opp_tot_pts_won%_6", "SOS_tw_l60_opp_tot_pts_won%_5", "SOS_tw_l60_opp_tot_pts_won%_4", "SOS_tw_l60_opp_tot_pts_won%_3", "SOS_tw_l60_opp_tot_pts_won%_2", "SOS_tw_l60_opp_tot_pts_won%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_tot_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_tot_pts_won%_60", "SOS_tw_l60_opp_tot_pts_won%_59", "SOS_tw_l60_opp_tot_pts_won%_58", "SOS_tw_l60_opp_tot_pts_won%_57", "SOS_tw_l60_opp_tot_pts_won%_56", "SOS_tw_l60_opp_tot_pts_won%_55", "SOS_tw_l60_opp_tot_pts_won%_54", "SOS_tw_l60_opp_tot_pts_won%_53", "SOS_tw_l60_opp_tot_pts_won%_52", "SOS_tw_l60_opp_tot_pts_won%_51", "SOS_tw_l60_opp_tot_pts_won%_50", "SOS_tw_l60_opp_tot_pts_won%_49", "SOS_tw_l60_opp_tot_pts_won%_48", "SOS_tw_l60_opp_tot_pts_won%_47", "SOS_tw_l60_opp_tot_pts_won%_46", "SOS_tw_l60_opp_tot_pts_won%_45", "SOS_tw_l60_opp_tot_pts_won%_44", "SOS_tw_l60_opp_tot_pts_won%_43", "SOS_tw_l60_opp_tot_pts_won%_42", "SOS_tw_l60_opp_tot_pts_won%_41", "SOS_tw_l60_opp_tot_pts_won%_40", "SOS_tw_l60_opp_tot_pts_won%_39", "SOS_tw_l60_opp_tot_pts_won%_38", "SOS_tw_l60_opp_tot_pts_won%_37", "SOS_tw_l60_opp_tot_pts_won%_36", "SOS_tw_l60_opp_tot_pts_won%_35", "SOS_tw_l60_opp_tot_pts_won%_34", "SOS_tw_l60_opp_tot_pts_won%_33", "SOS_tw_l60_opp_tot_pts_won%_32", "SOS_tw_l60_opp_tot_pts_won%_31", "SOS_tw_l60_opp_tot_pts_won%_30", "SOS_tw_l60_opp_tot_pts_won%_29", "SOS_tw_l60_opp_tot_pts_won%_28", "SOS_tw_l60_opp_tot_pts_won%_27", "SOS_tw_l60_opp_tot_pts_won%_26", "SOS_tw_l60_opp_tot_pts_won%_25", "SOS_tw_l60_opp_tot_pts_won%_24", "SOS_tw_l60_opp_tot_pts_won%_23", "SOS_tw_l60_opp_tot_pts_won%_22", "SOS_tw_l60_opp_tot_pts_won%_21", "SOS_tw_l60_opp_tot_pts_won%_20", "SOS_tw_l60_opp_tot_pts_won%_19", "SOS_tw_l60_opp_tot_pts_won%_18", "SOS_tw_l60_opp_tot_pts_won%_17", "SOS_tw_l60_opp_tot_pts_won%_16", "SOS_tw_l60_opp_tot_pts_won%_15", "SOS_tw_l60_opp_tot_pts_won%_14", "SOS_tw_l60_opp_tot_pts_won%_13", "SOS_tw_l60_opp_tot_pts_won%_12", "SOS_tw_l60_opp_tot_pts_won%_11", "SOS_tw_l60_opp_tot_pts_won%_10", "SOS_tw_l60_opp_tot_pts_won%_9", "SOS_tw_l60_opp_tot_pts_won%_8", "SOS_tw_l60_opp_tot_pts_won%_7", "SOS_tw_l60_opp_tot_pts_won%_6", "SOS_tw_l60_opp_tot_pts_won%_5", "SOS_tw_l60_opp_tot_pts_won%_4", "SOS_tw_l60_opp_tot_pts_won%_3", "SOS_tw_l60_opp_tot_pts_won%_2", "SOS_tw_l60_opp_tot_pts_won%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_tot_pts_won%"] = (df_player2["SOS_tw_l60_opp_tot_pts_won%_ws"]/df_player2["SOS_tw_l60_opp_tot_pts_won%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % total pts the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to yield
# EY is EXPECTED YIELD
df_player2["EY"] = 100-df_player2["SOS_tw_l60_opp_tot_pts_won%"]

# Mean % total pts performance (l60_tw_ss) across ALL players per surface. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (49.21161401098898, 49.72861111111113, 49.76280884676153, 49.90836550836534, 49.95065056360718, 49.9303169469599)
mean_hard_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (49.34777266288949, 49.867599932249114, 49.97316354344139, 49.93523910358221, 50.05344892061886, 49.94715427793844)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6
df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_tot_pts_won%_ws", "SOS_tw_l60_opp_tot_pts_won%_ws_ct", "SOS_tw_l60_opp_tot_pts_won%", "SOS_tw_l60_opp_tot_pts_won%_50", "SOS_tw_l60_opp_tot_pts_won%_49", "SOS_tw_l60_opp_tot_pts_won%_48", "SOS_tw_l60_opp_tot_pts_won%_47", "SOS_tw_l60_opp_tot_pts_won%_46", "SOS_tw_l60_opp_tot_pts_won%_45", "SOS_tw_l60_opp_tot_pts_won%_44", "SOS_tw_l60_opp_tot_pts_won%_43", "SOS_tw_l60_opp_tot_pts_won%_42", "SOS_tw_l60_opp_tot_pts_won%_41", "SOS_tw_l60_opp_tot_pts_won%_40", "SOS_tw_l60_opp_tot_pts_won%_39", "SOS_tw_l60_opp_tot_pts_won%_38", "SOS_tw_l60_opp_tot_pts_won%_37", "SOS_tw_l60_opp_tot_pts_won%_36", "SOS_tw_l60_opp_tot_pts_won%_35", "SOS_tw_l60_opp_tot_pts_won%_34", "SOS_tw_l60_opp_tot_pts_won%_33", "SOS_tw_l60_opp_tot_pts_won%_32", "SOS_tw_l60_opp_tot_pts_won%_31", "SOS_tw_l60_opp_tot_pts_won%_30", "SOS_tw_l60_opp_tot_pts_won%_29", "SOS_tw_l60_opp_tot_pts_won%_28", "SOS_tw_l60_opp_tot_pts_won%_27", "SOS_tw_l60_opp_tot_pts_won%_26", "SOS_tw_l60_opp_tot_pts_won%_25", "SOS_tw_l60_opp_tot_pts_won%_24", "SOS_tw_l60_opp_tot_pts_won%_23", "SOS_tw_l60_opp_tot_pts_won%_22", "SOS_tw_l60_opp_tot_pts_won%_21", "SOS_tw_l60_opp_tot_pts_won%_20", "SOS_tw_l60_opp_tot_pts_won%_19", "SOS_tw_l60_opp_tot_pts_won%_18", "SOS_tw_l60_opp_tot_pts_won%_17", "SOS_tw_l60_opp_tot_pts_won%_16", "SOS_tw_l60_opp_tot_pts_won%_15", "SOS_tw_l60_opp_tot_pts_won%_14", "SOS_tw_l60_opp_tot_pts_won%_13", "SOS_tw_l60_opp_tot_pts_won%_12", "SOS_tw_l60_opp_tot_pts_won%_11", "SOS_tw_l60_opp_tot_pts_won%_10", "SOS_tw_l60_opp_tot_pts_won%_9", "SOS_tw_l60_opp_tot_pts_won%_8", "SOS_tw_l60_opp_tot_pts_won%_7", "SOS_tw_l60_opp_tot_pts_won%_6", "SOS_tw_l60_opp_tot_pts_won%_5", "SOS_tw_l60_opp_tot_pts_won%_4", "SOS_tw_l60_opp_tot_pts_won%_3", "SOS_tw_l60_opp_tot_pts_won%_2", "SOS_tw_l60_opp_tot_pts_won%_1"],axis=1)

In [130]:
# 'p_tot_pts_won%_l10_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) TOTAL POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_tot_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_tot_pts_won%_60", "SOS_tw_l60_opp_tot_pts_won%_59", "SOS_tw_l60_opp_tot_pts_won%_58", "SOS_tw_l60_opp_tot_pts_won%_57", "SOS_tw_l60_opp_tot_pts_won%_56", "SOS_tw_l60_opp_tot_pts_won%_55", "SOS_tw_l60_opp_tot_pts_won%_54", "SOS_tw_l60_opp_tot_pts_won%_53", "SOS_tw_l60_opp_tot_pts_won%_52", "SOS_tw_l60_opp_tot_pts_won%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_tot_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_tot_pts_won%_60", "SOS_tw_l60_opp_tot_pts_won%_59", "SOS_tw_l60_opp_tot_pts_won%_58", "SOS_tw_l60_opp_tot_pts_won%_57", "SOS_tw_l60_opp_tot_pts_won%_56", "SOS_tw_l60_opp_tot_pts_won%_55", "SOS_tw_l60_opp_tot_pts_won%_54", "SOS_tw_l60_opp_tot_pts_won%_53", "SOS_tw_l60_opp_tot_pts_won%_52", "SOS_tw_l60_opp_tot_pts_won%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_tot_pts_won%"] = (df_player2["SOS_tw_l10_opp_tot_pts_won%_ws"]/df_player2["SOS_tw_l10_opp_tot_pts_won%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % total pts the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches on this surface is expected to yield
# EY is EXPECTED YIELD
df_player2["EY"] = 100-df_player2["SOS_tw_l10_opp_tot_pts_won%"]

# Mean % total pts performance (l10_tw_ss) across ALL players per surface. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_clay_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (49.23693681318672, 49.72714330808086, 49.80182938388632, 49.902046332046375, 49.950579710144865, 49.87507438551096)
mean_hard_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (49.39852691218135, 49.86438346883477, 49.971897785349526, 49.903541556378926, 50.00057793642685, 49.939435278108455)

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6
df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_tot_pts_won%_ws", "SOS_tw_l10_opp_tot_pts_won%_ws_ct", "SOS_tw_l10_opp_tot_pts_won%","SOS_tw_l60_opp_tot_pts_won%_60", "SOS_tw_l60_opp_tot_pts_won%_59", "SOS_tw_l60_opp_tot_pts_won%_58", "SOS_tw_l60_opp_tot_pts_won%_57", "SOS_tw_l60_opp_tot_pts_won%_56", "SOS_tw_l60_opp_tot_pts_won%_55", "SOS_tw_l60_opp_tot_pts_won%_54", "SOS_tw_l60_opp_tot_pts_won%_53", "SOS_tw_l60_opp_tot_pts_won%_52", "SOS_tw_l60_opp_tot_pts_won%_51"],axis=1)

In [131]:
df_player2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57068 entries, 56533 to 40644
Columns: 239 entries, t_id to p_tot_pts_won%_l10_tw_ss_SOS_adj
dtypes: datetime64[ns](1), float64(203), int64(29), object(6)
memory usage: 107.0+ MB


In [132]:
# 'p_tot_pts_won%_l60_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), indoor/outdoor-specific (IO), TOTAL POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific (SS), indoor/outdoor-specific (IO), TOTAL POINTS WON performance of player OPPONENTS over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_60"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-1)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_59"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-2)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_58"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-3)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_57"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-4)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_56"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-5)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_55"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-6)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_54"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-7)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_53"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-8)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_52"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-9)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_51"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-10)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_50"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-11)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_49"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-12)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_48"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-13)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_47"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-14)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_46"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-15)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_45"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-16)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_44"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-17)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_43"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-18)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_42"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-19)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_41"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-20)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_40"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-21)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_39"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-22)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_38"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-23)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_37"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-24)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_36"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-25)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_35"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-26)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_34"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-27)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_33"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-28)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_32"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-29)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_31"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-30)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_30"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-31)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_29"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-32)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_28"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-33)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_27"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-34)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_26"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-35)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_25"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-36)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_24"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-37)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_23"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-38)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_22"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-39)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_21"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-40)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_20"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-41)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_19"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-42)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_18"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-43)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_17"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-44)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_16"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-45)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_15"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-46)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_14"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-47)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_13"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-48)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_12"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-49)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_11"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-50)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_10"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-51)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_9"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-52)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_8"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-53)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_7"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-54)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_6"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-55)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_5"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-56)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_4"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-57)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_3"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-58)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_2"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-59)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_1"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_ws"] = df_player2[["SOS_tw_l60_opp_tot_pts_won%_IO_60", "SOS_tw_l60_opp_tot_pts_won%_IO_59", "SOS_tw_l60_opp_tot_pts_won%_IO_58", "SOS_tw_l60_opp_tot_pts_won%_IO_57", "SOS_tw_l60_opp_tot_pts_won%_IO_56", "SOS_tw_l60_opp_tot_pts_won%_IO_55", "SOS_tw_l60_opp_tot_pts_won%_IO_54", "SOS_tw_l60_opp_tot_pts_won%_IO_53", "SOS_tw_l60_opp_tot_pts_won%_IO_52", "SOS_tw_l60_opp_tot_pts_won%_IO_51", "SOS_tw_l60_opp_tot_pts_won%_IO_50", "SOS_tw_l60_opp_tot_pts_won%_IO_49", "SOS_tw_l60_opp_tot_pts_won%_IO_48", "SOS_tw_l60_opp_tot_pts_won%_IO_47", "SOS_tw_l60_opp_tot_pts_won%_IO_46", "SOS_tw_l60_opp_tot_pts_won%_IO_45", "SOS_tw_l60_opp_tot_pts_won%_IO_44", "SOS_tw_l60_opp_tot_pts_won%_IO_43", "SOS_tw_l60_opp_tot_pts_won%_IO_42", "SOS_tw_l60_opp_tot_pts_won%_IO_41", "SOS_tw_l60_opp_tot_pts_won%_IO_40", "SOS_tw_l60_opp_tot_pts_won%_IO_39", "SOS_tw_l60_opp_tot_pts_won%_IO_38", "SOS_tw_l60_opp_tot_pts_won%_IO_37", "SOS_tw_l60_opp_tot_pts_won%_IO_36", "SOS_tw_l60_opp_tot_pts_won%_IO_35", "SOS_tw_l60_opp_tot_pts_won%_IO_34", "SOS_tw_l60_opp_tot_pts_won%_IO_33", "SOS_tw_l60_opp_tot_pts_won%_IO_32", "SOS_tw_l60_opp_tot_pts_won%_IO_31", "SOS_tw_l60_opp_tot_pts_won%_IO_30", "SOS_tw_l60_opp_tot_pts_won%_IO_29", "SOS_tw_l60_opp_tot_pts_won%_IO_28", "SOS_tw_l60_opp_tot_pts_won%_IO_27", "SOS_tw_l60_opp_tot_pts_won%_IO_26", "SOS_tw_l60_opp_tot_pts_won%_IO_25", "SOS_tw_l60_opp_tot_pts_won%_IO_24", "SOS_tw_l60_opp_tot_pts_won%_IO_23", "SOS_tw_l60_opp_tot_pts_won%_IO_22", "SOS_tw_l60_opp_tot_pts_won%_IO_21", "SOS_tw_l60_opp_tot_pts_won%_IO_20", "SOS_tw_l60_opp_tot_pts_won%_IO_19", "SOS_tw_l60_opp_tot_pts_won%_IO_18", "SOS_tw_l60_opp_tot_pts_won%_IO_17", "SOS_tw_l60_opp_tot_pts_won%_IO_16", "SOS_tw_l60_opp_tot_pts_won%_IO_15", "SOS_tw_l60_opp_tot_pts_won%_IO_14", "SOS_tw_l60_opp_tot_pts_won%_IO_13", "SOS_tw_l60_opp_tot_pts_won%_IO_12", "SOS_tw_l60_opp_tot_pts_won%_IO_11", "SOS_tw_l60_opp_tot_pts_won%_IO_10", "SOS_tw_l60_opp_tot_pts_won%_IO_9", "SOS_tw_l60_opp_tot_pts_won%_IO_8", "SOS_tw_l60_opp_tot_pts_won%_IO_7", "SOS_tw_l60_opp_tot_pts_won%_IO_6", "SOS_tw_l60_opp_tot_pts_won%_IO_5", "SOS_tw_l60_opp_tot_pts_won%_IO_4", "SOS_tw_l60_opp_tot_pts_won%_IO_3", "SOS_tw_l60_opp_tot_pts_won%_IO_2", "SOS_tw_l60_opp_tot_pts_won%_IO_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_ws_ct"] = df_player2[["SOS_tw_l60_opp_tot_pts_won%_IO_60", "SOS_tw_l60_opp_tot_pts_won%_IO_59", "SOS_tw_l60_opp_tot_pts_won%_IO_58", "SOS_tw_l60_opp_tot_pts_won%_IO_57", "SOS_tw_l60_opp_tot_pts_won%_IO_56", "SOS_tw_l60_opp_tot_pts_won%_IO_55", "SOS_tw_l60_opp_tot_pts_won%_IO_54", "SOS_tw_l60_opp_tot_pts_won%_IO_53", "SOS_tw_l60_opp_tot_pts_won%_IO_52", "SOS_tw_l60_opp_tot_pts_won%_IO_51", "SOS_tw_l60_opp_tot_pts_won%_IO_50", "SOS_tw_l60_opp_tot_pts_won%_IO_49", "SOS_tw_l60_opp_tot_pts_won%_IO_48", "SOS_tw_l60_opp_tot_pts_won%_IO_47", "SOS_tw_l60_opp_tot_pts_won%_IO_46", "SOS_tw_l60_opp_tot_pts_won%_IO_45", "SOS_tw_l60_opp_tot_pts_won%_IO_44", "SOS_tw_l60_opp_tot_pts_won%_IO_43", "SOS_tw_l60_opp_tot_pts_won%_IO_42", "SOS_tw_l60_opp_tot_pts_won%_IO_41", "SOS_tw_l60_opp_tot_pts_won%_IO_40", "SOS_tw_l60_opp_tot_pts_won%_IO_39", "SOS_tw_l60_opp_tot_pts_won%_IO_38", "SOS_tw_l60_opp_tot_pts_won%_IO_37", "SOS_tw_l60_opp_tot_pts_won%_IO_36", "SOS_tw_l60_opp_tot_pts_won%_IO_35", "SOS_tw_l60_opp_tot_pts_won%_IO_34", "SOS_tw_l60_opp_tot_pts_won%_IO_33", "SOS_tw_l60_opp_tot_pts_won%_IO_32", "SOS_tw_l60_opp_tot_pts_won%_IO_31", "SOS_tw_l60_opp_tot_pts_won%_IO_30", "SOS_tw_l60_opp_tot_pts_won%_IO_29", "SOS_tw_l60_opp_tot_pts_won%_IO_28", "SOS_tw_l60_opp_tot_pts_won%_IO_27", "SOS_tw_l60_opp_tot_pts_won%_IO_26", "SOS_tw_l60_opp_tot_pts_won%_IO_25", "SOS_tw_l60_opp_tot_pts_won%_IO_24", "SOS_tw_l60_opp_tot_pts_won%_IO_23", "SOS_tw_l60_opp_tot_pts_won%_IO_22", "SOS_tw_l60_opp_tot_pts_won%_IO_21", "SOS_tw_l60_opp_tot_pts_won%_IO_20", "SOS_tw_l60_opp_tot_pts_won%_IO_19", "SOS_tw_l60_opp_tot_pts_won%_IO_18", "SOS_tw_l60_opp_tot_pts_won%_IO_17", "SOS_tw_l60_opp_tot_pts_won%_IO_16", "SOS_tw_l60_opp_tot_pts_won%_IO_15", "SOS_tw_l60_opp_tot_pts_won%_IO_14", "SOS_tw_l60_opp_tot_pts_won%_IO_13", "SOS_tw_l60_opp_tot_pts_won%_IO_12", "SOS_tw_l60_opp_tot_pts_won%_IO_11", "SOS_tw_l60_opp_tot_pts_won%_IO_10", "SOS_tw_l60_opp_tot_pts_won%_IO_9", "SOS_tw_l60_opp_tot_pts_won%_IO_8", "SOS_tw_l60_opp_tot_pts_won%_IO_7", "SOS_tw_l60_opp_tot_pts_won%_IO_6", "SOS_tw_l60_opp_tot_pts_won%_IO_5", "SOS_tw_l60_opp_tot_pts_won%_IO_4", "SOS_tw_l60_opp_tot_pts_won%_IO_3", "SOS_tw_l60_opp_tot_pts_won%_IO_2", "SOS_tw_l60_opp_tot_pts_won%_IO_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_tot_pts_won%_IO"] = (df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_ws"]/df_player2["SOS_tw_l60_opp_tot_pts_won%_IO_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % total pts the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface, specifically indoor or outdoor, is expected to yield
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l60_opp_tot_pts_won%_IO"]

# Mean % total pts performance (l60_tw_ss) for all players across ALL surfaces. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculat mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_clay_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (49.999999999999986, 50.000000000000014, 49.99999999999999, 50.000000000000014, 50.000000000000014, 50.0)

mean_clay_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (49.99999999999994, 50.000000000000036, 50.00000000000008, 50.00000000000002, 49.99999999999998, 49.99999999999998)        

mean_hard_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (50.00000000000004, 50.00000000000001, 50.000000000000036, 50.00000000000006, 49.99999999999995, 50.000000000000036)
                                         
mean_hard_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average                                         
# (49.99999999999999, 49.99999999999991, 50.000000000000085, 50.00000000000009, 49.99999999999994, 50.000000000000036)                                        
                                         
# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 
                                         
del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_tot_pts_won%_IO_ws", "SOS_tw_l60_opp_tot_pts_won%_IO_ws_ct", "SOS_tw_l60_opp_tot_pts_won%_IO", "SOS_tw_l60_opp_tot_pts_won%_IO_50", "SOS_tw_l60_opp_tot_pts_won%_IO_49", "SOS_tw_l60_opp_tot_pts_won%_IO_48", "SOS_tw_l60_opp_tot_pts_won%_IO_47", "SOS_tw_l60_opp_tot_pts_won%_IO_46", "SOS_tw_l60_opp_tot_pts_won%_IO_45", "SOS_tw_l60_opp_tot_pts_won%_IO_44", "SOS_tw_l60_opp_tot_pts_won%_IO_43", "SOS_tw_l60_opp_tot_pts_won%_IO_42", "SOS_tw_l60_opp_tot_pts_won%_IO_41", "SOS_tw_l60_opp_tot_pts_won%_IO_40", "SOS_tw_l60_opp_tot_pts_won%_IO_39", "SOS_tw_l60_opp_tot_pts_won%_IO_38", "SOS_tw_l60_opp_tot_pts_won%_IO_37", "SOS_tw_l60_opp_tot_pts_won%_IO_36", "SOS_tw_l60_opp_tot_pts_won%_IO_35", "SOS_tw_l60_opp_tot_pts_won%_IO_34", "SOS_tw_l60_opp_tot_pts_won%_IO_33", "SOS_tw_l60_opp_tot_pts_won%_IO_32", "SOS_tw_l60_opp_tot_pts_won%_IO_31", "SOS_tw_l60_opp_tot_pts_won%_IO_30", "SOS_tw_l60_opp_tot_pts_won%_IO_29", "SOS_tw_l60_opp_tot_pts_won%_IO_28", "SOS_tw_l60_opp_tot_pts_won%_IO_27", "SOS_tw_l60_opp_tot_pts_won%_IO_26", "SOS_tw_l60_opp_tot_pts_won%_IO_25", "SOS_tw_l60_opp_tot_pts_won%_IO_24", "SOS_tw_l60_opp_tot_pts_won%_IO_23", "SOS_tw_l60_opp_tot_pts_won%_IO_22", "SOS_tw_l60_opp_tot_pts_won%_IO_21", "SOS_tw_l60_opp_tot_pts_won%_IO_20", "SOS_tw_l60_opp_tot_pts_won%_IO_19", "SOS_tw_l60_opp_tot_pts_won%_IO_18", "SOS_tw_l60_opp_tot_pts_won%_IO_17", "SOS_tw_l60_opp_tot_pts_won%_IO_16", "SOS_tw_l60_opp_tot_pts_won%_IO_15", "SOS_tw_l60_opp_tot_pts_won%_IO_14", "SOS_tw_l60_opp_tot_pts_won%_IO_13", "SOS_tw_l60_opp_tot_pts_won%_IO_12", "SOS_tw_l60_opp_tot_pts_won%_IO_11", "SOS_tw_l60_opp_tot_pts_won%_IO_10", "SOS_tw_l60_opp_tot_pts_won%_IO_9", "SOS_tw_l60_opp_tot_pts_won%_IO_8", "SOS_tw_l60_opp_tot_pts_won%_IO_7", "SOS_tw_l60_opp_tot_pts_won%_IO_6", "SOS_tw_l60_opp_tot_pts_won%_IO_5", "SOS_tw_l60_opp_tot_pts_won%_IO_4", "SOS_tw_l60_opp_tot_pts_won%_IO_3", "SOS_tw_l60_opp_tot_pts_won%_IO_2", "SOS_tw_l60_opp_tot_pts_won%_IO_1"],axis=1)

In [133]:
# 'p_tot_pts_won%_l10_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), indoor/outdoor-specific (IO), TOTAL POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_tot_pts_won%_IO_ws"] = df_player2[["SOS_tw_l60_opp_tot_pts_won%_IO_60", "SOS_tw_l60_opp_tot_pts_won%_IO_59", "SOS_tw_l60_opp_tot_pts_won%_IO_58", "SOS_tw_l60_opp_tot_pts_won%_IO_57", "SOS_tw_l60_opp_tot_pts_won%_IO_56", "SOS_tw_l60_opp_tot_pts_won%_IO_55", "SOS_tw_l60_opp_tot_pts_won%_IO_54", "SOS_tw_l60_opp_tot_pts_won%_IO_53", "SOS_tw_l60_opp_tot_pts_won%_IO_52", "SOS_tw_l60_opp_tot_pts_won%_IO_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_tot_pts_won%_IO_ws_ct"] = df_player2[["SOS_tw_l60_opp_tot_pts_won%_IO_60", "SOS_tw_l60_opp_tot_pts_won%_IO_59", "SOS_tw_l60_opp_tot_pts_won%_IO_58", "SOS_tw_l60_opp_tot_pts_won%_IO_57", "SOS_tw_l60_opp_tot_pts_won%_IO_56", "SOS_tw_l60_opp_tot_pts_won%_IO_55", "SOS_tw_l60_opp_tot_pts_won%_IO_54", "SOS_tw_l60_opp_tot_pts_won%_IO_53", "SOS_tw_l60_opp_tot_pts_won%_IO_52", "SOS_tw_l60_opp_tot_pts_won%_IO_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_tot_pts_won%_IO"] = (df_player2["SOS_tw_l10_opp_tot_pts_won%_IO_ws"]/df_player2["SOS_tw_l10_opp_tot_pts_won%_IO_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % total pts the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches on this surface, specifically indoor or outdoor, is expected to yield
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l10_opp_tot_pts_won%_IO"]

# Mean % total pts performance (l10_tw_ss) for all players across ALL surfaces. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (nan, 50.57401960784313, 50.255046728971934, 50.368669950738926, 50.10459119496857, 50.09632075471699)

mean_clay_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (49.23693681318672, 49.69896934116122, 49.768963063368474, 49.86943889845101, 49.94226748133058, 49.867220361687835)      

mean_hard_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (49.7200524658972, 49.995944924406004, 50.15107405313729, 49.90918408631151, 50.04850582590809, 49.90125000000003)
                                         
mean_hard_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct total pts the field ALLOWS on average                                         
#  (49.23475681453775, 49.804252221125445, 49.894608632040956, 49.9015415869979, 49.984771699819355, 49.95246862879304)                                       
                                         
# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 
                                         
del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o
df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_tot_pts_won%_IO_ws", "SOS_tw_l10_opp_tot_pts_won%_IO_ws_ct", "SOS_tw_l10_opp_tot_pts_won%_IO", "SOS_tw_l60_opp_tot_pts_won%_IO_60", "SOS_tw_l60_opp_tot_pts_won%_IO_59", "SOS_tw_l60_opp_tot_pts_won%_IO_58", "SOS_tw_l60_opp_tot_pts_won%_IO_57", "SOS_tw_l60_opp_tot_pts_won%_IO_56", "SOS_tw_l60_opp_tot_pts_won%_IO_55", "SOS_tw_l60_opp_tot_pts_won%_IO_54", "SOS_tw_l60_opp_tot_pts_won%_IO_53", "SOS_tw_l60_opp_tot_pts_won%_IO_52", "SOS_tw_l60_opp_tot_pts_won%_IO_51"],axis=1)

In [134]:
df_player2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57068 entries, 56533 to 40644
Columns: 241 entries, t_id to p_tot_pts_won%_l10_tw_ss_IO_SOS_adj
dtypes: datetime64[ns](1), float64(205), int64(29), object(6)
memory usage: 107.9+ MB


In [135]:
#df_player2.to_csv('../data/df_player2_test.csv', index=False)

In [136]:
# 'p_tot_pts_won%_l60_tw_nss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, NON-surface-specific (NSS) TOTAL POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, NON-surface-specific (NSS) TOTAL POINTS WON performance of player OPPONENTS over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player2["SOS_tw_l60_opp_tot_pts_won%_60"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-1)
df_player2["SOS_tw_l60_opp_tot_pts_won%_59"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-2)
df_player2["SOS_tw_l60_opp_tot_pts_won%_58"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-3)
df_player2["SOS_tw_l60_opp_tot_pts_won%_57"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-4)
df_player2["SOS_tw_l60_opp_tot_pts_won%_56"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-5)
df_player2["SOS_tw_l60_opp_tot_pts_won%_55"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-6)
df_player2["SOS_tw_l60_opp_tot_pts_won%_54"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-7)
df_player2["SOS_tw_l60_opp_tot_pts_won%_53"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-8)
df_player2["SOS_tw_l60_opp_tot_pts_won%_52"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-9)
df_player2["SOS_tw_l60_opp_tot_pts_won%_51"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-10)
df_player2["SOS_tw_l60_opp_tot_pts_won%_50"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-11)
df_player2["SOS_tw_l60_opp_tot_pts_won%_49"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-12)
df_player2["SOS_tw_l60_opp_tot_pts_won%_48"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-13)
df_player2["SOS_tw_l60_opp_tot_pts_won%_47"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-14)
df_player2["SOS_tw_l60_opp_tot_pts_won%_46"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-15)
df_player2["SOS_tw_l60_opp_tot_pts_won%_45"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-16)
df_player2["SOS_tw_l60_opp_tot_pts_won%_44"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-17)
df_player2["SOS_tw_l60_opp_tot_pts_won%_43"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-18)
df_player2["SOS_tw_l60_opp_tot_pts_won%_42"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-19)
df_player2["SOS_tw_l60_opp_tot_pts_won%_41"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-20)
df_player2["SOS_tw_l60_opp_tot_pts_won%_40"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-21)
df_player2["SOS_tw_l60_opp_tot_pts_won%_39"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-22)
df_player2["SOS_tw_l60_opp_tot_pts_won%_38"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-23)
df_player2["SOS_tw_l60_opp_tot_pts_won%_37"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-24)
df_player2["SOS_tw_l60_opp_tot_pts_won%_36"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-25)
df_player2["SOS_tw_l60_opp_tot_pts_won%_35"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-26)
df_player2["SOS_tw_l60_opp_tot_pts_won%_34"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-27)
df_player2["SOS_tw_l60_opp_tot_pts_won%_33"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-28)
df_player2["SOS_tw_l60_opp_tot_pts_won%_32"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-29)
df_player2["SOS_tw_l60_opp_tot_pts_won%_31"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-30)
df_player2["SOS_tw_l60_opp_tot_pts_won%_30"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-31)
df_player2["SOS_tw_l60_opp_tot_pts_won%_29"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-32)
df_player2["SOS_tw_l60_opp_tot_pts_won%_28"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-33)
df_player2["SOS_tw_l60_opp_tot_pts_won%_27"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-34)
df_player2["SOS_tw_l60_opp_tot_pts_won%_26"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-35)
df_player2["SOS_tw_l60_opp_tot_pts_won%_25"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-36)
df_player2["SOS_tw_l60_opp_tot_pts_won%_24"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-37)
df_player2["SOS_tw_l60_opp_tot_pts_won%_23"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-38)
df_player2["SOS_tw_l60_opp_tot_pts_won%_22"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-39)
df_player2["SOS_tw_l60_opp_tot_pts_won%_21"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-40)
df_player2["SOS_tw_l60_opp_tot_pts_won%_20"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-41)
df_player2["SOS_tw_l60_opp_tot_pts_won%_19"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-42)
df_player2["SOS_tw_l60_opp_tot_pts_won%_18"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-43)
df_player2["SOS_tw_l60_opp_tot_pts_won%_17"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-44)
df_player2["SOS_tw_l60_opp_tot_pts_won%_16"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-45)
df_player2["SOS_tw_l60_opp_tot_pts_won%_15"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-46)
df_player2["SOS_tw_l60_opp_tot_pts_won%_14"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-47)
df_player2["SOS_tw_l60_opp_tot_pts_won%_13"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-48)
df_player2["SOS_tw_l60_opp_tot_pts_won%_12"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-49)
df_player2["SOS_tw_l60_opp_tot_pts_won%_11"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-50)
df_player2["SOS_tw_l60_opp_tot_pts_won%_10"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-51)
df_player2["SOS_tw_l60_opp_tot_pts_won%_9"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-52)
df_player2["SOS_tw_l60_opp_tot_pts_won%_8"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-53)
df_player2["SOS_tw_l60_opp_tot_pts_won%_7"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-54)
df_player2["SOS_tw_l60_opp_tot_pts_won%_6"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-55)
df_player2["SOS_tw_l60_opp_tot_pts_won%_5"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-56)
df_player2["SOS_tw_l60_opp_tot_pts_won%_4"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-57)
df_player2["SOS_tw_l60_opp_tot_pts_won%_3"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-58)
df_player2["SOS_tw_l60_opp_tot_pts_won%_2"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-59)
df_player2["SOS_tw_l60_opp_tot_pts_won%_1"] = df_player2.groupby(['p_id'])['p_opp_tot_pts_won%_l60_tw_nss'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_tot_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_tot_pts_won%_60", "SOS_tw_l60_opp_tot_pts_won%_59", "SOS_tw_l60_opp_tot_pts_won%_58", "SOS_tw_l60_opp_tot_pts_won%_57", "SOS_tw_l60_opp_tot_pts_won%_56", "SOS_tw_l60_opp_tot_pts_won%_55", "SOS_tw_l60_opp_tot_pts_won%_54", "SOS_tw_l60_opp_tot_pts_won%_53", "SOS_tw_l60_opp_tot_pts_won%_52", "SOS_tw_l60_opp_tot_pts_won%_51", "SOS_tw_l60_opp_tot_pts_won%_50", "SOS_tw_l60_opp_tot_pts_won%_49", "SOS_tw_l60_opp_tot_pts_won%_48", "SOS_tw_l60_opp_tot_pts_won%_47", "SOS_tw_l60_opp_tot_pts_won%_46", "SOS_tw_l60_opp_tot_pts_won%_45", "SOS_tw_l60_opp_tot_pts_won%_44", "SOS_tw_l60_opp_tot_pts_won%_43", "SOS_tw_l60_opp_tot_pts_won%_42", "SOS_tw_l60_opp_tot_pts_won%_41", "SOS_tw_l60_opp_tot_pts_won%_40", "SOS_tw_l60_opp_tot_pts_won%_39", "SOS_tw_l60_opp_tot_pts_won%_38", "SOS_tw_l60_opp_tot_pts_won%_37", "SOS_tw_l60_opp_tot_pts_won%_36", "SOS_tw_l60_opp_tot_pts_won%_35", "SOS_tw_l60_opp_tot_pts_won%_34", "SOS_tw_l60_opp_tot_pts_won%_33", "SOS_tw_l60_opp_tot_pts_won%_32", "SOS_tw_l60_opp_tot_pts_won%_31", "SOS_tw_l60_opp_tot_pts_won%_30", "SOS_tw_l60_opp_tot_pts_won%_29", "SOS_tw_l60_opp_tot_pts_won%_28", "SOS_tw_l60_opp_tot_pts_won%_27", "SOS_tw_l60_opp_tot_pts_won%_26", "SOS_tw_l60_opp_tot_pts_won%_25", "SOS_tw_l60_opp_tot_pts_won%_24", "SOS_tw_l60_opp_tot_pts_won%_23", "SOS_tw_l60_opp_tot_pts_won%_22", "SOS_tw_l60_opp_tot_pts_won%_21", "SOS_tw_l60_opp_tot_pts_won%_20", "SOS_tw_l60_opp_tot_pts_won%_19", "SOS_tw_l60_opp_tot_pts_won%_18", "SOS_tw_l60_opp_tot_pts_won%_17", "SOS_tw_l60_opp_tot_pts_won%_16", "SOS_tw_l60_opp_tot_pts_won%_15", "SOS_tw_l60_opp_tot_pts_won%_14", "SOS_tw_l60_opp_tot_pts_won%_13", "SOS_tw_l60_opp_tot_pts_won%_12", "SOS_tw_l60_opp_tot_pts_won%_11", "SOS_tw_l60_opp_tot_pts_won%_10", "SOS_tw_l60_opp_tot_pts_won%_9", "SOS_tw_l60_opp_tot_pts_won%_8", "SOS_tw_l60_opp_tot_pts_won%_7", "SOS_tw_l60_opp_tot_pts_won%_6", "SOS_tw_l60_opp_tot_pts_won%_5", "SOS_tw_l60_opp_tot_pts_won%_4", "SOS_tw_l60_opp_tot_pts_won%_3", "SOS_tw_l60_opp_tot_pts_won%_2", "SOS_tw_l60_opp_tot_pts_won%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_tot_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_tot_pts_won%_60", "SOS_tw_l60_opp_tot_pts_won%_59", "SOS_tw_l60_opp_tot_pts_won%_58", "SOS_tw_l60_opp_tot_pts_won%_57", "SOS_tw_l60_opp_tot_pts_won%_56", "SOS_tw_l60_opp_tot_pts_won%_55", "SOS_tw_l60_opp_tot_pts_won%_54", "SOS_tw_l60_opp_tot_pts_won%_53", "SOS_tw_l60_opp_tot_pts_won%_52", "SOS_tw_l60_opp_tot_pts_won%_51", "SOS_tw_l60_opp_tot_pts_won%_50", "SOS_tw_l60_opp_tot_pts_won%_49", "SOS_tw_l60_opp_tot_pts_won%_48", "SOS_tw_l60_opp_tot_pts_won%_47", "SOS_tw_l60_opp_tot_pts_won%_46", "SOS_tw_l60_opp_tot_pts_won%_45", "SOS_tw_l60_opp_tot_pts_won%_44", "SOS_tw_l60_opp_tot_pts_won%_43", "SOS_tw_l60_opp_tot_pts_won%_42", "SOS_tw_l60_opp_tot_pts_won%_41", "SOS_tw_l60_opp_tot_pts_won%_40", "SOS_tw_l60_opp_tot_pts_won%_39", "SOS_tw_l60_opp_tot_pts_won%_38", "SOS_tw_l60_opp_tot_pts_won%_37", "SOS_tw_l60_opp_tot_pts_won%_36", "SOS_tw_l60_opp_tot_pts_won%_35", "SOS_tw_l60_opp_tot_pts_won%_34", "SOS_tw_l60_opp_tot_pts_won%_33", "SOS_tw_l60_opp_tot_pts_won%_32", "SOS_tw_l60_opp_tot_pts_won%_31", "SOS_tw_l60_opp_tot_pts_won%_30", "SOS_tw_l60_opp_tot_pts_won%_29", "SOS_tw_l60_opp_tot_pts_won%_28", "SOS_tw_l60_opp_tot_pts_won%_27", "SOS_tw_l60_opp_tot_pts_won%_26", "SOS_tw_l60_opp_tot_pts_won%_25", "SOS_tw_l60_opp_tot_pts_won%_24", "SOS_tw_l60_opp_tot_pts_won%_23", "SOS_tw_l60_opp_tot_pts_won%_22", "SOS_tw_l60_opp_tot_pts_won%_21", "SOS_tw_l60_opp_tot_pts_won%_20", "SOS_tw_l60_opp_tot_pts_won%_19", "SOS_tw_l60_opp_tot_pts_won%_18", "SOS_tw_l60_opp_tot_pts_won%_17", "SOS_tw_l60_opp_tot_pts_won%_16", "SOS_tw_l60_opp_tot_pts_won%_15", "SOS_tw_l60_opp_tot_pts_won%_14", "SOS_tw_l60_opp_tot_pts_won%_13", "SOS_tw_l60_opp_tot_pts_won%_12", "SOS_tw_l60_opp_tot_pts_won%_11", "SOS_tw_l60_opp_tot_pts_won%_10", "SOS_tw_l60_opp_tot_pts_won%_9", "SOS_tw_l60_opp_tot_pts_won%_8", "SOS_tw_l60_opp_tot_pts_won%_7", "SOS_tw_l60_opp_tot_pts_won%_6", "SOS_tw_l60_opp_tot_pts_won%_5", "SOS_tw_l60_opp_tot_pts_won%_4", "SOS_tw_l60_opp_tot_pts_won%_3", "SOS_tw_l60_opp_tot_pts_won%_2", "SOS_tw_l60_opp_tot_pts_won%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_tot_pts_won%"] = (df_player2["SOS_tw_l60_opp_tot_pts_won%_ws"]/df_player2["SOS_tw_l60_opp_tot_pts_won%_ws_ct"]).round(2) 
#(ws = weighted sum; tw = time-weighted)

# % total pts the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches ACROSS SURFACES is expected to yield
# EY is EXPECTED YIELD
df_player2["EY"] = 100-df_player2["SOS_tw_l60_opp_tot_pts_won%"]

# Mean % total pts performance (l60_tw_nss) for ALL players ACROSS SURFACES (clay, hard, grass). We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_allsurf_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] != 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l60_tw_nss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_allsurf_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] != 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l60_tw_nss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_allsurf_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] != 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l60_tw_nss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_allsurf_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] != 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l60_tw_nss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_allsurf_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] != 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l60_tw_nss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_allsurf_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] != 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l60_tw_nss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (49.55924332935077, 49.8853512416657, 49.95044602771806, 49.947263042612235, 50.05061103555464, 49.98865384615413)
                                         
# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] != 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l60_tw_nss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_nss"])*(mean_allsurf_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] != 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l60_tw_nss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_nss"])*(mean_allsurf_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] != 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l60_tw_nss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_nss"])*(mean_allsurf_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] != 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l60_tw_nss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_nss"])*(mean_allsurf_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] != 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l60_tw_nss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_nss"])*(mean_allsurf_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] != 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l60_tw_nss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_nss"])*(mean_allsurf_SOS_6/df_player2["EY"])).round(2) 

del mean_allsurf_SOS_1, mean_allsurf_SOS_2, mean_allsurf_SOS_3, mean_allsurf_SOS_4, mean_allsurf_SOS_5, mean_allsurf_SOS_6
df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_tot_pts_won%_ws", "SOS_tw_l60_opp_tot_pts_won%_ws_ct", "SOS_tw_l60_opp_tot_pts_won%", "SOS_tw_l60_opp_tot_pts_won%_50", "SOS_tw_l60_opp_tot_pts_won%_49", "SOS_tw_l60_opp_tot_pts_won%_48", "SOS_tw_l60_opp_tot_pts_won%_47", "SOS_tw_l60_opp_tot_pts_won%_46", "SOS_tw_l60_opp_tot_pts_won%_45", "SOS_tw_l60_opp_tot_pts_won%_44", "SOS_tw_l60_opp_tot_pts_won%_43", "SOS_tw_l60_opp_tot_pts_won%_42", "SOS_tw_l60_opp_tot_pts_won%_41", "SOS_tw_l60_opp_tot_pts_won%_40", "SOS_tw_l60_opp_tot_pts_won%_39", "SOS_tw_l60_opp_tot_pts_won%_38", "SOS_tw_l60_opp_tot_pts_won%_37", "SOS_tw_l60_opp_tot_pts_won%_36", "SOS_tw_l60_opp_tot_pts_won%_35", "SOS_tw_l60_opp_tot_pts_won%_34", "SOS_tw_l60_opp_tot_pts_won%_33", "SOS_tw_l60_opp_tot_pts_won%_32", "SOS_tw_l60_opp_tot_pts_won%_31", "SOS_tw_l60_opp_tot_pts_won%_30", "SOS_tw_l60_opp_tot_pts_won%_29", "SOS_tw_l60_opp_tot_pts_won%_28", "SOS_tw_l60_opp_tot_pts_won%_27", "SOS_tw_l60_opp_tot_pts_won%_26", "SOS_tw_l60_opp_tot_pts_won%_25", "SOS_tw_l60_opp_tot_pts_won%_24", "SOS_tw_l60_opp_tot_pts_won%_23", "SOS_tw_l60_opp_tot_pts_won%_22", "SOS_tw_l60_opp_tot_pts_won%_21", "SOS_tw_l60_opp_tot_pts_won%_20", "SOS_tw_l60_opp_tot_pts_won%_19", "SOS_tw_l60_opp_tot_pts_won%_18", "SOS_tw_l60_opp_tot_pts_won%_17", "SOS_tw_l60_opp_tot_pts_won%_16", "SOS_tw_l60_opp_tot_pts_won%_15", "SOS_tw_l60_opp_tot_pts_won%_14", "SOS_tw_l60_opp_tot_pts_won%_13", "SOS_tw_l60_opp_tot_pts_won%_12", "SOS_tw_l60_opp_tot_pts_won%_11", "SOS_tw_l60_opp_tot_pts_won%_10", "SOS_tw_l60_opp_tot_pts_won%_9", "SOS_tw_l60_opp_tot_pts_won%_8", "SOS_tw_l60_opp_tot_pts_won%_7", "SOS_tw_l60_opp_tot_pts_won%_6", "SOS_tw_l60_opp_tot_pts_won%_5", "SOS_tw_l60_opp_tot_pts_won%_4", "SOS_tw_l60_opp_tot_pts_won%_3", "SOS_tw_l60_opp_tot_pts_won%_2", "SOS_tw_l60_opp_tot_pts_won%_1"],axis=1)

In [137]:
# 'p_tot_pts_won%_l10_tw_nss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, NON-surface-specific (NSS) TOTAL POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_tot_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_tot_pts_won%_60", "SOS_tw_l60_opp_tot_pts_won%_59", "SOS_tw_l60_opp_tot_pts_won%_58", "SOS_tw_l60_opp_tot_pts_won%_57", "SOS_tw_l60_opp_tot_pts_won%_56", "SOS_tw_l60_opp_tot_pts_won%_55", "SOS_tw_l60_opp_tot_pts_won%_54", "SOS_tw_l60_opp_tot_pts_won%_53", "SOS_tw_l60_opp_tot_pts_won%_52", "SOS_tw_l60_opp_tot_pts_won%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_tot_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_tot_pts_won%_60", "SOS_tw_l60_opp_tot_pts_won%_59", "SOS_tw_l60_opp_tot_pts_won%_58", "SOS_tw_l60_opp_tot_pts_won%_57", "SOS_tw_l60_opp_tot_pts_won%_56", "SOS_tw_l60_opp_tot_pts_won%_55", "SOS_tw_l60_opp_tot_pts_won%_54", "SOS_tw_l60_opp_tot_pts_won%_53", "SOS_tw_l60_opp_tot_pts_won%_52", "SOS_tw_l60_opp_tot_pts_won%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_tot_pts_won%"] = (df_player2["SOS_tw_l10_opp_tot_pts_won%_ws"]/df_player2["SOS_tw_l10_opp_tot_pts_won%_ws_ct"]).round(2) 
#(ws = weighted sum; tw = time-weighted)

# % total pts the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches ACROSS SURFACES is expected to yield
# EY is EXPECTED YIELD
df_player2["EY"] = 100-df_player2["SOS_tw_l10_opp_tot_pts_won%"]

# Mean % total pts performance (l10_tw_nss) for ALL players across ALL surfaces (clay, hard, grass). We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_allsurf_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] != 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l10_tw_nss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_allsurf_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] != 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l10_tw_nss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_allsurf_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] != 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l10_tw_nss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_allsurf_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] != 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l10_tw_nss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_allsurf_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] != 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l10_tw_nss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_allsurf_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] != 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l10_tw_nss'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (49.6064774990043, 49.90529906271136, 49.95271520593417, 49.94840700915975, 50.042730716922385, 49.96629467754469)
                                         
# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] != 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l10_tw_nss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_nss"])*(mean_allsurf_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] != 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l10_tw_nss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_nss"])*(mean_allsurf_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] != 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l10_tw_nss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_nss"])*(mean_allsurf_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] != 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l10_tw_nss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_nss"])*(mean_allsurf_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] != 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l10_tw_nss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_nss"])*(mean_allsurf_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] != 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l10_tw_nss_SOS_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_nss"])*(mean_allsurf_SOS_6/df_player2["EY"])).round(2) 

del mean_allsurf_SOS_1, mean_allsurf_SOS_2, mean_allsurf_SOS_3, mean_allsurf_SOS_4, mean_allsurf_SOS_5, mean_allsurf_SOS_6
df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_tot_pts_won%_ws", "SOS_tw_l10_opp_tot_pts_won%_ws_ct", "SOS_tw_l10_opp_tot_pts_won%","SOS_tw_l60_opp_tot_pts_won%_60", "SOS_tw_l60_opp_tot_pts_won%_59", "SOS_tw_l60_opp_tot_pts_won%_58", "SOS_tw_l60_opp_tot_pts_won%_57", "SOS_tw_l60_opp_tot_pts_won%_56", "SOS_tw_l60_opp_tot_pts_won%_55", "SOS_tw_l60_opp_tot_pts_won%_54", "SOS_tw_l60_opp_tot_pts_won%_53", "SOS_tw_l60_opp_tot_pts_won%_52", "SOS_tw_l60_opp_tot_pts_won%_51"],axis=1)

In [138]:
# 'p_tot_pts_won%_l60_tw_ss_SOS_comp_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) TOTAL POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted
# This variant of p_tot_pts is derived as a 'composite' - summation of p_sv_pts_won%_l60_tw_ss and p_ret_pts_won%_l60_tw_ss

df_player2 = df_player2.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific TOTAL POINTS WON performance of player OPPONENTS (composite variant) on the same surface over the maximum interval of interest (60 matches) prior to the match being predicted 
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_60"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-1)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_59"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-2)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_58"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-3)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_57"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-4)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_56"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-5)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_55"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-6)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_54"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-7)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_53"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-8)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_52"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-9)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_51"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-10)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_50"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-11)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_49"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-12)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_48"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-13)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_47"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-14)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_46"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-15)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_45"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-16)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_44"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-17)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_43"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-18)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_42"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-19)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_41"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-20)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_40"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-21)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_39"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-22)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_38"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-23)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_37"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-24)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_36"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-25)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_35"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-26)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_34"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-27)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_33"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-28)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_32"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-29)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_31"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-30)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_30"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-31)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_29"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-32)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_28"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-33)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_27"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-34)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_26"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-35)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_25"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-36)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_24"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-37)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_23"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-38)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_22"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-39)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_21"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-40)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_20"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-41)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_19"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-42)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_18"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-43)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_17"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-44)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_16"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-45)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_15"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-46)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_14"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-47)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_13"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-48)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_12"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-49)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_11"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-50)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_10"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-51)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_9"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-52)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_8"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-53)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_7"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-54)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_6"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-55)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_5"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-56)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_4"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-57)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_3"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-58)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_2"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-59)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_1"] = df_player2.groupby(['p_id','t_surf'])['p_opp_tot_pts_won%_l60_tw_ss_comp'].shift(-60)

#Using sum function allows ignoring of NaN instead of interpolation. In modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_ws"] = df_player2[["SOS_tw_l60_opp_tot_pts_won%_comp_60", "SOS_tw_l60_opp_tot_pts_won%_comp_59", "SOS_tw_l60_opp_tot_pts_won%_comp_58", "SOS_tw_l60_opp_tot_pts_won%_comp_57", "SOS_tw_l60_opp_tot_pts_won%_comp_56", "SOS_tw_l60_opp_tot_pts_won%_comp_55", "SOS_tw_l60_opp_tot_pts_won%_comp_54", "SOS_tw_l60_opp_tot_pts_won%_comp_53", "SOS_tw_l60_opp_tot_pts_won%_comp_52", "SOS_tw_l60_opp_tot_pts_won%_comp_51", "SOS_tw_l60_opp_tot_pts_won%_comp_50", "SOS_tw_l60_opp_tot_pts_won%_comp_49", "SOS_tw_l60_opp_tot_pts_won%_comp_48", "SOS_tw_l60_opp_tot_pts_won%_comp_47", "SOS_tw_l60_opp_tot_pts_won%_comp_46", "SOS_tw_l60_opp_tot_pts_won%_comp_45", "SOS_tw_l60_opp_tot_pts_won%_comp_44", "SOS_tw_l60_opp_tot_pts_won%_comp_43", "SOS_tw_l60_opp_tot_pts_won%_comp_42", "SOS_tw_l60_opp_tot_pts_won%_comp_41", "SOS_tw_l60_opp_tot_pts_won%_comp_40", "SOS_tw_l60_opp_tot_pts_won%_comp_39", "SOS_tw_l60_opp_tot_pts_won%_comp_38", "SOS_tw_l60_opp_tot_pts_won%_comp_37", "SOS_tw_l60_opp_tot_pts_won%_comp_36", "SOS_tw_l60_opp_tot_pts_won%_comp_35", "SOS_tw_l60_opp_tot_pts_won%_comp_34", "SOS_tw_l60_opp_tot_pts_won%_comp_33", "SOS_tw_l60_opp_tot_pts_won%_comp_32", "SOS_tw_l60_opp_tot_pts_won%_comp_31", "SOS_tw_l60_opp_tot_pts_won%_comp_30", "SOS_tw_l60_opp_tot_pts_won%_comp_29", "SOS_tw_l60_opp_tot_pts_won%_comp_28", "SOS_tw_l60_opp_tot_pts_won%_comp_27", "SOS_tw_l60_opp_tot_pts_won%_comp_26", "SOS_tw_l60_opp_tot_pts_won%_comp_25", "SOS_tw_l60_opp_tot_pts_won%_comp_24", "SOS_tw_l60_opp_tot_pts_won%_comp_23", "SOS_tw_l60_opp_tot_pts_won%_comp_22", "SOS_tw_l60_opp_tot_pts_won%_comp_21", "SOS_tw_l60_opp_tot_pts_won%_comp_20", "SOS_tw_l60_opp_tot_pts_won%_comp_19", "SOS_tw_l60_opp_tot_pts_won%_comp_18", "SOS_tw_l60_opp_tot_pts_won%_comp_17", "SOS_tw_l60_opp_tot_pts_won%_comp_16", "SOS_tw_l60_opp_tot_pts_won%_comp_15", "SOS_tw_l60_opp_tot_pts_won%_comp_14", "SOS_tw_l60_opp_tot_pts_won%_comp_13", "SOS_tw_l60_opp_tot_pts_won%_comp_12", "SOS_tw_l60_opp_tot_pts_won%_comp_11", "SOS_tw_l60_opp_tot_pts_won%_comp_10", "SOS_tw_l60_opp_tot_pts_won%_comp_9", "SOS_tw_l60_opp_tot_pts_won%_comp_8", "SOS_tw_l60_opp_tot_pts_won%_comp_7", "SOS_tw_l60_opp_tot_pts_won%_comp_6", "SOS_tw_l60_opp_tot_pts_won%_comp_5", "SOS_tw_l60_opp_tot_pts_won%_comp_4", "SOS_tw_l60_opp_tot_pts_won%_comp_3", "SOS_tw_l60_opp_tot_pts_won%_comp_2", "SOS_tw_l60_opp_tot_pts_won%_comp_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_ws_ct"] = df_player2[["SOS_tw_l60_opp_tot_pts_won%_comp_60", "SOS_tw_l60_opp_tot_pts_won%_comp_59", "SOS_tw_l60_opp_tot_pts_won%_comp_58", "SOS_tw_l60_opp_tot_pts_won%_comp_57", "SOS_tw_l60_opp_tot_pts_won%_comp_56", "SOS_tw_l60_opp_tot_pts_won%_comp_55", "SOS_tw_l60_opp_tot_pts_won%_comp_54", "SOS_tw_l60_opp_tot_pts_won%_comp_53", "SOS_tw_l60_opp_tot_pts_won%_comp_52", "SOS_tw_l60_opp_tot_pts_won%_comp_51", "SOS_tw_l60_opp_tot_pts_won%_comp_50", "SOS_tw_l60_opp_tot_pts_won%_comp_49", "SOS_tw_l60_opp_tot_pts_won%_comp_48", "SOS_tw_l60_opp_tot_pts_won%_comp_47", "SOS_tw_l60_opp_tot_pts_won%_comp_46", "SOS_tw_l60_opp_tot_pts_won%_comp_45", "SOS_tw_l60_opp_tot_pts_won%_comp_44", "SOS_tw_l60_opp_tot_pts_won%_comp_43", "SOS_tw_l60_opp_tot_pts_won%_comp_42", "SOS_tw_l60_opp_tot_pts_won%_comp_41", "SOS_tw_l60_opp_tot_pts_won%_comp_40", "SOS_tw_l60_opp_tot_pts_won%_comp_39", "SOS_tw_l60_opp_tot_pts_won%_comp_38", "SOS_tw_l60_opp_tot_pts_won%_comp_37", "SOS_tw_l60_opp_tot_pts_won%_comp_36", "SOS_tw_l60_opp_tot_pts_won%_comp_35", "SOS_tw_l60_opp_tot_pts_won%_comp_34", "SOS_tw_l60_opp_tot_pts_won%_comp_33", "SOS_tw_l60_opp_tot_pts_won%_comp_32", "SOS_tw_l60_opp_tot_pts_won%_comp_31", "SOS_tw_l60_opp_tot_pts_won%_comp_30", "SOS_tw_l60_opp_tot_pts_won%_comp_29", "SOS_tw_l60_opp_tot_pts_won%_comp_28", "SOS_tw_l60_opp_tot_pts_won%_comp_27", "SOS_tw_l60_opp_tot_pts_won%_comp_26", "SOS_tw_l60_opp_tot_pts_won%_comp_25", "SOS_tw_l60_opp_tot_pts_won%_comp_24", "SOS_tw_l60_opp_tot_pts_won%_comp_23", "SOS_tw_l60_opp_tot_pts_won%_comp_22", "SOS_tw_l60_opp_tot_pts_won%_comp_21", "SOS_tw_l60_opp_tot_pts_won%_comp_20", "SOS_tw_l60_opp_tot_pts_won%_comp_19", "SOS_tw_l60_opp_tot_pts_won%_comp_18", "SOS_tw_l60_opp_tot_pts_won%_comp_17", "SOS_tw_l60_opp_tot_pts_won%_comp_16", "SOS_tw_l60_opp_tot_pts_won%_comp_15", "SOS_tw_l60_opp_tot_pts_won%_comp_14", "SOS_tw_l60_opp_tot_pts_won%_comp_13", "SOS_tw_l60_opp_tot_pts_won%_comp_12", "SOS_tw_l60_opp_tot_pts_won%_comp_11", "SOS_tw_l60_opp_tot_pts_won%_comp_10", "SOS_tw_l60_opp_tot_pts_won%_comp_9", "SOS_tw_l60_opp_tot_pts_won%_comp_8", "SOS_tw_l60_opp_tot_pts_won%_comp_7", "SOS_tw_l60_opp_tot_pts_won%_comp_6", "SOS_tw_l60_opp_tot_pts_won%_comp_5", "SOS_tw_l60_opp_tot_pts_won%_comp_4", "SOS_tw_l60_opp_tot_pts_won%_comp_3", "SOS_tw_l60_opp_tot_pts_won%_comp_2", "SOS_tw_l60_opp_tot_pts_won%_comp_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_tot_pts_won%"] = (df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_ws"]/df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % total pts the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to yield
# EY is EXPECTED YIELD
df_player2["EY"] = 100-df_player2["SOS_tw_l60_opp_tot_pts_won%"]

# Mean % total pts performance (l60_tw_ss_comp) across ALL players per surface. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_clay_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l60_tw_ss_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l60_tw_ss_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l60_tw_ss_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l60_tw_ss_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l60_tw_ss_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l60_tw_ss_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (49.158732829670214, 49.70487847222243, 49.76722116903628, 49.90398648648641, 49.953119162640945, 49.93562742561447)
mean_hard_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l60_tw_ss_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l60_tw_ss_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l60_tw_ss_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l60_tw_ss_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l60_tw_ss_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l60_tw_ss_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (49.3052567280454, 49.8541725948511, 49.965574105621855, 49.919214752073344, 50.04900220975689, 49.940256846402484)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l60_tw_ss_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_comp"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l60_tw_ss_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_comp"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l60_tw_ss_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_comp"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l60_tw_ss_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_comp"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l60_tw_ss_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_comp"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l60_tw_ss_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_comp"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l60_tw_ss_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_comp"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l60_tw_ss_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_comp"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l60_tw_ss_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_comp"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l60_tw_ss_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_comp"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l60_tw_ss_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_comp"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l60_tw_ss_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_comp"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6
df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_tot_pts_won%_comp_ws", "SOS_tw_l60_opp_tot_pts_won%_comp_ws_ct", "SOS_tw_l60_opp_tot_pts_won%", "SOS_tw_l60_opp_tot_pts_won%_comp_50", "SOS_tw_l60_opp_tot_pts_won%_comp_49", "SOS_tw_l60_opp_tot_pts_won%_comp_48", "SOS_tw_l60_opp_tot_pts_won%_comp_47", "SOS_tw_l60_opp_tot_pts_won%_comp_46", "SOS_tw_l60_opp_tot_pts_won%_comp_45", "SOS_tw_l60_opp_tot_pts_won%_comp_44", "SOS_tw_l60_opp_tot_pts_won%_comp_43", "SOS_tw_l60_opp_tot_pts_won%_comp_42", "SOS_tw_l60_opp_tot_pts_won%_comp_41", "SOS_tw_l60_opp_tot_pts_won%_comp_40", "SOS_tw_l60_opp_tot_pts_won%_comp_39", "SOS_tw_l60_opp_tot_pts_won%_comp_38", "SOS_tw_l60_opp_tot_pts_won%_comp_37", "SOS_tw_l60_opp_tot_pts_won%_comp_36", "SOS_tw_l60_opp_tot_pts_won%_comp_35", "SOS_tw_l60_opp_tot_pts_won%_comp_34", "SOS_tw_l60_opp_tot_pts_won%_comp_33", "SOS_tw_l60_opp_tot_pts_won%_comp_32", "SOS_tw_l60_opp_tot_pts_won%_comp_31", "SOS_tw_l60_opp_tot_pts_won%_comp_30", "SOS_tw_l60_opp_tot_pts_won%_comp_29", "SOS_tw_l60_opp_tot_pts_won%_comp_28", "SOS_tw_l60_opp_tot_pts_won%_comp_27", "SOS_tw_l60_opp_tot_pts_won%_comp_26", "SOS_tw_l60_opp_tot_pts_won%_comp_25", "SOS_tw_l60_opp_tot_pts_won%_comp_24", "SOS_tw_l60_opp_tot_pts_won%_comp_23", "SOS_tw_l60_opp_tot_pts_won%_comp_22", "SOS_tw_l60_opp_tot_pts_won%_comp_21", "SOS_tw_l60_opp_tot_pts_won%_comp_20", "SOS_tw_l60_opp_tot_pts_won%_comp_19", "SOS_tw_l60_opp_tot_pts_won%_comp_18", "SOS_tw_l60_opp_tot_pts_won%_comp_17", "SOS_tw_l60_opp_tot_pts_won%_comp_16", "SOS_tw_l60_opp_tot_pts_won%_comp_15", "SOS_tw_l60_opp_tot_pts_won%_comp_14", "SOS_tw_l60_opp_tot_pts_won%_comp_13", "SOS_tw_l60_opp_tot_pts_won%_comp_12", "SOS_tw_l60_opp_tot_pts_won%_comp_11", "SOS_tw_l60_opp_tot_pts_won%_comp_10", "SOS_tw_l60_opp_tot_pts_won%_comp_9", "SOS_tw_l60_opp_tot_pts_won%_comp_8", "SOS_tw_l60_opp_tot_pts_won%_comp_7", "SOS_tw_l60_opp_tot_pts_won%_comp_6", "SOS_tw_l60_opp_tot_pts_won%_comp_5", "SOS_tw_l60_opp_tot_pts_won%_comp_4", "SOS_tw_l60_opp_tot_pts_won%_comp_3", "SOS_tw_l60_opp_tot_pts_won%_comp_2", "SOS_tw_l60_opp_tot_pts_won%_comp_1"],axis=1)

In [139]:
# 'p_tot_pts_won%_l10_tw_ss_SOS_comp_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) TOTAL POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted
# This variant of p_tot_pts is derived as a 'composite' - summation of p_sv_pts_won%_l10_tw_ss and p_ret_pts_won%_l10_tw_ss

#Using sum function allows ignoring of NaN instead of interpolation. In modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_tot_pts_won%_comp_ws"] = df_player2[["SOS_tw_l60_opp_tot_pts_won%_comp_60", "SOS_tw_l60_opp_tot_pts_won%_comp_59", "SOS_tw_l60_opp_tot_pts_won%_comp_58", "SOS_tw_l60_opp_tot_pts_won%_comp_57", "SOS_tw_l60_opp_tot_pts_won%_comp_56", "SOS_tw_l60_opp_tot_pts_won%_comp_55", "SOS_tw_l60_opp_tot_pts_won%_comp_54", "SOS_tw_l60_opp_tot_pts_won%_comp_53", "SOS_tw_l60_opp_tot_pts_won%_comp_52", "SOS_tw_l60_opp_tot_pts_won%_comp_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_tot_pts_won%_comp_ws_ct"] = df_player2[["SOS_tw_l60_opp_tot_pts_won%_comp_60", "SOS_tw_l60_opp_tot_pts_won%_comp_59", "SOS_tw_l60_opp_tot_pts_won%_comp_58", "SOS_tw_l60_opp_tot_pts_won%_comp_57", "SOS_tw_l60_opp_tot_pts_won%_comp_56", "SOS_tw_l60_opp_tot_pts_won%_comp_55", "SOS_tw_l60_opp_tot_pts_won%_comp_54", "SOS_tw_l60_opp_tot_pts_won%_comp_53", "SOS_tw_l60_opp_tot_pts_won%_comp_52", "SOS_tw_l60_opp_tot_pts_won%_comp_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_tot_pts_won%"] = (df_player2["SOS_tw_l10_opp_tot_pts_won%_comp_ws"]/df_player2["SOS_tw_l10_opp_tot_pts_won%_comp_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % total pts the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches on this surface is expected to yield
# EY is EXPECTED YIELD
df_player2["EY"] = 100-df_player2["SOS_tw_l10_opp_tot_pts_won%"]

# Mean % total pts performance (l10_tw_ss_comp) across ALL players per surface. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_clay_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l10_tw_ss_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l10_tw_ss_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l10_tw_ss_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l10_tw_ss_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l10_tw_ss_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l10_tw_ss_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (49.175587225274654, 49.708458017676875, 49.8038135860979, 49.884549549549554, 49.95308695652173, 49.875334734799495)
mean_hard_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l10_tw_ss_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l10_tw_ss_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l10_tw_ss_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l10_tw_ss_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l10_tw_ss_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l10_tw_ss_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (49.360625000000084, 49.85264989837409, 49.95782708688235, 49.89260014116794, 49.99241033486341, 49.93035975506046)

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l10_tw_ss_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_comp"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l10_tw_ss_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_comp"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l10_tw_ss_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_comp"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l10_tw_ss_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_comp"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l10_tw_ss_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_comp"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l10_tw_ss_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_comp"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l10_tw_ss_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_comp"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l10_tw_ss_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_comp"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l10_tw_ss_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_comp"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l10_tw_ss_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_comp"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l10_tw_ss_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_comp"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l10_tw_ss_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_comp"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6
df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_tot_pts_won%_comp_ws", "SOS_tw_l10_opp_tot_pts_won%_comp_ws_ct", "SOS_tw_l10_opp_tot_pts_won%", "SOS_tw_l60_opp_tot_pts_won%_comp_60", "SOS_tw_l60_opp_tot_pts_won%_comp_59", "SOS_tw_l60_opp_tot_pts_won%_comp_58", "SOS_tw_l60_opp_tot_pts_won%_comp_57", "SOS_tw_l60_opp_tot_pts_won%_comp_56", "SOS_tw_l60_opp_tot_pts_won%_comp_55", "SOS_tw_l60_opp_tot_pts_won%_comp_54", "SOS_tw_l60_opp_tot_pts_won%_comp_53", "SOS_tw_l60_opp_tot_pts_won%_comp_52", "SOS_tw_l60_opp_tot_pts_won%_comp_51"],axis=1)

In [140]:
# 'p_tot_pts_won%_l60_tw_ss_SOS_IO_comp_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), indoor/outdoor specific TOTAL POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted
# This variant of p_tot_pts is derived as a 'composite' - summation of p_sv_pts_won%_l60_tw_ss_IO and p_ret_pts_won%_l60_tw_ss_IO

df_player2 = df_player2.sort_values(by=['p_id','t_surf','t_ind', 'm_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, I/O specific TOTAL POINTS WON performance of player OPPONENTS (composite variant) on the same surface over the maximum interval of interest (60 matches) prior to the match being predicted 
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_60"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-1)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_59"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-2)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_58"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-3)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_57"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-4)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_56"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-5)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_55"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-6)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_54"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-7)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_53"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-8)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_52"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-9)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_51"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-10)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_50"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-11)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_49"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-12)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_48"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-13)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_47"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-14)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_46"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-15)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_45"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-16)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_44"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-17)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_43"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-18)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_42"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-19)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_41"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-20)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_40"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-21)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_39"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-22)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_38"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-23)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_37"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-24)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_36"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-25)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_35"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-26)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_34"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-27)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_33"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-28)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_32"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-29)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_31"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-30)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_30"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-31)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_29"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-32)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_28"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-33)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_27"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-34)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_26"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-35)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_25"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-36)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_24"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-37)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_23"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-38)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_22"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-39)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_21"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-40)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_20"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-41)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_19"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-42)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_18"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-43)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_17"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-44)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_16"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-45)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_15"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-46)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_14"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-47)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_13"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-48)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_12"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-49)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_11"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-50)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_10"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-51)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_9"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-52)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_8"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-53)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_7"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-54)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_6"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-55)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_5"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-56)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_4"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-57)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_3"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-58)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_2"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-59)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_1"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_tot_pts_won%_l60_tw_ss_IO_comp'].shift(-60)

#Using sum function allows ignoring of NaN instead of interpolation. In modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_ws"] = df_player2[["SOS_tw_l60_opp_tot_pts_won%_comp_60", "SOS_tw_l60_opp_tot_pts_won%_comp_59", "SOS_tw_l60_opp_tot_pts_won%_comp_58", "SOS_tw_l60_opp_tot_pts_won%_comp_57", "SOS_tw_l60_opp_tot_pts_won%_comp_56", "SOS_tw_l60_opp_tot_pts_won%_comp_55", "SOS_tw_l60_opp_tot_pts_won%_comp_54", "SOS_tw_l60_opp_tot_pts_won%_comp_53", "SOS_tw_l60_opp_tot_pts_won%_comp_52", "SOS_tw_l60_opp_tot_pts_won%_comp_51", "SOS_tw_l60_opp_tot_pts_won%_comp_50", "SOS_tw_l60_opp_tot_pts_won%_comp_49", "SOS_tw_l60_opp_tot_pts_won%_comp_48", "SOS_tw_l60_opp_tot_pts_won%_comp_47", "SOS_tw_l60_opp_tot_pts_won%_comp_46", "SOS_tw_l60_opp_tot_pts_won%_comp_45", "SOS_tw_l60_opp_tot_pts_won%_comp_44", "SOS_tw_l60_opp_tot_pts_won%_comp_43", "SOS_tw_l60_opp_tot_pts_won%_comp_42", "SOS_tw_l60_opp_tot_pts_won%_comp_41", "SOS_tw_l60_opp_tot_pts_won%_comp_40", "SOS_tw_l60_opp_tot_pts_won%_comp_39", "SOS_tw_l60_opp_tot_pts_won%_comp_38", "SOS_tw_l60_opp_tot_pts_won%_comp_37", "SOS_tw_l60_opp_tot_pts_won%_comp_36", "SOS_tw_l60_opp_tot_pts_won%_comp_35", "SOS_tw_l60_opp_tot_pts_won%_comp_34", "SOS_tw_l60_opp_tot_pts_won%_comp_33", "SOS_tw_l60_opp_tot_pts_won%_comp_32", "SOS_tw_l60_opp_tot_pts_won%_comp_31", "SOS_tw_l60_opp_tot_pts_won%_comp_30", "SOS_tw_l60_opp_tot_pts_won%_comp_29", "SOS_tw_l60_opp_tot_pts_won%_comp_28", "SOS_tw_l60_opp_tot_pts_won%_comp_27", "SOS_tw_l60_opp_tot_pts_won%_comp_26", "SOS_tw_l60_opp_tot_pts_won%_comp_25", "SOS_tw_l60_opp_tot_pts_won%_comp_24", "SOS_tw_l60_opp_tot_pts_won%_comp_23", "SOS_tw_l60_opp_tot_pts_won%_comp_22", "SOS_tw_l60_opp_tot_pts_won%_comp_21", "SOS_tw_l60_opp_tot_pts_won%_comp_20", "SOS_tw_l60_opp_tot_pts_won%_comp_19", "SOS_tw_l60_opp_tot_pts_won%_comp_18", "SOS_tw_l60_opp_tot_pts_won%_comp_17", "SOS_tw_l60_opp_tot_pts_won%_comp_16", "SOS_tw_l60_opp_tot_pts_won%_comp_15", "SOS_tw_l60_opp_tot_pts_won%_comp_14", "SOS_tw_l60_opp_tot_pts_won%_comp_13", "SOS_tw_l60_opp_tot_pts_won%_comp_12", "SOS_tw_l60_opp_tot_pts_won%_comp_11", "SOS_tw_l60_opp_tot_pts_won%_comp_10", "SOS_tw_l60_opp_tot_pts_won%_comp_9", "SOS_tw_l60_opp_tot_pts_won%_comp_8", "SOS_tw_l60_opp_tot_pts_won%_comp_7", "SOS_tw_l60_opp_tot_pts_won%_comp_6", "SOS_tw_l60_opp_tot_pts_won%_comp_5", "SOS_tw_l60_opp_tot_pts_won%_comp_4", "SOS_tw_l60_opp_tot_pts_won%_comp_3", "SOS_tw_l60_opp_tot_pts_won%_comp_2", "SOS_tw_l60_opp_tot_pts_won%_comp_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_ws_ct"] = df_player2[["SOS_tw_l60_opp_tot_pts_won%_comp_60", "SOS_tw_l60_opp_tot_pts_won%_comp_59", "SOS_tw_l60_opp_tot_pts_won%_comp_58", "SOS_tw_l60_opp_tot_pts_won%_comp_57", "SOS_tw_l60_opp_tot_pts_won%_comp_56", "SOS_tw_l60_opp_tot_pts_won%_comp_55", "SOS_tw_l60_opp_tot_pts_won%_comp_54", "SOS_tw_l60_opp_tot_pts_won%_comp_53", "SOS_tw_l60_opp_tot_pts_won%_comp_52", "SOS_tw_l60_opp_tot_pts_won%_comp_51", "SOS_tw_l60_opp_tot_pts_won%_comp_50", "SOS_tw_l60_opp_tot_pts_won%_comp_49", "SOS_tw_l60_opp_tot_pts_won%_comp_48", "SOS_tw_l60_opp_tot_pts_won%_comp_47", "SOS_tw_l60_opp_tot_pts_won%_comp_46", "SOS_tw_l60_opp_tot_pts_won%_comp_45", "SOS_tw_l60_opp_tot_pts_won%_comp_44", "SOS_tw_l60_opp_tot_pts_won%_comp_43", "SOS_tw_l60_opp_tot_pts_won%_comp_42", "SOS_tw_l60_opp_tot_pts_won%_comp_41", "SOS_tw_l60_opp_tot_pts_won%_comp_40", "SOS_tw_l60_opp_tot_pts_won%_comp_39", "SOS_tw_l60_opp_tot_pts_won%_comp_38", "SOS_tw_l60_opp_tot_pts_won%_comp_37", "SOS_tw_l60_opp_tot_pts_won%_comp_36", "SOS_tw_l60_opp_tot_pts_won%_comp_35", "SOS_tw_l60_opp_tot_pts_won%_comp_34", "SOS_tw_l60_opp_tot_pts_won%_comp_33", "SOS_tw_l60_opp_tot_pts_won%_comp_32", "SOS_tw_l60_opp_tot_pts_won%_comp_31", "SOS_tw_l60_opp_tot_pts_won%_comp_30", "SOS_tw_l60_opp_tot_pts_won%_comp_29", "SOS_tw_l60_opp_tot_pts_won%_comp_28", "SOS_tw_l60_opp_tot_pts_won%_comp_27", "SOS_tw_l60_opp_tot_pts_won%_comp_26", "SOS_tw_l60_opp_tot_pts_won%_comp_25", "SOS_tw_l60_opp_tot_pts_won%_comp_24", "SOS_tw_l60_opp_tot_pts_won%_comp_23", "SOS_tw_l60_opp_tot_pts_won%_comp_22", "SOS_tw_l60_opp_tot_pts_won%_comp_21", "SOS_tw_l60_opp_tot_pts_won%_comp_20", "SOS_tw_l60_opp_tot_pts_won%_comp_19", "SOS_tw_l60_opp_tot_pts_won%_comp_18", "SOS_tw_l60_opp_tot_pts_won%_comp_17", "SOS_tw_l60_opp_tot_pts_won%_comp_16", "SOS_tw_l60_opp_tot_pts_won%_comp_15", "SOS_tw_l60_opp_tot_pts_won%_comp_14", "SOS_tw_l60_opp_tot_pts_won%_comp_13", "SOS_tw_l60_opp_tot_pts_won%_comp_12", "SOS_tw_l60_opp_tot_pts_won%_comp_11", "SOS_tw_l60_opp_tot_pts_won%_comp_10", "SOS_tw_l60_opp_tot_pts_won%_comp_9", "SOS_tw_l60_opp_tot_pts_won%_comp_8", "SOS_tw_l60_opp_tot_pts_won%_comp_7", "SOS_tw_l60_opp_tot_pts_won%_comp_6", "SOS_tw_l60_opp_tot_pts_won%_comp_5", "SOS_tw_l60_opp_tot_pts_won%_comp_4", "SOS_tw_l60_opp_tot_pts_won%_comp_3", "SOS_tw_l60_opp_tot_pts_won%_comp_2", "SOS_tw_l60_opp_tot_pts_won%_comp_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_tot_pts_won%"] = (df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_ws"]/df_player2["SOS_tw_l60_opp_tot_pts_won%_comp_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % total pts the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to yield
# EY is EXPECTED YIELD
df_player2["EY"] = 100-df_player2["SOS_tw_l60_opp_tot_pts_won%"]

# Mean % total pts performance (l60_tw_ss_comp) across ALL players per surface. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_clay_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l60_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l60_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l60_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l60_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l60_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l60_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (nan, 45.05339285714285, 48.85859281437127, 49.24119883040936, 49.345915492957765, 49.19522988505746)

mean_clay_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l60_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l60_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l60_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l60_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l60_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l60_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (49.158732829670214, 49.68776925587477, 49.75494576271191, 49.87982407726795, 49.955332880434575, 49.93434505862655)      

mean_hard_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l60_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l60_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l60_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l60_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l60_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l60_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (48.50623780487805, 49.64146341463407, 49.85093016599886, 49.82317871759887, 50.090616724738716, 49.6791095890411)
                                         
mean_hard_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l60_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l60_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l60_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l60_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l60_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l60_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average                                         
# (49.05568928379107, 49.796769383697864, 49.9013458241491, 49.96678973867169, 50.00875425073683, 49.99122285714285)                                       
                                         
# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO_comp"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO_comp"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO_comp"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO_comp"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO_comp"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO_comp"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO_comp"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO_comp"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO_comp"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO_comp"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO_comp"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO_comp"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO_comp"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO_comp"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO_comp"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO_comp"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO_comp"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO_comp"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO_comp"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO_comp"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO_comp"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO_comp"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO_comp"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l60_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l60_tw_ss_IO_comp"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 

                                         
del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o
df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_tot_pts_won%_comp_ws", "SOS_tw_l60_opp_tot_pts_won%_comp_ws_ct", "SOS_tw_l60_opp_tot_pts_won%", "SOS_tw_l60_opp_tot_pts_won%_comp_50", "SOS_tw_l60_opp_tot_pts_won%_comp_49", "SOS_tw_l60_opp_tot_pts_won%_comp_48", "SOS_tw_l60_opp_tot_pts_won%_comp_47", "SOS_tw_l60_opp_tot_pts_won%_comp_46", "SOS_tw_l60_opp_tot_pts_won%_comp_45", "SOS_tw_l60_opp_tot_pts_won%_comp_44", "SOS_tw_l60_opp_tot_pts_won%_comp_43", "SOS_tw_l60_opp_tot_pts_won%_comp_42", "SOS_tw_l60_opp_tot_pts_won%_comp_41", "SOS_tw_l60_opp_tot_pts_won%_comp_40", "SOS_tw_l60_opp_tot_pts_won%_comp_39", "SOS_tw_l60_opp_tot_pts_won%_comp_38", "SOS_tw_l60_opp_tot_pts_won%_comp_37", "SOS_tw_l60_opp_tot_pts_won%_comp_36", "SOS_tw_l60_opp_tot_pts_won%_comp_35", "SOS_tw_l60_opp_tot_pts_won%_comp_34", "SOS_tw_l60_opp_tot_pts_won%_comp_33", "SOS_tw_l60_opp_tot_pts_won%_comp_32", "SOS_tw_l60_opp_tot_pts_won%_comp_31", "SOS_tw_l60_opp_tot_pts_won%_comp_30", "SOS_tw_l60_opp_tot_pts_won%_comp_29", "SOS_tw_l60_opp_tot_pts_won%_comp_28", "SOS_tw_l60_opp_tot_pts_won%_comp_27", "SOS_tw_l60_opp_tot_pts_won%_comp_26", "SOS_tw_l60_opp_tot_pts_won%_comp_25", "SOS_tw_l60_opp_tot_pts_won%_comp_24", "SOS_tw_l60_opp_tot_pts_won%_comp_23", "SOS_tw_l60_opp_tot_pts_won%_comp_22", "SOS_tw_l60_opp_tot_pts_won%_comp_21", "SOS_tw_l60_opp_tot_pts_won%_comp_20", "SOS_tw_l60_opp_tot_pts_won%_comp_19", "SOS_tw_l60_opp_tot_pts_won%_comp_18", "SOS_tw_l60_opp_tot_pts_won%_comp_17", "SOS_tw_l60_opp_tot_pts_won%_comp_16", "SOS_tw_l60_opp_tot_pts_won%_comp_15", "SOS_tw_l60_opp_tot_pts_won%_comp_14", "SOS_tw_l60_opp_tot_pts_won%_comp_13", "SOS_tw_l60_opp_tot_pts_won%_comp_12", "SOS_tw_l60_opp_tot_pts_won%_comp_11", "SOS_tw_l60_opp_tot_pts_won%_comp_10", "SOS_tw_l60_opp_tot_pts_won%_comp_9", "SOS_tw_l60_opp_tot_pts_won%_comp_8", "SOS_tw_l60_opp_tot_pts_won%_comp_7", "SOS_tw_l60_opp_tot_pts_won%_comp_6", "SOS_tw_l60_opp_tot_pts_won%_comp_5", "SOS_tw_l60_opp_tot_pts_won%_comp_4", "SOS_tw_l60_opp_tot_pts_won%_comp_3", "SOS_tw_l60_opp_tot_pts_won%_comp_2", "SOS_tw_l60_opp_tot_pts_won%_comp_1"],axis=1)

In [141]:
# 'p_tot_pts_won%_l10_tw_ss_SOS_IO_comp_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), indoor/outdoor specific TOTAL POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted
# This variant of p_tot_pts is derived as a 'composite' - summation of p_sv_pts_won%_l10_tw_ss_IO and p_ret_pts_won%_l10_tw_ss_IO

#Using sum function allows ignoring of NaN instead of interpolation. In modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_tot_pts_won%_comp_ws"] = df_player2[["SOS_tw_l60_opp_tot_pts_won%_comp_60", "SOS_tw_l60_opp_tot_pts_won%_comp_59", "SOS_tw_l60_opp_tot_pts_won%_comp_58", "SOS_tw_l60_opp_tot_pts_won%_comp_57", "SOS_tw_l60_opp_tot_pts_won%_comp_56", "SOS_tw_l60_opp_tot_pts_won%_comp_55", "SOS_tw_l60_opp_tot_pts_won%_comp_54", "SOS_tw_l60_opp_tot_pts_won%_comp_53", "SOS_tw_l60_opp_tot_pts_won%_comp_52", "SOS_tw_l60_opp_tot_pts_won%_comp_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_tot_pts_won%_comp_ws_ct"] = df_player2[["SOS_tw_l60_opp_tot_pts_won%_comp_60", "SOS_tw_l60_opp_tot_pts_won%_comp_59", "SOS_tw_l60_opp_tot_pts_won%_comp_58", "SOS_tw_l60_opp_tot_pts_won%_comp_57", "SOS_tw_l60_opp_tot_pts_won%_comp_56", "SOS_tw_l60_opp_tot_pts_won%_comp_55", "SOS_tw_l60_opp_tot_pts_won%_comp_54", "SOS_tw_l60_opp_tot_pts_won%_comp_53", "SOS_tw_l60_opp_tot_pts_won%_comp_52", "SOS_tw_l60_opp_tot_pts_won%_comp_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_tot_pts_won%"] = (df_player2["SOS_tw_l10_opp_tot_pts_won%_comp_ws"]/df_player2["SOS_tw_l10_opp_tot_pts_won%_comp_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % total pts the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to yield
# EY is EXPECTED YIELD
df_player2["EY"] = 100-df_player2["SOS_tw_l10_opp_tot_pts_won%"]

# Mean % total pts performance (l10_tw_ss_comp) across ALL players per surface. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_clay_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l10_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l10_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l10_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l10_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l10_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l10_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (nan, 45.05339285714285, 48.85197604790421, 49.233918128654985, 49.38992957746479, 49.17787356321838)

mean_clay_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l10_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l10_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l10_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l10_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l10_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l10_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
#  (49.175587225274654, 49.69800261096617, 49.774525423728775, 49.86351155570889, 49.95626528532608, 49.880671691792294)     

mean_hard_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l10_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l10_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l10_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l10_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l10_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l10_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (48.51064024390244, 49.7200471175166, 49.87448196908984, 49.86491132332885, 50.0180452961673, 49.55334246575339)
                                         
mean_hard_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_tot_pts_won%_l10_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_tot_pts_won%_l10_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_tot_pts_won%_l10_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_tot_pts_won%_l10_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_tot_pts_won%_l10_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_tot_pts_won%_l10_tw_ss_IO_comp'].mean()) #We want in terms of pct total pts the field ALLOWS on average                                         
#  (49.13672590199256, 49.78524105367792, 49.91254714670587, 49.89163270198996, 49.92917365676733, 49.96010742857155)                                     
                                         
# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO_comp"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO_comp"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO_comp"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO_comp"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO_comp"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO_comp"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO_comp"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO_comp"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO_comp"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO_comp"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO_comp"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO_comp"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO_comp"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO_comp"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO_comp"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO_comp"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO_comp"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO_comp"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO_comp"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO_comp"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO_comp"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO_comp"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO_comp"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_tot_pts_won%_l10_tw_ss_IO_SOS_comp_adj"] = ((df_player2["p_tot_pts_won%_l10_tw_ss_IO_comp"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 

                                         
del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o
df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_tot_pts_won%_comp_ws", "SOS_tw_l10_opp_tot_pts_won%_comp_ws_ct", "SOS_tw_l10_opp_tot_pts_won%", "SOS_tw_l60_opp_tot_pts_won%_comp_60", "SOS_tw_l60_opp_tot_pts_won%_comp_59", "SOS_tw_l60_opp_tot_pts_won%_comp_58", "SOS_tw_l60_opp_tot_pts_won%_comp_57", "SOS_tw_l60_opp_tot_pts_won%_comp_56", "SOS_tw_l60_opp_tot_pts_won%_comp_55", "SOS_tw_l60_opp_tot_pts_won%_comp_54", "SOS_tw_l60_opp_tot_pts_won%_comp_53", "SOS_tw_l60_opp_tot_pts_won%_comp_52", "SOS_tw_l60_opp_tot_pts_won%_comp_51"],axis=1)

In [142]:
# 'p_1st_sv%_l60_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) FIRST SERVE PERCENTAGE performance of PLAYER (as a server) over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific (SS) FIRST SERVE PERCENTAGE YIELDED (as returners) performance of player OPPONENTS over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_60"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-1)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_59"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-2)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_58"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-3)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_57"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-4)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_56"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-5)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_55"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-6)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_54"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-7)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_53"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-8)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_52"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-9)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_51"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-10)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_50"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-11)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_49"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-12)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_48"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-13)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_47"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-14)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_46"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-15)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_45"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-16)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_44"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-17)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_43"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-18)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_42"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-19)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_41"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-20)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_40"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-21)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_39"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-22)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_38"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-23)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_37"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-24)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_36"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-25)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_35"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-26)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_34"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-27)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_33"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-28)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_32"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-29)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_31"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-30)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_30"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-31)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_29"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-32)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_28"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-33)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_27"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-34)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_26"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-35)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_25"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-36)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_24"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-37)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_23"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-38)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_22"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-39)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_21"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-40)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_20"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-41)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_19"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-42)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_18"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-43)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_17"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-44)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_16"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-45)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_15"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-46)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_14"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-47)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_13"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-48)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_12"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-49)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_11"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-50)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_10"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-51)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_9"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-52)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_8"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-53)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_7"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-54)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_6"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-55)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_5"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-56)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_4"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-57)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_3"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-58)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_2"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-59)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_1"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_yielded_l60_tw_ss'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_ws"] = df_player2[["SOS_tw_l60_opp_1st_sv%_yielded_60", "SOS_tw_l60_opp_1st_sv%_yielded_59", "SOS_tw_l60_opp_1st_sv%_yielded_58", "SOS_tw_l60_opp_1st_sv%_yielded_57", "SOS_tw_l60_opp_1st_sv%_yielded_56", "SOS_tw_l60_opp_1st_sv%_yielded_55", "SOS_tw_l60_opp_1st_sv%_yielded_54", "SOS_tw_l60_opp_1st_sv%_yielded_53", "SOS_tw_l60_opp_1st_sv%_yielded_52", "SOS_tw_l60_opp_1st_sv%_yielded_51", "SOS_tw_l60_opp_1st_sv%_yielded_50", "SOS_tw_l60_opp_1st_sv%_yielded_49", "SOS_tw_l60_opp_1st_sv%_yielded_48", "SOS_tw_l60_opp_1st_sv%_yielded_47", "SOS_tw_l60_opp_1st_sv%_yielded_46", "SOS_tw_l60_opp_1st_sv%_yielded_45", "SOS_tw_l60_opp_1st_sv%_yielded_44", "SOS_tw_l60_opp_1st_sv%_yielded_43", "SOS_tw_l60_opp_1st_sv%_yielded_42", "SOS_tw_l60_opp_1st_sv%_yielded_41", "SOS_tw_l60_opp_1st_sv%_yielded_40", "SOS_tw_l60_opp_1st_sv%_yielded_39", "SOS_tw_l60_opp_1st_sv%_yielded_38", "SOS_tw_l60_opp_1st_sv%_yielded_37", "SOS_tw_l60_opp_1st_sv%_yielded_36", "SOS_tw_l60_opp_1st_sv%_yielded_35", "SOS_tw_l60_opp_1st_sv%_yielded_34", "SOS_tw_l60_opp_1st_sv%_yielded_33", "SOS_tw_l60_opp_1st_sv%_yielded_32", "SOS_tw_l60_opp_1st_sv%_yielded_31", "SOS_tw_l60_opp_1st_sv%_yielded_30", "SOS_tw_l60_opp_1st_sv%_yielded_29", "SOS_tw_l60_opp_1st_sv%_yielded_28", "SOS_tw_l60_opp_1st_sv%_yielded_27", "SOS_tw_l60_opp_1st_sv%_yielded_26", "SOS_tw_l60_opp_1st_sv%_yielded_25", "SOS_tw_l60_opp_1st_sv%_yielded_24", "SOS_tw_l60_opp_1st_sv%_yielded_23", "SOS_tw_l60_opp_1st_sv%_yielded_22", "SOS_tw_l60_opp_1st_sv%_yielded_21", "SOS_tw_l60_opp_1st_sv%_yielded_20", "SOS_tw_l60_opp_1st_sv%_yielded_19", "SOS_tw_l60_opp_1st_sv%_yielded_18", "SOS_tw_l60_opp_1st_sv%_yielded_17", "SOS_tw_l60_opp_1st_sv%_yielded_16", "SOS_tw_l60_opp_1st_sv%_yielded_15", "SOS_tw_l60_opp_1st_sv%_yielded_14", "SOS_tw_l60_opp_1st_sv%_yielded_13", "SOS_tw_l60_opp_1st_sv%_yielded_12", "SOS_tw_l60_opp_1st_sv%_yielded_11", "SOS_tw_l60_opp_1st_sv%_yielded_10", "SOS_tw_l60_opp_1st_sv%_yielded_9", "SOS_tw_l60_opp_1st_sv%_yielded_8", "SOS_tw_l60_opp_1st_sv%_yielded_7", "SOS_tw_l60_opp_1st_sv%_yielded_6", "SOS_tw_l60_opp_1st_sv%_yielded_5", "SOS_tw_l60_opp_1st_sv%_yielded_4", "SOS_tw_l60_opp_1st_sv%_yielded_3", "SOS_tw_l60_opp_1st_sv%_yielded_2", "SOS_tw_l60_opp_1st_sv%_yielded_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_ws_ct"] = df_player2[["SOS_tw_l60_opp_1st_sv%_yielded_60", "SOS_tw_l60_opp_1st_sv%_yielded_59", "SOS_tw_l60_opp_1st_sv%_yielded_58", "SOS_tw_l60_opp_1st_sv%_yielded_57", "SOS_tw_l60_opp_1st_sv%_yielded_56", "SOS_tw_l60_opp_1st_sv%_yielded_55", "SOS_tw_l60_opp_1st_sv%_yielded_54", "SOS_tw_l60_opp_1st_sv%_yielded_53", "SOS_tw_l60_opp_1st_sv%_yielded_52", "SOS_tw_l60_opp_1st_sv%_yielded_51", "SOS_tw_l60_opp_1st_sv%_yielded_50", "SOS_tw_l60_opp_1st_sv%_yielded_49", "SOS_tw_l60_opp_1st_sv%_yielded_48", "SOS_tw_l60_opp_1st_sv%_yielded_47", "SOS_tw_l60_opp_1st_sv%_yielded_46", "SOS_tw_l60_opp_1st_sv%_yielded_45", "SOS_tw_l60_opp_1st_sv%_yielded_44", "SOS_tw_l60_opp_1st_sv%_yielded_43", "SOS_tw_l60_opp_1st_sv%_yielded_42", "SOS_tw_l60_opp_1st_sv%_yielded_41", "SOS_tw_l60_opp_1st_sv%_yielded_40", "SOS_tw_l60_opp_1st_sv%_yielded_39", "SOS_tw_l60_opp_1st_sv%_yielded_38", "SOS_tw_l60_opp_1st_sv%_yielded_37", "SOS_tw_l60_opp_1st_sv%_yielded_36", "SOS_tw_l60_opp_1st_sv%_yielded_35", "SOS_tw_l60_opp_1st_sv%_yielded_34", "SOS_tw_l60_opp_1st_sv%_yielded_33", "SOS_tw_l60_opp_1st_sv%_yielded_32", "SOS_tw_l60_opp_1st_sv%_yielded_31", "SOS_tw_l60_opp_1st_sv%_yielded_30", "SOS_tw_l60_opp_1st_sv%_yielded_29", "SOS_tw_l60_opp_1st_sv%_yielded_28", "SOS_tw_l60_opp_1st_sv%_yielded_27", "SOS_tw_l60_opp_1st_sv%_yielded_26", "SOS_tw_l60_opp_1st_sv%_yielded_25", "SOS_tw_l60_opp_1st_sv%_yielded_24", "SOS_tw_l60_opp_1st_sv%_yielded_23", "SOS_tw_l60_opp_1st_sv%_yielded_22", "SOS_tw_l60_opp_1st_sv%_yielded_21", "SOS_tw_l60_opp_1st_sv%_yielded_20", "SOS_tw_l60_opp_1st_sv%_yielded_19", "SOS_tw_l60_opp_1st_sv%_yielded_18", "SOS_tw_l60_opp_1st_sv%_yielded_17", "SOS_tw_l60_opp_1st_sv%_yielded_16", "SOS_tw_l60_opp_1st_sv%_yielded_15", "SOS_tw_l60_opp_1st_sv%_yielded_14", "SOS_tw_l60_opp_1st_sv%_yielded_13", "SOS_tw_l60_opp_1st_sv%_yielded_12", "SOS_tw_l60_opp_1st_sv%_yielded_11", "SOS_tw_l60_opp_1st_sv%_yielded_10", "SOS_tw_l60_opp_1st_sv%_yielded_9", "SOS_tw_l60_opp_1st_sv%_yielded_8", "SOS_tw_l60_opp_1st_sv%_yielded_7", "SOS_tw_l60_opp_1st_sv%_yielded_6", "SOS_tw_l60_opp_1st_sv%_yielded_5", "SOS_tw_l60_opp_1st_sv%_yielded_4", "SOS_tw_l60_opp_1st_sv%_yielded_3", "SOS_tw_l60_opp_1st_sv%_yielded_2", "SOS_tw_l60_opp_1st_sv%_yielded_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded"] = (df_player2["SOS_tw_l60_opp_1st_sv%_yielded_ws"]/df_player2["SOS_tw_l60_opp_1st_sv%_yielded_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % FIRST SERVE (as returners) the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l60_opp_1st_sv%_yielded"]

# Mean % FIRST SERVE YIELDED (as returners) performance (l60_tw_ss) for ALL players per surface (clay, hard). We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1 = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv%_yielded_l60_tw_ss'].mean() #We want in terms of pct 1st serve the field ALLOWS on average
mean_clay_SOS_2 = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv%_yielded_l60_tw_ss'].mean() #We want in terms of pct 1st serve the field ALLOWS on average
mean_clay_SOS_3 = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv%_yielded_l60_tw_ss'].mean() #We want in terms of pct 1st serve the field ALLOWS on average
mean_clay_SOS_4 = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv%_yielded_l60_tw_ss'].mean() #We want in terms of pct 1st serve the field ALLOWS on average
mean_clay_SOS_5 = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv%_yielded_l60_tw_ss'].mean() #We want in terms of pct 1st serve the field ALLOWS on average
mean_clay_SOS_6 = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv%_yielded_l60_tw_ss'].mean() #We want in terms of pct 1st serve the field ALLOWS on average
# (61.10502747252756, 62.013276515151404, 61.748135860979446, 61.69176640926659, 61.502766505636004, 61.76918175937891)

mean_hard_SOS_1 = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv%_yielded_l60_tw_ss'].mean() #We want in terms of pct 1st serve the field ALLOWS on average
mean_hard_SOS_2 = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv%_yielded_l60_tw_ss'].mean() #We want in terms of pct 1st serve the field ALLOWS on average
mean_hard_SOS_3 = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv%_yielded_l60_tw_ss'].mean() #We want in terms of pct 1st serve the field ALLOWS on average
mean_hard_SOS_4 = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv%_yielded_l60_tw_ss'].mean() #We want in terms of pct 1st serve the field ALLOWS on average
mean_hard_SOS_5 = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv%_yielded_l60_tw_ss'].mean() #We want in terms of pct 1st serve the field ALLOWS on average
mean_hard_SOS_6 = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv%_yielded_l60_tw_ss'].mean() #We want in terms of pct 1st serve the field ALLOWS on average
# (60.14978399433424, 60.199471544715465, 60.044596252129544, 60.232138697723805, 59.61270440251568, 60.61603503997275)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_1st_sv%_yielded_ws", "SOS_tw_l60_opp_1st_sv%_yielded_ws_ct", "SOS_tw_l60_opp_1st_sv%_yielded", "SOS_tw_l60_opp_1st_sv%_yielded_50", "SOS_tw_l60_opp_1st_sv%_yielded_49", "SOS_tw_l60_opp_1st_sv%_yielded_48", "SOS_tw_l60_opp_1st_sv%_yielded_47", "SOS_tw_l60_opp_1st_sv%_yielded_46", "SOS_tw_l60_opp_1st_sv%_yielded_45", "SOS_tw_l60_opp_1st_sv%_yielded_44", "SOS_tw_l60_opp_1st_sv%_yielded_43", "SOS_tw_l60_opp_1st_sv%_yielded_42", "SOS_tw_l60_opp_1st_sv%_yielded_41", "SOS_tw_l60_opp_1st_sv%_yielded_40", "SOS_tw_l60_opp_1st_sv%_yielded_39", "SOS_tw_l60_opp_1st_sv%_yielded_38", "SOS_tw_l60_opp_1st_sv%_yielded_37", "SOS_tw_l60_opp_1st_sv%_yielded_36", "SOS_tw_l60_opp_1st_sv%_yielded_35", "SOS_tw_l60_opp_1st_sv%_yielded_34", "SOS_tw_l60_opp_1st_sv%_yielded_33", "SOS_tw_l60_opp_1st_sv%_yielded_32", "SOS_tw_l60_opp_1st_sv%_yielded_31", "SOS_tw_l60_opp_1st_sv%_yielded_30", "SOS_tw_l60_opp_1st_sv%_yielded_29", "SOS_tw_l60_opp_1st_sv%_yielded_28", "SOS_tw_l60_opp_1st_sv%_yielded_27", "SOS_tw_l60_opp_1st_sv%_yielded_26", "SOS_tw_l60_opp_1st_sv%_yielded_25", "SOS_tw_l60_opp_1st_sv%_yielded_24", "SOS_tw_l60_opp_1st_sv%_yielded_23", "SOS_tw_l60_opp_1st_sv%_yielded_22", "SOS_tw_l60_opp_1st_sv%_yielded_21", "SOS_tw_l60_opp_1st_sv%_yielded_20", "SOS_tw_l60_opp_1st_sv%_yielded_19", "SOS_tw_l60_opp_1st_sv%_yielded_18", "SOS_tw_l60_opp_1st_sv%_yielded_17", "SOS_tw_l60_opp_1st_sv%_yielded_16", "SOS_tw_l60_opp_1st_sv%_yielded_15", "SOS_tw_l60_opp_1st_sv%_yielded_14", "SOS_tw_l60_opp_1st_sv%_yielded_13", "SOS_tw_l60_opp_1st_sv%_yielded_12", "SOS_tw_l60_opp_1st_sv%_yielded_11", "SOS_tw_l60_opp_1st_sv%_yielded_10", "SOS_tw_l60_opp_1st_sv%_yielded_9", "SOS_tw_l60_opp_1st_sv%_yielded_8", "SOS_tw_l60_opp_1st_sv%_yielded_7", "SOS_tw_l60_opp_1st_sv%_yielded_6", "SOS_tw_l60_opp_1st_sv%_yielded_5", "SOS_tw_l60_opp_1st_sv%_yielded_4", "SOS_tw_l60_opp_1st_sv%_yielded_3", "SOS_tw_l60_opp_1st_sv%_yielded_2", "SOS_tw_l60_opp_1st_sv%_yielded_1"],axis=1)

In [143]:
# 'p_1st_sv%_l10_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) FIRST SERVE PERCENTAGE performance of PLAYER (as a server) over the 60 matches PRIOR TO the match being predicted 

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_1st_sv%_yielded_ws"] = df_player2[["SOS_tw_l60_opp_1st_sv%_yielded_60", "SOS_tw_l60_opp_1st_sv%_yielded_59", "SOS_tw_l60_opp_1st_sv%_yielded_58", "SOS_tw_l60_opp_1st_sv%_yielded_57", "SOS_tw_l60_opp_1st_sv%_yielded_56", "SOS_tw_l60_opp_1st_sv%_yielded_55", "SOS_tw_l60_opp_1st_sv%_yielded_54", "SOS_tw_l60_opp_1st_sv%_yielded_53", "SOS_tw_l60_opp_1st_sv%_yielded_52", "SOS_tw_l60_opp_1st_sv%_yielded_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_1st_sv%_yielded_ws_ct"] = df_player2[["SOS_tw_l60_opp_1st_sv%_yielded_60", "SOS_tw_l60_opp_1st_sv%_yielded_59", "SOS_tw_l60_opp_1st_sv%_yielded_58", "SOS_tw_l60_opp_1st_sv%_yielded_57", "SOS_tw_l60_opp_1st_sv%_yielded_56", "SOS_tw_l60_opp_1st_sv%_yielded_55", "SOS_tw_l60_opp_1st_sv%_yielded_54", "SOS_tw_l60_opp_1st_sv%_yielded_53", "SOS_tw_l60_opp_1st_sv%_yielded_52", "SOS_tw_l60_opp_1st_sv%_yielded_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_1st_sv%_yielded"] = (df_player2["SOS_tw_l10_opp_1st_sv%_yielded_ws"]/df_player2["SOS_tw_l10_opp_1st_sv%_yielded_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % FIRST SERVE (as returners) the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches on this surface is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l10_opp_1st_sv%_yielded"]

# Mean % FIRST SERVE YIELDED (as returners) performance (l10_tw_ss) for ALL players per surface (clay, hard). We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1 = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv%_yielded_l10_tw_ss'].mean() #We want in terms of pct 1st serve the field ALLOWS on average
mean_clay_SOS_2 = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv%_yielded_l10_tw_ss'].mean() #We want in terms of pct 1st serve the field ALLOWS on average
mean_clay_SOS_3 = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv%_yielded_l10_tw_ss'].mean() #We want in terms of pct 1st serve the field ALLOWS on average
mean_clay_SOS_4 = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv%_yielded_l10_tw_ss'].mean() #We want in terms of pct 1st serve the field ALLOWS on average
mean_clay_SOS_5 = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv%_yielded_l10_tw_ss'].mean() #We want in terms of pct 1st serve the field ALLOWS on average
mean_clay_SOS_6 = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv%_yielded_l10_tw_ss'].mean() #We want in terms of pct 1st serve the field ALLOWS on average
# (61.18486263736269, 62.05340593434351, 61.73751026856238, 61.676682754183, 61.46850563607095, 61.67998382923676)
mean_hard_SOS_1 = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv%_yielded_l10_tw_ss'].mean() #We want in terms of pct 1st serve the field ALLOWS on average
mean_hard_SOS_2 = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv%_yielded_l10_tw_ss'].mean() #We want in terms of pct 1st serve the field ALLOWS on average
mean_hard_SOS_3 = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv%_yielded_l10_tw_ss'].mean() #We want in terms of pct 1st serve the field ALLOWS on average
mean_hard_SOS_4 = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv%_yielded_l10_tw_ss'].mean() #We want in terms of pct 1st serve the field ALLOWS on average
mean_hard_SOS_5 = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv%_yielded_l10_tw_ss'].mean() #We want in terms of pct 1st serve the field ALLOWS on average
mean_hard_SOS_6 = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv%_yielded_l10_tw_ss'].mean() #We want in terms of pct 1st serve the field ALLOWS on average
# (60.22989022662888, 60.10563685636848, 60.016918228279295, 60.17662961002288, 59.548482066972674, 60.93329307705378)

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_1st_sv%_yielded_ws", "SOS_tw_l10_opp_1st_sv%_yielded_ws_ct", "SOS_tw_l10_opp_1st_sv%_yielded", "SOS_tw_l60_opp_1st_sv%_yielded_60", "SOS_tw_l60_opp_1st_sv%_yielded_59", "SOS_tw_l60_opp_1st_sv%_yielded_58", "SOS_tw_l60_opp_1st_sv%_yielded_57", "SOS_tw_l60_opp_1st_sv%_yielded_56", "SOS_tw_l60_opp_1st_sv%_yielded_55", "SOS_tw_l60_opp_1st_sv%_yielded_54", "SOS_tw_l60_opp_1st_sv%_yielded_53", "SOS_tw_l60_opp_1st_sv%_yielded_52", "SOS_tw_l60_opp_1st_sv%_yielded_51"],axis=1)

In [144]:
# 'p_1st_sv%_l60_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), indoor/outdoor (IO) specific FIRST SERVE PERCENTAGE performance of PLAYER (as a server) over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific (SS), indoor/outdoor-specific (IO), FIRST SERVE PERCENTAGE YIELDED (as returners) performance of player OPPONENTS over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_60"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-1)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_59"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-2)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_58"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-3)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_57"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-4)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_56"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-5)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_55"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-6)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_54"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-7)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_53"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-8)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_52"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-9)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_51"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-10)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_50"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-11)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_49"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-12)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_48"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-13)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_47"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-14)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_46"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-15)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_45"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-16)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_44"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-17)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_43"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-18)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_42"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-19)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_41"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-20)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_40"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-21)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_39"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-22)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_38"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-23)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_37"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-24)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_36"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-25)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_35"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-26)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_34"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-27)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_33"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-28)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_32"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-29)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_31"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-30)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_30"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-31)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_29"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-32)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_28"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-33)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_27"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-34)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_26"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-35)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_25"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-36)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_24"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-37)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_23"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-38)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_22"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-39)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_21"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-40)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_20"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-41)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_19"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-42)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_18"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-43)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_17"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-44)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_16"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-45)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_15"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-46)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_14"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-47)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_13"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-48)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_12"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-49)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_11"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-50)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_10"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-51)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_9"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-52)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_8"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-53)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_7"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-54)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_6"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-55)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_5"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-56)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_4"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-57)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_3"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-58)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_2"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-59)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_1"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_yielded_l60_tw_ss_IO'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_ws"] = df_player2[["SOS_tw_l60_opp_1st_sv%_yielded_IO_60", "SOS_tw_l60_opp_1st_sv%_yielded_IO_59", "SOS_tw_l60_opp_1st_sv%_yielded_IO_58", "SOS_tw_l60_opp_1st_sv%_yielded_IO_57", "SOS_tw_l60_opp_1st_sv%_yielded_IO_56", "SOS_tw_l60_opp_1st_sv%_yielded_IO_55", "SOS_tw_l60_opp_1st_sv%_yielded_IO_54", "SOS_tw_l60_opp_1st_sv%_yielded_IO_53", "SOS_tw_l60_opp_1st_sv%_yielded_IO_52", "SOS_tw_l60_opp_1st_sv%_yielded_IO_51", "SOS_tw_l60_opp_1st_sv%_yielded_IO_50", "SOS_tw_l60_opp_1st_sv%_yielded_IO_49", "SOS_tw_l60_opp_1st_sv%_yielded_IO_48", "SOS_tw_l60_opp_1st_sv%_yielded_IO_47", "SOS_tw_l60_opp_1st_sv%_yielded_IO_46", "SOS_tw_l60_opp_1st_sv%_yielded_IO_45", "SOS_tw_l60_opp_1st_sv%_yielded_IO_44", "SOS_tw_l60_opp_1st_sv%_yielded_IO_43", "SOS_tw_l60_opp_1st_sv%_yielded_IO_42", "SOS_tw_l60_opp_1st_sv%_yielded_IO_41", "SOS_tw_l60_opp_1st_sv%_yielded_IO_40", "SOS_tw_l60_opp_1st_sv%_yielded_IO_39", "SOS_tw_l60_opp_1st_sv%_yielded_IO_38", "SOS_tw_l60_opp_1st_sv%_yielded_IO_37", "SOS_tw_l60_opp_1st_sv%_yielded_IO_36", "SOS_tw_l60_opp_1st_sv%_yielded_IO_35", "SOS_tw_l60_opp_1st_sv%_yielded_IO_34", "SOS_tw_l60_opp_1st_sv%_yielded_IO_33", "SOS_tw_l60_opp_1st_sv%_yielded_IO_32", "SOS_tw_l60_opp_1st_sv%_yielded_IO_31", "SOS_tw_l60_opp_1st_sv%_yielded_IO_30", "SOS_tw_l60_opp_1st_sv%_yielded_IO_29", "SOS_tw_l60_opp_1st_sv%_yielded_IO_28", "SOS_tw_l60_opp_1st_sv%_yielded_IO_27", "SOS_tw_l60_opp_1st_sv%_yielded_IO_26", "SOS_tw_l60_opp_1st_sv%_yielded_IO_25", "SOS_tw_l60_opp_1st_sv%_yielded_IO_24", "SOS_tw_l60_opp_1st_sv%_yielded_IO_23", "SOS_tw_l60_opp_1st_sv%_yielded_IO_22", "SOS_tw_l60_opp_1st_sv%_yielded_IO_21", "SOS_tw_l60_opp_1st_sv%_yielded_IO_20", "SOS_tw_l60_opp_1st_sv%_yielded_IO_19", "SOS_tw_l60_opp_1st_sv%_yielded_IO_18", "SOS_tw_l60_opp_1st_sv%_yielded_IO_17", "SOS_tw_l60_opp_1st_sv%_yielded_IO_16", "SOS_tw_l60_opp_1st_sv%_yielded_IO_15", "SOS_tw_l60_opp_1st_sv%_yielded_IO_14", "SOS_tw_l60_opp_1st_sv%_yielded_IO_13", "SOS_tw_l60_opp_1st_sv%_yielded_IO_12", "SOS_tw_l60_opp_1st_sv%_yielded_IO_11", "SOS_tw_l60_opp_1st_sv%_yielded_IO_10", "SOS_tw_l60_opp_1st_sv%_yielded_IO_9", "SOS_tw_l60_opp_1st_sv%_yielded_IO_8", "SOS_tw_l60_opp_1st_sv%_yielded_IO_7", "SOS_tw_l60_opp_1st_sv%_yielded_IO_6", "SOS_tw_l60_opp_1st_sv%_yielded_IO_5", "SOS_tw_l60_opp_1st_sv%_yielded_IO_4", "SOS_tw_l60_opp_1st_sv%_yielded_IO_3", "SOS_tw_l60_opp_1st_sv%_yielded_IO_2", "SOS_tw_l60_opp_1st_sv%_yielded_IO_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_ws_ct"] = df_player2[["SOS_tw_l60_opp_1st_sv%_yielded_IO_60", "SOS_tw_l60_opp_1st_sv%_yielded_IO_59", "SOS_tw_l60_opp_1st_sv%_yielded_IO_58", "SOS_tw_l60_opp_1st_sv%_yielded_IO_57", "SOS_tw_l60_opp_1st_sv%_yielded_IO_56", "SOS_tw_l60_opp_1st_sv%_yielded_IO_55", "SOS_tw_l60_opp_1st_sv%_yielded_IO_54", "SOS_tw_l60_opp_1st_sv%_yielded_IO_53", "SOS_tw_l60_opp_1st_sv%_yielded_IO_52", "SOS_tw_l60_opp_1st_sv%_yielded_IO_51", "SOS_tw_l60_opp_1st_sv%_yielded_IO_50", "SOS_tw_l60_opp_1st_sv%_yielded_IO_49", "SOS_tw_l60_opp_1st_sv%_yielded_IO_48", "SOS_tw_l60_opp_1st_sv%_yielded_IO_47", "SOS_tw_l60_opp_1st_sv%_yielded_IO_46", "SOS_tw_l60_opp_1st_sv%_yielded_IO_45", "SOS_tw_l60_opp_1st_sv%_yielded_IO_44", "SOS_tw_l60_opp_1st_sv%_yielded_IO_43", "SOS_tw_l60_opp_1st_sv%_yielded_IO_42", "SOS_tw_l60_opp_1st_sv%_yielded_IO_41", "SOS_tw_l60_opp_1st_sv%_yielded_IO_40", "SOS_tw_l60_opp_1st_sv%_yielded_IO_39", "SOS_tw_l60_opp_1st_sv%_yielded_IO_38", "SOS_tw_l60_opp_1st_sv%_yielded_IO_37", "SOS_tw_l60_opp_1st_sv%_yielded_IO_36", "SOS_tw_l60_opp_1st_sv%_yielded_IO_35", "SOS_tw_l60_opp_1st_sv%_yielded_IO_34", "SOS_tw_l60_opp_1st_sv%_yielded_IO_33", "SOS_tw_l60_opp_1st_sv%_yielded_IO_32", "SOS_tw_l60_opp_1st_sv%_yielded_IO_31", "SOS_tw_l60_opp_1st_sv%_yielded_IO_30", "SOS_tw_l60_opp_1st_sv%_yielded_IO_29", "SOS_tw_l60_opp_1st_sv%_yielded_IO_28", "SOS_tw_l60_opp_1st_sv%_yielded_IO_27", "SOS_tw_l60_opp_1st_sv%_yielded_IO_26", "SOS_tw_l60_opp_1st_sv%_yielded_IO_25", "SOS_tw_l60_opp_1st_sv%_yielded_IO_24", "SOS_tw_l60_opp_1st_sv%_yielded_IO_23", "SOS_tw_l60_opp_1st_sv%_yielded_IO_22", "SOS_tw_l60_opp_1st_sv%_yielded_IO_21", "SOS_tw_l60_opp_1st_sv%_yielded_IO_20", "SOS_tw_l60_opp_1st_sv%_yielded_IO_19", "SOS_tw_l60_opp_1st_sv%_yielded_IO_18", "SOS_tw_l60_opp_1st_sv%_yielded_IO_17", "SOS_tw_l60_opp_1st_sv%_yielded_IO_16", "SOS_tw_l60_opp_1st_sv%_yielded_IO_15", "SOS_tw_l60_opp_1st_sv%_yielded_IO_14", "SOS_tw_l60_opp_1st_sv%_yielded_IO_13", "SOS_tw_l60_opp_1st_sv%_yielded_IO_12", "SOS_tw_l60_opp_1st_sv%_yielded_IO_11", "SOS_tw_l60_opp_1st_sv%_yielded_IO_10", "SOS_tw_l60_opp_1st_sv%_yielded_IO_9", "SOS_tw_l60_opp_1st_sv%_yielded_IO_8", "SOS_tw_l60_opp_1st_sv%_yielded_IO_7", "SOS_tw_l60_opp_1st_sv%_yielded_IO_6", "SOS_tw_l60_opp_1st_sv%_yielded_IO_5", "SOS_tw_l60_opp_1st_sv%_yielded_IO_4", "SOS_tw_l60_opp_1st_sv%_yielded_IO_3", "SOS_tw_l60_opp_1st_sv%_yielded_IO_2", "SOS_tw_l60_opp_1st_sv%_yielded_IO_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO"] = (df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_ws"]/df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % % FIRST SERVE (as returners) the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface, specifically indoor or outdoor, is expected to yield
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l60_opp_1st_sv%_yielded_IO"]

#  Mean % FIRST SERVE YIELDED (as returners) performance (l60_tw_ss) for ALL players per surface (clay, hard) and indoor/outdoor status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv%_yielded_l60_tw_ss_IO'].mean() 
mean_clay_SOS_2i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv%_yielded_l60_tw_ss_IO'].mean() 
mean_clay_SOS_3i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv%_yielded_l60_tw_ss_IO'].mean() 
mean_clay_SOS_4i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv%_yielded_l60_tw_ss_IO'].mean() 
mean_clay_SOS_5i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv%_yielded_l60_tw_ss_IO'].mean() 
mean_clay_SOS_6i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv%_yielded_l60_tw_ss_IO'].mean() 
# (nan, 60.96535714285715, 60.38502994011974, 61.88385964912283, 59.69387323943663, 61.02333333333332)

mean_clay_SOS_1o = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv%_yielded_l60_tw_ss_IO'].mean() 
mean_clay_SOS_2o = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv%_yielded_l60_tw_ss_IO'].mean() 
mean_clay_SOS_3o = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv%_yielded_l60_tw_ss_IO'].mean() 
mean_clay_SOS_4o = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv%_yielded_l60_tw_ss_IO'].mean() 
mean_clay_SOS_5o = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv%_yielded_l60_tw_ss_IO'].mean() 
mean_clay_SOS_6o = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv%_yielded_l60_tw_ss_IO'].mean() 
# (61.10502747252756, 61.96529046997368, 61.76758983050848, 61.65881338392559, 61.57023777173908, 61.78296482412066)        

mean_hard_SOS_1i = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv%_yielded_l60_tw_ss_IO'].mean() 
mean_hard_SOS_2i = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv%_yielded_l60_tw_ss_IO'].mean() 
mean_hard_SOS_3i = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv%_yielded_l60_tw_ss_IO'].mean() 
mean_hard_SOS_4i = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv%_yielded_l60_tw_ss_IO'].mean() 
mean_hard_SOS_5i = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv%_yielded_l60_tw_ss_IO'].mean() 
mean_hard_SOS_6i = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv%_yielded_l60_tw_ss_IO'].mean() 
# (59.72854878048779, 60.9568847006653, 60.89603892386963, 60.95005457025914, 60.55820905923349, 60.866369863013745)
                                         
mean_hard_SOS_1o = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv%_yielded_l60_tw_ss_IO'].mean() 
mean_hard_SOS_2o = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv%_yielded_l60_tw_ss_IO'].mean() 
mean_hard_SOS_3o = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv%_yielded_l60_tw_ss_IO'].mean() 
mean_hard_SOS_4o = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv%_yielded_l60_tw_ss_IO'].mean() 
mean_hard_SOS_5o = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv%_yielded_l60_tw_ss_IO'].mean() 
mean_hard_SOS_6o = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv%_yielded_l60_tw_ss_IO'].mean()                                          
# (60.21469574582667, 59.8990581510933, 59.71887337741854, 60.010525053944065, 59.40792790750394, 60.31720228571418)                                        
                                         
# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l60_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 
                                         
del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_1st_sv%_yielded_IO_ws", "SOS_tw_l60_opp_1st_sv%_yielded_IO_ws_ct", "SOS_tw_l60_opp_1st_sv%_yielded_IO", "SOS_tw_l60_opp_1st_sv%_yielded_IO_50", "SOS_tw_l60_opp_1st_sv%_yielded_IO_49", "SOS_tw_l60_opp_1st_sv%_yielded_IO_48", "SOS_tw_l60_opp_1st_sv%_yielded_IO_47", "SOS_tw_l60_opp_1st_sv%_yielded_IO_46", "SOS_tw_l60_opp_1st_sv%_yielded_IO_45", "SOS_tw_l60_opp_1st_sv%_yielded_IO_44", "SOS_tw_l60_opp_1st_sv%_yielded_IO_43", "SOS_tw_l60_opp_1st_sv%_yielded_IO_42", "SOS_tw_l60_opp_1st_sv%_yielded_IO_41", "SOS_tw_l60_opp_1st_sv%_yielded_IO_40", "SOS_tw_l60_opp_1st_sv%_yielded_IO_39", "SOS_tw_l60_opp_1st_sv%_yielded_IO_38", "SOS_tw_l60_opp_1st_sv%_yielded_IO_37", "SOS_tw_l60_opp_1st_sv%_yielded_IO_36", "SOS_tw_l60_opp_1st_sv%_yielded_IO_35", "SOS_tw_l60_opp_1st_sv%_yielded_IO_34", "SOS_tw_l60_opp_1st_sv%_yielded_IO_33", "SOS_tw_l60_opp_1st_sv%_yielded_IO_32", "SOS_tw_l60_opp_1st_sv%_yielded_IO_31", "SOS_tw_l60_opp_1st_sv%_yielded_IO_30", "SOS_tw_l60_opp_1st_sv%_yielded_IO_29", "SOS_tw_l60_opp_1st_sv%_yielded_IO_28", "SOS_tw_l60_opp_1st_sv%_yielded_IO_27", "SOS_tw_l60_opp_1st_sv%_yielded_IO_26", "SOS_tw_l60_opp_1st_sv%_yielded_IO_25", "SOS_tw_l60_opp_1st_sv%_yielded_IO_24", "SOS_tw_l60_opp_1st_sv%_yielded_IO_23", "SOS_tw_l60_opp_1st_sv%_yielded_IO_22", "SOS_tw_l60_opp_1st_sv%_yielded_IO_21", "SOS_tw_l60_opp_1st_sv%_yielded_IO_20", "SOS_tw_l60_opp_1st_sv%_yielded_IO_19", "SOS_tw_l60_opp_1st_sv%_yielded_IO_18", "SOS_tw_l60_opp_1st_sv%_yielded_IO_17", "SOS_tw_l60_opp_1st_sv%_yielded_IO_16", "SOS_tw_l60_opp_1st_sv%_yielded_IO_15", "SOS_tw_l60_opp_1st_sv%_yielded_IO_14", "SOS_tw_l60_opp_1st_sv%_yielded_IO_13", "SOS_tw_l60_opp_1st_sv%_yielded_IO_12", "SOS_tw_l60_opp_1st_sv%_yielded_IO_11", "SOS_tw_l60_opp_1st_sv%_yielded_IO_10", "SOS_tw_l60_opp_1st_sv%_yielded_IO_9", "SOS_tw_l60_opp_1st_sv%_yielded_IO_8", "SOS_tw_l60_opp_1st_sv%_yielded_IO_7", "SOS_tw_l60_opp_1st_sv%_yielded_IO_6", "SOS_tw_l60_opp_1st_sv%_yielded_IO_5", "SOS_tw_l60_opp_1st_sv%_yielded_IO_4", "SOS_tw_l60_opp_1st_sv%_yielded_IO_3", "SOS_tw_l60_opp_1st_sv%_yielded_IO_2", "SOS_tw_l60_opp_1st_sv%_yielded_IO_1"],axis=1)

In [145]:
# 'p_1st_sv%_l10_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), indoor/outdoor (IO) specific FIRST SERVE PERCENTAGE performance of PLAYER (as a server) over the 10 matches PRIOR TO the match being predicted 

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_1st_sv%_yielded_IO_ws"] = df_player2[["SOS_tw_l60_opp_1st_sv%_yielded_IO_60", "SOS_tw_l60_opp_1st_sv%_yielded_IO_59", "SOS_tw_l60_opp_1st_sv%_yielded_IO_58", "SOS_tw_l60_opp_1st_sv%_yielded_IO_57", "SOS_tw_l60_opp_1st_sv%_yielded_IO_56", "SOS_tw_l60_opp_1st_sv%_yielded_IO_55", "SOS_tw_l60_opp_1st_sv%_yielded_IO_54", "SOS_tw_l60_opp_1st_sv%_yielded_IO_53", "SOS_tw_l60_opp_1st_sv%_yielded_IO_52", "SOS_tw_l60_opp_1st_sv%_yielded_IO_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_1st_sv%_yielded_IO_ws_ct"] = df_player2[["SOS_tw_l60_opp_1st_sv%_yielded_IO_60", "SOS_tw_l60_opp_1st_sv%_yielded_IO_59", "SOS_tw_l60_opp_1st_sv%_yielded_IO_58", "SOS_tw_l60_opp_1st_sv%_yielded_IO_57", "SOS_tw_l60_opp_1st_sv%_yielded_IO_56", "SOS_tw_l60_opp_1st_sv%_yielded_IO_55", "SOS_tw_l60_opp_1st_sv%_yielded_IO_54", "SOS_tw_l60_opp_1st_sv%_yielded_IO_53", "SOS_tw_l60_opp_1st_sv%_yielded_IO_52", "SOS_tw_l60_opp_1st_sv%_yielded_IO_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_1st_sv%_yielded_IO"] = (df_player2["SOS_tw_l10_opp_1st_sv%_yielded_IO_ws"]/df_player2["SOS_tw_l10_opp_1st_sv%_yielded_IO_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % % FIRST SERVE (as returners) the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches on this surface, specifically indoor or outdoor, is expected to yield
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l10_opp_1st_sv%_yielded_IO"]

#  Mean % FIRST SERVE YIELDED (as returners) performance (l10_tw_ss) for ALL players per surface (clay, hard) and indoor/outdoor status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv%_yielded_l10_tw_ss_IO'].mean() 
mean_clay_SOS_2i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv%_yielded_l10_tw_ss_IO'].mean() 
mean_clay_SOS_3i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv%_yielded_l10_tw_ss_IO'].mean() 
mean_clay_SOS_4i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv%_yielded_l10_tw_ss_IO'].mean() 
mean_clay_SOS_5i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv%_yielded_l10_tw_ss_IO'].mean() 
mean_clay_SOS_6i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv%_yielded_l10_tw_ss_IO'].mean() 
# (nan, 60.96535714285715, 60.40317365269459, 61.9652046783626, 59.73274647887323, 61.08126436781609)

mean_clay_SOS_1o = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv%_yielded_l10_tw_ss_IO'].mean() 
mean_clay_SOS_2o = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv%_yielded_l10_tw_ss_IO'].mean() 
mean_clay_SOS_3o = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv%_yielded_l10_tw_ss_IO'].mean() 
mean_clay_SOS_4o = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv%_yielded_l10_tw_ss_IO'].mean() 
mean_clay_SOS_5o = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv%_yielded_l10_tw_ss_IO'].mean() 
mean_clay_SOS_6o = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv%_yielded_l10_tw_ss_IO'].mean() 
# (61.18486263736269, 62.04073433420363, 61.73752881355925, 61.645605381166064, 61.582697010869644, 61.646700167504164)       

mean_hard_SOS_1i = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv%_yielded_l10_tw_ss_IO'].mean() 
mean_hard_SOS_2i = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv%_yielded_l10_tw_ss_IO'].mean() 
mean_hard_SOS_3i = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv%_yielded_l10_tw_ss_IO'].mean() 
mean_hard_SOS_4i = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv%_yielded_l10_tw_ss_IO'].mean() 
mean_hard_SOS_5i = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv%_yielded_l10_tw_ss_IO'].mean() 
mean_hard_SOS_6i = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv%_yielded_l10_tw_ss_IO'].mean() 
# (59.8187926829268, 60.967145232815916, 60.80080137378356, 60.839911323328614, 60.1748222996514, 61.16520547945199)
                                         
mean_hard_SOS_1o = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv%_yielded_l10_tw_ss_IO'].mean() 
mean_hard_SOS_2o = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv%_yielded_l10_tw_ss_IO'].mean() 
mean_hard_SOS_3o = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv%_yielded_l10_tw_ss_IO'].mean() 
mean_hard_SOS_4o = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv%_yielded_l10_tw_ss_IO'].mean() 
mean_hard_SOS_5o = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv%_yielded_l10_tw_ss_IO'].mean() 
mean_hard_SOS_6o = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv%_yielded_l10_tw_ss_IO'].mean()                                          
# (60.111475498115226, 59.816351888667874, 59.61435464119517, 60.10037401102852, 59.22781228746312, 60.743206857142795)                                       
                                         
# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_l10_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 
                                         
del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_1st_sv%_yielded_IO_ws", "SOS_tw_l10_opp_1st_sv%_yielded_IO_ws_ct", "SOS_tw_l10_opp_1st_sv%_yielded_IO", "SOS_tw_l60_opp_1st_sv%_yielded_IO_60", "SOS_tw_l60_opp_1st_sv%_yielded_IO_59", "SOS_tw_l60_opp_1st_sv%_yielded_IO_58", "SOS_tw_l60_opp_1st_sv%_yielded_IO_57", "SOS_tw_l60_opp_1st_sv%_yielded_IO_56", "SOS_tw_l60_opp_1st_sv%_yielded_IO_55", "SOS_tw_l60_opp_1st_sv%_yielded_IO_54", "SOS_tw_l60_opp_1st_sv%_yielded_IO_53", "SOS_tw_l60_opp_1st_sv%_yielded_IO_52", "SOS_tw_l60_opp_1st_sv%_yielded_IO_51"],axis=1)

In [146]:
# 'p_1st_sv%_yielded_l60_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) FIRST SERVE PERCENTAGE YIELDED performance of PLAYER as a (returner) over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific (SS) FIRST SERVE PERCENTAGE (as servers) performance of player OPPONENTS over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player2["SOS_tw_l60_opp_1st_sv%_60"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-1)
df_player2["SOS_tw_l60_opp_1st_sv%_59"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-2)
df_player2["SOS_tw_l60_opp_1st_sv%_58"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-3)
df_player2["SOS_tw_l60_opp_1st_sv%_57"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-4)
df_player2["SOS_tw_l60_opp_1st_sv%_56"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-5)
df_player2["SOS_tw_l60_opp_1st_sv%_55"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-6)
df_player2["SOS_tw_l60_opp_1st_sv%_54"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-7)
df_player2["SOS_tw_l60_opp_1st_sv%_53"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-8)
df_player2["SOS_tw_l60_opp_1st_sv%_52"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-9)
df_player2["SOS_tw_l60_opp_1st_sv%_51"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-10)
df_player2["SOS_tw_l60_opp_1st_sv%_50"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-11)
df_player2["SOS_tw_l60_opp_1st_sv%_49"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-12)
df_player2["SOS_tw_l60_opp_1st_sv%_48"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-13)
df_player2["SOS_tw_l60_opp_1st_sv%_47"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-14)
df_player2["SOS_tw_l60_opp_1st_sv%_46"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-15)
df_player2["SOS_tw_l60_opp_1st_sv%_45"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-16)
df_player2["SOS_tw_l60_opp_1st_sv%_44"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-17)
df_player2["SOS_tw_l60_opp_1st_sv%_43"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-18)
df_player2["SOS_tw_l60_opp_1st_sv%_42"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-19)
df_player2["SOS_tw_l60_opp_1st_sv%_41"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-20)
df_player2["SOS_tw_l60_opp_1st_sv%_40"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-21)
df_player2["SOS_tw_l60_opp_1st_sv%_39"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-22)
df_player2["SOS_tw_l60_opp_1st_sv%_38"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-23)
df_player2["SOS_tw_l60_opp_1st_sv%_37"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-24)
df_player2["SOS_tw_l60_opp_1st_sv%_36"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-25)
df_player2["SOS_tw_l60_opp_1st_sv%_35"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-26)
df_player2["SOS_tw_l60_opp_1st_sv%_34"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-27)
df_player2["SOS_tw_l60_opp_1st_sv%_33"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-28)
df_player2["SOS_tw_l60_opp_1st_sv%_32"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-29)
df_player2["SOS_tw_l60_opp_1st_sv%_31"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-30)
df_player2["SOS_tw_l60_opp_1st_sv%_30"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-31)
df_player2["SOS_tw_l60_opp_1st_sv%_29"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-32)
df_player2["SOS_tw_l60_opp_1st_sv%_28"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-33)
df_player2["SOS_tw_l60_opp_1st_sv%_27"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-34)
df_player2["SOS_tw_l60_opp_1st_sv%_26"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-35)
df_player2["SOS_tw_l60_opp_1st_sv%_25"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-36)
df_player2["SOS_tw_l60_opp_1st_sv%_24"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-37)
df_player2["SOS_tw_l60_opp_1st_sv%_23"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-38)
df_player2["SOS_tw_l60_opp_1st_sv%_22"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-39)
df_player2["SOS_tw_l60_opp_1st_sv%_21"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-40)
df_player2["SOS_tw_l60_opp_1st_sv%_20"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-41)
df_player2["SOS_tw_l60_opp_1st_sv%_19"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-42)
df_player2["SOS_tw_l60_opp_1st_sv%_18"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-43)
df_player2["SOS_tw_l60_opp_1st_sv%_17"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-44)
df_player2["SOS_tw_l60_opp_1st_sv%_16"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-45)
df_player2["SOS_tw_l60_opp_1st_sv%_15"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-46)
df_player2["SOS_tw_l60_opp_1st_sv%_14"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-47)
df_player2["SOS_tw_l60_opp_1st_sv%_13"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-48)
df_player2["SOS_tw_l60_opp_1st_sv%_12"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-49)
df_player2["SOS_tw_l60_opp_1st_sv%_11"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-50)
df_player2["SOS_tw_l60_opp_1st_sv%_10"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-51)
df_player2["SOS_tw_l60_opp_1st_sv%_9"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-52)
df_player2["SOS_tw_l60_opp_1st_sv%_8"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-53)
df_player2["SOS_tw_l60_opp_1st_sv%_7"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-54)
df_player2["SOS_tw_l60_opp_1st_sv%_6"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-55)
df_player2["SOS_tw_l60_opp_1st_sv%_5"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-56)
df_player2["SOS_tw_l60_opp_1st_sv%_4"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-57)
df_player2["SOS_tw_l60_opp_1st_sv%_3"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-58)
df_player2["SOS_tw_l60_opp_1st_sv%_2"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-59)
df_player2["SOS_tw_l60_opp_1st_sv%_1"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv%_l60_tw_ss'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_1st_sv%_ws"] = df_player2[["SOS_tw_l60_opp_1st_sv%_60", "SOS_tw_l60_opp_1st_sv%_59", "SOS_tw_l60_opp_1st_sv%_58", "SOS_tw_l60_opp_1st_sv%_57", "SOS_tw_l60_opp_1st_sv%_56", "SOS_tw_l60_opp_1st_sv%_55", "SOS_tw_l60_opp_1st_sv%_54", "SOS_tw_l60_opp_1st_sv%_53", "SOS_tw_l60_opp_1st_sv%_52", "SOS_tw_l60_opp_1st_sv%_51", "SOS_tw_l60_opp_1st_sv%_50", "SOS_tw_l60_opp_1st_sv%_49", "SOS_tw_l60_opp_1st_sv%_48", "SOS_tw_l60_opp_1st_sv%_47", "SOS_tw_l60_opp_1st_sv%_46", "SOS_tw_l60_opp_1st_sv%_45", "SOS_tw_l60_opp_1st_sv%_44", "SOS_tw_l60_opp_1st_sv%_43", "SOS_tw_l60_opp_1st_sv%_42", "SOS_tw_l60_opp_1st_sv%_41", "SOS_tw_l60_opp_1st_sv%_40", "SOS_tw_l60_opp_1st_sv%_39", "SOS_tw_l60_opp_1st_sv%_38", "SOS_tw_l60_opp_1st_sv%_37", "SOS_tw_l60_opp_1st_sv%_36", "SOS_tw_l60_opp_1st_sv%_35", "SOS_tw_l60_opp_1st_sv%_34", "SOS_tw_l60_opp_1st_sv%_33", "SOS_tw_l60_opp_1st_sv%_32", "SOS_tw_l60_opp_1st_sv%_31", "SOS_tw_l60_opp_1st_sv%_30", "SOS_tw_l60_opp_1st_sv%_29", "SOS_tw_l60_opp_1st_sv%_28", "SOS_tw_l60_opp_1st_sv%_27", "SOS_tw_l60_opp_1st_sv%_26", "SOS_tw_l60_opp_1st_sv%_25", "SOS_tw_l60_opp_1st_sv%_24", "SOS_tw_l60_opp_1st_sv%_23", "SOS_tw_l60_opp_1st_sv%_22", "SOS_tw_l60_opp_1st_sv%_21", "SOS_tw_l60_opp_1st_sv%_20", "SOS_tw_l60_opp_1st_sv%_19", "SOS_tw_l60_opp_1st_sv%_18", "SOS_tw_l60_opp_1st_sv%_17", "SOS_tw_l60_opp_1st_sv%_16", "SOS_tw_l60_opp_1st_sv%_15", "SOS_tw_l60_opp_1st_sv%_14", "SOS_tw_l60_opp_1st_sv%_13", "SOS_tw_l60_opp_1st_sv%_12", "SOS_tw_l60_opp_1st_sv%_11", "SOS_tw_l60_opp_1st_sv%_10", "SOS_tw_l60_opp_1st_sv%_9", "SOS_tw_l60_opp_1st_sv%_8", "SOS_tw_l60_opp_1st_sv%_7", "SOS_tw_l60_opp_1st_sv%_6", "SOS_tw_l60_opp_1st_sv%_5", "SOS_tw_l60_opp_1st_sv%_4", "SOS_tw_l60_opp_1st_sv%_3", "SOS_tw_l60_opp_1st_sv%_2", "SOS_tw_l60_opp_1st_sv%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_1st_sv%_ws_ct"] = df_player2[["SOS_tw_l60_opp_1st_sv%_60", "SOS_tw_l60_opp_1st_sv%_59", "SOS_tw_l60_opp_1st_sv%_58", "SOS_tw_l60_opp_1st_sv%_57", "SOS_tw_l60_opp_1st_sv%_56", "SOS_tw_l60_opp_1st_sv%_55", "SOS_tw_l60_opp_1st_sv%_54", "SOS_tw_l60_opp_1st_sv%_53", "SOS_tw_l60_opp_1st_sv%_52", "SOS_tw_l60_opp_1st_sv%_51", "SOS_tw_l60_opp_1st_sv%_50", "SOS_tw_l60_opp_1st_sv%_49", "SOS_tw_l60_opp_1st_sv%_48", "SOS_tw_l60_opp_1st_sv%_47", "SOS_tw_l60_opp_1st_sv%_46", "SOS_tw_l60_opp_1st_sv%_45", "SOS_tw_l60_opp_1st_sv%_44", "SOS_tw_l60_opp_1st_sv%_43", "SOS_tw_l60_opp_1st_sv%_42", "SOS_tw_l60_opp_1st_sv%_41", "SOS_tw_l60_opp_1st_sv%_40", "SOS_tw_l60_opp_1st_sv%_39", "SOS_tw_l60_opp_1st_sv%_38", "SOS_tw_l60_opp_1st_sv%_37", "SOS_tw_l60_opp_1st_sv%_36", "SOS_tw_l60_opp_1st_sv%_35", "SOS_tw_l60_opp_1st_sv%_34", "SOS_tw_l60_opp_1st_sv%_33", "SOS_tw_l60_opp_1st_sv%_32", "SOS_tw_l60_opp_1st_sv%_31", "SOS_tw_l60_opp_1st_sv%_30", "SOS_tw_l60_opp_1st_sv%_29", "SOS_tw_l60_opp_1st_sv%_28", "SOS_tw_l60_opp_1st_sv%_27", "SOS_tw_l60_opp_1st_sv%_26", "SOS_tw_l60_opp_1st_sv%_25", "SOS_tw_l60_opp_1st_sv%_24", "SOS_tw_l60_opp_1st_sv%_23", "SOS_tw_l60_opp_1st_sv%_22", "SOS_tw_l60_opp_1st_sv%_21", "SOS_tw_l60_opp_1st_sv%_20", "SOS_tw_l60_opp_1st_sv%_19", "SOS_tw_l60_opp_1st_sv%_18", "SOS_tw_l60_opp_1st_sv%_17", "SOS_tw_l60_opp_1st_sv%_16", "SOS_tw_l60_opp_1st_sv%_15", "SOS_tw_l60_opp_1st_sv%_14", "SOS_tw_l60_opp_1st_sv%_13", "SOS_tw_l60_opp_1st_sv%_12", "SOS_tw_l60_opp_1st_sv%_11", "SOS_tw_l60_opp_1st_sv%_10", "SOS_tw_l60_opp_1st_sv%_9", "SOS_tw_l60_opp_1st_sv%_8", "SOS_tw_l60_opp_1st_sv%_7", "SOS_tw_l60_opp_1st_sv%_6", "SOS_tw_l60_opp_1st_sv%_5", "SOS_tw_l60_opp_1st_sv%_4", "SOS_tw_l60_opp_1st_sv%_3", "SOS_tw_l60_opp_1st_sv%_2", "SOS_tw_l60_opp_1st_sv%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_1st_sv%"] = (df_player2["SOS_tw_l60_opp_1st_sv%_ws"]/df_player2["SOS_tw_l60_opp_1st_sv%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % FIRST SERVE (as servers) the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to have
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l60_opp_1st_sv%"]

# Mean % FIRST SERVE YIELDED (as returners) performance (l60_tw_ss) for ALL players per surface (clay, hard). We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1 = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv%_l60_tw_ss'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_2 = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv%_l60_tw_ss'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_3 = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv%_l60_tw_ss'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_4 = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv%_l60_tw_ss'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_5 = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv%_l60_tw_ss'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_6 = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv%_l60_tw_ss'].mean() #We want in terms of pct 1st serve the field MADE on average
# (61.38809065934069, 62.25705808080828, 61.59658451816754, 61.30044079794062, 61.26026731078898, 61.90919793014239)

mean_hard_SOS_1 = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv%_l60_tw_ss'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_2 = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv%_l60_tw_ss'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_3 = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv%_l60_tw_ss'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_4 = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv%_l60_tw_ss'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_5 = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv%_l60_tw_ss'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_6 = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv%_l60_tw_ss'].mean() #We want in terms of pct 1st serve the field MADE on average
# (60.36063031161469, 60.42149898373973, 59.94460477001693, 60.096126698429444, 59.53475267720558, 60.59268242898443)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv%_yielded_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv%_yielded_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv%_yielded_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv%_yielded_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv%_yielded_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv%_yielded_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv%_yielded_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv%_yielded_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv%_yielded_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv%_yielded_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv%_yielded_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv%_yielded_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_1st_sv%_ws", "SOS_tw_l60_opp_1st_sv%_ws_ct", "SOS_tw_l60_opp_1st_sv%", "SOS_tw_l60_opp_1st_sv%_50", "SOS_tw_l60_opp_1st_sv%_49", "SOS_tw_l60_opp_1st_sv%_48", "SOS_tw_l60_opp_1st_sv%_47", "SOS_tw_l60_opp_1st_sv%_46", "SOS_tw_l60_opp_1st_sv%_45", "SOS_tw_l60_opp_1st_sv%_44", "SOS_tw_l60_opp_1st_sv%_43", "SOS_tw_l60_opp_1st_sv%_42", "SOS_tw_l60_opp_1st_sv%_41", "SOS_tw_l60_opp_1st_sv%_40", "SOS_tw_l60_opp_1st_sv%_39", "SOS_tw_l60_opp_1st_sv%_38", "SOS_tw_l60_opp_1st_sv%_37", "SOS_tw_l60_opp_1st_sv%_36", "SOS_tw_l60_opp_1st_sv%_35", "SOS_tw_l60_opp_1st_sv%_34", "SOS_tw_l60_opp_1st_sv%_33", "SOS_tw_l60_opp_1st_sv%_32", "SOS_tw_l60_opp_1st_sv%_31", "SOS_tw_l60_opp_1st_sv%_30", "SOS_tw_l60_opp_1st_sv%_29", "SOS_tw_l60_opp_1st_sv%_28", "SOS_tw_l60_opp_1st_sv%_27", "SOS_tw_l60_opp_1st_sv%_26", "SOS_tw_l60_opp_1st_sv%_25", "SOS_tw_l60_opp_1st_sv%_24", "SOS_tw_l60_opp_1st_sv%_23", "SOS_tw_l60_opp_1st_sv%_22", "SOS_tw_l60_opp_1st_sv%_21", "SOS_tw_l60_opp_1st_sv%_20", "SOS_tw_l60_opp_1st_sv%_19", "SOS_tw_l60_opp_1st_sv%_18", "SOS_tw_l60_opp_1st_sv%_17", "SOS_tw_l60_opp_1st_sv%_16", "SOS_tw_l60_opp_1st_sv%_15", "SOS_tw_l60_opp_1st_sv%_14", "SOS_tw_l60_opp_1st_sv%_13", "SOS_tw_l60_opp_1st_sv%_12", "SOS_tw_l60_opp_1st_sv%_11", "SOS_tw_l60_opp_1st_sv%_10", "SOS_tw_l60_opp_1st_sv%_9", "SOS_tw_l60_opp_1st_sv%_8", "SOS_tw_l60_opp_1st_sv%_7", "SOS_tw_l60_opp_1st_sv%_6", "SOS_tw_l60_opp_1st_sv%_5", "SOS_tw_l60_opp_1st_sv%_4", "SOS_tw_l60_opp_1st_sv%_3", "SOS_tw_l60_opp_1st_sv%_2", "SOS_tw_l60_opp_1st_sv%_1"],axis=1)

In [147]:
# 'p_1st_sv%_yielded_l10_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) FIRST SERVE PERCENTAGE YIELDED performance of PLAYER as a (returner) over the 10 matches PRIOR TO the match being predicted
#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_1st_sv%_ws"] = df_player2[["SOS_tw_l60_opp_1st_sv%_60", "SOS_tw_l60_opp_1st_sv%_59", "SOS_tw_l60_opp_1st_sv%_58", "SOS_tw_l60_opp_1st_sv%_57", "SOS_tw_l60_opp_1st_sv%_56", "SOS_tw_l60_opp_1st_sv%_55", "SOS_tw_l60_opp_1st_sv%_54", "SOS_tw_l60_opp_1st_sv%_53", "SOS_tw_l60_opp_1st_sv%_52", "SOS_tw_l60_opp_1st_sv%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_1st_sv%_ws_ct"] = df_player2[["SOS_tw_l60_opp_1st_sv%_60", "SOS_tw_l60_opp_1st_sv%_59", "SOS_tw_l60_opp_1st_sv%_58", "SOS_tw_l60_opp_1st_sv%_57", "SOS_tw_l60_opp_1st_sv%_56", "SOS_tw_l60_opp_1st_sv%_55", "SOS_tw_l60_opp_1st_sv%_54", "SOS_tw_l60_opp_1st_sv%_53", "SOS_tw_l60_opp_1st_sv%_52", "SOS_tw_l60_opp_1st_sv%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_1st_sv%"] = (df_player2["SOS_tw_l10_opp_1st_sv%_ws"]/df_player2["SOS_tw_l10_opp_1st_sv%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % FIRST SERVE (as servers) the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches on this surface is expected to have
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l10_opp_1st_sv%"]

# Mean % FIRST SERVE YIELDED (as returners) performance (l10_tw_ss) for ALL players per surface (clay, hard). We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1 = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv%_l10_tw_ss'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_2 = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv%_l10_tw_ss'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_3 = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv%_l10_tw_ss'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_4 = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv%_l10_tw_ss'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_5 = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv%_l10_tw_ss'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_6 = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv%_l10_tw_ss'].mean() #We want in terms of pct 1st serve the field MADE on average
# (61.65590084643305, 62.06802318486902, 61.69227244582065, 61.671667715544544, 61.358530895334404, 62.02806543837379)
mean_hard_SOS_1 = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv%_l10_tw_ss'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_2 = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv%_l10_tw_ss'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_3 = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv%_l10_tw_ss'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_4 = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv%_l10_tw_ss'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_5 = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv%_l10_tw_ss'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_6 = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv%_l10_tw_ss'].mean() #We want in terms of pct 1st serve the field MADE on average
# (60.49880936890067, 59.950358803986816, 60.14602280348777, 60.026589403973624, 59.64408252264333, 61.22972718086884)

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv%_yielded_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv%_yielded_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv%_yielded_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv%_yielded_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv%_yielded_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv%_yielded_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv%_yielded_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv%_yielded_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv%_yielded_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv%_yielded_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv%_yielded_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv%_yielded_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_1st_sv%_ws", "SOS_tw_l10_opp_1st_sv%_ws_ct", "SOS_tw_l10_opp_1st_sv%", "SOS_tw_l60_opp_1st_sv%_60", "SOS_tw_l60_opp_1st_sv%_59", "SOS_tw_l60_opp_1st_sv%_58", "SOS_tw_l60_opp_1st_sv%_57", "SOS_tw_l60_opp_1st_sv%_56", "SOS_tw_l60_opp_1st_sv%_55", "SOS_tw_l60_opp_1st_sv%_54", "SOS_tw_l60_opp_1st_sv%_53", "SOS_tw_l60_opp_1st_sv%_52", "SOS_tw_l60_opp_1st_sv%_51"],axis=1)

In [148]:
# 'p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), indoor/outdoor (IO) specific FIRST SERVE PERCENTAGE YIELDED performance of PLAYER as a (returner) over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific (SS), IO specific FIRST SERVE PERCENTAGE (as servers) performance of player OPPONENTS over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player2["SOS_tw_l60_opp_1st_sv%_60"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-1)
df_player2["SOS_tw_l60_opp_1st_sv%_59"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-2)
df_player2["SOS_tw_l60_opp_1st_sv%_58"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-3)
df_player2["SOS_tw_l60_opp_1st_sv%_57"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-4)
df_player2["SOS_tw_l60_opp_1st_sv%_56"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-5)
df_player2["SOS_tw_l60_opp_1st_sv%_55"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-6)
df_player2["SOS_tw_l60_opp_1st_sv%_54"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-7)
df_player2["SOS_tw_l60_opp_1st_sv%_53"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-8)
df_player2["SOS_tw_l60_opp_1st_sv%_52"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-9)
df_player2["SOS_tw_l60_opp_1st_sv%_51"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-10)
df_player2["SOS_tw_l60_opp_1st_sv%_50"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-11)
df_player2["SOS_tw_l60_opp_1st_sv%_49"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-12)
df_player2["SOS_tw_l60_opp_1st_sv%_48"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-13)
df_player2["SOS_tw_l60_opp_1st_sv%_47"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-14)
df_player2["SOS_tw_l60_opp_1st_sv%_46"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-15)
df_player2["SOS_tw_l60_opp_1st_sv%_45"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-16)
df_player2["SOS_tw_l60_opp_1st_sv%_44"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-17)
df_player2["SOS_tw_l60_opp_1st_sv%_43"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-18)
df_player2["SOS_tw_l60_opp_1st_sv%_42"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-19)
df_player2["SOS_tw_l60_opp_1st_sv%_41"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-20)
df_player2["SOS_tw_l60_opp_1st_sv%_40"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-21)
df_player2["SOS_tw_l60_opp_1st_sv%_39"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-22)
df_player2["SOS_tw_l60_opp_1st_sv%_38"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-23)
df_player2["SOS_tw_l60_opp_1st_sv%_37"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-24)
df_player2["SOS_tw_l60_opp_1st_sv%_36"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-25)
df_player2["SOS_tw_l60_opp_1st_sv%_35"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-26)
df_player2["SOS_tw_l60_opp_1st_sv%_34"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-27)
df_player2["SOS_tw_l60_opp_1st_sv%_33"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-28)
df_player2["SOS_tw_l60_opp_1st_sv%_32"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-29)
df_player2["SOS_tw_l60_opp_1st_sv%_31"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-30)
df_player2["SOS_tw_l60_opp_1st_sv%_30"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-31)
df_player2["SOS_tw_l60_opp_1st_sv%_29"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-32)
df_player2["SOS_tw_l60_opp_1st_sv%_28"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-33)
df_player2["SOS_tw_l60_opp_1st_sv%_27"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-34)
df_player2["SOS_tw_l60_opp_1st_sv%_26"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-35)
df_player2["SOS_tw_l60_opp_1st_sv%_25"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-36)
df_player2["SOS_tw_l60_opp_1st_sv%_24"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-37)
df_player2["SOS_tw_l60_opp_1st_sv%_23"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-38)
df_player2["SOS_tw_l60_opp_1st_sv%_22"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-39)
df_player2["SOS_tw_l60_opp_1st_sv%_21"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-40)
df_player2["SOS_tw_l60_opp_1st_sv%_20"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-41)
df_player2["SOS_tw_l60_opp_1st_sv%_19"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-42)
df_player2["SOS_tw_l60_opp_1st_sv%_18"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-43)
df_player2["SOS_tw_l60_opp_1st_sv%_17"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-44)
df_player2["SOS_tw_l60_opp_1st_sv%_16"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-45)
df_player2["SOS_tw_l60_opp_1st_sv%_15"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-46)
df_player2["SOS_tw_l60_opp_1st_sv%_14"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-47)
df_player2["SOS_tw_l60_opp_1st_sv%_13"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-48)
df_player2["SOS_tw_l60_opp_1st_sv%_12"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-49)
df_player2["SOS_tw_l60_opp_1st_sv%_11"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-50)
df_player2["SOS_tw_l60_opp_1st_sv%_10"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-51)
df_player2["SOS_tw_l60_opp_1st_sv%_9"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-52)
df_player2["SOS_tw_l60_opp_1st_sv%_8"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-53)
df_player2["SOS_tw_l60_opp_1st_sv%_7"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-54)
df_player2["SOS_tw_l60_opp_1st_sv%_6"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-55)
df_player2["SOS_tw_l60_opp_1st_sv%_5"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-56)
df_player2["SOS_tw_l60_opp_1st_sv%_4"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-57)
df_player2["SOS_tw_l60_opp_1st_sv%_3"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-58)
df_player2["SOS_tw_l60_opp_1st_sv%_2"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-59)
df_player2["SOS_tw_l60_opp_1st_sv%_1"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv%_l60_tw_ss_IO'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_1st_sv%_ws"] = df_player2[["SOS_tw_l60_opp_1st_sv%_60", "SOS_tw_l60_opp_1st_sv%_59", "SOS_tw_l60_opp_1st_sv%_58", "SOS_tw_l60_opp_1st_sv%_57", "SOS_tw_l60_opp_1st_sv%_56", "SOS_tw_l60_opp_1st_sv%_55", "SOS_tw_l60_opp_1st_sv%_54", "SOS_tw_l60_opp_1st_sv%_53", "SOS_tw_l60_opp_1st_sv%_52", "SOS_tw_l60_opp_1st_sv%_51", "SOS_tw_l60_opp_1st_sv%_50", "SOS_tw_l60_opp_1st_sv%_49", "SOS_tw_l60_opp_1st_sv%_48", "SOS_tw_l60_opp_1st_sv%_47", "SOS_tw_l60_opp_1st_sv%_46", "SOS_tw_l60_opp_1st_sv%_45", "SOS_tw_l60_opp_1st_sv%_44", "SOS_tw_l60_opp_1st_sv%_43", "SOS_tw_l60_opp_1st_sv%_42", "SOS_tw_l60_opp_1st_sv%_41", "SOS_tw_l60_opp_1st_sv%_40", "SOS_tw_l60_opp_1st_sv%_39", "SOS_tw_l60_opp_1st_sv%_38", "SOS_tw_l60_opp_1st_sv%_37", "SOS_tw_l60_opp_1st_sv%_36", "SOS_tw_l60_opp_1st_sv%_35", "SOS_tw_l60_opp_1st_sv%_34", "SOS_tw_l60_opp_1st_sv%_33", "SOS_tw_l60_opp_1st_sv%_32", "SOS_tw_l60_opp_1st_sv%_31", "SOS_tw_l60_opp_1st_sv%_30", "SOS_tw_l60_opp_1st_sv%_29", "SOS_tw_l60_opp_1st_sv%_28", "SOS_tw_l60_opp_1st_sv%_27", "SOS_tw_l60_opp_1st_sv%_26", "SOS_tw_l60_opp_1st_sv%_25", "SOS_tw_l60_opp_1st_sv%_24", "SOS_tw_l60_opp_1st_sv%_23", "SOS_tw_l60_opp_1st_sv%_22", "SOS_tw_l60_opp_1st_sv%_21", "SOS_tw_l60_opp_1st_sv%_20", "SOS_tw_l60_opp_1st_sv%_19", "SOS_tw_l60_opp_1st_sv%_18", "SOS_tw_l60_opp_1st_sv%_17", "SOS_tw_l60_opp_1st_sv%_16", "SOS_tw_l60_opp_1st_sv%_15", "SOS_tw_l60_opp_1st_sv%_14", "SOS_tw_l60_opp_1st_sv%_13", "SOS_tw_l60_opp_1st_sv%_12", "SOS_tw_l60_opp_1st_sv%_11", "SOS_tw_l60_opp_1st_sv%_10", "SOS_tw_l60_opp_1st_sv%_9", "SOS_tw_l60_opp_1st_sv%_8", "SOS_tw_l60_opp_1st_sv%_7", "SOS_tw_l60_opp_1st_sv%_6", "SOS_tw_l60_opp_1st_sv%_5", "SOS_tw_l60_opp_1st_sv%_4", "SOS_tw_l60_opp_1st_sv%_3", "SOS_tw_l60_opp_1st_sv%_2", "SOS_tw_l60_opp_1st_sv%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_1st_sv%_ws_ct"] = df_player2[["SOS_tw_l60_opp_1st_sv%_60", "SOS_tw_l60_opp_1st_sv%_59", "SOS_tw_l60_opp_1st_sv%_58", "SOS_tw_l60_opp_1st_sv%_57", "SOS_tw_l60_opp_1st_sv%_56", "SOS_tw_l60_opp_1st_sv%_55", "SOS_tw_l60_opp_1st_sv%_54", "SOS_tw_l60_opp_1st_sv%_53", "SOS_tw_l60_opp_1st_sv%_52", "SOS_tw_l60_opp_1st_sv%_51", "SOS_tw_l60_opp_1st_sv%_50", "SOS_tw_l60_opp_1st_sv%_49", "SOS_tw_l60_opp_1st_sv%_48", "SOS_tw_l60_opp_1st_sv%_47", "SOS_tw_l60_opp_1st_sv%_46", "SOS_tw_l60_opp_1st_sv%_45", "SOS_tw_l60_opp_1st_sv%_44", "SOS_tw_l60_opp_1st_sv%_43", "SOS_tw_l60_opp_1st_sv%_42", "SOS_tw_l60_opp_1st_sv%_41", "SOS_tw_l60_opp_1st_sv%_40", "SOS_tw_l60_opp_1st_sv%_39", "SOS_tw_l60_opp_1st_sv%_38", "SOS_tw_l60_opp_1st_sv%_37", "SOS_tw_l60_opp_1st_sv%_36", "SOS_tw_l60_opp_1st_sv%_35", "SOS_tw_l60_opp_1st_sv%_34", "SOS_tw_l60_opp_1st_sv%_33", "SOS_tw_l60_opp_1st_sv%_32", "SOS_tw_l60_opp_1st_sv%_31", "SOS_tw_l60_opp_1st_sv%_30", "SOS_tw_l60_opp_1st_sv%_29", "SOS_tw_l60_opp_1st_sv%_28", "SOS_tw_l60_opp_1st_sv%_27", "SOS_tw_l60_opp_1st_sv%_26", "SOS_tw_l60_opp_1st_sv%_25", "SOS_tw_l60_opp_1st_sv%_24", "SOS_tw_l60_opp_1st_sv%_23", "SOS_tw_l60_opp_1st_sv%_22", "SOS_tw_l60_opp_1st_sv%_21", "SOS_tw_l60_opp_1st_sv%_20", "SOS_tw_l60_opp_1st_sv%_19", "SOS_tw_l60_opp_1st_sv%_18", "SOS_tw_l60_opp_1st_sv%_17", "SOS_tw_l60_opp_1st_sv%_16", "SOS_tw_l60_opp_1st_sv%_15", "SOS_tw_l60_opp_1st_sv%_14", "SOS_tw_l60_opp_1st_sv%_13", "SOS_tw_l60_opp_1st_sv%_12", "SOS_tw_l60_opp_1st_sv%_11", "SOS_tw_l60_opp_1st_sv%_10", "SOS_tw_l60_opp_1st_sv%_9", "SOS_tw_l60_opp_1st_sv%_8", "SOS_tw_l60_opp_1st_sv%_7", "SOS_tw_l60_opp_1st_sv%_6", "SOS_tw_l60_opp_1st_sv%_5", "SOS_tw_l60_opp_1st_sv%_4", "SOS_tw_l60_opp_1st_sv%_3", "SOS_tw_l60_opp_1st_sv%_2", "SOS_tw_l60_opp_1st_sv%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_1st_sv%"] = (df_player2["SOS_tw_l60_opp_1st_sv%_ws"]/df_player2["SOS_tw_l60_opp_1st_sv%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % FIRST SERVE (as servers) the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to have
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l60_opp_1st_sv%"]

# Mean % FIRST SERVE YIELDED (as returners) performance (l60_tw_ss_IO) for ALL players per surface (clay, hard). We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv%_l60_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_2i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv%_l60_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_3i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv%_l60_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_4i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv%_l60_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_5i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv%_l60_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_6i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv%_l60_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
# (nan, 63.16714285714288, 61.76550898203588, 61.447076023391794, 60.26246478873241, 61.94850574712643)

mean_clay_SOS_1o = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv%_l60_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_2o = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv%_l60_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_3o = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv%_l60_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_4o = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv%_l60_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_5o = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv%_l60_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_6o = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv%_l60_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
# (61.38809065934069, 62.235296997389135, 61.59443389830507, 61.28912038634013, 61.279629755434726, 61.897728643216034)

mean_hard_SOS_1i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv%_l60_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_2i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv%_l60_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_3i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv%_l60_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_4i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv%_l60_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_5i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv%_l60_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_6i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv%_l60_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
# (60.17274390243901, 61.28734478935706, 60.94774470520887, 60.42612551159601, 60.11593728222994, 60.84636301369854)

mean_hard_SOS_1o = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv%_l60_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_2o = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv%_l60_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_3o = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv%_l60_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_4o = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv%_l60_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_5o = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv%_l60_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_6o = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv%_l60_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
# (60.61414647280567, 60.12488568588459, 59.652691648297775, 59.863895948213866, 59.436656087055056, 60.27742628571435)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l60_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 

del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_1st_sv%_ws", "SOS_tw_l60_opp_1st_sv%_ws_ct", "SOS_tw_l60_opp_1st_sv%", "SOS_tw_l60_opp_1st_sv%_50", "SOS_tw_l60_opp_1st_sv%_49", "SOS_tw_l60_opp_1st_sv%_48", "SOS_tw_l60_opp_1st_sv%_47", "SOS_tw_l60_opp_1st_sv%_46", "SOS_tw_l60_opp_1st_sv%_45", "SOS_tw_l60_opp_1st_sv%_44", "SOS_tw_l60_opp_1st_sv%_43", "SOS_tw_l60_opp_1st_sv%_42", "SOS_tw_l60_opp_1st_sv%_41", "SOS_tw_l60_opp_1st_sv%_40", "SOS_tw_l60_opp_1st_sv%_39", "SOS_tw_l60_opp_1st_sv%_38", "SOS_tw_l60_opp_1st_sv%_37", "SOS_tw_l60_opp_1st_sv%_36", "SOS_tw_l60_opp_1st_sv%_35", "SOS_tw_l60_opp_1st_sv%_34", "SOS_tw_l60_opp_1st_sv%_33", "SOS_tw_l60_opp_1st_sv%_32", "SOS_tw_l60_opp_1st_sv%_31", "SOS_tw_l60_opp_1st_sv%_30", "SOS_tw_l60_opp_1st_sv%_29", "SOS_tw_l60_opp_1st_sv%_28", "SOS_tw_l60_opp_1st_sv%_27", "SOS_tw_l60_opp_1st_sv%_26", "SOS_tw_l60_opp_1st_sv%_25", "SOS_tw_l60_opp_1st_sv%_24", "SOS_tw_l60_opp_1st_sv%_23", "SOS_tw_l60_opp_1st_sv%_22", "SOS_tw_l60_opp_1st_sv%_21", "SOS_tw_l60_opp_1st_sv%_20", "SOS_tw_l60_opp_1st_sv%_19", "SOS_tw_l60_opp_1st_sv%_18", "SOS_tw_l60_opp_1st_sv%_17", "SOS_tw_l60_opp_1st_sv%_16", "SOS_tw_l60_opp_1st_sv%_15", "SOS_tw_l60_opp_1st_sv%_14", "SOS_tw_l60_opp_1st_sv%_13", "SOS_tw_l60_opp_1st_sv%_12", "SOS_tw_l60_opp_1st_sv%_11", "SOS_tw_l60_opp_1st_sv%_10", "SOS_tw_l60_opp_1st_sv%_9", "SOS_tw_l60_opp_1st_sv%_8", "SOS_tw_l60_opp_1st_sv%_7", "SOS_tw_l60_opp_1st_sv%_6", "SOS_tw_l60_opp_1st_sv%_5", "SOS_tw_l60_opp_1st_sv%_4", "SOS_tw_l60_opp_1st_sv%_3", "SOS_tw_l60_opp_1st_sv%_2", "SOS_tw_l60_opp_1st_sv%_1"],axis=1)

In [149]:
# 'p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), indoor/outdoor (IO) specific FIRST SERVE PERCENTAGE YIELDED performance of PLAYER as a (returner) over the 10 matches PRIOR TO the match being predicted 

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_1st_sv%_ws"] = df_player2[["SOS_tw_l60_opp_1st_sv%_60", "SOS_tw_l60_opp_1st_sv%_59", "SOS_tw_l60_opp_1st_sv%_58", "SOS_tw_l60_opp_1st_sv%_57", "SOS_tw_l60_opp_1st_sv%_56", "SOS_tw_l60_opp_1st_sv%_55", "SOS_tw_l60_opp_1st_sv%_54", "SOS_tw_l60_opp_1st_sv%_53", "SOS_tw_l60_opp_1st_sv%_52", "SOS_tw_l60_opp_1st_sv%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_1st_sv%_ws_ct"] = df_player2[["SOS_tw_l60_opp_1st_sv%_60", "SOS_tw_l60_opp_1st_sv%_59", "SOS_tw_l60_opp_1st_sv%_58", "SOS_tw_l60_opp_1st_sv%_57", "SOS_tw_l60_opp_1st_sv%_56", "SOS_tw_l60_opp_1st_sv%_55", "SOS_tw_l60_opp_1st_sv%_54", "SOS_tw_l60_opp_1st_sv%_53", "SOS_tw_l60_opp_1st_sv%_52", "SOS_tw_l60_opp_1st_sv%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_1st_sv%"] = (df_player2["SOS_tw_l10_opp_1st_sv%_ws"]/df_player2["SOS_tw_l10_opp_1st_sv%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % FIRST SERVE (as servers) the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to have
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l10_opp_1st_sv%"]

# Mean % FIRST SERVE YIELDED (as returners) performance (l10_tw_ss_IO) for ALL players per surface (clay, hard). We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv%_l10_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_2i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv%_l10_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_3i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv%_l10_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_4i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv%_l10_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_5i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv%_l10_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_6i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv%_l10_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
# (nan, 63.16714285714288, 61.76299401197599, 61.52853801169588, 60.30105633802818, 61.936206896551724)

mean_clay_SOS_1o = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv%_l10_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_2o = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv%_l10_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_3o = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv%_l10_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_4o = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv%_l10_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_5o = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv%_l10_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_clay_SOS_6o = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv%_l10_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
# (61.51053571428575, 62.25780678851181, 61.71395932203388, 61.437630217316254, 61.5653668478261, 61.81955778894477)

mean_hard_SOS_1i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv%_l10_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_2i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv%_l10_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_3i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv%_l10_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_4i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv%_l10_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_5i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv%_l10_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_6i = df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv%_l10_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
# (60.2639268292683, 61.241335920177455, 60.77828277046371, 60.45150068212812, 60.07275261324033, 61.171116438356194)

mean_hard_SOS_1o = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv%_l10_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_2o = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv%_l10_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_3o = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv%_l10_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_4o = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv%_l10_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_5o = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv%_l10_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
mean_hard_SOS_6o = df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv%_l10_tw_ss_IO'].mean() #We want in terms of pct 1st serve the field MADE on average
# (60.38831987075935, 59.95947813121277, 59.562341415625674, 60.075032366339045, 59.27353888007249, 60.75386285714277)

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv%_yielded_l10_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 

del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_1st_sv%_ws", "SOS_tw_l10_opp_1st_sv%_ws_ct", "SOS_tw_l10_opp_1st_sv%", "SOS_tw_l60_opp_1st_sv%_60", "SOS_tw_l60_opp_1st_sv%_59", "SOS_tw_l60_opp_1st_sv%_58", "SOS_tw_l60_opp_1st_sv%_57", "SOS_tw_l60_opp_1st_sv%_56", "SOS_tw_l60_opp_1st_sv%_55", "SOS_tw_l60_opp_1st_sv%_54", "SOS_tw_l60_opp_1st_sv%_53", "SOS_tw_l60_opp_1st_sv%_52", "SOS_tw_l60_opp_1st_sv%_51"],axis=1)

In [150]:
# 'p_sv_pts_won%_l60_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) SERVE POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific (SS) RETURN POINTS WON performance of player OPPONENTS over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player2["SOS_tw_l60_opp_ret_pts_won%_60"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-1)
df_player2["SOS_tw_l60_opp_ret_pts_won%_59"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-2)
df_player2["SOS_tw_l60_opp_ret_pts_won%_58"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-3)
df_player2["SOS_tw_l60_opp_ret_pts_won%_57"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-4)
df_player2["SOS_tw_l60_opp_ret_pts_won%_56"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-5)
df_player2["SOS_tw_l60_opp_ret_pts_won%_55"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-6)
df_player2["SOS_tw_l60_opp_ret_pts_won%_54"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-7)
df_player2["SOS_tw_l60_opp_ret_pts_won%_53"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-8)
df_player2["SOS_tw_l60_opp_ret_pts_won%_52"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-9)
df_player2["SOS_tw_l60_opp_ret_pts_won%_51"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-10)
df_player2["SOS_tw_l60_opp_ret_pts_won%_50"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-11)
df_player2["SOS_tw_l60_opp_ret_pts_won%_49"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-12)
df_player2["SOS_tw_l60_opp_ret_pts_won%_48"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-13)
df_player2["SOS_tw_l60_opp_ret_pts_won%_47"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-14)
df_player2["SOS_tw_l60_opp_ret_pts_won%_46"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-15)
df_player2["SOS_tw_l60_opp_ret_pts_won%_45"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-16)
df_player2["SOS_tw_l60_opp_ret_pts_won%_44"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-17)
df_player2["SOS_tw_l60_opp_ret_pts_won%_43"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-18)
df_player2["SOS_tw_l60_opp_ret_pts_won%_42"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-19)
df_player2["SOS_tw_l60_opp_ret_pts_won%_41"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-20)
df_player2["SOS_tw_l60_opp_ret_pts_won%_40"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-21)
df_player2["SOS_tw_l60_opp_ret_pts_won%_39"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-22)
df_player2["SOS_tw_l60_opp_ret_pts_won%_38"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-23)
df_player2["SOS_tw_l60_opp_ret_pts_won%_37"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-24)
df_player2["SOS_tw_l60_opp_ret_pts_won%_36"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-25)
df_player2["SOS_tw_l60_opp_ret_pts_won%_35"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-26)
df_player2["SOS_tw_l60_opp_ret_pts_won%_34"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-27)
df_player2["SOS_tw_l60_opp_ret_pts_won%_33"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-28)
df_player2["SOS_tw_l60_opp_ret_pts_won%_32"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-29)
df_player2["SOS_tw_l60_opp_ret_pts_won%_31"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-30)
df_player2["SOS_tw_l60_opp_ret_pts_won%_30"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-31)
df_player2["SOS_tw_l60_opp_ret_pts_won%_29"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-32)
df_player2["SOS_tw_l60_opp_ret_pts_won%_28"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-33)
df_player2["SOS_tw_l60_opp_ret_pts_won%_27"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-34)
df_player2["SOS_tw_l60_opp_ret_pts_won%_26"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-35)
df_player2["SOS_tw_l60_opp_ret_pts_won%_25"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-36)
df_player2["SOS_tw_l60_opp_ret_pts_won%_24"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-37)
df_player2["SOS_tw_l60_opp_ret_pts_won%_23"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-38)
df_player2["SOS_tw_l60_opp_ret_pts_won%_22"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-39)
df_player2["SOS_tw_l60_opp_ret_pts_won%_21"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-40)
df_player2["SOS_tw_l60_opp_ret_pts_won%_20"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-41)
df_player2["SOS_tw_l60_opp_ret_pts_won%_19"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-42)
df_player2["SOS_tw_l60_opp_ret_pts_won%_18"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-43)
df_player2["SOS_tw_l60_opp_ret_pts_won%_17"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-44)
df_player2["SOS_tw_l60_opp_ret_pts_won%_16"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-45)
df_player2["SOS_tw_l60_opp_ret_pts_won%_15"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-46)
df_player2["SOS_tw_l60_opp_ret_pts_won%_14"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-47)
df_player2["SOS_tw_l60_opp_ret_pts_won%_13"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-48)
df_player2["SOS_tw_l60_opp_ret_pts_won%_12"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-49)
df_player2["SOS_tw_l60_opp_ret_pts_won%_11"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-50)
df_player2["SOS_tw_l60_opp_ret_pts_won%_10"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-51)
df_player2["SOS_tw_l60_opp_ret_pts_won%_9"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-52)
df_player2["SOS_tw_l60_opp_ret_pts_won%_8"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-53)
df_player2["SOS_tw_l60_opp_ret_pts_won%_7"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-54)
df_player2["SOS_tw_l60_opp_ret_pts_won%_6"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-55)
df_player2["SOS_tw_l60_opp_ret_pts_won%_5"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-56)
df_player2["SOS_tw_l60_opp_ret_pts_won%_4"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-57)
df_player2["SOS_tw_l60_opp_ret_pts_won%_3"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-58)
df_player2["SOS_tw_l60_opp_ret_pts_won%_2"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-59)
df_player2["SOS_tw_l60_opp_ret_pts_won%_1"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_tw_ss'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_ret_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_ret_pts_won%_60", "SOS_tw_l60_opp_ret_pts_won%_59", "SOS_tw_l60_opp_ret_pts_won%_58", "SOS_tw_l60_opp_ret_pts_won%_57", "SOS_tw_l60_opp_ret_pts_won%_56", "SOS_tw_l60_opp_ret_pts_won%_55", "SOS_tw_l60_opp_ret_pts_won%_54", "SOS_tw_l60_opp_ret_pts_won%_53", "SOS_tw_l60_opp_ret_pts_won%_52", "SOS_tw_l60_opp_ret_pts_won%_51", "SOS_tw_l60_opp_ret_pts_won%_50", "SOS_tw_l60_opp_ret_pts_won%_49", "SOS_tw_l60_opp_ret_pts_won%_48", "SOS_tw_l60_opp_ret_pts_won%_47", "SOS_tw_l60_opp_ret_pts_won%_46", "SOS_tw_l60_opp_ret_pts_won%_45", "SOS_tw_l60_opp_ret_pts_won%_44", "SOS_tw_l60_opp_ret_pts_won%_43", "SOS_tw_l60_opp_ret_pts_won%_42", "SOS_tw_l60_opp_ret_pts_won%_41", "SOS_tw_l60_opp_ret_pts_won%_40", "SOS_tw_l60_opp_ret_pts_won%_39", "SOS_tw_l60_opp_ret_pts_won%_38", "SOS_tw_l60_opp_ret_pts_won%_37", "SOS_tw_l60_opp_ret_pts_won%_36", "SOS_tw_l60_opp_ret_pts_won%_35", "SOS_tw_l60_opp_ret_pts_won%_34", "SOS_tw_l60_opp_ret_pts_won%_33", "SOS_tw_l60_opp_ret_pts_won%_32", "SOS_tw_l60_opp_ret_pts_won%_31", "SOS_tw_l60_opp_ret_pts_won%_30", "SOS_tw_l60_opp_ret_pts_won%_29", "SOS_tw_l60_opp_ret_pts_won%_28", "SOS_tw_l60_opp_ret_pts_won%_27", "SOS_tw_l60_opp_ret_pts_won%_26", "SOS_tw_l60_opp_ret_pts_won%_25", "SOS_tw_l60_opp_ret_pts_won%_24", "SOS_tw_l60_opp_ret_pts_won%_23", "SOS_tw_l60_opp_ret_pts_won%_22", "SOS_tw_l60_opp_ret_pts_won%_21", "SOS_tw_l60_opp_ret_pts_won%_20", "SOS_tw_l60_opp_ret_pts_won%_19", "SOS_tw_l60_opp_ret_pts_won%_18", "SOS_tw_l60_opp_ret_pts_won%_17", "SOS_tw_l60_opp_ret_pts_won%_16", "SOS_tw_l60_opp_ret_pts_won%_15", "SOS_tw_l60_opp_ret_pts_won%_14", "SOS_tw_l60_opp_ret_pts_won%_13", "SOS_tw_l60_opp_ret_pts_won%_12", "SOS_tw_l60_opp_ret_pts_won%_11", "SOS_tw_l60_opp_ret_pts_won%_10", "SOS_tw_l60_opp_ret_pts_won%_9", "SOS_tw_l60_opp_ret_pts_won%_8", "SOS_tw_l60_opp_ret_pts_won%_7", "SOS_tw_l60_opp_ret_pts_won%_6", "SOS_tw_l60_opp_ret_pts_won%_5", "SOS_tw_l60_opp_ret_pts_won%_4", "SOS_tw_l60_opp_ret_pts_won%_3", "SOS_tw_l60_opp_ret_pts_won%_2", "SOS_tw_l60_opp_ret_pts_won%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_ret_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_ret_pts_won%_60", "SOS_tw_l60_opp_ret_pts_won%_59", "SOS_tw_l60_opp_ret_pts_won%_58", "SOS_tw_l60_opp_ret_pts_won%_57", "SOS_tw_l60_opp_ret_pts_won%_56", "SOS_tw_l60_opp_ret_pts_won%_55", "SOS_tw_l60_opp_ret_pts_won%_54", "SOS_tw_l60_opp_ret_pts_won%_53", "SOS_tw_l60_opp_ret_pts_won%_52", "SOS_tw_l60_opp_ret_pts_won%_51", "SOS_tw_l60_opp_ret_pts_won%_50", "SOS_tw_l60_opp_ret_pts_won%_49", "SOS_tw_l60_opp_ret_pts_won%_48", "SOS_tw_l60_opp_ret_pts_won%_47", "SOS_tw_l60_opp_ret_pts_won%_46", "SOS_tw_l60_opp_ret_pts_won%_45", "SOS_tw_l60_opp_ret_pts_won%_44", "SOS_tw_l60_opp_ret_pts_won%_43", "SOS_tw_l60_opp_ret_pts_won%_42", "SOS_tw_l60_opp_ret_pts_won%_41", "SOS_tw_l60_opp_ret_pts_won%_40", "SOS_tw_l60_opp_ret_pts_won%_39", "SOS_tw_l60_opp_ret_pts_won%_38", "SOS_tw_l60_opp_ret_pts_won%_37", "SOS_tw_l60_opp_ret_pts_won%_36", "SOS_tw_l60_opp_ret_pts_won%_35", "SOS_tw_l60_opp_ret_pts_won%_34", "SOS_tw_l60_opp_ret_pts_won%_33", "SOS_tw_l60_opp_ret_pts_won%_32", "SOS_tw_l60_opp_ret_pts_won%_31", "SOS_tw_l60_opp_ret_pts_won%_30", "SOS_tw_l60_opp_ret_pts_won%_29", "SOS_tw_l60_opp_ret_pts_won%_28", "SOS_tw_l60_opp_ret_pts_won%_27", "SOS_tw_l60_opp_ret_pts_won%_26", "SOS_tw_l60_opp_ret_pts_won%_25", "SOS_tw_l60_opp_ret_pts_won%_24", "SOS_tw_l60_opp_ret_pts_won%_23", "SOS_tw_l60_opp_ret_pts_won%_22", "SOS_tw_l60_opp_ret_pts_won%_21", "SOS_tw_l60_opp_ret_pts_won%_20", "SOS_tw_l60_opp_ret_pts_won%_19", "SOS_tw_l60_opp_ret_pts_won%_18", "SOS_tw_l60_opp_ret_pts_won%_17", "SOS_tw_l60_opp_ret_pts_won%_16", "SOS_tw_l60_opp_ret_pts_won%_15", "SOS_tw_l60_opp_ret_pts_won%_14", "SOS_tw_l60_opp_ret_pts_won%_13", "SOS_tw_l60_opp_ret_pts_won%_12", "SOS_tw_l60_opp_ret_pts_won%_11", "SOS_tw_l60_opp_ret_pts_won%_10", "SOS_tw_l60_opp_ret_pts_won%_9", "SOS_tw_l60_opp_ret_pts_won%_8", "SOS_tw_l60_opp_ret_pts_won%_7", "SOS_tw_l60_opp_ret_pts_won%_6", "SOS_tw_l60_opp_ret_pts_won%_5", "SOS_tw_l60_opp_ret_pts_won%_4", "SOS_tw_l60_opp_ret_pts_won%_3", "SOS_tw_l60_opp_ret_pts_won%_2", "SOS_tw_l60_opp_ret_pts_won%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_ret_pts_won%"] = (df_player2["SOS_tw_l60_opp_ret_pts_won%_ws"]/df_player2["SOS_tw_l60_opp_ret_pts_won%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % SERVE pts won the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l60_opp_ret_pts_won%"]

# Mean % SERVE pts won performance (l60_tw_ss) for ALL players per surface (clay, hard). We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_clay_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_clay_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_clay_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_clay_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_clay_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_clay_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
# (60.36497939560435, 60.957935606060545, 61.14281200631905, 61.66031531531527, 62.06639291465379, 61.95438551099616)
mean_hard_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_hard_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_hard_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_hard_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_hard_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_hard_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
# (62.5628222379603, 63.49542174796736, 63.3019812606473, 63.962841009352324, 63.830066292707855, 64.01893349209047)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_ret_pts_won%_ws", "SOS_tw_l60_opp_ret_pts_won%_ws_ct", "SOS_tw_l60_opp_ret_pts_won%", "SOS_tw_l60_opp_ret_pts_won%_50", "SOS_tw_l60_opp_ret_pts_won%_49", "SOS_tw_l60_opp_ret_pts_won%_48", "SOS_tw_l60_opp_ret_pts_won%_47", "SOS_tw_l60_opp_ret_pts_won%_46", "SOS_tw_l60_opp_ret_pts_won%_45", "SOS_tw_l60_opp_ret_pts_won%_44", "SOS_tw_l60_opp_ret_pts_won%_43", "SOS_tw_l60_opp_ret_pts_won%_42", "SOS_tw_l60_opp_ret_pts_won%_41", "SOS_tw_l60_opp_ret_pts_won%_40", "SOS_tw_l60_opp_ret_pts_won%_39", "SOS_tw_l60_opp_ret_pts_won%_38", "SOS_tw_l60_opp_ret_pts_won%_37", "SOS_tw_l60_opp_ret_pts_won%_36", "SOS_tw_l60_opp_ret_pts_won%_35", "SOS_tw_l60_opp_ret_pts_won%_34", "SOS_tw_l60_opp_ret_pts_won%_33", "SOS_tw_l60_opp_ret_pts_won%_32", "SOS_tw_l60_opp_ret_pts_won%_31", "SOS_tw_l60_opp_ret_pts_won%_30", "SOS_tw_l60_opp_ret_pts_won%_29", "SOS_tw_l60_opp_ret_pts_won%_28", "SOS_tw_l60_opp_ret_pts_won%_27", "SOS_tw_l60_opp_ret_pts_won%_26", "SOS_tw_l60_opp_ret_pts_won%_25", "SOS_tw_l60_opp_ret_pts_won%_24", "SOS_tw_l60_opp_ret_pts_won%_23", "SOS_tw_l60_opp_ret_pts_won%_22", "SOS_tw_l60_opp_ret_pts_won%_21", "SOS_tw_l60_opp_ret_pts_won%_20", "SOS_tw_l60_opp_ret_pts_won%_19", "SOS_tw_l60_opp_ret_pts_won%_18", "SOS_tw_l60_opp_ret_pts_won%_17", "SOS_tw_l60_opp_ret_pts_won%_16", "SOS_tw_l60_opp_ret_pts_won%_15", "SOS_tw_l60_opp_ret_pts_won%_14", "SOS_tw_l60_opp_ret_pts_won%_13", "SOS_tw_l60_opp_ret_pts_won%_12", "SOS_tw_l60_opp_ret_pts_won%_11", "SOS_tw_l60_opp_ret_pts_won%_10", "SOS_tw_l60_opp_ret_pts_won%_9", "SOS_tw_l60_opp_ret_pts_won%_8", "SOS_tw_l60_opp_ret_pts_won%_7", "SOS_tw_l60_opp_ret_pts_won%_6", "SOS_tw_l60_opp_ret_pts_won%_5", "SOS_tw_l60_opp_ret_pts_won%_4", "SOS_tw_l60_opp_ret_pts_won%_3", "SOS_tw_l60_opp_ret_pts_won%_2", "SOS_tw_l60_opp_ret_pts_won%_1"],axis=1)

In [151]:
# 'p_sv_pts_won%_l10_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) SERVE POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_ret_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_ret_pts_won%_60", "SOS_tw_l60_opp_ret_pts_won%_59", "SOS_tw_l60_opp_ret_pts_won%_58", "SOS_tw_l60_opp_ret_pts_won%_57", "SOS_tw_l60_opp_ret_pts_won%_56", "SOS_tw_l60_opp_ret_pts_won%_55", "SOS_tw_l60_opp_ret_pts_won%_54", "SOS_tw_l60_opp_ret_pts_won%_53", "SOS_tw_l60_opp_ret_pts_won%_52", "SOS_tw_l60_opp_ret_pts_won%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_ret_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_ret_pts_won%_60", "SOS_tw_l60_opp_ret_pts_won%_59", "SOS_tw_l60_opp_ret_pts_won%_58", "SOS_tw_l60_opp_ret_pts_won%_57", "SOS_tw_l60_opp_ret_pts_won%_56", "SOS_tw_l60_opp_ret_pts_won%_55", "SOS_tw_l60_opp_ret_pts_won%_54", "SOS_tw_l60_opp_ret_pts_won%_53", "SOS_tw_l60_opp_ret_pts_won%_52", "SOS_tw_l60_opp_ret_pts_won%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_ret_pts_won%"] = (df_player2["SOS_tw_l10_opp_ret_pts_won%_ws"]/df_player2["SOS_tw_l10_opp_ret_pts_won%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % SERVE pts won the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l10_opp_ret_pts_won%"]

# Mean % SERVE pts won performance (l10_tw_ss) for ALL players per surface (clay, hard). We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_clay_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_clay_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_clay_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_clay_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_clay_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_clay_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
# (60.431964285714265, 60.93922348484849, 61.30464454976309, 61.825604890604886, 62.18508534621577, 61.9458764553687)
mean_hard_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_hard_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_hard_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_hard_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_hard_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_hard_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
# (62.740198300283346, 63.51234756097569, 63.28705110732531, 64.14400917593093, 63.7061363250042, 64.22008334750817)

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_ret_pts_won%_ws", "SOS_tw_l10_opp_ret_pts_won%_ws_ct", "SOS_tw_l10_opp_ret_pts_won%", "SOS_tw_l60_opp_ret_pts_won%_60", "SOS_tw_l60_opp_ret_pts_won%_59", "SOS_tw_l60_opp_ret_pts_won%_58", "SOS_tw_l60_opp_ret_pts_won%_57", "SOS_tw_l60_opp_ret_pts_won%_56", "SOS_tw_l60_opp_ret_pts_won%_55", "SOS_tw_l60_opp_ret_pts_won%_54", "SOS_tw_l60_opp_ret_pts_won%_53", "SOS_tw_l60_opp_ret_pts_won%_52", "SOS_tw_l60_opp_ret_pts_won%_51"],axis=1)

In [152]:
# 'p_sv_pts_won%_l60_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) SERVE POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf', 't_ind', 'm_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific (SS), IO specific RETURN POINTS WON performance of player OPPONENTS over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player2["SOS_tw_l60_opp_ret_pts_won%_60"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-1)
df_player2["SOS_tw_l60_opp_ret_pts_won%_59"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-2)
df_player2["SOS_tw_l60_opp_ret_pts_won%_58"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-3)
df_player2["SOS_tw_l60_opp_ret_pts_won%_57"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-4)
df_player2["SOS_tw_l60_opp_ret_pts_won%_56"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-5)
df_player2["SOS_tw_l60_opp_ret_pts_won%_55"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-6)
df_player2["SOS_tw_l60_opp_ret_pts_won%_54"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-7)
df_player2["SOS_tw_l60_opp_ret_pts_won%_53"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-8)
df_player2["SOS_tw_l60_opp_ret_pts_won%_52"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-9)
df_player2["SOS_tw_l60_opp_ret_pts_won%_51"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-10)
df_player2["SOS_tw_l60_opp_ret_pts_won%_50"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-11)
df_player2["SOS_tw_l60_opp_ret_pts_won%_49"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-12)
df_player2["SOS_tw_l60_opp_ret_pts_won%_48"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-13)
df_player2["SOS_tw_l60_opp_ret_pts_won%_47"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-14)
df_player2["SOS_tw_l60_opp_ret_pts_won%_46"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-15)
df_player2["SOS_tw_l60_opp_ret_pts_won%_45"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-16)
df_player2["SOS_tw_l60_opp_ret_pts_won%_44"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-17)
df_player2["SOS_tw_l60_opp_ret_pts_won%_43"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-18)
df_player2["SOS_tw_l60_opp_ret_pts_won%_42"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-19)
df_player2["SOS_tw_l60_opp_ret_pts_won%_41"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-20)
df_player2["SOS_tw_l60_opp_ret_pts_won%_40"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-21)
df_player2["SOS_tw_l60_opp_ret_pts_won%_39"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-22)
df_player2["SOS_tw_l60_opp_ret_pts_won%_38"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-23)
df_player2["SOS_tw_l60_opp_ret_pts_won%_37"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-24)
df_player2["SOS_tw_l60_opp_ret_pts_won%_36"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-25)
df_player2["SOS_tw_l60_opp_ret_pts_won%_35"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-26)
df_player2["SOS_tw_l60_opp_ret_pts_won%_34"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-27)
df_player2["SOS_tw_l60_opp_ret_pts_won%_33"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-28)
df_player2["SOS_tw_l60_opp_ret_pts_won%_32"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-29)
df_player2["SOS_tw_l60_opp_ret_pts_won%_31"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-30)
df_player2["SOS_tw_l60_opp_ret_pts_won%_30"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-31)
df_player2["SOS_tw_l60_opp_ret_pts_won%_29"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-32)
df_player2["SOS_tw_l60_opp_ret_pts_won%_28"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-33)
df_player2["SOS_tw_l60_opp_ret_pts_won%_27"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-34)
df_player2["SOS_tw_l60_opp_ret_pts_won%_26"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-35)
df_player2["SOS_tw_l60_opp_ret_pts_won%_25"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-36)
df_player2["SOS_tw_l60_opp_ret_pts_won%_24"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-37)
df_player2["SOS_tw_l60_opp_ret_pts_won%_23"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-38)
df_player2["SOS_tw_l60_opp_ret_pts_won%_22"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-39)
df_player2["SOS_tw_l60_opp_ret_pts_won%_21"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-40)
df_player2["SOS_tw_l60_opp_ret_pts_won%_20"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-41)
df_player2["SOS_tw_l60_opp_ret_pts_won%_19"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-42)
df_player2["SOS_tw_l60_opp_ret_pts_won%_18"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-43)
df_player2["SOS_tw_l60_opp_ret_pts_won%_17"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-44)
df_player2["SOS_tw_l60_opp_ret_pts_won%_16"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-45)
df_player2["SOS_tw_l60_opp_ret_pts_won%_15"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-46)
df_player2["SOS_tw_l60_opp_ret_pts_won%_14"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-47)
df_player2["SOS_tw_l60_opp_ret_pts_won%_13"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-48)
df_player2["SOS_tw_l60_opp_ret_pts_won%_12"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-49)
df_player2["SOS_tw_l60_opp_ret_pts_won%_11"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-50)
df_player2["SOS_tw_l60_opp_ret_pts_won%_10"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-51)
df_player2["SOS_tw_l60_opp_ret_pts_won%_9"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-52)
df_player2["SOS_tw_l60_opp_ret_pts_won%_8"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-53)
df_player2["SOS_tw_l60_opp_ret_pts_won%_7"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-54)
df_player2["SOS_tw_l60_opp_ret_pts_won%_6"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-55)
df_player2["SOS_tw_l60_opp_ret_pts_won%_5"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-56)
df_player2["SOS_tw_l60_opp_ret_pts_won%_4"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-57)
df_player2["SOS_tw_l60_opp_ret_pts_won%_3"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-58)
df_player2["SOS_tw_l60_opp_ret_pts_won%_2"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-59)
df_player2["SOS_tw_l60_opp_ret_pts_won%_1"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ret_pts_won%_l60_tw_ss_IO'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_ret_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_ret_pts_won%_60", "SOS_tw_l60_opp_ret_pts_won%_59", "SOS_tw_l60_opp_ret_pts_won%_58", "SOS_tw_l60_opp_ret_pts_won%_57", "SOS_tw_l60_opp_ret_pts_won%_56", "SOS_tw_l60_opp_ret_pts_won%_55", "SOS_tw_l60_opp_ret_pts_won%_54", "SOS_tw_l60_opp_ret_pts_won%_53", "SOS_tw_l60_opp_ret_pts_won%_52", "SOS_tw_l60_opp_ret_pts_won%_51", "SOS_tw_l60_opp_ret_pts_won%_50", "SOS_tw_l60_opp_ret_pts_won%_49", "SOS_tw_l60_opp_ret_pts_won%_48", "SOS_tw_l60_opp_ret_pts_won%_47", "SOS_tw_l60_opp_ret_pts_won%_46", "SOS_tw_l60_opp_ret_pts_won%_45", "SOS_tw_l60_opp_ret_pts_won%_44", "SOS_tw_l60_opp_ret_pts_won%_43", "SOS_tw_l60_opp_ret_pts_won%_42", "SOS_tw_l60_opp_ret_pts_won%_41", "SOS_tw_l60_opp_ret_pts_won%_40", "SOS_tw_l60_opp_ret_pts_won%_39", "SOS_tw_l60_opp_ret_pts_won%_38", "SOS_tw_l60_opp_ret_pts_won%_37", "SOS_tw_l60_opp_ret_pts_won%_36", "SOS_tw_l60_opp_ret_pts_won%_35", "SOS_tw_l60_opp_ret_pts_won%_34", "SOS_tw_l60_opp_ret_pts_won%_33", "SOS_tw_l60_opp_ret_pts_won%_32", "SOS_tw_l60_opp_ret_pts_won%_31", "SOS_tw_l60_opp_ret_pts_won%_30", "SOS_tw_l60_opp_ret_pts_won%_29", "SOS_tw_l60_opp_ret_pts_won%_28", "SOS_tw_l60_opp_ret_pts_won%_27", "SOS_tw_l60_opp_ret_pts_won%_26", "SOS_tw_l60_opp_ret_pts_won%_25", "SOS_tw_l60_opp_ret_pts_won%_24", "SOS_tw_l60_opp_ret_pts_won%_23", "SOS_tw_l60_opp_ret_pts_won%_22", "SOS_tw_l60_opp_ret_pts_won%_21", "SOS_tw_l60_opp_ret_pts_won%_20", "SOS_tw_l60_opp_ret_pts_won%_19", "SOS_tw_l60_opp_ret_pts_won%_18", "SOS_tw_l60_opp_ret_pts_won%_17", "SOS_tw_l60_opp_ret_pts_won%_16", "SOS_tw_l60_opp_ret_pts_won%_15", "SOS_tw_l60_opp_ret_pts_won%_14", "SOS_tw_l60_opp_ret_pts_won%_13", "SOS_tw_l60_opp_ret_pts_won%_12", "SOS_tw_l60_opp_ret_pts_won%_11", "SOS_tw_l60_opp_ret_pts_won%_10", "SOS_tw_l60_opp_ret_pts_won%_9", "SOS_tw_l60_opp_ret_pts_won%_8", "SOS_tw_l60_opp_ret_pts_won%_7", "SOS_tw_l60_opp_ret_pts_won%_6", "SOS_tw_l60_opp_ret_pts_won%_5", "SOS_tw_l60_opp_ret_pts_won%_4", "SOS_tw_l60_opp_ret_pts_won%_3", "SOS_tw_l60_opp_ret_pts_won%_2", "SOS_tw_l60_opp_ret_pts_won%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_ret_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_ret_pts_won%_60", "SOS_tw_l60_opp_ret_pts_won%_59", "SOS_tw_l60_opp_ret_pts_won%_58", "SOS_tw_l60_opp_ret_pts_won%_57", "SOS_tw_l60_opp_ret_pts_won%_56", "SOS_tw_l60_opp_ret_pts_won%_55", "SOS_tw_l60_opp_ret_pts_won%_54", "SOS_tw_l60_opp_ret_pts_won%_53", "SOS_tw_l60_opp_ret_pts_won%_52", "SOS_tw_l60_opp_ret_pts_won%_51", "SOS_tw_l60_opp_ret_pts_won%_50", "SOS_tw_l60_opp_ret_pts_won%_49", "SOS_tw_l60_opp_ret_pts_won%_48", "SOS_tw_l60_opp_ret_pts_won%_47", "SOS_tw_l60_opp_ret_pts_won%_46", "SOS_tw_l60_opp_ret_pts_won%_45", "SOS_tw_l60_opp_ret_pts_won%_44", "SOS_tw_l60_opp_ret_pts_won%_43", "SOS_tw_l60_opp_ret_pts_won%_42", "SOS_tw_l60_opp_ret_pts_won%_41", "SOS_tw_l60_opp_ret_pts_won%_40", "SOS_tw_l60_opp_ret_pts_won%_39", "SOS_tw_l60_opp_ret_pts_won%_38", "SOS_tw_l60_opp_ret_pts_won%_37", "SOS_tw_l60_opp_ret_pts_won%_36", "SOS_tw_l60_opp_ret_pts_won%_35", "SOS_tw_l60_opp_ret_pts_won%_34", "SOS_tw_l60_opp_ret_pts_won%_33", "SOS_tw_l60_opp_ret_pts_won%_32", "SOS_tw_l60_opp_ret_pts_won%_31", "SOS_tw_l60_opp_ret_pts_won%_30", "SOS_tw_l60_opp_ret_pts_won%_29", "SOS_tw_l60_opp_ret_pts_won%_28", "SOS_tw_l60_opp_ret_pts_won%_27", "SOS_tw_l60_opp_ret_pts_won%_26", "SOS_tw_l60_opp_ret_pts_won%_25", "SOS_tw_l60_opp_ret_pts_won%_24", "SOS_tw_l60_opp_ret_pts_won%_23", "SOS_tw_l60_opp_ret_pts_won%_22", "SOS_tw_l60_opp_ret_pts_won%_21", "SOS_tw_l60_opp_ret_pts_won%_20", "SOS_tw_l60_opp_ret_pts_won%_19", "SOS_tw_l60_opp_ret_pts_won%_18", "SOS_tw_l60_opp_ret_pts_won%_17", "SOS_tw_l60_opp_ret_pts_won%_16", "SOS_tw_l60_opp_ret_pts_won%_15", "SOS_tw_l60_opp_ret_pts_won%_14", "SOS_tw_l60_opp_ret_pts_won%_13", "SOS_tw_l60_opp_ret_pts_won%_12", "SOS_tw_l60_opp_ret_pts_won%_11", "SOS_tw_l60_opp_ret_pts_won%_10", "SOS_tw_l60_opp_ret_pts_won%_9", "SOS_tw_l60_opp_ret_pts_won%_8", "SOS_tw_l60_opp_ret_pts_won%_7", "SOS_tw_l60_opp_ret_pts_won%_6", "SOS_tw_l60_opp_ret_pts_won%_5", "SOS_tw_l60_opp_ret_pts_won%_4", "SOS_tw_l60_opp_ret_pts_won%_3", "SOS_tw_l60_opp_ret_pts_won%_2", "SOS_tw_l60_opp_ret_pts_won%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_ret_pts_won%"] = (df_player2["SOS_tw_l60_opp_ret_pts_won%_ws"]/df_player2["SOS_tw_l60_opp_ret_pts_won%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % SERVE pts won the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface and IO status is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l60_opp_ret_pts_won%"]

# Mean % SERVE pts won performance (l60_tw_ss) for ALL players per surface (clay, hard) and IO status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (nan, 56.41428571428571, 61.05137724550896, 63.2390058479532, 61.729366197183104, 62.90666666666666)

mean_clay_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
#  60.36497939560435, 60.93853459530024, 61.08033898305085, 61.50788892721632, 62.06317595108697, 61.88410720268,   

mean_hard_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (62.94398780487806, 64.31386917960089, 64.18406410990264, 64.3583697135061, 64.04655749128916, 64.00767808219177)
                                         
mean_hard_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average                                         
# (62.03459343026393, 62.948884194830924, 62.89086211119271, 63.77561016542802, 63.7604420766266, 63.89965714285724) 

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 
                                         
del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_ret_pts_won%_ws", "SOS_tw_l60_opp_ret_pts_won%_ws_ct", "SOS_tw_l60_opp_ret_pts_won%", "SOS_tw_l60_opp_ret_pts_won%_50", "SOS_tw_l60_opp_ret_pts_won%_49", "SOS_tw_l60_opp_ret_pts_won%_48", "SOS_tw_l60_opp_ret_pts_won%_47", "SOS_tw_l60_opp_ret_pts_won%_46", "SOS_tw_l60_opp_ret_pts_won%_45", "SOS_tw_l60_opp_ret_pts_won%_44", "SOS_tw_l60_opp_ret_pts_won%_43", "SOS_tw_l60_opp_ret_pts_won%_42", "SOS_tw_l60_opp_ret_pts_won%_41", "SOS_tw_l60_opp_ret_pts_won%_40", "SOS_tw_l60_opp_ret_pts_won%_39", "SOS_tw_l60_opp_ret_pts_won%_38", "SOS_tw_l60_opp_ret_pts_won%_37", "SOS_tw_l60_opp_ret_pts_won%_36", "SOS_tw_l60_opp_ret_pts_won%_35", "SOS_tw_l60_opp_ret_pts_won%_34", "SOS_tw_l60_opp_ret_pts_won%_33", "SOS_tw_l60_opp_ret_pts_won%_32", "SOS_tw_l60_opp_ret_pts_won%_31", "SOS_tw_l60_opp_ret_pts_won%_30", "SOS_tw_l60_opp_ret_pts_won%_29", "SOS_tw_l60_opp_ret_pts_won%_28", "SOS_tw_l60_opp_ret_pts_won%_27", "SOS_tw_l60_opp_ret_pts_won%_26", "SOS_tw_l60_opp_ret_pts_won%_25", "SOS_tw_l60_opp_ret_pts_won%_24", "SOS_tw_l60_opp_ret_pts_won%_23", "SOS_tw_l60_opp_ret_pts_won%_22", "SOS_tw_l60_opp_ret_pts_won%_21", "SOS_tw_l60_opp_ret_pts_won%_20", "SOS_tw_l60_opp_ret_pts_won%_19", "SOS_tw_l60_opp_ret_pts_won%_18", "SOS_tw_l60_opp_ret_pts_won%_17", "SOS_tw_l60_opp_ret_pts_won%_16", "SOS_tw_l60_opp_ret_pts_won%_15", "SOS_tw_l60_opp_ret_pts_won%_14", "SOS_tw_l60_opp_ret_pts_won%_13", "SOS_tw_l60_opp_ret_pts_won%_12", "SOS_tw_l60_opp_ret_pts_won%_11", "SOS_tw_l60_opp_ret_pts_won%_10", "SOS_tw_l60_opp_ret_pts_won%_9", "SOS_tw_l60_opp_ret_pts_won%_8", "SOS_tw_l60_opp_ret_pts_won%_7", "SOS_tw_l60_opp_ret_pts_won%_6", "SOS_tw_l60_opp_ret_pts_won%_5", "SOS_tw_l60_opp_ret_pts_won%_4", "SOS_tw_l60_opp_ret_pts_won%_3", "SOS_tw_l60_opp_ret_pts_won%_2", "SOS_tw_l60_opp_ret_pts_won%_1"],axis=1)

In [153]:
# 'p_sv_pts_won%_l10_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) SERVE POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_ret_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_ret_pts_won%_60", "SOS_tw_l60_opp_ret_pts_won%_59", "SOS_tw_l60_opp_ret_pts_won%_58", "SOS_tw_l60_opp_ret_pts_won%_57", "SOS_tw_l60_opp_ret_pts_won%_56", "SOS_tw_l60_opp_ret_pts_won%_55", "SOS_tw_l60_opp_ret_pts_won%_54", "SOS_tw_l60_opp_ret_pts_won%_53", "SOS_tw_l60_opp_ret_pts_won%_52", "SOS_tw_l60_opp_ret_pts_won%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_ret_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_ret_pts_won%_60", "SOS_tw_l60_opp_ret_pts_won%_59", "SOS_tw_l60_opp_ret_pts_won%_58", "SOS_tw_l60_opp_ret_pts_won%_57", "SOS_tw_l60_opp_ret_pts_won%_56", "SOS_tw_l60_opp_ret_pts_won%_55", "SOS_tw_l60_opp_ret_pts_won%_54", "SOS_tw_l60_opp_ret_pts_won%_53", "SOS_tw_l60_opp_ret_pts_won%_52", "SOS_tw_l60_opp_ret_pts_won%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_ret_pts_won%"] = (df_player2["SOS_tw_l10_opp_ret_pts_won%_ws"]/df_player2["SOS_tw_l10_opp_ret_pts_won%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % SERVE pts won the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches on this surface and IO status is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l10_opp_ret_pts_won%"]

# Mean % SERVE pts won performance (l10_tw_ss) for ALL players per surface (clay, hard) and IO status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# 

mean_clay_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
#     

mean_hard_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# 
                                         
mean_hard_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average                                         
#  

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 
                                         
del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_ret_pts_won%_ws", "SOS_tw_l10_opp_ret_pts_won%_ws_ct", "SOS_tw_l10_opp_ret_pts_won%", "SOS_tw_l60_opp_ret_pts_won%_60", "SOS_tw_l60_opp_ret_pts_won%_59", "SOS_tw_l60_opp_ret_pts_won%_58", "SOS_tw_l60_opp_ret_pts_won%_57", "SOS_tw_l60_opp_ret_pts_won%_56", "SOS_tw_l60_opp_ret_pts_won%_55", "SOS_tw_l60_opp_ret_pts_won%_54", "SOS_tw_l60_opp_ret_pts_won%_53", "SOS_tw_l60_opp_ret_pts_won%_52", "SOS_tw_l60_opp_ret_pts_won%_51"],axis=1)


In [154]:
# 'p_1st_sv_pts_won%_l60_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) FIRST SERVE POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific (SS) FIRST SERVE RETURN POINTS WON performance of player OPPONENTS over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_60"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-1)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_59"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-2)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_58"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-3)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_57"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-4)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_56"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-5)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_55"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-6)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_54"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-7)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_53"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-8)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_52"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-9)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_51"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-10)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_50"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-11)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_49"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-12)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_48"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-13)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_47"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-14)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_46"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-15)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_45"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-16)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_44"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-17)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_43"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-18)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_42"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-19)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_41"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-20)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_40"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-21)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_39"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-22)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_38"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-23)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_37"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-24)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_36"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-25)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_35"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-26)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_34"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-27)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_33"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-28)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_32"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-29)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_31"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-30)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_30"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-31)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_29"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-32)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_28"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-33)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_27"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-34)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_26"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-35)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_25"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-36)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_24"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-37)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_23"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-38)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_22"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-39)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_21"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-40)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_20"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-41)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_19"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-42)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_18"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-43)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_17"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-44)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_16"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-45)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_15"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-46)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_14"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-47)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_13"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-48)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_12"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-49)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_11"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-50)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_10"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-51)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_9"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-52)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_8"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-53)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_7"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-54)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_6"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-55)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_5"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-56)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_4"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-57)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_3"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-58)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_2"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-59)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_1"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_ret_pts_won%_l60_tw_ss'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_1st_ret_pts_won%_60", "SOS_tw_l60_opp_1st_ret_pts_won%_59", "SOS_tw_l60_opp_1st_ret_pts_won%_58", "SOS_tw_l60_opp_1st_ret_pts_won%_57", "SOS_tw_l60_opp_1st_ret_pts_won%_56", "SOS_tw_l60_opp_1st_ret_pts_won%_55", "SOS_tw_l60_opp_1st_ret_pts_won%_54", "SOS_tw_l60_opp_1st_ret_pts_won%_53", "SOS_tw_l60_opp_1st_ret_pts_won%_52", "SOS_tw_l60_opp_1st_ret_pts_won%_51", "SOS_tw_l60_opp_1st_ret_pts_won%_50", "SOS_tw_l60_opp_1st_ret_pts_won%_49", "SOS_tw_l60_opp_1st_ret_pts_won%_48", "SOS_tw_l60_opp_1st_ret_pts_won%_47", "SOS_tw_l60_opp_1st_ret_pts_won%_46", "SOS_tw_l60_opp_1st_ret_pts_won%_45", "SOS_tw_l60_opp_1st_ret_pts_won%_44", "SOS_tw_l60_opp_1st_ret_pts_won%_43", "SOS_tw_l60_opp_1st_ret_pts_won%_42", "SOS_tw_l60_opp_1st_ret_pts_won%_41", "SOS_tw_l60_opp_1st_ret_pts_won%_40", "SOS_tw_l60_opp_1st_ret_pts_won%_39", "SOS_tw_l60_opp_1st_ret_pts_won%_38", "SOS_tw_l60_opp_1st_ret_pts_won%_37", "SOS_tw_l60_opp_1st_ret_pts_won%_36", "SOS_tw_l60_opp_1st_ret_pts_won%_35", "SOS_tw_l60_opp_1st_ret_pts_won%_34", "SOS_tw_l60_opp_1st_ret_pts_won%_33", "SOS_tw_l60_opp_1st_ret_pts_won%_32", "SOS_tw_l60_opp_1st_ret_pts_won%_31", "SOS_tw_l60_opp_1st_ret_pts_won%_30", "SOS_tw_l60_opp_1st_ret_pts_won%_29", "SOS_tw_l60_opp_1st_ret_pts_won%_28", "SOS_tw_l60_opp_1st_ret_pts_won%_27", "SOS_tw_l60_opp_1st_ret_pts_won%_26", "SOS_tw_l60_opp_1st_ret_pts_won%_25", "SOS_tw_l60_opp_1st_ret_pts_won%_24", "SOS_tw_l60_opp_1st_ret_pts_won%_23", "SOS_tw_l60_opp_1st_ret_pts_won%_22", "SOS_tw_l60_opp_1st_ret_pts_won%_21", "SOS_tw_l60_opp_1st_ret_pts_won%_20", "SOS_tw_l60_opp_1st_ret_pts_won%_19", "SOS_tw_l60_opp_1st_ret_pts_won%_18", "SOS_tw_l60_opp_1st_ret_pts_won%_17", "SOS_tw_l60_opp_1st_ret_pts_won%_16", "SOS_tw_l60_opp_1st_ret_pts_won%_15", "SOS_tw_l60_opp_1st_ret_pts_won%_14", "SOS_tw_l60_opp_1st_ret_pts_won%_13", "SOS_tw_l60_opp_1st_ret_pts_won%_12", "SOS_tw_l60_opp_1st_ret_pts_won%_11", "SOS_tw_l60_opp_1st_ret_pts_won%_10", "SOS_tw_l60_opp_1st_ret_pts_won%_9", "SOS_tw_l60_opp_1st_ret_pts_won%_8", "SOS_tw_l60_opp_1st_ret_pts_won%_7", "SOS_tw_l60_opp_1st_ret_pts_won%_6", "SOS_tw_l60_opp_1st_ret_pts_won%_5", "SOS_tw_l60_opp_1st_ret_pts_won%_4", "SOS_tw_l60_opp_1st_ret_pts_won%_3", "SOS_tw_l60_opp_1st_ret_pts_won%_2", "SOS_tw_l60_opp_1st_ret_pts_won%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_1st_ret_pts_won%_60", "SOS_tw_l60_opp_1st_ret_pts_won%_59", "SOS_tw_l60_opp_1st_ret_pts_won%_58", "SOS_tw_l60_opp_1st_ret_pts_won%_57", "SOS_tw_l60_opp_1st_ret_pts_won%_56", "SOS_tw_l60_opp_1st_ret_pts_won%_55", "SOS_tw_l60_opp_1st_ret_pts_won%_54", "SOS_tw_l60_opp_1st_ret_pts_won%_53", "SOS_tw_l60_opp_1st_ret_pts_won%_52", "SOS_tw_l60_opp_1st_ret_pts_won%_51", "SOS_tw_l60_opp_1st_ret_pts_won%_50", "SOS_tw_l60_opp_1st_ret_pts_won%_49", "SOS_tw_l60_opp_1st_ret_pts_won%_48", "SOS_tw_l60_opp_1st_ret_pts_won%_47", "SOS_tw_l60_opp_1st_ret_pts_won%_46", "SOS_tw_l60_opp_1st_ret_pts_won%_45", "SOS_tw_l60_opp_1st_ret_pts_won%_44", "SOS_tw_l60_opp_1st_ret_pts_won%_43", "SOS_tw_l60_opp_1st_ret_pts_won%_42", "SOS_tw_l60_opp_1st_ret_pts_won%_41", "SOS_tw_l60_opp_1st_ret_pts_won%_40", "SOS_tw_l60_opp_1st_ret_pts_won%_39", "SOS_tw_l60_opp_1st_ret_pts_won%_38", "SOS_tw_l60_opp_1st_ret_pts_won%_37", "SOS_tw_l60_opp_1st_ret_pts_won%_36", "SOS_tw_l60_opp_1st_ret_pts_won%_35", "SOS_tw_l60_opp_1st_ret_pts_won%_34", "SOS_tw_l60_opp_1st_ret_pts_won%_33", "SOS_tw_l60_opp_1st_ret_pts_won%_32", "SOS_tw_l60_opp_1st_ret_pts_won%_31", "SOS_tw_l60_opp_1st_ret_pts_won%_30", "SOS_tw_l60_opp_1st_ret_pts_won%_29", "SOS_tw_l60_opp_1st_ret_pts_won%_28", "SOS_tw_l60_opp_1st_ret_pts_won%_27", "SOS_tw_l60_opp_1st_ret_pts_won%_26", "SOS_tw_l60_opp_1st_ret_pts_won%_25", "SOS_tw_l60_opp_1st_ret_pts_won%_24", "SOS_tw_l60_opp_1st_ret_pts_won%_23", "SOS_tw_l60_opp_1st_ret_pts_won%_22", "SOS_tw_l60_opp_1st_ret_pts_won%_21", "SOS_tw_l60_opp_1st_ret_pts_won%_20", "SOS_tw_l60_opp_1st_ret_pts_won%_19", "SOS_tw_l60_opp_1st_ret_pts_won%_18", "SOS_tw_l60_opp_1st_ret_pts_won%_17", "SOS_tw_l60_opp_1st_ret_pts_won%_16", "SOS_tw_l60_opp_1st_ret_pts_won%_15", "SOS_tw_l60_opp_1st_ret_pts_won%_14", "SOS_tw_l60_opp_1st_ret_pts_won%_13", "SOS_tw_l60_opp_1st_ret_pts_won%_12", "SOS_tw_l60_opp_1st_ret_pts_won%_11", "SOS_tw_l60_opp_1st_ret_pts_won%_10", "SOS_tw_l60_opp_1st_ret_pts_won%_9", "SOS_tw_l60_opp_1st_ret_pts_won%_8", "SOS_tw_l60_opp_1st_ret_pts_won%_7", "SOS_tw_l60_opp_1st_ret_pts_won%_6", "SOS_tw_l60_opp_1st_ret_pts_won%_5", "SOS_tw_l60_opp_1st_ret_pts_won%_4", "SOS_tw_l60_opp_1st_ret_pts_won%_3", "SOS_tw_l60_opp_1st_ret_pts_won%_2", "SOS_tw_l60_opp_1st_ret_pts_won%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%"] = (df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_ws"]/df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % 1st SERVE pts won the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l60_opp_1st_ret_pts_won%"]

# Mean % 1st SERVE pts won performance (l60_tw_ss) for ALL players per surface (clay, hard). We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_clay_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 1st serve pts the field ALLOWS on average
mean_clay_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 1st serve pts the field ALLOWS on average
mean_clay_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 1st serve pts the field ALLOWS on average
mean_clay_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 1st serve pts the field ALLOWS on average
mean_clay_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 1st serve pts the field ALLOWS on average
mean_clay_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 1st serve pts the field ALLOWS on average
# (67.5574587912088, 67.7812689393939, 68.07616429699843, 68.87905727155727, 69.22866022544278, 69.11749029754205)

mean_hard_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 1st serve pts the field ALLOWS on average
mean_hard_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 1st serve pts the field ALLOWS on average
mean_hard_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 1st serve pts the field ALLOWS on average
mean_hard_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 1st serve pts the field ALLOWS on average
mean_hard_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 1st serve pts the field ALLOWS on average
mean_hard_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 1st serve pts the field ALLOWS on average
# (71.06310906515586, 71.9643275745256, 71.64766609880755, 72.50704076230814, 72.69757776644558, 72.69365708453816)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_1st_ret_pts_won%_ws", "SOS_tw_l60_opp_1st_ret_pts_won%_ws_ct", "SOS_tw_l60_opp_1st_ret_pts_won%", "SOS_tw_l60_opp_1st_ret_pts_won%_50", "SOS_tw_l60_opp_1st_ret_pts_won%_49", "SOS_tw_l60_opp_1st_ret_pts_won%_48", "SOS_tw_l60_opp_1st_ret_pts_won%_47", "SOS_tw_l60_opp_1st_ret_pts_won%_46", "SOS_tw_l60_opp_1st_ret_pts_won%_45", "SOS_tw_l60_opp_1st_ret_pts_won%_44", "SOS_tw_l60_opp_1st_ret_pts_won%_43", "SOS_tw_l60_opp_1st_ret_pts_won%_42", "SOS_tw_l60_opp_1st_ret_pts_won%_41", "SOS_tw_l60_opp_1st_ret_pts_won%_40", "SOS_tw_l60_opp_1st_ret_pts_won%_39", "SOS_tw_l60_opp_1st_ret_pts_won%_38", "SOS_tw_l60_opp_1st_ret_pts_won%_37", "SOS_tw_l60_opp_1st_ret_pts_won%_36", "SOS_tw_l60_opp_1st_ret_pts_won%_35", "SOS_tw_l60_opp_1st_ret_pts_won%_34", "SOS_tw_l60_opp_1st_ret_pts_won%_33", "SOS_tw_l60_opp_1st_ret_pts_won%_32", "SOS_tw_l60_opp_1st_ret_pts_won%_31", "SOS_tw_l60_opp_1st_ret_pts_won%_30", "SOS_tw_l60_opp_1st_ret_pts_won%_29", "SOS_tw_l60_opp_1st_ret_pts_won%_28", "SOS_tw_l60_opp_1st_ret_pts_won%_27", "SOS_tw_l60_opp_1st_ret_pts_won%_26", "SOS_tw_l60_opp_1st_ret_pts_won%_25", "SOS_tw_l60_opp_1st_ret_pts_won%_24", "SOS_tw_l60_opp_1st_ret_pts_won%_23", "SOS_tw_l60_opp_1st_ret_pts_won%_22", "SOS_tw_l60_opp_1st_ret_pts_won%_21", "SOS_tw_l60_opp_1st_ret_pts_won%_20", "SOS_tw_l60_opp_1st_ret_pts_won%_19", "SOS_tw_l60_opp_1st_ret_pts_won%_18", "SOS_tw_l60_opp_1st_ret_pts_won%_17", "SOS_tw_l60_opp_1st_ret_pts_won%_16", "SOS_tw_l60_opp_1st_ret_pts_won%_15", "SOS_tw_l60_opp_1st_ret_pts_won%_14", "SOS_tw_l60_opp_1st_ret_pts_won%_13", "SOS_tw_l60_opp_1st_ret_pts_won%_12", "SOS_tw_l60_opp_1st_ret_pts_won%_11", "SOS_tw_l60_opp_1st_ret_pts_won%_10", "SOS_tw_l60_opp_1st_ret_pts_won%_9", "SOS_tw_l60_opp_1st_ret_pts_won%_8", "SOS_tw_l60_opp_1st_ret_pts_won%_7", "SOS_tw_l60_opp_1st_ret_pts_won%_6", "SOS_tw_l60_opp_1st_ret_pts_won%_5", "SOS_tw_l60_opp_1st_ret_pts_won%_4", "SOS_tw_l60_opp_1st_ret_pts_won%_3", "SOS_tw_l60_opp_1st_ret_pts_won%_2", "SOS_tw_l60_opp_1st_ret_pts_won%_1"],axis=1)

In [155]:
# 'p_1st_sv_pts_won%_l10_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) FIRST SERVE POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_1st_ret_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_1st_ret_pts_won%_60", "SOS_tw_l60_opp_1st_ret_pts_won%_59", "SOS_tw_l60_opp_1st_ret_pts_won%_58", "SOS_tw_l60_opp_1st_ret_pts_won%_57", "SOS_tw_l60_opp_1st_ret_pts_won%_56", "SOS_tw_l60_opp_1st_ret_pts_won%_55", "SOS_tw_l60_opp_1st_ret_pts_won%_54", "SOS_tw_l60_opp_1st_ret_pts_won%_53", "SOS_tw_l60_opp_1st_ret_pts_won%_52", "SOS_tw_l60_opp_1st_ret_pts_won%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_1st_ret_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_1st_ret_pts_won%_60", "SOS_tw_l60_opp_1st_ret_pts_won%_59", "SOS_tw_l60_opp_1st_ret_pts_won%_58", "SOS_tw_l60_opp_1st_ret_pts_won%_57", "SOS_tw_l60_opp_1st_ret_pts_won%_56", "SOS_tw_l60_opp_1st_ret_pts_won%_55", "SOS_tw_l60_opp_1st_ret_pts_won%_54", "SOS_tw_l60_opp_1st_ret_pts_won%_53", "SOS_tw_l60_opp_1st_ret_pts_won%_52", "SOS_tw_l60_opp_1st_ret_pts_won%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_1st_ret_pts_won%"] = (df_player2["SOS_tw_l10_opp_1st_ret_pts_won%_ws"]/df_player2["SOS_tw_l10_opp_1st_ret_pts_won%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % FIRST SERVE pts won the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches on this surface is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l10_opp_1st_ret_pts_won%"]

# Mean % FIRST SERVE pts won performance (l10_tw_ss) for ALL players per surface (clay, hard). We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_clay_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_clay_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_clay_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_clay_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_clay_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_clay_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
# (67.63615384615382, 67.74870580808093, 68.28806951026863, 69.06286357786357, 69.3423671497585, 69.17749353169465)
mean_hard_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_hard_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_hard_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_hard_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_hard_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
mean_hard_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct serve pts the field ALLOWS on average
# (71.23715297450426, 72.02117378048779, 71.68543100511081, 72.82641079936475, 72.56895801461846, 72.83070590236437)

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_1st_ret_pts_won%_ws", "SOS_tw_l10_opp_1st_ret_pts_won%_ws_ct", "SOS_tw_l10_opp_1st_ret_pts_won%", "SOS_tw_l60_opp_1st_ret_pts_won%_60", "SOS_tw_l60_opp_1st_ret_pts_won%_59", "SOS_tw_l60_opp_1st_ret_pts_won%_58", "SOS_tw_l60_opp_1st_ret_pts_won%_57", "SOS_tw_l60_opp_1st_ret_pts_won%_56", "SOS_tw_l60_opp_1st_ret_pts_won%_55", "SOS_tw_l60_opp_1st_ret_pts_won%_54", "SOS_tw_l60_opp_1st_ret_pts_won%_53", "SOS_tw_l60_opp_1st_ret_pts_won%_52", "SOS_tw_l60_opp_1st_ret_pts_won%_51"],axis=1)

In [156]:
# 'p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), IO specific FIRST SERVE POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf', 't_ind', 'm_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific (SS), IO specific RETURN POINTS WON performance of player OPPONENTS over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_60"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-1)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_59"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-2)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_58"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-3)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_57"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-4)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_56"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-5)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_55"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-6)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_54"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-7)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_53"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-8)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_52"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-9)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_51"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-10)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_50"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-11)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_49"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-12)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_48"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-13)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_47"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-14)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_46"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-15)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_45"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-16)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_44"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-17)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_43"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-18)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_42"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-19)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_41"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-20)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_40"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-21)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_39"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-22)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_38"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-23)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_37"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-24)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_36"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-25)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_35"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-26)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_34"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-27)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_33"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-28)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_32"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-29)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_31"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-30)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_30"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-31)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_29"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-32)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_28"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-33)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_27"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-34)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_26"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-35)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_25"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-36)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_24"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-37)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_23"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-38)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_22"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-39)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_21"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-40)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_20"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-41)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_19"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-42)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_18"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-43)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_17"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-44)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_16"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-45)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_15"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-46)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_14"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-47)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_13"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-48)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_12"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-49)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_11"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-50)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_10"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-51)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_9"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-52)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_8"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-53)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_7"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-54)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_6"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-55)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_5"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-56)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_4"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-57)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_3"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-58)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_2"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-59)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_1"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_1st_ret_pts_won%_60", "SOS_tw_l60_opp_1st_ret_pts_won%_59", "SOS_tw_l60_opp_1st_ret_pts_won%_58", "SOS_tw_l60_opp_1st_ret_pts_won%_57", "SOS_tw_l60_opp_1st_ret_pts_won%_56", "SOS_tw_l60_opp_1st_ret_pts_won%_55", "SOS_tw_l60_opp_1st_ret_pts_won%_54", "SOS_tw_l60_opp_1st_ret_pts_won%_53", "SOS_tw_l60_opp_1st_ret_pts_won%_52", "SOS_tw_l60_opp_1st_ret_pts_won%_51", "SOS_tw_l60_opp_1st_ret_pts_won%_50", "SOS_tw_l60_opp_1st_ret_pts_won%_49", "SOS_tw_l60_opp_1st_ret_pts_won%_48", "SOS_tw_l60_opp_1st_ret_pts_won%_47", "SOS_tw_l60_opp_1st_ret_pts_won%_46", "SOS_tw_l60_opp_1st_ret_pts_won%_45", "SOS_tw_l60_opp_1st_ret_pts_won%_44", "SOS_tw_l60_opp_1st_ret_pts_won%_43", "SOS_tw_l60_opp_1st_ret_pts_won%_42", "SOS_tw_l60_opp_1st_ret_pts_won%_41", "SOS_tw_l60_opp_1st_ret_pts_won%_40", "SOS_tw_l60_opp_1st_ret_pts_won%_39", "SOS_tw_l60_opp_1st_ret_pts_won%_38", "SOS_tw_l60_opp_1st_ret_pts_won%_37", "SOS_tw_l60_opp_1st_ret_pts_won%_36", "SOS_tw_l60_opp_1st_ret_pts_won%_35", "SOS_tw_l60_opp_1st_ret_pts_won%_34", "SOS_tw_l60_opp_1st_ret_pts_won%_33", "SOS_tw_l60_opp_1st_ret_pts_won%_32", "SOS_tw_l60_opp_1st_ret_pts_won%_31", "SOS_tw_l60_opp_1st_ret_pts_won%_30", "SOS_tw_l60_opp_1st_ret_pts_won%_29", "SOS_tw_l60_opp_1st_ret_pts_won%_28", "SOS_tw_l60_opp_1st_ret_pts_won%_27", "SOS_tw_l60_opp_1st_ret_pts_won%_26", "SOS_tw_l60_opp_1st_ret_pts_won%_25", "SOS_tw_l60_opp_1st_ret_pts_won%_24", "SOS_tw_l60_opp_1st_ret_pts_won%_23", "SOS_tw_l60_opp_1st_ret_pts_won%_22", "SOS_tw_l60_opp_1st_ret_pts_won%_21", "SOS_tw_l60_opp_1st_ret_pts_won%_20", "SOS_tw_l60_opp_1st_ret_pts_won%_19", "SOS_tw_l60_opp_1st_ret_pts_won%_18", "SOS_tw_l60_opp_1st_ret_pts_won%_17", "SOS_tw_l60_opp_1st_ret_pts_won%_16", "SOS_tw_l60_opp_1st_ret_pts_won%_15", "SOS_tw_l60_opp_1st_ret_pts_won%_14", "SOS_tw_l60_opp_1st_ret_pts_won%_13", "SOS_tw_l60_opp_1st_ret_pts_won%_12", "SOS_tw_l60_opp_1st_ret_pts_won%_11", "SOS_tw_l60_opp_1st_ret_pts_won%_10", "SOS_tw_l60_opp_1st_ret_pts_won%_9", "SOS_tw_l60_opp_1st_ret_pts_won%_8", "SOS_tw_l60_opp_1st_ret_pts_won%_7", "SOS_tw_l60_opp_1st_ret_pts_won%_6", "SOS_tw_l60_opp_1st_ret_pts_won%_5", "SOS_tw_l60_opp_1st_ret_pts_won%_4", "SOS_tw_l60_opp_1st_ret_pts_won%_3", "SOS_tw_l60_opp_1st_ret_pts_won%_2", "SOS_tw_l60_opp_1st_ret_pts_won%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_1st_ret_pts_won%_60", "SOS_tw_l60_opp_1st_ret_pts_won%_59", "SOS_tw_l60_opp_1st_ret_pts_won%_58", "SOS_tw_l60_opp_1st_ret_pts_won%_57", "SOS_tw_l60_opp_1st_ret_pts_won%_56", "SOS_tw_l60_opp_1st_ret_pts_won%_55", "SOS_tw_l60_opp_1st_ret_pts_won%_54", "SOS_tw_l60_opp_1st_ret_pts_won%_53", "SOS_tw_l60_opp_1st_ret_pts_won%_52", "SOS_tw_l60_opp_1st_ret_pts_won%_51", "SOS_tw_l60_opp_1st_ret_pts_won%_50", "SOS_tw_l60_opp_1st_ret_pts_won%_49", "SOS_tw_l60_opp_1st_ret_pts_won%_48", "SOS_tw_l60_opp_1st_ret_pts_won%_47", "SOS_tw_l60_opp_1st_ret_pts_won%_46", "SOS_tw_l60_opp_1st_ret_pts_won%_45", "SOS_tw_l60_opp_1st_ret_pts_won%_44", "SOS_tw_l60_opp_1st_ret_pts_won%_43", "SOS_tw_l60_opp_1st_ret_pts_won%_42", "SOS_tw_l60_opp_1st_ret_pts_won%_41", "SOS_tw_l60_opp_1st_ret_pts_won%_40", "SOS_tw_l60_opp_1st_ret_pts_won%_39", "SOS_tw_l60_opp_1st_ret_pts_won%_38", "SOS_tw_l60_opp_1st_ret_pts_won%_37", "SOS_tw_l60_opp_1st_ret_pts_won%_36", "SOS_tw_l60_opp_1st_ret_pts_won%_35", "SOS_tw_l60_opp_1st_ret_pts_won%_34", "SOS_tw_l60_opp_1st_ret_pts_won%_33", "SOS_tw_l60_opp_1st_ret_pts_won%_32", "SOS_tw_l60_opp_1st_ret_pts_won%_31", "SOS_tw_l60_opp_1st_ret_pts_won%_30", "SOS_tw_l60_opp_1st_ret_pts_won%_29", "SOS_tw_l60_opp_1st_ret_pts_won%_28", "SOS_tw_l60_opp_1st_ret_pts_won%_27", "SOS_tw_l60_opp_1st_ret_pts_won%_26", "SOS_tw_l60_opp_1st_ret_pts_won%_25", "SOS_tw_l60_opp_1st_ret_pts_won%_24", "SOS_tw_l60_opp_1st_ret_pts_won%_23", "SOS_tw_l60_opp_1st_ret_pts_won%_22", "SOS_tw_l60_opp_1st_ret_pts_won%_21", "SOS_tw_l60_opp_1st_ret_pts_won%_20", "SOS_tw_l60_opp_1st_ret_pts_won%_19", "SOS_tw_l60_opp_1st_ret_pts_won%_18", "SOS_tw_l60_opp_1st_ret_pts_won%_17", "SOS_tw_l60_opp_1st_ret_pts_won%_16", "SOS_tw_l60_opp_1st_ret_pts_won%_15", "SOS_tw_l60_opp_1st_ret_pts_won%_14", "SOS_tw_l60_opp_1st_ret_pts_won%_13", "SOS_tw_l60_opp_1st_ret_pts_won%_12", "SOS_tw_l60_opp_1st_ret_pts_won%_11", "SOS_tw_l60_opp_1st_ret_pts_won%_10", "SOS_tw_l60_opp_1st_ret_pts_won%_9", "SOS_tw_l60_opp_1st_ret_pts_won%_8", "SOS_tw_l60_opp_1st_ret_pts_won%_7", "SOS_tw_l60_opp_1st_ret_pts_won%_6", "SOS_tw_l60_opp_1st_ret_pts_won%_5", "SOS_tw_l60_opp_1st_ret_pts_won%_4", "SOS_tw_l60_opp_1st_ret_pts_won%_3", "SOS_tw_l60_opp_1st_ret_pts_won%_2", "SOS_tw_l60_opp_1st_ret_pts_won%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_1st_ret_pts_won%"] = (df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_ws"]/df_player2["SOS_tw_l60_opp_1st_ret_pts_won%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % FIRST SERVE pts won the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface and IO status is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l60_opp_1st_ret_pts_won%"]

# Mean % FIRST SERVE pts won performance (l60_tw_ss_IO) for ALL players per surface (clay, hard) and IO status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (nan, 63.88589285714286, 68.847125748503, 70.53847953216373, 69.65295774647889, 71.19528735632184)

mean_clay_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (67.5574587912088, 67.77936357702347, 67.9961559322033, 68.71040358744392, 69.18491508152172, 69.01497152428817)      

mean_hard_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (72.30870731707316, 72.91347006651888, 72.59073840870067, 72.87051841746246, 72.70080836236936, 72.77317808219176)
                                         
mean_hard_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average 
# (70.33680667743678, 71.31724652087476, 71.20963507225078, 72.2585734835771, 72.66678757651331, 72.58674285714282)

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 
                                         
del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_1st_ret_pts_won%_ws", "SOS_tw_l60_opp_1st_ret_pts_won%_ws_ct", "SOS_tw_l60_opp_1st_ret_pts_won%", "SOS_tw_l60_opp_1st_ret_pts_won%_50", "SOS_tw_l60_opp_1st_ret_pts_won%_49", "SOS_tw_l60_opp_1st_ret_pts_won%_48", "SOS_tw_l60_opp_1st_ret_pts_won%_47", "SOS_tw_l60_opp_1st_ret_pts_won%_46", "SOS_tw_l60_opp_1st_ret_pts_won%_45", "SOS_tw_l60_opp_1st_ret_pts_won%_44", "SOS_tw_l60_opp_1st_ret_pts_won%_43", "SOS_tw_l60_opp_1st_ret_pts_won%_42", "SOS_tw_l60_opp_1st_ret_pts_won%_41", "SOS_tw_l60_opp_1st_ret_pts_won%_40", "SOS_tw_l60_opp_1st_ret_pts_won%_39", "SOS_tw_l60_opp_1st_ret_pts_won%_38", "SOS_tw_l60_opp_1st_ret_pts_won%_37", "SOS_tw_l60_opp_1st_ret_pts_won%_36", "SOS_tw_l60_opp_1st_ret_pts_won%_35", "SOS_tw_l60_opp_1st_ret_pts_won%_34", "SOS_tw_l60_opp_1st_ret_pts_won%_33", "SOS_tw_l60_opp_1st_ret_pts_won%_32", "SOS_tw_l60_opp_1st_ret_pts_won%_31", "SOS_tw_l60_opp_1st_ret_pts_won%_30", "SOS_tw_l60_opp_1st_ret_pts_won%_29", "SOS_tw_l60_opp_1st_ret_pts_won%_28", "SOS_tw_l60_opp_1st_ret_pts_won%_27", "SOS_tw_l60_opp_1st_ret_pts_won%_26", "SOS_tw_l60_opp_1st_ret_pts_won%_25", "SOS_tw_l60_opp_1st_ret_pts_won%_24", "SOS_tw_l60_opp_1st_ret_pts_won%_23", "SOS_tw_l60_opp_1st_ret_pts_won%_22", "SOS_tw_l60_opp_1st_ret_pts_won%_21", "SOS_tw_l60_opp_1st_ret_pts_won%_20", "SOS_tw_l60_opp_1st_ret_pts_won%_19", "SOS_tw_l60_opp_1st_ret_pts_won%_18", "SOS_tw_l60_opp_1st_ret_pts_won%_17", "SOS_tw_l60_opp_1st_ret_pts_won%_16", "SOS_tw_l60_opp_1st_ret_pts_won%_15", "SOS_tw_l60_opp_1st_ret_pts_won%_14", "SOS_tw_l60_opp_1st_ret_pts_won%_13", "SOS_tw_l60_opp_1st_ret_pts_won%_12", "SOS_tw_l60_opp_1st_ret_pts_won%_11", "SOS_tw_l60_opp_1st_ret_pts_won%_10", "SOS_tw_l60_opp_1st_ret_pts_won%_9", "SOS_tw_l60_opp_1st_ret_pts_won%_8", "SOS_tw_l60_opp_1st_ret_pts_won%_7", "SOS_tw_l60_opp_1st_ret_pts_won%_6", "SOS_tw_l60_opp_1st_ret_pts_won%_5", "SOS_tw_l60_opp_1st_ret_pts_won%_4", "SOS_tw_l60_opp_1st_ret_pts_won%_3", "SOS_tw_l60_opp_1st_ret_pts_won%_2", "SOS_tw_l60_opp_1st_ret_pts_won%_1"],axis=1)

In [157]:
# 'p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), IO specific FIRST SERVE POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_1st_ret_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_1st_ret_pts_won%_60", "SOS_tw_l60_opp_1st_ret_pts_won%_59", "SOS_tw_l60_opp_1st_ret_pts_won%_58", "SOS_tw_l60_opp_1st_ret_pts_won%_57", "SOS_tw_l60_opp_1st_ret_pts_won%_56", "SOS_tw_l60_opp_1st_ret_pts_won%_55", "SOS_tw_l60_opp_1st_ret_pts_won%_54", "SOS_tw_l60_opp_1st_ret_pts_won%_53", "SOS_tw_l60_opp_1st_ret_pts_won%_52", "SOS_tw_l60_opp_1st_ret_pts_won%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_1st_ret_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_1st_ret_pts_won%_60", "SOS_tw_l60_opp_1st_ret_pts_won%_59", "SOS_tw_l60_opp_1st_ret_pts_won%_58", "SOS_tw_l60_opp_1st_ret_pts_won%_57", "SOS_tw_l60_opp_1st_ret_pts_won%_56", "SOS_tw_l60_opp_1st_ret_pts_won%_55", "SOS_tw_l60_opp_1st_ret_pts_won%_54", "SOS_tw_l60_opp_1st_ret_pts_won%_53", "SOS_tw_l60_opp_1st_ret_pts_won%_52", "SOS_tw_l60_opp_1st_ret_pts_won%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_1st_ret_pts_won%"] = (df_player2["SOS_tw_l10_opp_1st_ret_pts_won%_ws"]/df_player2["SOS_tw_l10_opp_1st_ret_pts_won%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % FIRST SERVE pts won the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches on this surface and IO status is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l10_opp_1st_ret_pts_won%"]

# Mean % FIRST SERVE pts won performance (l10_tw_ss_IO) for ALL players per surface (clay, hard) and IO status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# 

mean_clay_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
#      

mean_hard_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# 
                                         
mean_hard_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average 
# 

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 
                                         
del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_1st_ret_pts_won%_ws", "SOS_tw_l10_opp_1st_ret_pts_won%_ws_ct", "SOS_tw_l10_opp_1st_ret_pts_won%", "SOS_tw_l60_opp_1st_ret_pts_won%_60", "SOS_tw_l60_opp_1st_ret_pts_won%_59", "SOS_tw_l60_opp_1st_ret_pts_won%_58", "SOS_tw_l60_opp_1st_ret_pts_won%_57", "SOS_tw_l60_opp_1st_ret_pts_won%_56", "SOS_tw_l60_opp_1st_ret_pts_won%_55", "SOS_tw_l60_opp_1st_ret_pts_won%_54", "SOS_tw_l60_opp_1st_ret_pts_won%_53", "SOS_tw_l60_opp_1st_ret_pts_won%_52", "SOS_tw_l60_opp_1st_ret_pts_won%_51"],axis=1)

In [158]:
# 'p_2nd_sv_pts_won%_l60_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) SECOND SERVE POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific (SS) SECOND SERVE RETURN POINTS WON performance of player OPPONENTS over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_60"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-1)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_59"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-2)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_58"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-3)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_57"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-4)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_56"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-5)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_55"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-6)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_54"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-7)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_53"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-8)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_52"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-9)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_51"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-10)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_50"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-11)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_49"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-12)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_48"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-13)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_47"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-14)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_46"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-15)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_45"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-16)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_44"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-17)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_43"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-18)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_42"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-19)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_41"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-20)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_40"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-21)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_39"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-22)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_38"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-23)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_37"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-24)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_36"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-25)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_35"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-26)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_34"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-27)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_33"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-28)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_32"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-29)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_31"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-30)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_30"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-31)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_29"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-32)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_28"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-33)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_27"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-34)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_26"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-35)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_25"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-36)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_24"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-37)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_23"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-38)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_22"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-39)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_21"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-40)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_20"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-41)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_19"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-42)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_18"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-43)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_17"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-44)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_16"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-45)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_15"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-46)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_14"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-47)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_13"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-48)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_12"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-49)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_11"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-50)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_10"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-51)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_9"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-52)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_8"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-53)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_7"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-54)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_6"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-55)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_5"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-56)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_4"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-57)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_3"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-58)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_2"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-59)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_1"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_ret_pts_won%_l60_tw_ss'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_2nd_ret_pts_won%_60", "SOS_tw_l60_opp_2nd_ret_pts_won%_59", "SOS_tw_l60_opp_2nd_ret_pts_won%_58", "SOS_tw_l60_opp_2nd_ret_pts_won%_57", "SOS_tw_l60_opp_2nd_ret_pts_won%_56", "SOS_tw_l60_opp_2nd_ret_pts_won%_55", "SOS_tw_l60_opp_2nd_ret_pts_won%_54", "SOS_tw_l60_opp_2nd_ret_pts_won%_53", "SOS_tw_l60_opp_2nd_ret_pts_won%_52", "SOS_tw_l60_opp_2nd_ret_pts_won%_51", "SOS_tw_l60_opp_2nd_ret_pts_won%_50", "SOS_tw_l60_opp_2nd_ret_pts_won%_49", "SOS_tw_l60_opp_2nd_ret_pts_won%_48", "SOS_tw_l60_opp_2nd_ret_pts_won%_47", "SOS_tw_l60_opp_2nd_ret_pts_won%_46", "SOS_tw_l60_opp_2nd_ret_pts_won%_45", "SOS_tw_l60_opp_2nd_ret_pts_won%_44", "SOS_tw_l60_opp_2nd_ret_pts_won%_43", "SOS_tw_l60_opp_2nd_ret_pts_won%_42", "SOS_tw_l60_opp_2nd_ret_pts_won%_41", "SOS_tw_l60_opp_2nd_ret_pts_won%_40", "SOS_tw_l60_opp_2nd_ret_pts_won%_39", "SOS_tw_l60_opp_2nd_ret_pts_won%_38", "SOS_tw_l60_opp_2nd_ret_pts_won%_37", "SOS_tw_l60_opp_2nd_ret_pts_won%_36", "SOS_tw_l60_opp_2nd_ret_pts_won%_35", "SOS_tw_l60_opp_2nd_ret_pts_won%_34", "SOS_tw_l60_opp_2nd_ret_pts_won%_33", "SOS_tw_l60_opp_2nd_ret_pts_won%_32", "SOS_tw_l60_opp_2nd_ret_pts_won%_31", "SOS_tw_l60_opp_2nd_ret_pts_won%_30", "SOS_tw_l60_opp_2nd_ret_pts_won%_29", "SOS_tw_l60_opp_2nd_ret_pts_won%_28", "SOS_tw_l60_opp_2nd_ret_pts_won%_27", "SOS_tw_l60_opp_2nd_ret_pts_won%_26", "SOS_tw_l60_opp_2nd_ret_pts_won%_25", "SOS_tw_l60_opp_2nd_ret_pts_won%_24", "SOS_tw_l60_opp_2nd_ret_pts_won%_23", "SOS_tw_l60_opp_2nd_ret_pts_won%_22", "SOS_tw_l60_opp_2nd_ret_pts_won%_21", "SOS_tw_l60_opp_2nd_ret_pts_won%_20", "SOS_tw_l60_opp_2nd_ret_pts_won%_19", "SOS_tw_l60_opp_2nd_ret_pts_won%_18", "SOS_tw_l60_opp_2nd_ret_pts_won%_17", "SOS_tw_l60_opp_2nd_ret_pts_won%_16", "SOS_tw_l60_opp_2nd_ret_pts_won%_15", "SOS_tw_l60_opp_2nd_ret_pts_won%_14", "SOS_tw_l60_opp_2nd_ret_pts_won%_13", "SOS_tw_l60_opp_2nd_ret_pts_won%_12", "SOS_tw_l60_opp_2nd_ret_pts_won%_11", "SOS_tw_l60_opp_2nd_ret_pts_won%_10", "SOS_tw_l60_opp_2nd_ret_pts_won%_9", "SOS_tw_l60_opp_2nd_ret_pts_won%_8", "SOS_tw_l60_opp_2nd_ret_pts_won%_7", "SOS_tw_l60_opp_2nd_ret_pts_won%_6", "SOS_tw_l60_opp_2nd_ret_pts_won%_5", "SOS_tw_l60_opp_2nd_ret_pts_won%_4", "SOS_tw_l60_opp_2nd_ret_pts_won%_3", "SOS_tw_l60_opp_2nd_ret_pts_won%_2", "SOS_tw_l60_opp_2nd_ret_pts_won%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_2nd_ret_pts_won%_60", "SOS_tw_l60_opp_2nd_ret_pts_won%_59", "SOS_tw_l60_opp_2nd_ret_pts_won%_58", "SOS_tw_l60_opp_2nd_ret_pts_won%_57", "SOS_tw_l60_opp_2nd_ret_pts_won%_56", "SOS_tw_l60_opp_2nd_ret_pts_won%_55", "SOS_tw_l60_opp_2nd_ret_pts_won%_54", "SOS_tw_l60_opp_2nd_ret_pts_won%_53", "SOS_tw_l60_opp_2nd_ret_pts_won%_52", "SOS_tw_l60_opp_2nd_ret_pts_won%_51", "SOS_tw_l60_opp_2nd_ret_pts_won%_50", "SOS_tw_l60_opp_2nd_ret_pts_won%_49", "SOS_tw_l60_opp_2nd_ret_pts_won%_48", "SOS_tw_l60_opp_2nd_ret_pts_won%_47", "SOS_tw_l60_opp_2nd_ret_pts_won%_46", "SOS_tw_l60_opp_2nd_ret_pts_won%_45", "SOS_tw_l60_opp_2nd_ret_pts_won%_44", "SOS_tw_l60_opp_2nd_ret_pts_won%_43", "SOS_tw_l60_opp_2nd_ret_pts_won%_42", "SOS_tw_l60_opp_2nd_ret_pts_won%_41", "SOS_tw_l60_opp_2nd_ret_pts_won%_40", "SOS_tw_l60_opp_2nd_ret_pts_won%_39", "SOS_tw_l60_opp_2nd_ret_pts_won%_38", "SOS_tw_l60_opp_2nd_ret_pts_won%_37", "SOS_tw_l60_opp_2nd_ret_pts_won%_36", "SOS_tw_l60_opp_2nd_ret_pts_won%_35", "SOS_tw_l60_opp_2nd_ret_pts_won%_34", "SOS_tw_l60_opp_2nd_ret_pts_won%_33", "SOS_tw_l60_opp_2nd_ret_pts_won%_32", "SOS_tw_l60_opp_2nd_ret_pts_won%_31", "SOS_tw_l60_opp_2nd_ret_pts_won%_30", "SOS_tw_l60_opp_2nd_ret_pts_won%_29", "SOS_tw_l60_opp_2nd_ret_pts_won%_28", "SOS_tw_l60_opp_2nd_ret_pts_won%_27", "SOS_tw_l60_opp_2nd_ret_pts_won%_26", "SOS_tw_l60_opp_2nd_ret_pts_won%_25", "SOS_tw_l60_opp_2nd_ret_pts_won%_24", "SOS_tw_l60_opp_2nd_ret_pts_won%_23", "SOS_tw_l60_opp_2nd_ret_pts_won%_22", "SOS_tw_l60_opp_2nd_ret_pts_won%_21", "SOS_tw_l60_opp_2nd_ret_pts_won%_20", "SOS_tw_l60_opp_2nd_ret_pts_won%_19", "SOS_tw_l60_opp_2nd_ret_pts_won%_18", "SOS_tw_l60_opp_2nd_ret_pts_won%_17", "SOS_tw_l60_opp_2nd_ret_pts_won%_16", "SOS_tw_l60_opp_2nd_ret_pts_won%_15", "SOS_tw_l60_opp_2nd_ret_pts_won%_14", "SOS_tw_l60_opp_2nd_ret_pts_won%_13", "SOS_tw_l60_opp_2nd_ret_pts_won%_12", "SOS_tw_l60_opp_2nd_ret_pts_won%_11", "SOS_tw_l60_opp_2nd_ret_pts_won%_10", "SOS_tw_l60_opp_2nd_ret_pts_won%_9", "SOS_tw_l60_opp_2nd_ret_pts_won%_8", "SOS_tw_l60_opp_2nd_ret_pts_won%_7", "SOS_tw_l60_opp_2nd_ret_pts_won%_6", "SOS_tw_l60_opp_2nd_ret_pts_won%_5", "SOS_tw_l60_opp_2nd_ret_pts_won%_4", "SOS_tw_l60_opp_2nd_ret_pts_won%_3", "SOS_tw_l60_opp_2nd_ret_pts_won%_2", "SOS_tw_l60_opp_2nd_ret_pts_won%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%"] = (df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_ws"]/df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % 2nd SERVE pts won the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%"]

# Mean % 2nd SERVE pts won performance (l60_tw_ss) for ALL players per surface (clay, hard). We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_clay_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_2nd_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 2nd serve pts won the field ALLOWS on average
mean_clay_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_2nd_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 2nd serve pts won the field ALLOWS on average
mean_clay_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_2nd_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 2nd serve pts won the field ALLOWS on average
mean_clay_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_2nd_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 2nd serve pts won the field ALLOWS on average
mean_clay_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_2nd_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 2nd serve pts won the field ALLOWS on average
mean_clay_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_2nd_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 2nd serve pts won the field ALLOWS on average
# (50.643270858524765, 50.36309029896282, 50.524402476780104, 50.784848961610976, 51.307890920554875, 50.86440597204575)
mean_hard_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_2nd_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 2nd serve pts won the field ALLOWS on average
mean_hard_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_2nd_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 2nd serve pts won the field ALLOWS on average
mean_hard_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_2nd_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 2nd serve pts won the field ALLOWS on average
mean_hard_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_2nd_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 2nd serve pts won the field ALLOWS on average
mean_hard_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_2nd_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 2nd serve pts won the field ALLOWS on average
mean_hard_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_2nd_ret_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 2nd serve pts on the field ALLOWS on average
# (51.09679036458339, 51.299672757474994, 51.13884305835022, 51.57946148483779, 51.078274069104424, 51.351385988548365)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_2nd_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_2nd_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_2nd_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_2nd_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_2nd_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_2nd_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_2nd_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_2nd_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_2nd_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_2nd_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_2nd_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_2nd_sv_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_2nd_ret_pts_won%_ws", "SOS_tw_l60_opp_2nd_ret_pts_won%_ws_ct", "SOS_tw_l60_opp_2nd_ret_pts_won%", "SOS_tw_l60_opp_2nd_ret_pts_won%_50", "SOS_tw_l60_opp_2nd_ret_pts_won%_49", "SOS_tw_l60_opp_2nd_ret_pts_won%_48", "SOS_tw_l60_opp_2nd_ret_pts_won%_47", "SOS_tw_l60_opp_2nd_ret_pts_won%_46", "SOS_tw_l60_opp_2nd_ret_pts_won%_45", "SOS_tw_l60_opp_2nd_ret_pts_won%_44", "SOS_tw_l60_opp_2nd_ret_pts_won%_43", "SOS_tw_l60_opp_2nd_ret_pts_won%_42", "SOS_tw_l60_opp_2nd_ret_pts_won%_41", "SOS_tw_l60_opp_2nd_ret_pts_won%_40", "SOS_tw_l60_opp_2nd_ret_pts_won%_39", "SOS_tw_l60_opp_2nd_ret_pts_won%_38", "SOS_tw_l60_opp_2nd_ret_pts_won%_37", "SOS_tw_l60_opp_2nd_ret_pts_won%_36", "SOS_tw_l60_opp_2nd_ret_pts_won%_35", "SOS_tw_l60_opp_2nd_ret_pts_won%_34", "SOS_tw_l60_opp_2nd_ret_pts_won%_33", "SOS_tw_l60_opp_2nd_ret_pts_won%_32", "SOS_tw_l60_opp_2nd_ret_pts_won%_31", "SOS_tw_l60_opp_2nd_ret_pts_won%_30", "SOS_tw_l60_opp_2nd_ret_pts_won%_29", "SOS_tw_l60_opp_2nd_ret_pts_won%_28", "SOS_tw_l60_opp_2nd_ret_pts_won%_27", "SOS_tw_l60_opp_2nd_ret_pts_won%_26", "SOS_tw_l60_opp_2nd_ret_pts_won%_25", "SOS_tw_l60_opp_2nd_ret_pts_won%_24", "SOS_tw_l60_opp_2nd_ret_pts_won%_23", "SOS_tw_l60_opp_2nd_ret_pts_won%_22", "SOS_tw_l60_opp_2nd_ret_pts_won%_21", "SOS_tw_l60_opp_2nd_ret_pts_won%_20", "SOS_tw_l60_opp_2nd_ret_pts_won%_19", "SOS_tw_l60_opp_2nd_ret_pts_won%_18", "SOS_tw_l60_opp_2nd_ret_pts_won%_17", "SOS_tw_l60_opp_2nd_ret_pts_won%_16", "SOS_tw_l60_opp_2nd_ret_pts_won%_15", "SOS_tw_l60_opp_2nd_ret_pts_won%_14", "SOS_tw_l60_opp_2nd_ret_pts_won%_13", "SOS_tw_l60_opp_2nd_ret_pts_won%_12", "SOS_tw_l60_opp_2nd_ret_pts_won%_11", "SOS_tw_l60_opp_2nd_ret_pts_won%_10", "SOS_tw_l60_opp_2nd_ret_pts_won%_9", "SOS_tw_l60_opp_2nd_ret_pts_won%_8", "SOS_tw_l60_opp_2nd_ret_pts_won%_7", "SOS_tw_l60_opp_2nd_ret_pts_won%_6", "SOS_tw_l60_opp_2nd_ret_pts_won%_5", "SOS_tw_l60_opp_2nd_ret_pts_won%_4", "SOS_tw_l60_opp_2nd_ret_pts_won%_3", "SOS_tw_l60_opp_2nd_ret_pts_won%_2", "SOS_tw_l60_opp_2nd_ret_pts_won%_1"],axis=1)

In [159]:
# 'p_2nd_sv_pts_won%_l10_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) SECOND SERVE POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_2nd_ret_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_2nd_ret_pts_won%_60", "SOS_tw_l60_opp_2nd_ret_pts_won%_59", "SOS_tw_l60_opp_2nd_ret_pts_won%_58", "SOS_tw_l60_opp_2nd_ret_pts_won%_57", "SOS_tw_l60_opp_2nd_ret_pts_won%_56", "SOS_tw_l60_opp_2nd_ret_pts_won%_55", "SOS_tw_l60_opp_2nd_ret_pts_won%_54", "SOS_tw_l60_opp_2nd_ret_pts_won%_53", "SOS_tw_l60_opp_2nd_ret_pts_won%_52", "SOS_tw_l60_opp_2nd_ret_pts_won%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_2nd_ret_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_2nd_ret_pts_won%_60", "SOS_tw_l60_opp_2nd_ret_pts_won%_59", "SOS_tw_l60_opp_2nd_ret_pts_won%_58", "SOS_tw_l60_opp_2nd_ret_pts_won%_57", "SOS_tw_l60_opp_2nd_ret_pts_won%_56", "SOS_tw_l60_opp_2nd_ret_pts_won%_55", "SOS_tw_l60_opp_2nd_ret_pts_won%_54", "SOS_tw_l60_opp_2nd_ret_pts_won%_53", "SOS_tw_l60_opp_2nd_ret_pts_won%_52", "SOS_tw_l60_opp_2nd_ret_pts_won%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_2nd_ret_pts_won%"] = (df_player2["SOS_tw_l10_opp_2nd_ret_pts_won%_ws"]/df_player2["SOS_tw_l10_opp_2nd_ret_pts_won%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % 2nd SERVE pts won the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l10_opp_2nd_ret_pts_won%"]

# Mean % 2nd SERVE pts won performance (l10_tw_ss) for ALL players per surface (clay, hard). We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_clay_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_2nd_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 2nd serve pts won the field ALLOWS on average
mean_clay_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_2nd_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 2nd serve pts won the field ALLOWS on average
mean_clay_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_2nd_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 2nd serve pts won the field ALLOWS on average
mean_clay_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_2nd_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 2nd serve pts won the field ALLOWS on average
mean_clay_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_2nd_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 2nd serve pts won the field ALLOWS on average
mean_clay_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_2nd_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 2nd serve pts won the field ALLOWS on average
# (49.198997252747226, 49.86771780303037, 50.10273617693512, 50.25192406692413, 50.84107890499212, 50.35167529107377)
mean_hard_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_2nd_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 2nd serve pts won the field ALLOWS on average
mean_hard_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_2nd_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 2nd serve pts won the field ALLOWS on average
mean_hard_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_2nd_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 2nd serve pts won the field ALLOWS on average
mean_hard_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_2nd_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 2nd serve pts won the field ALLOWS on average
mean_hard_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_2nd_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 2nd serve pts won the field ALLOWS on average
mean_hard_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_2nd_ret_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 2nd serve pts on the field ALLOWS on average
# (49.98617209631733, 50.75324695121934, 50.732821124361244, 51.06764072701609, 50.685294917559204, 50.83958496342911)

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_2nd_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_2nd_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_2nd_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_2nd_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_2nd_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_2nd_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_2nd_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_2nd_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_2nd_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_2nd_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_2nd_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_2nd_sv_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_2nd_ret_pts_won%_ws", "SOS_tw_l10_opp_2nd_ret_pts_won%_ws_ct", "SOS_tw_l10_opp_2nd_ret_pts_won%", "SOS_tw_l60_opp_2nd_ret_pts_won%_60", "SOS_tw_l60_opp_2nd_ret_pts_won%_59", "SOS_tw_l60_opp_2nd_ret_pts_won%_58", "SOS_tw_l60_opp_2nd_ret_pts_won%_57", "SOS_tw_l60_opp_2nd_ret_pts_won%_56", "SOS_tw_l60_opp_2nd_ret_pts_won%_55", "SOS_tw_l60_opp_2nd_ret_pts_won%_54", "SOS_tw_l60_opp_2nd_ret_pts_won%_53", "SOS_tw_l60_opp_2nd_ret_pts_won%_52", "SOS_tw_l60_opp_2nd_ret_pts_won%_51"],axis=1)

In [160]:
# 'p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), IO specific SECOND SERVE POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf', 't_ind', 'm_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific (SS), IO specific RETURN POINTS WON performance of player OPPONENTS over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_60"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-1)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_59"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-2)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_58"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-3)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_57"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-4)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_56"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-5)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_55"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-6)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_54"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-7)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_53"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-8)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_52"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-9)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_51"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-10)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_50"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-11)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_49"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-12)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_48"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-13)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_47"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-14)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_46"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-15)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_45"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-16)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_44"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-17)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_43"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-18)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_42"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-19)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_41"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-20)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_40"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-21)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_39"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-22)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_38"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-23)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_37"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-24)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_36"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-25)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_35"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-26)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_34"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-27)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_33"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-28)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_32"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-29)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_31"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-30)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_30"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-31)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_29"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-32)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_28"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-33)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_27"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-34)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_26"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-35)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_25"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-36)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_24"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-37)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_23"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-38)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_22"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-39)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_21"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-40)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_20"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-41)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_19"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-42)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_18"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-43)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_17"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-44)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_16"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-45)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_15"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-46)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_14"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-47)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_13"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-48)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_12"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-49)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_11"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-50)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_10"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-51)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_9"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-52)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_8"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-53)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_7"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-54)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_6"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-55)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_5"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-56)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_4"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-57)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_3"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-58)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_2"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-59)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_1"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_2nd_ret_pts_won%_60", "SOS_tw_l60_opp_2nd_ret_pts_won%_59", "SOS_tw_l60_opp_2nd_ret_pts_won%_58", "SOS_tw_l60_opp_2nd_ret_pts_won%_57", "SOS_tw_l60_opp_2nd_ret_pts_won%_56", "SOS_tw_l60_opp_2nd_ret_pts_won%_55", "SOS_tw_l60_opp_2nd_ret_pts_won%_54", "SOS_tw_l60_opp_2nd_ret_pts_won%_53", "SOS_tw_l60_opp_2nd_ret_pts_won%_52", "SOS_tw_l60_opp_2nd_ret_pts_won%_51", "SOS_tw_l60_opp_2nd_ret_pts_won%_50", "SOS_tw_l60_opp_2nd_ret_pts_won%_49", "SOS_tw_l60_opp_2nd_ret_pts_won%_48", "SOS_tw_l60_opp_2nd_ret_pts_won%_47", "SOS_tw_l60_opp_2nd_ret_pts_won%_46", "SOS_tw_l60_opp_2nd_ret_pts_won%_45", "SOS_tw_l60_opp_2nd_ret_pts_won%_44", "SOS_tw_l60_opp_2nd_ret_pts_won%_43", "SOS_tw_l60_opp_2nd_ret_pts_won%_42", "SOS_tw_l60_opp_2nd_ret_pts_won%_41", "SOS_tw_l60_opp_2nd_ret_pts_won%_40", "SOS_tw_l60_opp_2nd_ret_pts_won%_39", "SOS_tw_l60_opp_2nd_ret_pts_won%_38", "SOS_tw_l60_opp_2nd_ret_pts_won%_37", "SOS_tw_l60_opp_2nd_ret_pts_won%_36", "SOS_tw_l60_opp_2nd_ret_pts_won%_35", "SOS_tw_l60_opp_2nd_ret_pts_won%_34", "SOS_tw_l60_opp_2nd_ret_pts_won%_33", "SOS_tw_l60_opp_2nd_ret_pts_won%_32", "SOS_tw_l60_opp_2nd_ret_pts_won%_31", "SOS_tw_l60_opp_2nd_ret_pts_won%_30", "SOS_tw_l60_opp_2nd_ret_pts_won%_29", "SOS_tw_l60_opp_2nd_ret_pts_won%_28", "SOS_tw_l60_opp_2nd_ret_pts_won%_27", "SOS_tw_l60_opp_2nd_ret_pts_won%_26", "SOS_tw_l60_opp_2nd_ret_pts_won%_25", "SOS_tw_l60_opp_2nd_ret_pts_won%_24", "SOS_tw_l60_opp_2nd_ret_pts_won%_23", "SOS_tw_l60_opp_2nd_ret_pts_won%_22", "SOS_tw_l60_opp_2nd_ret_pts_won%_21", "SOS_tw_l60_opp_2nd_ret_pts_won%_20", "SOS_tw_l60_opp_2nd_ret_pts_won%_19", "SOS_tw_l60_opp_2nd_ret_pts_won%_18", "SOS_tw_l60_opp_2nd_ret_pts_won%_17", "SOS_tw_l60_opp_2nd_ret_pts_won%_16", "SOS_tw_l60_opp_2nd_ret_pts_won%_15", "SOS_tw_l60_opp_2nd_ret_pts_won%_14", "SOS_tw_l60_opp_2nd_ret_pts_won%_13", "SOS_tw_l60_opp_2nd_ret_pts_won%_12", "SOS_tw_l60_opp_2nd_ret_pts_won%_11", "SOS_tw_l60_opp_2nd_ret_pts_won%_10", "SOS_tw_l60_opp_2nd_ret_pts_won%_9", "SOS_tw_l60_opp_2nd_ret_pts_won%_8", "SOS_tw_l60_opp_2nd_ret_pts_won%_7", "SOS_tw_l60_opp_2nd_ret_pts_won%_6", "SOS_tw_l60_opp_2nd_ret_pts_won%_5", "SOS_tw_l60_opp_2nd_ret_pts_won%_4", "SOS_tw_l60_opp_2nd_ret_pts_won%_3", "SOS_tw_l60_opp_2nd_ret_pts_won%_2", "SOS_tw_l60_opp_2nd_ret_pts_won%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_2nd_ret_pts_won%_60", "SOS_tw_l60_opp_2nd_ret_pts_won%_59", "SOS_tw_l60_opp_2nd_ret_pts_won%_58", "SOS_tw_l60_opp_2nd_ret_pts_won%_57", "SOS_tw_l60_opp_2nd_ret_pts_won%_56", "SOS_tw_l60_opp_2nd_ret_pts_won%_55", "SOS_tw_l60_opp_2nd_ret_pts_won%_54", "SOS_tw_l60_opp_2nd_ret_pts_won%_53", "SOS_tw_l60_opp_2nd_ret_pts_won%_52", "SOS_tw_l60_opp_2nd_ret_pts_won%_51", "SOS_tw_l60_opp_2nd_ret_pts_won%_50", "SOS_tw_l60_opp_2nd_ret_pts_won%_49", "SOS_tw_l60_opp_2nd_ret_pts_won%_48", "SOS_tw_l60_opp_2nd_ret_pts_won%_47", "SOS_tw_l60_opp_2nd_ret_pts_won%_46", "SOS_tw_l60_opp_2nd_ret_pts_won%_45", "SOS_tw_l60_opp_2nd_ret_pts_won%_44", "SOS_tw_l60_opp_2nd_ret_pts_won%_43", "SOS_tw_l60_opp_2nd_ret_pts_won%_42", "SOS_tw_l60_opp_2nd_ret_pts_won%_41", "SOS_tw_l60_opp_2nd_ret_pts_won%_40", "SOS_tw_l60_opp_2nd_ret_pts_won%_39", "SOS_tw_l60_opp_2nd_ret_pts_won%_38", "SOS_tw_l60_opp_2nd_ret_pts_won%_37", "SOS_tw_l60_opp_2nd_ret_pts_won%_36", "SOS_tw_l60_opp_2nd_ret_pts_won%_35", "SOS_tw_l60_opp_2nd_ret_pts_won%_34", "SOS_tw_l60_opp_2nd_ret_pts_won%_33", "SOS_tw_l60_opp_2nd_ret_pts_won%_32", "SOS_tw_l60_opp_2nd_ret_pts_won%_31", "SOS_tw_l60_opp_2nd_ret_pts_won%_30", "SOS_tw_l60_opp_2nd_ret_pts_won%_29", "SOS_tw_l60_opp_2nd_ret_pts_won%_28", "SOS_tw_l60_opp_2nd_ret_pts_won%_27", "SOS_tw_l60_opp_2nd_ret_pts_won%_26", "SOS_tw_l60_opp_2nd_ret_pts_won%_25", "SOS_tw_l60_opp_2nd_ret_pts_won%_24", "SOS_tw_l60_opp_2nd_ret_pts_won%_23", "SOS_tw_l60_opp_2nd_ret_pts_won%_22", "SOS_tw_l60_opp_2nd_ret_pts_won%_21", "SOS_tw_l60_opp_2nd_ret_pts_won%_20", "SOS_tw_l60_opp_2nd_ret_pts_won%_19", "SOS_tw_l60_opp_2nd_ret_pts_won%_18", "SOS_tw_l60_opp_2nd_ret_pts_won%_17", "SOS_tw_l60_opp_2nd_ret_pts_won%_16", "SOS_tw_l60_opp_2nd_ret_pts_won%_15", "SOS_tw_l60_opp_2nd_ret_pts_won%_14", "SOS_tw_l60_opp_2nd_ret_pts_won%_13", "SOS_tw_l60_opp_2nd_ret_pts_won%_12", "SOS_tw_l60_opp_2nd_ret_pts_won%_11", "SOS_tw_l60_opp_2nd_ret_pts_won%_10", "SOS_tw_l60_opp_2nd_ret_pts_won%_9", "SOS_tw_l60_opp_2nd_ret_pts_won%_8", "SOS_tw_l60_opp_2nd_ret_pts_won%_7", "SOS_tw_l60_opp_2nd_ret_pts_won%_6", "SOS_tw_l60_opp_2nd_ret_pts_won%_5", "SOS_tw_l60_opp_2nd_ret_pts_won%_4", "SOS_tw_l60_opp_2nd_ret_pts_won%_3", "SOS_tw_l60_opp_2nd_ret_pts_won%_2", "SOS_tw_l60_opp_2nd_ret_pts_won%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%"] = (df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_ws"]/df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % SECOND SERVE pts won the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface and IO status is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l60_opp_2nd_ret_pts_won%"]

# Mean % SECOND SERVE pts won performance (l60_tw_ss_IO) for ALL players per surface (clay, hard) and IO status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_2nd_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_2nd_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_2nd_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_2nd_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_2nd_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_2nd_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (nan, 44.89714285714286, 49.48011976047904, 51.59187134502923, 50.11091549295774, 50.16919540229883)

mean_clay_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_2nd_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_2nd_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_2nd_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_2nd_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_2nd_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_2nd_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (49.18883241758239, 49.83844321148842, 49.9520067796612, 49.99689203173504, 50.69872622282608, 50.38858626465655)    

mean_hard_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_2nd_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_2nd_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_2nd_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_2nd_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_2nd_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_2nd_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (49.17482926829275, 50.98665188470069, 51.10810532341151, 51.12737380627565, 50.802550522648076, 50.429034246575284)
                                         
mean_hard_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_2nd_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_2nd_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_2nd_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_2nd_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_2nd_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_2nd_ret_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average 
# (49.5766020463112, 50.50592942345924, 50.589277492040196, 51.06830256533208, 50.75752890501028, 50.72430171428569)

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 
                                         
del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_2nd_ret_pts_won%_ws", "SOS_tw_l60_opp_2nd_ret_pts_won%_ws_ct", "SOS_tw_l60_opp_2nd_ret_pts_won%", "SOS_tw_l60_opp_2nd_ret_pts_won%_50", "SOS_tw_l60_opp_2nd_ret_pts_won%_49", "SOS_tw_l60_opp_2nd_ret_pts_won%_48", "SOS_tw_l60_opp_2nd_ret_pts_won%_47", "SOS_tw_l60_opp_2nd_ret_pts_won%_46", "SOS_tw_l60_opp_2nd_ret_pts_won%_45", "SOS_tw_l60_opp_2nd_ret_pts_won%_44", "SOS_tw_l60_opp_2nd_ret_pts_won%_43", "SOS_tw_l60_opp_2nd_ret_pts_won%_42", "SOS_tw_l60_opp_2nd_ret_pts_won%_41", "SOS_tw_l60_opp_2nd_ret_pts_won%_40", "SOS_tw_l60_opp_2nd_ret_pts_won%_39", "SOS_tw_l60_opp_2nd_ret_pts_won%_38", "SOS_tw_l60_opp_2nd_ret_pts_won%_37", "SOS_tw_l60_opp_2nd_ret_pts_won%_36", "SOS_tw_l60_opp_2nd_ret_pts_won%_35", "SOS_tw_l60_opp_2nd_ret_pts_won%_34", "SOS_tw_l60_opp_2nd_ret_pts_won%_33", "SOS_tw_l60_opp_2nd_ret_pts_won%_32", "SOS_tw_l60_opp_2nd_ret_pts_won%_31", "SOS_tw_l60_opp_2nd_ret_pts_won%_30", "SOS_tw_l60_opp_2nd_ret_pts_won%_29", "SOS_tw_l60_opp_2nd_ret_pts_won%_28", "SOS_tw_l60_opp_2nd_ret_pts_won%_27", "SOS_tw_l60_opp_2nd_ret_pts_won%_26", "SOS_tw_l60_opp_2nd_ret_pts_won%_25", "SOS_tw_l60_opp_2nd_ret_pts_won%_24", "SOS_tw_l60_opp_2nd_ret_pts_won%_23", "SOS_tw_l60_opp_2nd_ret_pts_won%_22", "SOS_tw_l60_opp_2nd_ret_pts_won%_21", "SOS_tw_l60_opp_2nd_ret_pts_won%_20", "SOS_tw_l60_opp_2nd_ret_pts_won%_19", "SOS_tw_l60_opp_2nd_ret_pts_won%_18", "SOS_tw_l60_opp_2nd_ret_pts_won%_17", "SOS_tw_l60_opp_2nd_ret_pts_won%_16", "SOS_tw_l60_opp_2nd_ret_pts_won%_15", "SOS_tw_l60_opp_2nd_ret_pts_won%_14", "SOS_tw_l60_opp_2nd_ret_pts_won%_13", "SOS_tw_l60_opp_2nd_ret_pts_won%_12", "SOS_tw_l60_opp_2nd_ret_pts_won%_11", "SOS_tw_l60_opp_2nd_ret_pts_won%_10", "SOS_tw_l60_opp_2nd_ret_pts_won%_9", "SOS_tw_l60_opp_2nd_ret_pts_won%_8", "SOS_tw_l60_opp_2nd_ret_pts_won%_7", "SOS_tw_l60_opp_2nd_ret_pts_won%_6", "SOS_tw_l60_opp_2nd_ret_pts_won%_5", "SOS_tw_l60_opp_2nd_ret_pts_won%_4", "SOS_tw_l60_opp_2nd_ret_pts_won%_3", "SOS_tw_l60_opp_2nd_ret_pts_won%_2", "SOS_tw_l60_opp_2nd_ret_pts_won%_1"],axis=1)

In [161]:
# 'p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), IO specific SECOND SERVE POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_2nd_ret_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_2nd_ret_pts_won%_60", "SOS_tw_l60_opp_2nd_ret_pts_won%_59", "SOS_tw_l60_opp_2nd_ret_pts_won%_58", "SOS_tw_l60_opp_2nd_ret_pts_won%_57", "SOS_tw_l60_opp_2nd_ret_pts_won%_56", "SOS_tw_l60_opp_2nd_ret_pts_won%_55", "SOS_tw_l60_opp_2nd_ret_pts_won%_54", "SOS_tw_l60_opp_2nd_ret_pts_won%_53", "SOS_tw_l60_opp_2nd_ret_pts_won%_52", "SOS_tw_l60_opp_2nd_ret_pts_won%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_2nd_ret_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_2nd_ret_pts_won%_60", "SOS_tw_l60_opp_2nd_ret_pts_won%_59", "SOS_tw_l60_opp_2nd_ret_pts_won%_58", "SOS_tw_l60_opp_2nd_ret_pts_won%_57", "SOS_tw_l60_opp_2nd_ret_pts_won%_56", "SOS_tw_l60_opp_2nd_ret_pts_won%_55", "SOS_tw_l60_opp_2nd_ret_pts_won%_54", "SOS_tw_l60_opp_2nd_ret_pts_won%_53", "SOS_tw_l60_opp_2nd_ret_pts_won%_52", "SOS_tw_l60_opp_2nd_ret_pts_won%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_2nd_ret_pts_won%"] = (df_player2["SOS_tw_l10_opp_2nd_ret_pts_won%_ws"]/df_player2["SOS_tw_l10_opp_2nd_ret_pts_won%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % SECOND SERVE pts won the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches on this surface and IO status is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l10_opp_2nd_ret_pts_won%"]

# Mean % SECOND SERVE pts won performance (l10_tw_ss_IO) for ALL players per surface (clay, hard) and IO status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_2nd_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_2nd_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_2nd_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_2nd_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_2nd_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_2nd_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# 

mean_clay_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_2nd_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_2nd_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_2nd_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_2nd_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_2nd_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_2nd_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
#      

mean_hard_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_2nd_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_2nd_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_2nd_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_2nd_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_2nd_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_2nd_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# 
                                         
mean_hard_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_2nd_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_2nd_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_2nd_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_2nd_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_2nd_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_2nd_ret_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average 
# 

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_sv_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 
                                         
del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_2nd_ret_pts_won%_ws", "SOS_tw_l10_opp_2nd_ret_pts_won%_ws_ct", "SOS_tw_l10_opp_2nd_ret_pts_won%", "SOS_tw_l60_opp_2nd_ret_pts_won%_60", "SOS_tw_l60_opp_2nd_ret_pts_won%_59", "SOS_tw_l60_opp_2nd_ret_pts_won%_58", "SOS_tw_l60_opp_2nd_ret_pts_won%_57", "SOS_tw_l60_opp_2nd_ret_pts_won%_56", "SOS_tw_l60_opp_2nd_ret_pts_won%_55", "SOS_tw_l60_opp_2nd_ret_pts_won%_54", "SOS_tw_l60_opp_2nd_ret_pts_won%_53", "SOS_tw_l60_opp_2nd_ret_pts_won%_52", "SOS_tw_l60_opp_2nd_ret_pts_won%_51"],axis=1)

In [162]:
# 'p_ret_pts_won%_l60_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) RETURN POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific (SS) SERVE POINTS WON performance of player OPPONENTS over the 60 matches PRIOR TO the match being predicted 
df_player2["SOS_tw_l60_opp_sv_pts_won%_60"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-1)
df_player2["SOS_tw_l60_opp_sv_pts_won%_59"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-2)
df_player2["SOS_tw_l60_opp_sv_pts_won%_58"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-3)
df_player2["SOS_tw_l60_opp_sv_pts_won%_57"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-4)
df_player2["SOS_tw_l60_opp_sv_pts_won%_56"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-5)
df_player2["SOS_tw_l60_opp_sv_pts_won%_55"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-6)
df_player2["SOS_tw_l60_opp_sv_pts_won%_54"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-7)
df_player2["SOS_tw_l60_opp_sv_pts_won%_53"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-8)
df_player2["SOS_tw_l60_opp_sv_pts_won%_52"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-9)
df_player2["SOS_tw_l60_opp_sv_pts_won%_51"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-10)
df_player2["SOS_tw_l60_opp_sv_pts_won%_50"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-11)
df_player2["SOS_tw_l60_opp_sv_pts_won%_49"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-12)
df_player2["SOS_tw_l60_opp_sv_pts_won%_48"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-13)
df_player2["SOS_tw_l60_opp_sv_pts_won%_47"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-14)
df_player2["SOS_tw_l60_opp_sv_pts_won%_46"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-15)
df_player2["SOS_tw_l60_opp_sv_pts_won%_45"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-16)
df_player2["SOS_tw_l60_opp_sv_pts_won%_44"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-17)
df_player2["SOS_tw_l60_opp_sv_pts_won%_43"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-18)
df_player2["SOS_tw_l60_opp_sv_pts_won%_42"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-19)
df_player2["SOS_tw_l60_opp_sv_pts_won%_41"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-20)
df_player2["SOS_tw_l60_opp_sv_pts_won%_40"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-21)
df_player2["SOS_tw_l60_opp_sv_pts_won%_39"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-22)
df_player2["SOS_tw_l60_opp_sv_pts_won%_38"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-23)
df_player2["SOS_tw_l60_opp_sv_pts_won%_37"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-24)
df_player2["SOS_tw_l60_opp_sv_pts_won%_36"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-25)
df_player2["SOS_tw_l60_opp_sv_pts_won%_35"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-26)
df_player2["SOS_tw_l60_opp_sv_pts_won%_34"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-27)
df_player2["SOS_tw_l60_opp_sv_pts_won%_33"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-28)
df_player2["SOS_tw_l60_opp_sv_pts_won%_32"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-29)
df_player2["SOS_tw_l60_opp_sv_pts_won%_31"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-30)
df_player2["SOS_tw_l60_opp_sv_pts_won%_30"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-31)
df_player2["SOS_tw_l60_opp_sv_pts_won%_29"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-32)
df_player2["SOS_tw_l60_opp_sv_pts_won%_28"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-33)
df_player2["SOS_tw_l60_opp_sv_pts_won%_27"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-34)
df_player2["SOS_tw_l60_opp_sv_pts_won%_26"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-35)
df_player2["SOS_tw_l60_opp_sv_pts_won%_25"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-36)
df_player2["SOS_tw_l60_opp_sv_pts_won%_24"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-37)
df_player2["SOS_tw_l60_opp_sv_pts_won%_23"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-38)
df_player2["SOS_tw_l60_opp_sv_pts_won%_22"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-39)
df_player2["SOS_tw_l60_opp_sv_pts_won%_21"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-40)
df_player2["SOS_tw_l60_opp_sv_pts_won%_20"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-41)
df_player2["SOS_tw_l60_opp_sv_pts_won%_19"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-42)
df_player2["SOS_tw_l60_opp_sv_pts_won%_18"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-43)
df_player2["SOS_tw_l60_opp_sv_pts_won%_17"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-44)
df_player2["SOS_tw_l60_opp_sv_pts_won%_16"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-45)
df_player2["SOS_tw_l60_opp_sv_pts_won%_15"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-46)
df_player2["SOS_tw_l60_opp_sv_pts_won%_14"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-47)
df_player2["SOS_tw_l60_opp_sv_pts_won%_13"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-48)
df_player2["SOS_tw_l60_opp_sv_pts_won%_12"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-49)
df_player2["SOS_tw_l60_opp_sv_pts_won%_11"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-50)
df_player2["SOS_tw_l60_opp_sv_pts_won%_10"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-51)
df_player2["SOS_tw_l60_opp_sv_pts_won%_9"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-52)
df_player2["SOS_tw_l60_opp_sv_pts_won%_8"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-53)
df_player2["SOS_tw_l60_opp_sv_pts_won%_7"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-54)
df_player2["SOS_tw_l60_opp_sv_pts_won%_6"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-55)
df_player2["SOS_tw_l60_opp_sv_pts_won%_5"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-56)
df_player2["SOS_tw_l60_opp_sv_pts_won%_4"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-57)
df_player2["SOS_tw_l60_opp_sv_pts_won%_3"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-58)
df_player2["SOS_tw_l60_opp_sv_pts_won%_2"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-59)
df_player2["SOS_tw_l60_opp_sv_pts_won%_1"] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_tw_ss'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_sv_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_sv_pts_won%_60", "SOS_tw_l60_opp_sv_pts_won%_59", "SOS_tw_l60_opp_sv_pts_won%_58", "SOS_tw_l60_opp_sv_pts_won%_57", "SOS_tw_l60_opp_sv_pts_won%_56", "SOS_tw_l60_opp_sv_pts_won%_55", "SOS_tw_l60_opp_sv_pts_won%_54", "SOS_tw_l60_opp_sv_pts_won%_53", "SOS_tw_l60_opp_sv_pts_won%_52", "SOS_tw_l60_opp_sv_pts_won%_51", "SOS_tw_l60_opp_sv_pts_won%_50", "SOS_tw_l60_opp_sv_pts_won%_49", "SOS_tw_l60_opp_sv_pts_won%_48", "SOS_tw_l60_opp_sv_pts_won%_47", "SOS_tw_l60_opp_sv_pts_won%_46", "SOS_tw_l60_opp_sv_pts_won%_45", "SOS_tw_l60_opp_sv_pts_won%_44", "SOS_tw_l60_opp_sv_pts_won%_43", "SOS_tw_l60_opp_sv_pts_won%_42", "SOS_tw_l60_opp_sv_pts_won%_41", "SOS_tw_l60_opp_sv_pts_won%_40", "SOS_tw_l60_opp_sv_pts_won%_39", "SOS_tw_l60_opp_sv_pts_won%_38", "SOS_tw_l60_opp_sv_pts_won%_37", "SOS_tw_l60_opp_sv_pts_won%_36", "SOS_tw_l60_opp_sv_pts_won%_35", "SOS_tw_l60_opp_sv_pts_won%_34", "SOS_tw_l60_opp_sv_pts_won%_33", "SOS_tw_l60_opp_sv_pts_won%_32", "SOS_tw_l60_opp_sv_pts_won%_31", "SOS_tw_l60_opp_sv_pts_won%_30", "SOS_tw_l60_opp_sv_pts_won%_29", "SOS_tw_l60_opp_sv_pts_won%_28", "SOS_tw_l60_opp_sv_pts_won%_27", "SOS_tw_l60_opp_sv_pts_won%_26", "SOS_tw_l60_opp_sv_pts_won%_25", "SOS_tw_l60_opp_sv_pts_won%_24", "SOS_tw_l60_opp_sv_pts_won%_23", "SOS_tw_l60_opp_sv_pts_won%_22", "SOS_tw_l60_opp_sv_pts_won%_21", "SOS_tw_l60_opp_sv_pts_won%_20", "SOS_tw_l60_opp_sv_pts_won%_19", "SOS_tw_l60_opp_sv_pts_won%_18", "SOS_tw_l60_opp_sv_pts_won%_17", "SOS_tw_l60_opp_sv_pts_won%_16", "SOS_tw_l60_opp_sv_pts_won%_15", "SOS_tw_l60_opp_sv_pts_won%_14", "SOS_tw_l60_opp_sv_pts_won%_13", "SOS_tw_l60_opp_sv_pts_won%_12", "SOS_tw_l60_opp_sv_pts_won%_11", "SOS_tw_l60_opp_sv_pts_won%_10", "SOS_tw_l60_opp_sv_pts_won%_9", "SOS_tw_l60_opp_sv_pts_won%_8", "SOS_tw_l60_opp_sv_pts_won%_7", "SOS_tw_l60_opp_sv_pts_won%_6", "SOS_tw_l60_opp_sv_pts_won%_5", "SOS_tw_l60_opp_sv_pts_won%_4", "SOS_tw_l60_opp_sv_pts_won%_3", "SOS_tw_l60_opp_sv_pts_won%_2", "SOS_tw_l60_opp_sv_pts_won%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_sv_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_sv_pts_won%_60", "SOS_tw_l60_opp_sv_pts_won%_59", "SOS_tw_l60_opp_sv_pts_won%_58", "SOS_tw_l60_opp_sv_pts_won%_57", "SOS_tw_l60_opp_sv_pts_won%_56", "SOS_tw_l60_opp_sv_pts_won%_55", "SOS_tw_l60_opp_sv_pts_won%_54", "SOS_tw_l60_opp_sv_pts_won%_53", "SOS_tw_l60_opp_sv_pts_won%_52", "SOS_tw_l60_opp_sv_pts_won%_51", "SOS_tw_l60_opp_sv_pts_won%_50", "SOS_tw_l60_opp_sv_pts_won%_49", "SOS_tw_l60_opp_sv_pts_won%_48", "SOS_tw_l60_opp_sv_pts_won%_47", "SOS_tw_l60_opp_sv_pts_won%_46", "SOS_tw_l60_opp_sv_pts_won%_45", "SOS_tw_l60_opp_sv_pts_won%_44", "SOS_tw_l60_opp_sv_pts_won%_43", "SOS_tw_l60_opp_sv_pts_won%_42", "SOS_tw_l60_opp_sv_pts_won%_41", "SOS_tw_l60_opp_sv_pts_won%_40", "SOS_tw_l60_opp_sv_pts_won%_39", "SOS_tw_l60_opp_sv_pts_won%_38", "SOS_tw_l60_opp_sv_pts_won%_37", "SOS_tw_l60_opp_sv_pts_won%_36", "SOS_tw_l60_opp_sv_pts_won%_35", "SOS_tw_l60_opp_sv_pts_won%_34", "SOS_tw_l60_opp_sv_pts_won%_33", "SOS_tw_l60_opp_sv_pts_won%_32", "SOS_tw_l60_opp_sv_pts_won%_31", "SOS_tw_l60_opp_sv_pts_won%_30", "SOS_tw_l60_opp_sv_pts_won%_29", "SOS_tw_l60_opp_sv_pts_won%_28", "SOS_tw_l60_opp_sv_pts_won%_27", "SOS_tw_l60_opp_sv_pts_won%_26", "SOS_tw_l60_opp_sv_pts_won%_25", "SOS_tw_l60_opp_sv_pts_won%_24", "SOS_tw_l60_opp_sv_pts_won%_23", "SOS_tw_l60_opp_sv_pts_won%_22", "SOS_tw_l60_opp_sv_pts_won%_21", "SOS_tw_l60_opp_sv_pts_won%_20", "SOS_tw_l60_opp_sv_pts_won%_19", "SOS_tw_l60_opp_sv_pts_won%_18", "SOS_tw_l60_opp_sv_pts_won%_17", "SOS_tw_l60_opp_sv_pts_won%_16", "SOS_tw_l60_opp_sv_pts_won%_15", "SOS_tw_l60_opp_sv_pts_won%_14", "SOS_tw_l60_opp_sv_pts_won%_13", "SOS_tw_l60_opp_sv_pts_won%_12", "SOS_tw_l60_opp_sv_pts_won%_11", "SOS_tw_l60_opp_sv_pts_won%_10", "SOS_tw_l60_opp_sv_pts_won%_9", "SOS_tw_l60_opp_sv_pts_won%_8", "SOS_tw_l60_opp_sv_pts_won%_7", "SOS_tw_l60_opp_sv_pts_won%_6", "SOS_tw_l60_opp_sv_pts_won%_5", "SOS_tw_l60_opp_sv_pts_won%_4", "SOS_tw_l60_opp_sv_pts_won%_3", "SOS_tw_l60_opp_sv_pts_won%_2", "SOS_tw_l60_opp_sv_pts_won%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_sv_pts_won%"] = (df_player2["SOS_tw_l60_opp_sv_pts_won%_ws"]/df_player2["SOS_tw_l60_opp_sv_pts_won%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % RETURN pts won the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l60_opp_sv_pts_won%"]

# Mean % RETURN pts won performance (l60_tw_ss) for ALL players per surface (clay, hard). We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_clay_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct return pts the field ALLOWS on average
mean_clay_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct return pts the field ALLOWS on average
mean_clay_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct return pts the field ALLOWS on average
mean_clay_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct return pts the field ALLOWS on average
mean_clay_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct return pts the field ALLOWS on average
mean_clay_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct return pts the field ALLOWS on average
# (37.95248626373632, 38.451821338383816, 38.391630331753596, 38.14765765765777, 37.839845410628115, 37.91686934023291)

mean_hard_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct return pts the field ALLOWS on average
mean_hard_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct return pts the field ALLOWS on average
mean_hard_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct return pts the field ALLOWS on average
mean_hard_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct return pts the field ALLOWS on average
mean_hard_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct return pts the field ALLOWS on average
mean_hard_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct return pts the field ALLOWS on average
# (36.04769121813039, 36.212923441734304, 36.629166950596286, 35.875588494794485, 36.267938126805916, 35.86158020071457)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_sv_pts_won%_ws", "SOS_tw_l60_opp_sv_pts_won%_ws_ct", "SOS_tw_l60_opp_sv_pts_won%", "SOS_tw_l60_opp_sv_pts_won%_50", "SOS_tw_l60_opp_sv_pts_won%_49", "SOS_tw_l60_opp_sv_pts_won%_48", "SOS_tw_l60_opp_sv_pts_won%_47", "SOS_tw_l60_opp_sv_pts_won%_46", "SOS_tw_l60_opp_sv_pts_won%_45", "SOS_tw_l60_opp_sv_pts_won%_44", "SOS_tw_l60_opp_sv_pts_won%_43", "SOS_tw_l60_opp_sv_pts_won%_42", "SOS_tw_l60_opp_sv_pts_won%_41", "SOS_tw_l60_opp_sv_pts_won%_40", "SOS_tw_l60_opp_sv_pts_won%_39", "SOS_tw_l60_opp_sv_pts_won%_38", "SOS_tw_l60_opp_sv_pts_won%_37", "SOS_tw_l60_opp_sv_pts_won%_36", "SOS_tw_l60_opp_sv_pts_won%_35", "SOS_tw_l60_opp_sv_pts_won%_34", "SOS_tw_l60_opp_sv_pts_won%_33", "SOS_tw_l60_opp_sv_pts_won%_32", "SOS_tw_l60_opp_sv_pts_won%_31", "SOS_tw_l60_opp_sv_pts_won%_30", "SOS_tw_l60_opp_sv_pts_won%_29", "SOS_tw_l60_opp_sv_pts_won%_28", "SOS_tw_l60_opp_sv_pts_won%_27", "SOS_tw_l60_opp_sv_pts_won%_26", "SOS_tw_l60_opp_sv_pts_won%_25", "SOS_tw_l60_opp_sv_pts_won%_24", "SOS_tw_l60_opp_sv_pts_won%_23", "SOS_tw_l60_opp_sv_pts_won%_22", "SOS_tw_l60_opp_sv_pts_won%_21", "SOS_tw_l60_opp_sv_pts_won%_20", "SOS_tw_l60_opp_sv_pts_won%_19", "SOS_tw_l60_opp_sv_pts_won%_18", "SOS_tw_l60_opp_sv_pts_won%_17", "SOS_tw_l60_opp_sv_pts_won%_16", "SOS_tw_l60_opp_sv_pts_won%_15", "SOS_tw_l60_opp_sv_pts_won%_14", "SOS_tw_l60_opp_sv_pts_won%_13", "SOS_tw_l60_opp_sv_pts_won%_12", "SOS_tw_l60_opp_sv_pts_won%_11", "SOS_tw_l60_opp_sv_pts_won%_10", "SOS_tw_l60_opp_sv_pts_won%_9", "SOS_tw_l60_opp_sv_pts_won%_8", "SOS_tw_l60_opp_sv_pts_won%_7", "SOS_tw_l60_opp_sv_pts_won%_6", "SOS_tw_l60_opp_sv_pts_won%_5", "SOS_tw_l60_opp_sv_pts_won%_4", "SOS_tw_l60_opp_sv_pts_won%_3", "SOS_tw_l60_opp_sv_pts_won%_2", "SOS_tw_l60_opp_sv_pts_won%_1"],axis=1)

In [163]:
# 'p_ret_pts_won%_l10_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) RETURN POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

df_player2["SOS_tw_l10_opp_sv_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_sv_pts_won%_60", "SOS_tw_l60_opp_sv_pts_won%_59", "SOS_tw_l60_opp_sv_pts_won%_58", "SOS_tw_l60_opp_sv_pts_won%_57", "SOS_tw_l60_opp_sv_pts_won%_56", "SOS_tw_l60_opp_sv_pts_won%_55", "SOS_tw_l60_opp_sv_pts_won%_54", "SOS_tw_l60_opp_sv_pts_won%_53", "SOS_tw_l60_opp_sv_pts_won%_52", "SOS_tw_l60_opp_sv_pts_won%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_sv_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_sv_pts_won%_60", "SOS_tw_l60_opp_sv_pts_won%_59", "SOS_tw_l60_opp_sv_pts_won%_58", "SOS_tw_l60_opp_sv_pts_won%_57", "SOS_tw_l60_opp_sv_pts_won%_56", "SOS_tw_l60_opp_sv_pts_won%_55", "SOS_tw_l60_opp_sv_pts_won%_54", "SOS_tw_l60_opp_sv_pts_won%_53", "SOS_tw_l60_opp_sv_pts_won%_52", "SOS_tw_l60_opp_sv_pts_won%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_sv_pts_won%"] = (df_player2["SOS_tw_l10_opp_sv_pts_won%_ws"]/df_player2["SOS_tw_l10_opp_sv_pts_won%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % RETURN pts won the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches on this surface is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l10_opp_sv_pts_won%"]

# Mean % RETURN pts won performance (l10_tw_ss) across ALL players per surface. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per durface over 2 year intervals, and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_clay_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct return pts the field ALLOWS on average
mean_clay_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct return pts the field ALLOWS on average
mean_clay_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct return pts the field ALLOWS on average
mean_clay_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct return pts the field ALLOWS on average
mean_clay_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct return pts the field ALLOWS on average
mean_clay_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct return pts the field ALLOWS on average
# (37.91921016483517, 38.477692550505154, 38.302982622433106, 37.94349420849421, 37.721088566827774, 37.80479301423045)

mean_hard_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct return pts the field ALLOWS on average
mean_hard_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct return pts the field ALLOWS on average
mean_hard_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct return pts the field ALLOWS on average
mean_hard_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct return pts the field ALLOWS on average
mean_hard_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct return pts the field ALLOWS on average
mean_hard_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct return pts the field ALLOWS on average
# (35.981051699716744, 36.19295223577232, 36.62860306643934, 35.64119110640557, 36.27868434472218, 35.640636162612694)

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6


df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_sv_pts_won%_ws", "SOS_tw_l10_opp_sv_pts_won%_ws_ct", "SOS_tw_l10_opp_sv_pts_won%", "SOS_tw_l60_opp_sv_pts_won%_60", "SOS_tw_l60_opp_sv_pts_won%_59", "SOS_tw_l60_opp_sv_pts_won%_58", "SOS_tw_l60_opp_sv_pts_won%_57", "SOS_tw_l60_opp_sv_pts_won%_56", "SOS_tw_l60_opp_sv_pts_won%_55", "SOS_tw_l60_opp_sv_pts_won%_54", "SOS_tw_l60_opp_sv_pts_won%_53", "SOS_tw_l60_opp_sv_pts_won%_52", "SOS_tw_l60_opp_sv_pts_won%_51"],axis=1)

In [164]:
# 'p_ret_pts_won%_l60_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), indoor/outdoor (IO) specific RETURN POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf', 't_ind', 'm_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific (SS), IO specific SERVE POINTS WON performance of player OPPONENTS over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player2["SOS_tw_l60_opp_sv_pts_won%_60"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-1)
df_player2["SOS_tw_l60_opp_sv_pts_won%_59"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-2)
df_player2["SOS_tw_l60_opp_sv_pts_won%_58"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-3)
df_player2["SOS_tw_l60_opp_sv_pts_won%_57"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-4)
df_player2["SOS_tw_l60_opp_sv_pts_won%_56"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-5)
df_player2["SOS_tw_l60_opp_sv_pts_won%_55"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-6)
df_player2["SOS_tw_l60_opp_sv_pts_won%_54"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-7)
df_player2["SOS_tw_l60_opp_sv_pts_won%_53"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-8)
df_player2["SOS_tw_l60_opp_sv_pts_won%_52"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-9)
df_player2["SOS_tw_l60_opp_sv_pts_won%_51"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-10)
df_player2["SOS_tw_l60_opp_sv_pts_won%_50"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-11)
df_player2["SOS_tw_l60_opp_sv_pts_won%_49"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-12)
df_player2["SOS_tw_l60_opp_sv_pts_won%_48"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-13)
df_player2["SOS_tw_l60_opp_sv_pts_won%_47"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-14)
df_player2["SOS_tw_l60_opp_sv_pts_won%_46"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-15)
df_player2["SOS_tw_l60_opp_sv_pts_won%_45"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-16)
df_player2["SOS_tw_l60_opp_sv_pts_won%_44"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-17)
df_player2["SOS_tw_l60_opp_sv_pts_won%_43"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-18)
df_player2["SOS_tw_l60_opp_sv_pts_won%_42"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-19)
df_player2["SOS_tw_l60_opp_sv_pts_won%_41"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-20)
df_player2["SOS_tw_l60_opp_sv_pts_won%_40"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-21)
df_player2["SOS_tw_l60_opp_sv_pts_won%_39"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-22)
df_player2["SOS_tw_l60_opp_sv_pts_won%_38"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-23)
df_player2["SOS_tw_l60_opp_sv_pts_won%_37"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-24)
df_player2["SOS_tw_l60_opp_sv_pts_won%_36"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-25)
df_player2["SOS_tw_l60_opp_sv_pts_won%_35"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-26)
df_player2["SOS_tw_l60_opp_sv_pts_won%_34"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-27)
df_player2["SOS_tw_l60_opp_sv_pts_won%_33"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-28)
df_player2["SOS_tw_l60_opp_sv_pts_won%_32"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-29)
df_player2["SOS_tw_l60_opp_sv_pts_won%_31"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-30)
df_player2["SOS_tw_l60_opp_sv_pts_won%_30"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-31)
df_player2["SOS_tw_l60_opp_sv_pts_won%_29"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-32)
df_player2["SOS_tw_l60_opp_sv_pts_won%_28"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-33)
df_player2["SOS_tw_l60_opp_sv_pts_won%_27"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-34)
df_player2["SOS_tw_l60_opp_sv_pts_won%_26"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-35)
df_player2["SOS_tw_l60_opp_sv_pts_won%_25"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-36)
df_player2["SOS_tw_l60_opp_sv_pts_won%_24"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-37)
df_player2["SOS_tw_l60_opp_sv_pts_won%_23"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-38)
df_player2["SOS_tw_l60_opp_sv_pts_won%_22"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-39)
df_player2["SOS_tw_l60_opp_sv_pts_won%_21"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-40)
df_player2["SOS_tw_l60_opp_sv_pts_won%_20"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-41)
df_player2["SOS_tw_l60_opp_sv_pts_won%_19"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-42)
df_player2["SOS_tw_l60_opp_sv_pts_won%_18"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-43)
df_player2["SOS_tw_l60_opp_sv_pts_won%_17"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-44)
df_player2["SOS_tw_l60_opp_sv_pts_won%_16"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-45)
df_player2["SOS_tw_l60_opp_sv_pts_won%_15"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-46)
df_player2["SOS_tw_l60_opp_sv_pts_won%_14"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-47)
df_player2["SOS_tw_l60_opp_sv_pts_won%_13"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-48)
df_player2["SOS_tw_l60_opp_sv_pts_won%_12"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-49)
df_player2["SOS_tw_l60_opp_sv_pts_won%_11"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-50)
df_player2["SOS_tw_l60_opp_sv_pts_won%_10"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-51)
df_player2["SOS_tw_l60_opp_sv_pts_won%_9"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-52)
df_player2["SOS_tw_l60_opp_sv_pts_won%_8"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-53)
df_player2["SOS_tw_l60_opp_sv_pts_won%_7"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-54)
df_player2["SOS_tw_l60_opp_sv_pts_won%_6"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-55)
df_player2["SOS_tw_l60_opp_sv_pts_won%_5"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-56)
df_player2["SOS_tw_l60_opp_sv_pts_won%_4"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-57)
df_player2["SOS_tw_l60_opp_sv_pts_won%_3"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-58)
df_player2["SOS_tw_l60_opp_sv_pts_won%_2"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-59)
df_player2["SOS_tw_l60_opp_sv_pts_won%_1"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_sv_pts_won%_l60_tw_ss_IO'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_sv_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_sv_pts_won%_60", "SOS_tw_l60_opp_sv_pts_won%_59", "SOS_tw_l60_opp_sv_pts_won%_58", "SOS_tw_l60_opp_sv_pts_won%_57", "SOS_tw_l60_opp_sv_pts_won%_56", "SOS_tw_l60_opp_sv_pts_won%_55", "SOS_tw_l60_opp_sv_pts_won%_54", "SOS_tw_l60_opp_sv_pts_won%_53", "SOS_tw_l60_opp_sv_pts_won%_52", "SOS_tw_l60_opp_sv_pts_won%_51", "SOS_tw_l60_opp_sv_pts_won%_50", "SOS_tw_l60_opp_sv_pts_won%_49", "SOS_tw_l60_opp_sv_pts_won%_48", "SOS_tw_l60_opp_sv_pts_won%_47", "SOS_tw_l60_opp_sv_pts_won%_46", "SOS_tw_l60_opp_sv_pts_won%_45", "SOS_tw_l60_opp_sv_pts_won%_44", "SOS_tw_l60_opp_sv_pts_won%_43", "SOS_tw_l60_opp_sv_pts_won%_42", "SOS_tw_l60_opp_sv_pts_won%_41", "SOS_tw_l60_opp_sv_pts_won%_40", "SOS_tw_l60_opp_sv_pts_won%_39", "SOS_tw_l60_opp_sv_pts_won%_38", "SOS_tw_l60_opp_sv_pts_won%_37", "SOS_tw_l60_opp_sv_pts_won%_36", "SOS_tw_l60_opp_sv_pts_won%_35", "SOS_tw_l60_opp_sv_pts_won%_34", "SOS_tw_l60_opp_sv_pts_won%_33", "SOS_tw_l60_opp_sv_pts_won%_32", "SOS_tw_l60_opp_sv_pts_won%_31", "SOS_tw_l60_opp_sv_pts_won%_30", "SOS_tw_l60_opp_sv_pts_won%_29", "SOS_tw_l60_opp_sv_pts_won%_28", "SOS_tw_l60_opp_sv_pts_won%_27", "SOS_tw_l60_opp_sv_pts_won%_26", "SOS_tw_l60_opp_sv_pts_won%_25", "SOS_tw_l60_opp_sv_pts_won%_24", "SOS_tw_l60_opp_sv_pts_won%_23", "SOS_tw_l60_opp_sv_pts_won%_22", "SOS_tw_l60_opp_sv_pts_won%_21", "SOS_tw_l60_opp_sv_pts_won%_20", "SOS_tw_l60_opp_sv_pts_won%_19", "SOS_tw_l60_opp_sv_pts_won%_18", "SOS_tw_l60_opp_sv_pts_won%_17", "SOS_tw_l60_opp_sv_pts_won%_16", "SOS_tw_l60_opp_sv_pts_won%_15", "SOS_tw_l60_opp_sv_pts_won%_14", "SOS_tw_l60_opp_sv_pts_won%_13", "SOS_tw_l60_opp_sv_pts_won%_12", "SOS_tw_l60_opp_sv_pts_won%_11", "SOS_tw_l60_opp_sv_pts_won%_10", "SOS_tw_l60_opp_sv_pts_won%_9", "SOS_tw_l60_opp_sv_pts_won%_8", "SOS_tw_l60_opp_sv_pts_won%_7", "SOS_tw_l60_opp_sv_pts_won%_6", "SOS_tw_l60_opp_sv_pts_won%_5", "SOS_tw_l60_opp_sv_pts_won%_4", "SOS_tw_l60_opp_sv_pts_won%_3", "SOS_tw_l60_opp_sv_pts_won%_2", "SOS_tw_l60_opp_sv_pts_won%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_sv_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_sv_pts_won%_60", "SOS_tw_l60_opp_sv_pts_won%_59", "SOS_tw_l60_opp_sv_pts_won%_58", "SOS_tw_l60_opp_sv_pts_won%_57", "SOS_tw_l60_opp_sv_pts_won%_56", "SOS_tw_l60_opp_sv_pts_won%_55", "SOS_tw_l60_opp_sv_pts_won%_54", "SOS_tw_l60_opp_sv_pts_won%_53", "SOS_tw_l60_opp_sv_pts_won%_52", "SOS_tw_l60_opp_sv_pts_won%_51", "SOS_tw_l60_opp_sv_pts_won%_50", "SOS_tw_l60_opp_sv_pts_won%_49", "SOS_tw_l60_opp_sv_pts_won%_48", "SOS_tw_l60_opp_sv_pts_won%_47", "SOS_tw_l60_opp_sv_pts_won%_46", "SOS_tw_l60_opp_sv_pts_won%_45", "SOS_tw_l60_opp_sv_pts_won%_44", "SOS_tw_l60_opp_sv_pts_won%_43", "SOS_tw_l60_opp_sv_pts_won%_42", "SOS_tw_l60_opp_sv_pts_won%_41", "SOS_tw_l60_opp_sv_pts_won%_40", "SOS_tw_l60_opp_sv_pts_won%_39", "SOS_tw_l60_opp_sv_pts_won%_38", "SOS_tw_l60_opp_sv_pts_won%_37", "SOS_tw_l60_opp_sv_pts_won%_36", "SOS_tw_l60_opp_sv_pts_won%_35", "SOS_tw_l60_opp_sv_pts_won%_34", "SOS_tw_l60_opp_sv_pts_won%_33", "SOS_tw_l60_opp_sv_pts_won%_32", "SOS_tw_l60_opp_sv_pts_won%_31", "SOS_tw_l60_opp_sv_pts_won%_30", "SOS_tw_l60_opp_sv_pts_won%_29", "SOS_tw_l60_opp_sv_pts_won%_28", "SOS_tw_l60_opp_sv_pts_won%_27", "SOS_tw_l60_opp_sv_pts_won%_26", "SOS_tw_l60_opp_sv_pts_won%_25", "SOS_tw_l60_opp_sv_pts_won%_24", "SOS_tw_l60_opp_sv_pts_won%_23", "SOS_tw_l60_opp_sv_pts_won%_22", "SOS_tw_l60_opp_sv_pts_won%_21", "SOS_tw_l60_opp_sv_pts_won%_20", "SOS_tw_l60_opp_sv_pts_won%_19", "SOS_tw_l60_opp_sv_pts_won%_18", "SOS_tw_l60_opp_sv_pts_won%_17", "SOS_tw_l60_opp_sv_pts_won%_16", "SOS_tw_l60_opp_sv_pts_won%_15", "SOS_tw_l60_opp_sv_pts_won%_14", "SOS_tw_l60_opp_sv_pts_won%_13", "SOS_tw_l60_opp_sv_pts_won%_12", "SOS_tw_l60_opp_sv_pts_won%_11", "SOS_tw_l60_opp_sv_pts_won%_10", "SOS_tw_l60_opp_sv_pts_won%_9", "SOS_tw_l60_opp_sv_pts_won%_8", "SOS_tw_l60_opp_sv_pts_won%_7", "SOS_tw_l60_opp_sv_pts_won%_6", "SOS_tw_l60_opp_sv_pts_won%_5", "SOS_tw_l60_opp_sv_pts_won%_4", "SOS_tw_l60_opp_sv_pts_won%_3", "SOS_tw_l60_opp_sv_pts_won%_2", "SOS_tw_l60_opp_sv_pts_won%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_sv_pts_won%"] = (df_player2["SOS_tw_l60_opp_sv_pts_won%_ws"]/df_player2["SOS_tw_l60_opp_sv_pts_won%_ws_ct"]).round(2) #see note on prior line

#(ws = weighted sum; tw = time-weighted)

# % RETURN pts won the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface and IO status is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l60_opp_sv_pts_won%"]

# Mean % RETURN pts won performance (l60_tw_ss) for ALL players per surface (clay, hard) and IO status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (nan, 33.69250000000001, 36.66580838323349, 35.2433918128655, 36.9624647887324, 35.48379310344828)

mean_clay_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (37.95248626373632, 38.43700391644919, 38.429552542372846, 38.251759227319795, 37.84748980978268, 37.98458291457287)  

mean_hard_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (34.068487804878046, 34.96905764966748, 35.51779622209504, 35.287987721691735, 36.1346759581882, 35.35054109589039)
                                         
mean_hard_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average                                         
# (36.076785137318204, 36.644654572564576, 36.91182953710506, 36.15796931191582, 36.25706642484706, 36.08278857142863)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 
                                         
del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_sv_pts_won%_ws", "SOS_tw_l60_opp_sv_pts_won%_ws_ct", "SOS_tw_l60_opp_sv_pts_won%", "SOS_tw_l60_opp_sv_pts_won%_50", "SOS_tw_l60_opp_sv_pts_won%_49", "SOS_tw_l60_opp_sv_pts_won%_48", "SOS_tw_l60_opp_sv_pts_won%_47", "SOS_tw_l60_opp_sv_pts_won%_46", "SOS_tw_l60_opp_sv_pts_won%_45", "SOS_tw_l60_opp_sv_pts_won%_44", "SOS_tw_l60_opp_sv_pts_won%_43", "SOS_tw_l60_opp_sv_pts_won%_42", "SOS_tw_l60_opp_sv_pts_won%_41", "SOS_tw_l60_opp_sv_pts_won%_40", "SOS_tw_l60_opp_sv_pts_won%_39", "SOS_tw_l60_opp_sv_pts_won%_38", "SOS_tw_l60_opp_sv_pts_won%_37", "SOS_tw_l60_opp_sv_pts_won%_36", "SOS_tw_l60_opp_sv_pts_won%_35", "SOS_tw_l60_opp_sv_pts_won%_34", "SOS_tw_l60_opp_sv_pts_won%_33", "SOS_tw_l60_opp_sv_pts_won%_32", "SOS_tw_l60_opp_sv_pts_won%_31", "SOS_tw_l60_opp_sv_pts_won%_30", "SOS_tw_l60_opp_sv_pts_won%_29", "SOS_tw_l60_opp_sv_pts_won%_28", "SOS_tw_l60_opp_sv_pts_won%_27", "SOS_tw_l60_opp_sv_pts_won%_26", "SOS_tw_l60_opp_sv_pts_won%_25", "SOS_tw_l60_opp_sv_pts_won%_24", "SOS_tw_l60_opp_sv_pts_won%_23", "SOS_tw_l60_opp_sv_pts_won%_22", "SOS_tw_l60_opp_sv_pts_won%_21", "SOS_tw_l60_opp_sv_pts_won%_20", "SOS_tw_l60_opp_sv_pts_won%_19", "SOS_tw_l60_opp_sv_pts_won%_18", "SOS_tw_l60_opp_sv_pts_won%_17", "SOS_tw_l60_opp_sv_pts_won%_16", "SOS_tw_l60_opp_sv_pts_won%_15", "SOS_tw_l60_opp_sv_pts_won%_14", "SOS_tw_l60_opp_sv_pts_won%_13", "SOS_tw_l60_opp_sv_pts_won%_12", "SOS_tw_l60_opp_sv_pts_won%_11", "SOS_tw_l60_opp_sv_pts_won%_10", "SOS_tw_l60_opp_sv_pts_won%_9", "SOS_tw_l60_opp_sv_pts_won%_8", "SOS_tw_l60_opp_sv_pts_won%_7", "SOS_tw_l60_opp_sv_pts_won%_6", "SOS_tw_l60_opp_sv_pts_won%_5", "SOS_tw_l60_opp_sv_pts_won%_4", "SOS_tw_l60_opp_sv_pts_won%_3", "SOS_tw_l60_opp_sv_pts_won%_2", "SOS_tw_l60_opp_sv_pts_won%_1"],axis=1)

In [165]:
# 'p_ret_pts_won%_l10_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) RETURN POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_sv_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_sv_pts_won%_60", "SOS_tw_l60_opp_sv_pts_won%_59", "SOS_tw_l60_opp_sv_pts_won%_58", "SOS_tw_l60_opp_sv_pts_won%_57", "SOS_tw_l60_opp_sv_pts_won%_56", "SOS_tw_l60_opp_sv_pts_won%_55", "SOS_tw_l60_opp_sv_pts_won%_54", "SOS_tw_l60_opp_sv_pts_won%_53", "SOS_tw_l60_opp_sv_pts_won%_52", "SOS_tw_l60_opp_sv_pts_won%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_sv_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_sv_pts_won%_60", "SOS_tw_l60_opp_sv_pts_won%_59", "SOS_tw_l60_opp_sv_pts_won%_58", "SOS_tw_l60_opp_sv_pts_won%_57", "SOS_tw_l60_opp_sv_pts_won%_56", "SOS_tw_l60_opp_sv_pts_won%_55", "SOS_tw_l60_opp_sv_pts_won%_54", "SOS_tw_l60_opp_sv_pts_won%_53", "SOS_tw_l60_opp_sv_pts_won%_52", "SOS_tw_l60_opp_sv_pts_won%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_sv_pts_won%"] = (df_player2["SOS_tw_l10_opp_sv_pts_won%_ws"]/df_player2["SOS_tw_l10_opp_sv_pts_won%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % RETURN pts won the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches on this surface and IO status is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l10_opp_sv_pts_won%"]

# Mean % RETURN pts won performance (l10_tw_ss) for ALL players per surface (clay, hard) and IO status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# 

mean_clay_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
#   

mean_hard_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# 
                                         
mean_hard_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average                                         
# (36.076785137318204, 36.644654572564576, 36.91182953710506, 36.15796931191582, 36.25706642484706, 36.08278857142863)

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 
                                         
del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_sv_pts_won%_ws", "SOS_tw_l10_opp_sv_pts_won%_ws_ct", "SOS_tw_l10_opp_sv_pts_won%", "SOS_tw_l60_opp_sv_pts_won%_60", "SOS_tw_l60_opp_sv_pts_won%_59", "SOS_tw_l60_opp_sv_pts_won%_58", "SOS_tw_l60_opp_sv_pts_won%_57", "SOS_tw_l60_opp_sv_pts_won%_56", "SOS_tw_l60_opp_sv_pts_won%_55", "SOS_tw_l60_opp_sv_pts_won%_54", "SOS_tw_l60_opp_sv_pts_won%_53", "SOS_tw_l60_opp_sv_pts_won%_52", "SOS_tw_l60_opp_sv_pts_won%_51"],axis=1)

In [166]:
# 'p_1st_ret_pts_won%_l60_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) FIRST RETURN POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific (SS) FIRST SERVE POINTS WON performance of player OPPONENTS over the 60 matches PRIOR TO the match being predicted 
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_60"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-1)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_59"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-2)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_58"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-3)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_57"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-4)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_56"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-5)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_55"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-6)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_54"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-7)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_53"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-8)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_52"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-9)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_51"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-10)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_50"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-11)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_49"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-12)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_48"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-13)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_47"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-14)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_46"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-15)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_45"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-16)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_44"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-17)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_43"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-18)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_42"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-19)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_41"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-20)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_40"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-21)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_39"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-22)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_38"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-23)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_37"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-24)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_36"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-25)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_35"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-26)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_34"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-27)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_33"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-28)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_32"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-29)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_31"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-30)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_30"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-31)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_29"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-32)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_28"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-33)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_27"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-34)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_26"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-35)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_25"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-36)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_24"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-37)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_23"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-38)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_22"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-39)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_21"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-40)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_20"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-41)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_19"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-42)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_18"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-43)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_17"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-44)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_16"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-45)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_15"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-46)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_14"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-47)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_13"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-48)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_12"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-49)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_11"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-50)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_10"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-51)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_9"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-52)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_8"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-53)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_7"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-54)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_6"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-55)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_5"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-56)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_4"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-57)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_3"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-58)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_2"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-59)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_1"] = df_player2.groupby(['p_id','t_surf'])['p_opp_1st_sv_pts_won%_l60_tw_ss'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_1st_sv_pts_won%_60", "SOS_tw_l60_opp_1st_sv_pts_won%_59", "SOS_tw_l60_opp_1st_sv_pts_won%_58", "SOS_tw_l60_opp_1st_sv_pts_won%_57", "SOS_tw_l60_opp_1st_sv_pts_won%_56", "SOS_tw_l60_opp_1st_sv_pts_won%_55", "SOS_tw_l60_opp_1st_sv_pts_won%_54", "SOS_tw_l60_opp_1st_sv_pts_won%_53", "SOS_tw_l60_opp_1st_sv_pts_won%_52", "SOS_tw_l60_opp_1st_sv_pts_won%_51", "SOS_tw_l60_opp_1st_sv_pts_won%_50", "SOS_tw_l60_opp_1st_sv_pts_won%_49", "SOS_tw_l60_opp_1st_sv_pts_won%_48", "SOS_tw_l60_opp_1st_sv_pts_won%_47", "SOS_tw_l60_opp_1st_sv_pts_won%_46", "SOS_tw_l60_opp_1st_sv_pts_won%_45", "SOS_tw_l60_opp_1st_sv_pts_won%_44", "SOS_tw_l60_opp_1st_sv_pts_won%_43", "SOS_tw_l60_opp_1st_sv_pts_won%_42", "SOS_tw_l60_opp_1st_sv_pts_won%_41", "SOS_tw_l60_opp_1st_sv_pts_won%_40", "SOS_tw_l60_opp_1st_sv_pts_won%_39", "SOS_tw_l60_opp_1st_sv_pts_won%_38", "SOS_tw_l60_opp_1st_sv_pts_won%_37", "SOS_tw_l60_opp_1st_sv_pts_won%_36", "SOS_tw_l60_opp_1st_sv_pts_won%_35", "SOS_tw_l60_opp_1st_sv_pts_won%_34", "SOS_tw_l60_opp_1st_sv_pts_won%_33", "SOS_tw_l60_opp_1st_sv_pts_won%_32", "SOS_tw_l60_opp_1st_sv_pts_won%_31", "SOS_tw_l60_opp_1st_sv_pts_won%_30", "SOS_tw_l60_opp_1st_sv_pts_won%_29", "SOS_tw_l60_opp_1st_sv_pts_won%_28", "SOS_tw_l60_opp_1st_sv_pts_won%_27", "SOS_tw_l60_opp_1st_sv_pts_won%_26", "SOS_tw_l60_opp_1st_sv_pts_won%_25", "SOS_tw_l60_opp_1st_sv_pts_won%_24", "SOS_tw_l60_opp_1st_sv_pts_won%_23", "SOS_tw_l60_opp_1st_sv_pts_won%_22", "SOS_tw_l60_opp_1st_sv_pts_won%_21", "SOS_tw_l60_opp_1st_sv_pts_won%_20", "SOS_tw_l60_opp_1st_sv_pts_won%_19", "SOS_tw_l60_opp_1st_sv_pts_won%_18", "SOS_tw_l60_opp_1st_sv_pts_won%_17", "SOS_tw_l60_opp_1st_sv_pts_won%_16", "SOS_tw_l60_opp_1st_sv_pts_won%_15", "SOS_tw_l60_opp_1st_sv_pts_won%_14", "SOS_tw_l60_opp_1st_sv_pts_won%_13", "SOS_tw_l60_opp_1st_sv_pts_won%_12", "SOS_tw_l60_opp_1st_sv_pts_won%_11", "SOS_tw_l60_opp_1st_sv_pts_won%_10", "SOS_tw_l60_opp_1st_sv_pts_won%_9", "SOS_tw_l60_opp_1st_sv_pts_won%_8", "SOS_tw_l60_opp_1st_sv_pts_won%_7", "SOS_tw_l60_opp_1st_sv_pts_won%_6", "SOS_tw_l60_opp_1st_sv_pts_won%_5", "SOS_tw_l60_opp_1st_sv_pts_won%_4", "SOS_tw_l60_opp_1st_sv_pts_won%_3", "SOS_tw_l60_opp_1st_sv_pts_won%_2", "SOS_tw_l60_opp_1st_sv_pts_won%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_1st_sv_pts_won%_60", "SOS_tw_l60_opp_1st_sv_pts_won%_59", "SOS_tw_l60_opp_1st_sv_pts_won%_58", "SOS_tw_l60_opp_1st_sv_pts_won%_57", "SOS_tw_l60_opp_1st_sv_pts_won%_56", "SOS_tw_l60_opp_1st_sv_pts_won%_55", "SOS_tw_l60_opp_1st_sv_pts_won%_54", "SOS_tw_l60_opp_1st_sv_pts_won%_53", "SOS_tw_l60_opp_1st_sv_pts_won%_52", "SOS_tw_l60_opp_1st_sv_pts_won%_51", "SOS_tw_l60_opp_1st_sv_pts_won%_50", "SOS_tw_l60_opp_1st_sv_pts_won%_49", "SOS_tw_l60_opp_1st_sv_pts_won%_48", "SOS_tw_l60_opp_1st_sv_pts_won%_47", "SOS_tw_l60_opp_1st_sv_pts_won%_46", "SOS_tw_l60_opp_1st_sv_pts_won%_45", "SOS_tw_l60_opp_1st_sv_pts_won%_44", "SOS_tw_l60_opp_1st_sv_pts_won%_43", "SOS_tw_l60_opp_1st_sv_pts_won%_42", "SOS_tw_l60_opp_1st_sv_pts_won%_41", "SOS_tw_l60_opp_1st_sv_pts_won%_40", "SOS_tw_l60_opp_1st_sv_pts_won%_39", "SOS_tw_l60_opp_1st_sv_pts_won%_38", "SOS_tw_l60_opp_1st_sv_pts_won%_37", "SOS_tw_l60_opp_1st_sv_pts_won%_36", "SOS_tw_l60_opp_1st_sv_pts_won%_35", "SOS_tw_l60_opp_1st_sv_pts_won%_34", "SOS_tw_l60_opp_1st_sv_pts_won%_33", "SOS_tw_l60_opp_1st_sv_pts_won%_32", "SOS_tw_l60_opp_1st_sv_pts_won%_31", "SOS_tw_l60_opp_1st_sv_pts_won%_30", "SOS_tw_l60_opp_1st_sv_pts_won%_29", "SOS_tw_l60_opp_1st_sv_pts_won%_28", "SOS_tw_l60_opp_1st_sv_pts_won%_27", "SOS_tw_l60_opp_1st_sv_pts_won%_26", "SOS_tw_l60_opp_1st_sv_pts_won%_25", "SOS_tw_l60_opp_1st_sv_pts_won%_24", "SOS_tw_l60_opp_1st_sv_pts_won%_23", "SOS_tw_l60_opp_1st_sv_pts_won%_22", "SOS_tw_l60_opp_1st_sv_pts_won%_21", "SOS_tw_l60_opp_1st_sv_pts_won%_20", "SOS_tw_l60_opp_1st_sv_pts_won%_19", "SOS_tw_l60_opp_1st_sv_pts_won%_18", "SOS_tw_l60_opp_1st_sv_pts_won%_17", "SOS_tw_l60_opp_1st_sv_pts_won%_16", "SOS_tw_l60_opp_1st_sv_pts_won%_15", "SOS_tw_l60_opp_1st_sv_pts_won%_14", "SOS_tw_l60_opp_1st_sv_pts_won%_13", "SOS_tw_l60_opp_1st_sv_pts_won%_12", "SOS_tw_l60_opp_1st_sv_pts_won%_11", "SOS_tw_l60_opp_1st_sv_pts_won%_10", "SOS_tw_l60_opp_1st_sv_pts_won%_9", "SOS_tw_l60_opp_1st_sv_pts_won%_8", "SOS_tw_l60_opp_1st_sv_pts_won%_7", "SOS_tw_l60_opp_1st_sv_pts_won%_6", "SOS_tw_l60_opp_1st_sv_pts_won%_5", "SOS_tw_l60_opp_1st_sv_pts_won%_4", "SOS_tw_l60_opp_1st_sv_pts_won%_3", "SOS_tw_l60_opp_1st_sv_pts_won%_2", "SOS_tw_l60_opp_1st_sv_pts_won%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%"] = (df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_ws"]/df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % FIRST RETURN pts won the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l60_opp_1st_sv_pts_won%"]

# Mean % FIRST RETURN pts won performance (l60_tw_ss) for ALL players per surface (clay, hard). We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_clay_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 1st return pts the field ALLOWS on average
mean_clay_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 1st return pts the field ALLOWS on average
mean_clay_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 1st return pts the field ALLOWS on average
mean_clay_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 1st return pts the field ALLOWS on average
mean_clay_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 1st return pts the field ALLOWS on average
mean_clay_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 1st return pts the field ALLOWS on average
# (30.71741071428579, 31.537193813131466, 31.234830963665118, 30.876705276705252, 30.65352012882454, 30.97303040103499)

mean_hard_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 1st return pts the field ALLOWS on average
mean_hard_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 1st return pts the field ALLOWS on average
mean_hard_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 1st return pts the field ALLOWS on average
mean_hard_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 1st return pts the field ALLOWS on average
mean_hard_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 1st return pts the field ALLOWS on average
mean_hard_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 1st return pts the field ALLOWS on average
# (27.563133852691266, 27.75973746612452, 28.193027257240118, 27.153700370566554, 27.459269080401114, 27.21882803197809)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_1st_sv_pts_won%_ws", "SOS_tw_l60_opp_1st_sv_pts_won%_ws_ct", "SOS_tw_l60_opp_1st_sv_pts_won%", "SOS_tw_l60_opp_1st_sv_pts_won%_50", "SOS_tw_l60_opp_1st_sv_pts_won%_49", "SOS_tw_l60_opp_1st_sv_pts_won%_48", "SOS_tw_l60_opp_1st_sv_pts_won%_47", "SOS_tw_l60_opp_1st_sv_pts_won%_46", "SOS_tw_l60_opp_1st_sv_pts_won%_45", "SOS_tw_l60_opp_1st_sv_pts_won%_44", "SOS_tw_l60_opp_1st_sv_pts_won%_43", "SOS_tw_l60_opp_1st_sv_pts_won%_42", "SOS_tw_l60_opp_1st_sv_pts_won%_41", "SOS_tw_l60_opp_1st_sv_pts_won%_40", "SOS_tw_l60_opp_1st_sv_pts_won%_39", "SOS_tw_l60_opp_1st_sv_pts_won%_38", "SOS_tw_l60_opp_1st_sv_pts_won%_37", "SOS_tw_l60_opp_1st_sv_pts_won%_36", "SOS_tw_l60_opp_1st_sv_pts_won%_35", "SOS_tw_l60_opp_1st_sv_pts_won%_34", "SOS_tw_l60_opp_1st_sv_pts_won%_33", "SOS_tw_l60_opp_1st_sv_pts_won%_32", "SOS_tw_l60_opp_1st_sv_pts_won%_31", "SOS_tw_l60_opp_1st_sv_pts_won%_30", "SOS_tw_l60_opp_1st_sv_pts_won%_29", "SOS_tw_l60_opp_1st_sv_pts_won%_28", "SOS_tw_l60_opp_1st_sv_pts_won%_27", "SOS_tw_l60_opp_1st_sv_pts_won%_26", "SOS_tw_l60_opp_1st_sv_pts_won%_25", "SOS_tw_l60_opp_1st_sv_pts_won%_24", "SOS_tw_l60_opp_1st_sv_pts_won%_23", "SOS_tw_l60_opp_1st_sv_pts_won%_22", "SOS_tw_l60_opp_1st_sv_pts_won%_21", "SOS_tw_l60_opp_1st_sv_pts_won%_20", "SOS_tw_l60_opp_1st_sv_pts_won%_19", "SOS_tw_l60_opp_1st_sv_pts_won%_18", "SOS_tw_l60_opp_1st_sv_pts_won%_17", "SOS_tw_l60_opp_1st_sv_pts_won%_16", "SOS_tw_l60_opp_1st_sv_pts_won%_15", "SOS_tw_l60_opp_1st_sv_pts_won%_14", "SOS_tw_l60_opp_1st_sv_pts_won%_13", "SOS_tw_l60_opp_1st_sv_pts_won%_12", "SOS_tw_l60_opp_1st_sv_pts_won%_11", "SOS_tw_l60_opp_1st_sv_pts_won%_10", "SOS_tw_l60_opp_1st_sv_pts_won%_9", "SOS_tw_l60_opp_1st_sv_pts_won%_8", "SOS_tw_l60_opp_1st_sv_pts_won%_7", "SOS_tw_l60_opp_1st_sv_pts_won%_6", "SOS_tw_l60_opp_1st_sv_pts_won%_5", "SOS_tw_l60_opp_1st_sv_pts_won%_4", "SOS_tw_l60_opp_1st_sv_pts_won%_3", "SOS_tw_l60_opp_1st_sv_pts_won%_2", "SOS_tw_l60_opp_1st_sv_pts_won%_1"],axis=1)

In [167]:
# 'p_1st_ret_pts_won%_l10_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) FIRST RETURN POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_1st_sv_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_1st_sv_pts_won%_60", "SOS_tw_l60_opp_1st_sv_pts_won%_59", "SOS_tw_l60_opp_1st_sv_pts_won%_58", "SOS_tw_l60_opp_1st_sv_pts_won%_57", "SOS_tw_l60_opp_1st_sv_pts_won%_56", "SOS_tw_l60_opp_1st_sv_pts_won%_55", "SOS_tw_l60_opp_1st_sv_pts_won%_54", "SOS_tw_l60_opp_1st_sv_pts_won%_53", "SOS_tw_l60_opp_1st_sv_pts_won%_52", "SOS_tw_l60_opp_1st_sv_pts_won%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_1st_sv_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_1st_sv_pts_won%_60", "SOS_tw_l60_opp_1st_sv_pts_won%_59", "SOS_tw_l60_opp_1st_sv_pts_won%_58", "SOS_tw_l60_opp_1st_sv_pts_won%_57", "SOS_tw_l60_opp_1st_sv_pts_won%_56", "SOS_tw_l60_opp_1st_sv_pts_won%_55", "SOS_tw_l60_opp_1st_sv_pts_won%_54", "SOS_tw_l60_opp_1st_sv_pts_won%_53", "SOS_tw_l60_opp_1st_sv_pts_won%_52", "SOS_tw_l60_opp_1st_sv_pts_won%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_1st_sv_pts_won%"] = (df_player2["SOS_tw_l10_opp_1st_sv_pts_won%_ws"]/df_player2["SOS_tw_l10_opp_1st_sv_pts_won%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % FIRST RETURN pts won the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches on this surface is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l10_opp_1st_sv_pts_won%"]

# Mean % FIRST RETURN pts won performance (l10_tw_ss) for ALL players per surface (clay, hard). We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_clay_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 1st return pts the field ALLOWS on average
mean_clay_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 1st return pts the field ALLOWS on average
mean_clay_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 1st return pts the field ALLOWS on average
mean_clay_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 1st return pts the field ALLOWS on average
mean_clay_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 1st return pts the field ALLOWS on average
mean_clay_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 1st return pts the field ALLOWS on average
# (30.716847527472638, 31.563686868686872, 31.183146919431195, 30.627252252252347, 30.547858293075677, 30.686575032341338)

mean_hard_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 1st return pts the field ALLOWS on average
mean_hard_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 1st return pts the field ALLOWS on average
mean_hard_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 1st return pts the field ALLOWS on average
mean_hard_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 1st return pts the field ALLOWS on average
mean_hard_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 1st return pts the field ALLOWS on average
mean_hard_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 1st return pts the field ALLOWS on average
# (27.46297096317265, 27.726683604335847, 28.150037478705457, 26.92364566790171, 27.396959034506168, 27.024039802687597)

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_1st_sv_pts_won%_ws", "SOS_tw_l10_opp_1st_sv_pts_won%_ws_ct", "SOS_tw_l10_opp_1st_sv_pts_won%", "SOS_tw_l60_opp_1st_sv_pts_won%_60", "SOS_tw_l60_opp_1st_sv_pts_won%_59", "SOS_tw_l60_opp_1st_sv_pts_won%_58", "SOS_tw_l60_opp_1st_sv_pts_won%_57", "SOS_tw_l60_opp_1st_sv_pts_won%_56", "SOS_tw_l60_opp_1st_sv_pts_won%_55", "SOS_tw_l60_opp_1st_sv_pts_won%_54", "SOS_tw_l60_opp_1st_sv_pts_won%_53", "SOS_tw_l60_opp_1st_sv_pts_won%_52", "SOS_tw_l60_opp_1st_sv_pts_won%_51"],axis=1)

In [168]:
# 'p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), indoor/outdoor I/O specific FIRST RETURN POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf', 't_ind', 'm_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific (SS), IO specific FIRST SERVE POINTS WON performance of player OPPONENTS over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_60"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-1)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_59"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-2)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_58"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-3)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_57"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-4)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_56"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-5)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_55"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-6)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_54"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-7)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_53"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-8)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_52"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-9)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_51"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-10)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_50"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-11)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_49"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-12)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_48"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-13)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_47"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-14)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_46"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-15)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_45"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-16)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_44"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-17)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_43"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-18)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_42"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-19)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_41"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-20)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_40"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-21)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_39"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-22)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_38"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-23)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_37"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-24)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_36"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-25)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_35"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-26)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_34"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-27)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_33"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-28)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_32"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-29)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_31"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-30)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_30"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-31)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_29"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-32)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_28"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-33)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_27"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-34)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_26"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-35)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_25"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-36)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_24"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-37)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_23"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-38)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_22"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-39)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_21"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-40)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_20"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-41)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_19"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-42)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_18"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-43)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_17"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-44)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_16"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-45)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_15"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-46)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_14"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-47)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_13"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-48)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_12"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-49)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_11"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-50)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_10"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-51)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_9"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-52)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_8"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-53)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_7"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-54)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_6"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-55)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_5"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-56)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_4"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-57)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_3"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-58)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_2"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-59)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_1"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_1st_sv_pts_won%_60", "SOS_tw_l60_opp_1st_sv_pts_won%_59", "SOS_tw_l60_opp_1st_sv_pts_won%_58", "SOS_tw_l60_opp_1st_sv_pts_won%_57", "SOS_tw_l60_opp_1st_sv_pts_won%_56", "SOS_tw_l60_opp_1st_sv_pts_won%_55", "SOS_tw_l60_opp_1st_sv_pts_won%_54", "SOS_tw_l60_opp_1st_sv_pts_won%_53", "SOS_tw_l60_opp_1st_sv_pts_won%_52", "SOS_tw_l60_opp_1st_sv_pts_won%_51", "SOS_tw_l60_opp_1st_sv_pts_won%_50", "SOS_tw_l60_opp_1st_sv_pts_won%_49", "SOS_tw_l60_opp_1st_sv_pts_won%_48", "SOS_tw_l60_opp_1st_sv_pts_won%_47", "SOS_tw_l60_opp_1st_sv_pts_won%_46", "SOS_tw_l60_opp_1st_sv_pts_won%_45", "SOS_tw_l60_opp_1st_sv_pts_won%_44", "SOS_tw_l60_opp_1st_sv_pts_won%_43", "SOS_tw_l60_opp_1st_sv_pts_won%_42", "SOS_tw_l60_opp_1st_sv_pts_won%_41", "SOS_tw_l60_opp_1st_sv_pts_won%_40", "SOS_tw_l60_opp_1st_sv_pts_won%_39", "SOS_tw_l60_opp_1st_sv_pts_won%_38", "SOS_tw_l60_opp_1st_sv_pts_won%_37", "SOS_tw_l60_opp_1st_sv_pts_won%_36", "SOS_tw_l60_opp_1st_sv_pts_won%_35", "SOS_tw_l60_opp_1st_sv_pts_won%_34", "SOS_tw_l60_opp_1st_sv_pts_won%_33", "SOS_tw_l60_opp_1st_sv_pts_won%_32", "SOS_tw_l60_opp_1st_sv_pts_won%_31", "SOS_tw_l60_opp_1st_sv_pts_won%_30", "SOS_tw_l60_opp_1st_sv_pts_won%_29", "SOS_tw_l60_opp_1st_sv_pts_won%_28", "SOS_tw_l60_opp_1st_sv_pts_won%_27", "SOS_tw_l60_opp_1st_sv_pts_won%_26", "SOS_tw_l60_opp_1st_sv_pts_won%_25", "SOS_tw_l60_opp_1st_sv_pts_won%_24", "SOS_tw_l60_opp_1st_sv_pts_won%_23", "SOS_tw_l60_opp_1st_sv_pts_won%_22", "SOS_tw_l60_opp_1st_sv_pts_won%_21", "SOS_tw_l60_opp_1st_sv_pts_won%_20", "SOS_tw_l60_opp_1st_sv_pts_won%_19", "SOS_tw_l60_opp_1st_sv_pts_won%_18", "SOS_tw_l60_opp_1st_sv_pts_won%_17", "SOS_tw_l60_opp_1st_sv_pts_won%_16", "SOS_tw_l60_opp_1st_sv_pts_won%_15", "SOS_tw_l60_opp_1st_sv_pts_won%_14", "SOS_tw_l60_opp_1st_sv_pts_won%_13", "SOS_tw_l60_opp_1st_sv_pts_won%_12", "SOS_tw_l60_opp_1st_sv_pts_won%_11", "SOS_tw_l60_opp_1st_sv_pts_won%_10", "SOS_tw_l60_opp_1st_sv_pts_won%_9", "SOS_tw_l60_opp_1st_sv_pts_won%_8", "SOS_tw_l60_opp_1st_sv_pts_won%_7", "SOS_tw_l60_opp_1st_sv_pts_won%_6", "SOS_tw_l60_opp_1st_sv_pts_won%_5", "SOS_tw_l60_opp_1st_sv_pts_won%_4", "SOS_tw_l60_opp_1st_sv_pts_won%_3", "SOS_tw_l60_opp_1st_sv_pts_won%_2", "SOS_tw_l60_opp_1st_sv_pts_won%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_1st_sv_pts_won%_60", "SOS_tw_l60_opp_1st_sv_pts_won%_59", "SOS_tw_l60_opp_1st_sv_pts_won%_58", "SOS_tw_l60_opp_1st_sv_pts_won%_57", "SOS_tw_l60_opp_1st_sv_pts_won%_56", "SOS_tw_l60_opp_1st_sv_pts_won%_55", "SOS_tw_l60_opp_1st_sv_pts_won%_54", "SOS_tw_l60_opp_1st_sv_pts_won%_53", "SOS_tw_l60_opp_1st_sv_pts_won%_52", "SOS_tw_l60_opp_1st_sv_pts_won%_51", "SOS_tw_l60_opp_1st_sv_pts_won%_50", "SOS_tw_l60_opp_1st_sv_pts_won%_49", "SOS_tw_l60_opp_1st_sv_pts_won%_48", "SOS_tw_l60_opp_1st_sv_pts_won%_47", "SOS_tw_l60_opp_1st_sv_pts_won%_46", "SOS_tw_l60_opp_1st_sv_pts_won%_45", "SOS_tw_l60_opp_1st_sv_pts_won%_44", "SOS_tw_l60_opp_1st_sv_pts_won%_43", "SOS_tw_l60_opp_1st_sv_pts_won%_42", "SOS_tw_l60_opp_1st_sv_pts_won%_41", "SOS_tw_l60_opp_1st_sv_pts_won%_40", "SOS_tw_l60_opp_1st_sv_pts_won%_39", "SOS_tw_l60_opp_1st_sv_pts_won%_38", "SOS_tw_l60_opp_1st_sv_pts_won%_37", "SOS_tw_l60_opp_1st_sv_pts_won%_36", "SOS_tw_l60_opp_1st_sv_pts_won%_35", "SOS_tw_l60_opp_1st_sv_pts_won%_34", "SOS_tw_l60_opp_1st_sv_pts_won%_33", "SOS_tw_l60_opp_1st_sv_pts_won%_32", "SOS_tw_l60_opp_1st_sv_pts_won%_31", "SOS_tw_l60_opp_1st_sv_pts_won%_30", "SOS_tw_l60_opp_1st_sv_pts_won%_29", "SOS_tw_l60_opp_1st_sv_pts_won%_28", "SOS_tw_l60_opp_1st_sv_pts_won%_27", "SOS_tw_l60_opp_1st_sv_pts_won%_26", "SOS_tw_l60_opp_1st_sv_pts_won%_25", "SOS_tw_l60_opp_1st_sv_pts_won%_24", "SOS_tw_l60_opp_1st_sv_pts_won%_23", "SOS_tw_l60_opp_1st_sv_pts_won%_22", "SOS_tw_l60_opp_1st_sv_pts_won%_21", "SOS_tw_l60_opp_1st_sv_pts_won%_20", "SOS_tw_l60_opp_1st_sv_pts_won%_19", "SOS_tw_l60_opp_1st_sv_pts_won%_18", "SOS_tw_l60_opp_1st_sv_pts_won%_17", "SOS_tw_l60_opp_1st_sv_pts_won%_16", "SOS_tw_l60_opp_1st_sv_pts_won%_15", "SOS_tw_l60_opp_1st_sv_pts_won%_14", "SOS_tw_l60_opp_1st_sv_pts_won%_13", "SOS_tw_l60_opp_1st_sv_pts_won%_12", "SOS_tw_l60_opp_1st_sv_pts_won%_11", "SOS_tw_l60_opp_1st_sv_pts_won%_10", "SOS_tw_l60_opp_1st_sv_pts_won%_9", "SOS_tw_l60_opp_1st_sv_pts_won%_8", "SOS_tw_l60_opp_1st_sv_pts_won%_7", "SOS_tw_l60_opp_1st_sv_pts_won%_6", "SOS_tw_l60_opp_1st_sv_pts_won%_5", "SOS_tw_l60_opp_1st_sv_pts_won%_4", "SOS_tw_l60_opp_1st_sv_pts_won%_3", "SOS_tw_l60_opp_1st_sv_pts_won%_2", "SOS_tw_l60_opp_1st_sv_pts_won%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_1st_sv_pts_won%"] = (df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_ws"]/df_player2["SOS_tw_l60_opp_1st_sv_pts_won%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % FIRST RETURN pts won the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface and IO status is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l60_opp_1st_sv_pts_won%"]

# Mean % FIRST RETURN pts won performance (l60_tw_ss_IO) for ALL players per surface (clay, hard) and IO status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (nan, 26.81053571428572, 29.143772455089803, 27.162222222222184, 29.074507042253515, 28.148160919540217)

mean_clay_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (30.71741071428579, 31.51635117493494, 31.283593220339014, 31.007637116247082, 30.695244565217507, 31.088542713567918) 

mean_hard_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (24.780817073170724, 26.46917960088699, 27.08777332570118, 26.5702660300135, 27.300473867595855, 26.61621917808226)
                                         
mean_hard_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average                                         
# (27.797555196553617, 28.27525844930429, 28.47468283125167, 27.484783025653172, 27.492883699841258, 27.434285714285835)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 
                                         
del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_1st_sv_pts_won%_ws", "SOS_tw_l60_opp_1st_sv_pts_won%_ws_ct", "SOS_tw_l60_opp_1st_sv_pts_won%", "SOS_tw_l60_opp_1st_sv_pts_won%_50", "SOS_tw_l60_opp_1st_sv_pts_won%_49", "SOS_tw_l60_opp_1st_sv_pts_won%_48", "SOS_tw_l60_opp_1st_sv_pts_won%_47", "SOS_tw_l60_opp_1st_sv_pts_won%_46", "SOS_tw_l60_opp_1st_sv_pts_won%_45", "SOS_tw_l60_opp_1st_sv_pts_won%_44", "SOS_tw_l60_opp_1st_sv_pts_won%_43", "SOS_tw_l60_opp_1st_sv_pts_won%_42", "SOS_tw_l60_opp_1st_sv_pts_won%_41", "SOS_tw_l60_opp_1st_sv_pts_won%_40", "SOS_tw_l60_opp_1st_sv_pts_won%_39", "SOS_tw_l60_opp_1st_sv_pts_won%_38", "SOS_tw_l60_opp_1st_sv_pts_won%_37", "SOS_tw_l60_opp_1st_sv_pts_won%_36", "SOS_tw_l60_opp_1st_sv_pts_won%_35", "SOS_tw_l60_opp_1st_sv_pts_won%_34", "SOS_tw_l60_opp_1st_sv_pts_won%_33", "SOS_tw_l60_opp_1st_sv_pts_won%_32", "SOS_tw_l60_opp_1st_sv_pts_won%_31", "SOS_tw_l60_opp_1st_sv_pts_won%_30", "SOS_tw_l60_opp_1st_sv_pts_won%_29", "SOS_tw_l60_opp_1st_sv_pts_won%_28", "SOS_tw_l60_opp_1st_sv_pts_won%_27", "SOS_tw_l60_opp_1st_sv_pts_won%_26", "SOS_tw_l60_opp_1st_sv_pts_won%_25", "SOS_tw_l60_opp_1st_sv_pts_won%_24", "SOS_tw_l60_opp_1st_sv_pts_won%_23", "SOS_tw_l60_opp_1st_sv_pts_won%_22", "SOS_tw_l60_opp_1st_sv_pts_won%_21", "SOS_tw_l60_opp_1st_sv_pts_won%_20", "SOS_tw_l60_opp_1st_sv_pts_won%_19", "SOS_tw_l60_opp_1st_sv_pts_won%_18", "SOS_tw_l60_opp_1st_sv_pts_won%_17", "SOS_tw_l60_opp_1st_sv_pts_won%_16", "SOS_tw_l60_opp_1st_sv_pts_won%_15", "SOS_tw_l60_opp_1st_sv_pts_won%_14", "SOS_tw_l60_opp_1st_sv_pts_won%_13", "SOS_tw_l60_opp_1st_sv_pts_won%_12", "SOS_tw_l60_opp_1st_sv_pts_won%_11", "SOS_tw_l60_opp_1st_sv_pts_won%_10", "SOS_tw_l60_opp_1st_sv_pts_won%_9", "SOS_tw_l60_opp_1st_sv_pts_won%_8", "SOS_tw_l60_opp_1st_sv_pts_won%_7", "SOS_tw_l60_opp_1st_sv_pts_won%_6", "SOS_tw_l60_opp_1st_sv_pts_won%_5", "SOS_tw_l60_opp_1st_sv_pts_won%_4", "SOS_tw_l60_opp_1st_sv_pts_won%_3", "SOS_tw_l60_opp_1st_sv_pts_won%_2", "SOS_tw_l60_opp_1st_sv_pts_won%_1"],axis=1)

In [169]:
# 'p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), indoor/outdoor I/O specific FIRST RETURN POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_1st_sv_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_1st_sv_pts_won%_60", "SOS_tw_l60_opp_1st_sv_pts_won%_59", "SOS_tw_l60_opp_1st_sv_pts_won%_58", "SOS_tw_l60_opp_1st_sv_pts_won%_57", "SOS_tw_l60_opp_1st_sv_pts_won%_56", "SOS_tw_l60_opp_1st_sv_pts_won%_55", "SOS_tw_l60_opp_1st_sv_pts_won%_54", "SOS_tw_l60_opp_1st_sv_pts_won%_53", "SOS_tw_l60_opp_1st_sv_pts_won%_52", "SOS_tw_l60_opp_1st_sv_pts_won%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_1st_sv_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_1st_sv_pts_won%_60", "SOS_tw_l60_opp_1st_sv_pts_won%_59", "SOS_tw_l60_opp_1st_sv_pts_won%_58", "SOS_tw_l60_opp_1st_sv_pts_won%_57", "SOS_tw_l60_opp_1st_sv_pts_won%_56", "SOS_tw_l60_opp_1st_sv_pts_won%_55", "SOS_tw_l60_opp_1st_sv_pts_won%_54", "SOS_tw_l60_opp_1st_sv_pts_won%_53", "SOS_tw_l60_opp_1st_sv_pts_won%_52", "SOS_tw_l60_opp_1st_sv_pts_won%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_1st_sv_pts_won%"] = (df_player2["SOS_tw_l10_opp_1st_sv_pts_won%_ws"]/df_player2["SOS_tw_l10_opp_1st_sv_pts_won%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % FIRST RETURN pts won the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches on this surface and IO status is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l10_opp_1st_sv_pts_won%"]

# Mean % FIRST RETURN pts won performance (l10_tw_ss_IO) for ALL players per surface (clay, hard) and IO status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# 

mean_clay_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# 

mean_hard_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# 
                                         
mean_hard_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_1st_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_1st_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_1st_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_1st_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_1st_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_1st_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average                                         
# 

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_1st_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 
                                         
del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_1st_sv_pts_won%_ws", "SOS_tw_l10_opp_1st_sv_pts_won%_ws_ct", "SOS_tw_l10_opp_1st_sv_pts_won%", "SOS_tw_l60_opp_1st_sv_pts_won%_60", "SOS_tw_l60_opp_1st_sv_pts_won%_59", "SOS_tw_l60_opp_1st_sv_pts_won%_58", "SOS_tw_l60_opp_1st_sv_pts_won%_57", "SOS_tw_l60_opp_1st_sv_pts_won%_56", "SOS_tw_l60_opp_1st_sv_pts_won%_55", "SOS_tw_l60_opp_1st_sv_pts_won%_54", "SOS_tw_l60_opp_1st_sv_pts_won%_53", "SOS_tw_l60_opp_1st_sv_pts_won%_52", "SOS_tw_l60_opp_1st_sv_pts_won%_51"],axis=1)

In [170]:
# 'p_2nd_ret_pts_won%_l60_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) SECOND RETURN POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific (SS) SECOND SERVE POINTS WON performance of player OPPONENTS over the 60 matches PRIOR TO the match being predicted 
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_60"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-1)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_59"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-2)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_58"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-3)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_57"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-4)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_56"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-5)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_55"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-6)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_54"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-7)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_53"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-8)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_52"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-9)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_51"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-10)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_50"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-11)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_49"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-12)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_48"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-13)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_47"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-14)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_46"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-15)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_45"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-16)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_44"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-17)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_43"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-18)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_42"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-19)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_41"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-20)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_40"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-21)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_39"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-22)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_38"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-23)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_37"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-24)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_36"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-25)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_35"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-26)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_34"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-27)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_33"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-28)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_32"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-29)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_31"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-30)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_30"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-31)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_29"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-32)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_28"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-33)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_27"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-34)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_26"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-35)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_25"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-36)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_24"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-37)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_23"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-38)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_22"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-39)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_21"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-40)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_20"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-41)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_19"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-42)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_18"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-43)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_17"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-44)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_16"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-45)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_15"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-46)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_14"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-47)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_13"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-48)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_12"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-49)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_11"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-50)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_10"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-51)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_9"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-52)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_8"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-53)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_7"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-54)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_6"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-55)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_5"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-56)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_4"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-57)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_3"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-58)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_2"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-59)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_1"] = df_player2.groupby(['p_id','t_surf'])['p_opp_2nd_sv_pts_won%_l60_tw_ss'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_2nd_sv_pts_won%_60", "SOS_tw_l60_opp_2nd_sv_pts_won%_59", "SOS_tw_l60_opp_2nd_sv_pts_won%_58", "SOS_tw_l60_opp_2nd_sv_pts_won%_57", "SOS_tw_l60_opp_2nd_sv_pts_won%_56", "SOS_tw_l60_opp_2nd_sv_pts_won%_55", "SOS_tw_l60_opp_2nd_sv_pts_won%_54", "SOS_tw_l60_opp_2nd_sv_pts_won%_53", "SOS_tw_l60_opp_2nd_sv_pts_won%_52", "SOS_tw_l60_opp_2nd_sv_pts_won%_51", "SOS_tw_l60_opp_2nd_sv_pts_won%_50", "SOS_tw_l60_opp_2nd_sv_pts_won%_49", "SOS_tw_l60_opp_2nd_sv_pts_won%_48", "SOS_tw_l60_opp_2nd_sv_pts_won%_47", "SOS_tw_l60_opp_2nd_sv_pts_won%_46", "SOS_tw_l60_opp_2nd_sv_pts_won%_45", "SOS_tw_l60_opp_2nd_sv_pts_won%_44", "SOS_tw_l60_opp_2nd_sv_pts_won%_43", "SOS_tw_l60_opp_2nd_sv_pts_won%_42", "SOS_tw_l60_opp_2nd_sv_pts_won%_41", "SOS_tw_l60_opp_2nd_sv_pts_won%_40", "SOS_tw_l60_opp_2nd_sv_pts_won%_39", "SOS_tw_l60_opp_2nd_sv_pts_won%_38", "SOS_tw_l60_opp_2nd_sv_pts_won%_37", "SOS_tw_l60_opp_2nd_sv_pts_won%_36", "SOS_tw_l60_opp_2nd_sv_pts_won%_35", "SOS_tw_l60_opp_2nd_sv_pts_won%_34", "SOS_tw_l60_opp_2nd_sv_pts_won%_33", "SOS_tw_l60_opp_2nd_sv_pts_won%_32", "SOS_tw_l60_opp_2nd_sv_pts_won%_31", "SOS_tw_l60_opp_2nd_sv_pts_won%_30", "SOS_tw_l60_opp_2nd_sv_pts_won%_29", "SOS_tw_l60_opp_2nd_sv_pts_won%_28", "SOS_tw_l60_opp_2nd_sv_pts_won%_27", "SOS_tw_l60_opp_2nd_sv_pts_won%_26", "SOS_tw_l60_opp_2nd_sv_pts_won%_25", "SOS_tw_l60_opp_2nd_sv_pts_won%_24", "SOS_tw_l60_opp_2nd_sv_pts_won%_23", "SOS_tw_l60_opp_2nd_sv_pts_won%_22", "SOS_tw_l60_opp_2nd_sv_pts_won%_21", "SOS_tw_l60_opp_2nd_sv_pts_won%_20", "SOS_tw_l60_opp_2nd_sv_pts_won%_19", "SOS_tw_l60_opp_2nd_sv_pts_won%_18", "SOS_tw_l60_opp_2nd_sv_pts_won%_17", "SOS_tw_l60_opp_2nd_sv_pts_won%_16", "SOS_tw_l60_opp_2nd_sv_pts_won%_15", "SOS_tw_l60_opp_2nd_sv_pts_won%_14", "SOS_tw_l60_opp_2nd_sv_pts_won%_13", "SOS_tw_l60_opp_2nd_sv_pts_won%_12", "SOS_tw_l60_opp_2nd_sv_pts_won%_11", "SOS_tw_l60_opp_2nd_sv_pts_won%_10", "SOS_tw_l60_opp_2nd_sv_pts_won%_9", "SOS_tw_l60_opp_2nd_sv_pts_won%_8", "SOS_tw_l60_opp_2nd_sv_pts_won%_7", "SOS_tw_l60_opp_2nd_sv_pts_won%_6", "SOS_tw_l60_opp_2nd_sv_pts_won%_5", "SOS_tw_l60_opp_2nd_sv_pts_won%_4", "SOS_tw_l60_opp_2nd_sv_pts_won%_3", "SOS_tw_l60_opp_2nd_sv_pts_won%_2", "SOS_tw_l60_opp_2nd_sv_pts_won%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_2nd_sv_pts_won%_60", "SOS_tw_l60_opp_2nd_sv_pts_won%_59", "SOS_tw_l60_opp_2nd_sv_pts_won%_58", "SOS_tw_l60_opp_2nd_sv_pts_won%_57", "SOS_tw_l60_opp_2nd_sv_pts_won%_56", "SOS_tw_l60_opp_2nd_sv_pts_won%_55", "SOS_tw_l60_opp_2nd_sv_pts_won%_54", "SOS_tw_l60_opp_2nd_sv_pts_won%_53", "SOS_tw_l60_opp_2nd_sv_pts_won%_52", "SOS_tw_l60_opp_2nd_sv_pts_won%_51", "SOS_tw_l60_opp_2nd_sv_pts_won%_50", "SOS_tw_l60_opp_2nd_sv_pts_won%_49", "SOS_tw_l60_opp_2nd_sv_pts_won%_48", "SOS_tw_l60_opp_2nd_sv_pts_won%_47", "SOS_tw_l60_opp_2nd_sv_pts_won%_46", "SOS_tw_l60_opp_2nd_sv_pts_won%_45", "SOS_tw_l60_opp_2nd_sv_pts_won%_44", "SOS_tw_l60_opp_2nd_sv_pts_won%_43", "SOS_tw_l60_opp_2nd_sv_pts_won%_42", "SOS_tw_l60_opp_2nd_sv_pts_won%_41", "SOS_tw_l60_opp_2nd_sv_pts_won%_40", "SOS_tw_l60_opp_2nd_sv_pts_won%_39", "SOS_tw_l60_opp_2nd_sv_pts_won%_38", "SOS_tw_l60_opp_2nd_sv_pts_won%_37", "SOS_tw_l60_opp_2nd_sv_pts_won%_36", "SOS_tw_l60_opp_2nd_sv_pts_won%_35", "SOS_tw_l60_opp_2nd_sv_pts_won%_34", "SOS_tw_l60_opp_2nd_sv_pts_won%_33", "SOS_tw_l60_opp_2nd_sv_pts_won%_32", "SOS_tw_l60_opp_2nd_sv_pts_won%_31", "SOS_tw_l60_opp_2nd_sv_pts_won%_30", "SOS_tw_l60_opp_2nd_sv_pts_won%_29", "SOS_tw_l60_opp_2nd_sv_pts_won%_28", "SOS_tw_l60_opp_2nd_sv_pts_won%_27", "SOS_tw_l60_opp_2nd_sv_pts_won%_26", "SOS_tw_l60_opp_2nd_sv_pts_won%_25", "SOS_tw_l60_opp_2nd_sv_pts_won%_24", "SOS_tw_l60_opp_2nd_sv_pts_won%_23", "SOS_tw_l60_opp_2nd_sv_pts_won%_22", "SOS_tw_l60_opp_2nd_sv_pts_won%_21", "SOS_tw_l60_opp_2nd_sv_pts_won%_20", "SOS_tw_l60_opp_2nd_sv_pts_won%_19", "SOS_tw_l60_opp_2nd_sv_pts_won%_18", "SOS_tw_l60_opp_2nd_sv_pts_won%_17", "SOS_tw_l60_opp_2nd_sv_pts_won%_16", "SOS_tw_l60_opp_2nd_sv_pts_won%_15", "SOS_tw_l60_opp_2nd_sv_pts_won%_14", "SOS_tw_l60_opp_2nd_sv_pts_won%_13", "SOS_tw_l60_opp_2nd_sv_pts_won%_12", "SOS_tw_l60_opp_2nd_sv_pts_won%_11", "SOS_tw_l60_opp_2nd_sv_pts_won%_10", "SOS_tw_l60_opp_2nd_sv_pts_won%_9", "SOS_tw_l60_opp_2nd_sv_pts_won%_8", "SOS_tw_l60_opp_2nd_sv_pts_won%_7", "SOS_tw_l60_opp_2nd_sv_pts_won%_6", "SOS_tw_l60_opp_2nd_sv_pts_won%_5", "SOS_tw_l60_opp_2nd_sv_pts_won%_4", "SOS_tw_l60_opp_2nd_sv_pts_won%_3", "SOS_tw_l60_opp_2nd_sv_pts_won%_2", "SOS_tw_l60_opp_2nd_sv_pts_won%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%"] = (df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_ws"]/df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % SECOND RETURN pts won the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%"]

# Mean % SECOND RETURN pts won performance for ALL players (l60_tw_ss) per surface (clay, hard). We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_clay_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_2nd_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 2nd return pts the field ALLOWS on average
mean_clay_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_2nd_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 2nd return pts the field ALLOWS on average
mean_clay_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_2nd_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 2nd return pts the field ALLOWS on average
mean_clay_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_2nd_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 2nd return pts the field ALLOWS on average
mean_clay_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_2nd_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 2nd return pts the field ALLOWS on average
mean_clay_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_2nd_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 2nd return pts the field ALLOWS on average
# (49.09524038461528, 49.54225694444442, 49.5925971563981, 49.45010296010288, 49.0471594202899, 49.0315653298835)

mean_hard_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_2nd_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 2nd return pts the field ALLOWS on average
mean_hard_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_2nd_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 2nd return pts the field ALLOWS on average
mean_hard_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_2nd_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 2nd return pts the field ALLOWS on average
mean_hard_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_2nd_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 2nd return pts the field ALLOWS on average
mean_hard_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_2nd_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 2nd return pts the field ALLOWS on average
mean_hard_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_2nd_sv_pts_won%_l60_tw_ss'].mean()) #We want in terms of pct 2nd return pts the field ALLOWS on average
# (48.73842067988659, 49.00169037940372, 49.15574787052807, 48.92828127757199, 49.08781234064256, 49.00359244769502)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_2nd_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_2nd_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_2nd_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_2nd_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_2nd_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_2nd_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_2nd_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_2nd_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_2nd_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_2nd_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_2nd_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_2nd_ret_pts_won%_l60_tw_ss_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_2nd_sv_pts_won%_ws", "SOS_tw_l60_opp_2nd_sv_pts_won%_ws_ct", "SOS_tw_l60_opp_2nd_sv_pts_won%", "SOS_tw_l60_opp_2nd_sv_pts_won%_50", "SOS_tw_l60_opp_2nd_sv_pts_won%_49", "SOS_tw_l60_opp_2nd_sv_pts_won%_48", "SOS_tw_l60_opp_2nd_sv_pts_won%_47", "SOS_tw_l60_opp_2nd_sv_pts_won%_46", "SOS_tw_l60_opp_2nd_sv_pts_won%_45", "SOS_tw_l60_opp_2nd_sv_pts_won%_44", "SOS_tw_l60_opp_2nd_sv_pts_won%_43", "SOS_tw_l60_opp_2nd_sv_pts_won%_42", "SOS_tw_l60_opp_2nd_sv_pts_won%_41", "SOS_tw_l60_opp_2nd_sv_pts_won%_40", "SOS_tw_l60_opp_2nd_sv_pts_won%_39", "SOS_tw_l60_opp_2nd_sv_pts_won%_38", "SOS_tw_l60_opp_2nd_sv_pts_won%_37", "SOS_tw_l60_opp_2nd_sv_pts_won%_36", "SOS_tw_l60_opp_2nd_sv_pts_won%_35", "SOS_tw_l60_opp_2nd_sv_pts_won%_34", "SOS_tw_l60_opp_2nd_sv_pts_won%_33", "SOS_tw_l60_opp_2nd_sv_pts_won%_32", "SOS_tw_l60_opp_2nd_sv_pts_won%_31", "SOS_tw_l60_opp_2nd_sv_pts_won%_30", "SOS_tw_l60_opp_2nd_sv_pts_won%_29", "SOS_tw_l60_opp_2nd_sv_pts_won%_28", "SOS_tw_l60_opp_2nd_sv_pts_won%_27", "SOS_tw_l60_opp_2nd_sv_pts_won%_26", "SOS_tw_l60_opp_2nd_sv_pts_won%_25", "SOS_tw_l60_opp_2nd_sv_pts_won%_24", "SOS_tw_l60_opp_2nd_sv_pts_won%_23", "SOS_tw_l60_opp_2nd_sv_pts_won%_22", "SOS_tw_l60_opp_2nd_sv_pts_won%_21", "SOS_tw_l60_opp_2nd_sv_pts_won%_20", "SOS_tw_l60_opp_2nd_sv_pts_won%_19", "SOS_tw_l60_opp_2nd_sv_pts_won%_18", "SOS_tw_l60_opp_2nd_sv_pts_won%_17", "SOS_tw_l60_opp_2nd_sv_pts_won%_16", "SOS_tw_l60_opp_2nd_sv_pts_won%_15", "SOS_tw_l60_opp_2nd_sv_pts_won%_14", "SOS_tw_l60_opp_2nd_sv_pts_won%_13", "SOS_tw_l60_opp_2nd_sv_pts_won%_12", "SOS_tw_l60_opp_2nd_sv_pts_won%_11", "SOS_tw_l60_opp_2nd_sv_pts_won%_10", "SOS_tw_l60_opp_2nd_sv_pts_won%_9", "SOS_tw_l60_opp_2nd_sv_pts_won%_8", "SOS_tw_l60_opp_2nd_sv_pts_won%_7", "SOS_tw_l60_opp_2nd_sv_pts_won%_6", "SOS_tw_l60_opp_2nd_sv_pts_won%_5", "SOS_tw_l60_opp_2nd_sv_pts_won%_4", "SOS_tw_l60_opp_2nd_sv_pts_won%_3", "SOS_tw_l60_opp_2nd_sv_pts_won%_2", "SOS_tw_l60_opp_2nd_sv_pts_won%_1"],axis=1)

In [171]:
# 'p_2nd_ret_pts_won%_l10_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) SECOND RETURN POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_2nd_sv_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_2nd_sv_pts_won%_60", "SOS_tw_l60_opp_2nd_sv_pts_won%_59", "SOS_tw_l60_opp_2nd_sv_pts_won%_58", "SOS_tw_l60_opp_2nd_sv_pts_won%_57", "SOS_tw_l60_opp_2nd_sv_pts_won%_56", "SOS_tw_l60_opp_2nd_sv_pts_won%_55", "SOS_tw_l60_opp_2nd_sv_pts_won%_54", "SOS_tw_l60_opp_2nd_sv_pts_won%_53", "SOS_tw_l60_opp_2nd_sv_pts_won%_52", "SOS_tw_l60_opp_2nd_sv_pts_won%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_2nd_sv_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_2nd_sv_pts_won%_60", "SOS_tw_l60_opp_2nd_sv_pts_won%_59", "SOS_tw_l60_opp_2nd_sv_pts_won%_58", "SOS_tw_l60_opp_2nd_sv_pts_won%_57", "SOS_tw_l60_opp_2nd_sv_pts_won%_56", "SOS_tw_l60_opp_2nd_sv_pts_won%_55", "SOS_tw_l60_opp_2nd_sv_pts_won%_54", "SOS_tw_l60_opp_2nd_sv_pts_won%_53", "SOS_tw_l60_opp_2nd_sv_pts_won%_52", "SOS_tw_l60_opp_2nd_sv_pts_won%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_2nd_sv_pts_won%"] = (df_player2["SOS_tw_l10_opp_2nd_sv_pts_won%_ws"]/df_player2["SOS_tw_l10_opp_2nd_sv_pts_won%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % SECOND RETURN pts won the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches on this surface is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l10_opp_2nd_sv_pts_won%"]

# Mean % SECOND RETURN pts won performance (l10_tw_ss) for ALL players per surface (clay, hard). We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_clay_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_2nd_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 2nd return pts the field ALLOWS on average
mean_clay_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_2nd_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 2nd return pts the field ALLOWS on average
mean_clay_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_2nd_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 2nd return pts the field ALLOWS on average
mean_clay_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_2nd_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 2nd return pts the field ALLOWS on average
mean_clay_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_2nd_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 2nd return pts the field ALLOWS on average
mean_clay_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_2nd_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 2nd return pts the field ALLOWS on average
# (49.06131868131865, 49.5478724747475, 49.45854660347554, 49.39018983268983, 48.96871175523334, 49.12830206985776)

mean_hard_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_2nd_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 2nd return pts the field ALLOWS on average
mean_hard_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_2nd_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 2nd return pts the field ALLOWS on average
mean_hard_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_2nd_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 2nd return pts the field ALLOWS on average
mean_hard_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_2nd_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 2nd return pts the field ALLOWS on average
mean_hard_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_2nd_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 2nd return pts the field ALLOWS on average
mean_hard_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_2nd_sv_pts_won%_l10_tw_ss'].mean()) #We want in terms of pct 2nd return pts the field ALLOWS on average
# (48.689344900849925, 48.92008468834694, 49.19004429301557, 48.694612669842854, 49.14239503654612, 48.93656404150387)

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_2nd_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_2nd_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_2nd_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_2nd_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_2nd_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_2nd_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_2nd_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_2nd_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_2nd_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_2nd_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_2nd_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_2nd_ret_pts_won%_l10_tw_ss_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_2nd_sv_pts_won%_ws", "SOS_tw_l10_opp_2nd_sv_pts_won%_ws_ct", "SOS_tw_l10_opp_2nd_sv_pts_won%", "SOS_tw_l60_opp_2nd_sv_pts_won%_60", "SOS_tw_l60_opp_2nd_sv_pts_won%_59", "SOS_tw_l60_opp_2nd_sv_pts_won%_58", "SOS_tw_l60_opp_2nd_sv_pts_won%_57", "SOS_tw_l60_opp_2nd_sv_pts_won%_56", "SOS_tw_l60_opp_2nd_sv_pts_won%_55", "SOS_tw_l60_opp_2nd_sv_pts_won%_54", "SOS_tw_l60_opp_2nd_sv_pts_won%_53", "SOS_tw_l60_opp_2nd_sv_pts_won%_52", "SOS_tw_l60_opp_2nd_sv_pts_won%_51"],axis=1)

In [172]:
# 'p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), indoor/outdoor I/O specific FIRST RETURN POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf', 't_ind', 'm_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific (SS), IO specific SECOND SERVE POINTS WON performance of player OPPONENTS over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_60"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-1)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_59"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-2)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_58"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-3)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_57"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-4)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_56"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-5)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_55"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-6)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_54"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-7)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_53"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-8)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_52"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-9)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_51"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-10)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_50"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-11)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_49"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-12)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_48"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-13)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_47"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-14)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_46"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-15)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_45"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-16)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_44"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-17)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_43"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-18)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_42"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-19)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_41"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-20)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_40"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-21)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_39"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-22)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_38"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-23)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_37"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-24)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_36"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-25)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_35"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-26)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_34"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-27)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_33"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-28)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_32"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-29)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_31"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-30)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_30"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-31)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_29"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-32)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_28"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-33)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_27"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-34)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_26"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-35)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_25"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-36)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_24"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-37)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_23"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-38)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_22"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-39)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_21"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-40)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_20"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-41)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_19"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-42)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_18"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-43)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_17"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-44)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_16"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-45)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_15"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-46)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_14"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-47)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_13"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-48)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_12"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-49)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_11"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-50)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_10"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-51)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_9"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-52)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_8"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-53)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_7"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-54)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_6"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-55)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_5"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-56)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_4"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-57)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_3"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-58)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_2"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-59)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_1"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_2nd_sv_pts_won%_60", "SOS_tw_l60_opp_2nd_sv_pts_won%_59", "SOS_tw_l60_opp_2nd_sv_pts_won%_58", "SOS_tw_l60_opp_2nd_sv_pts_won%_57", "SOS_tw_l60_opp_2nd_sv_pts_won%_56", "SOS_tw_l60_opp_2nd_sv_pts_won%_55", "SOS_tw_l60_opp_2nd_sv_pts_won%_54", "SOS_tw_l60_opp_2nd_sv_pts_won%_53", "SOS_tw_l60_opp_2nd_sv_pts_won%_52", "SOS_tw_l60_opp_2nd_sv_pts_won%_51", "SOS_tw_l60_opp_2nd_sv_pts_won%_50", "SOS_tw_l60_opp_2nd_sv_pts_won%_49", "SOS_tw_l60_opp_2nd_sv_pts_won%_48", "SOS_tw_l60_opp_2nd_sv_pts_won%_47", "SOS_tw_l60_opp_2nd_sv_pts_won%_46", "SOS_tw_l60_opp_2nd_sv_pts_won%_45", "SOS_tw_l60_opp_2nd_sv_pts_won%_44", "SOS_tw_l60_opp_2nd_sv_pts_won%_43", "SOS_tw_l60_opp_2nd_sv_pts_won%_42", "SOS_tw_l60_opp_2nd_sv_pts_won%_41", "SOS_tw_l60_opp_2nd_sv_pts_won%_40", "SOS_tw_l60_opp_2nd_sv_pts_won%_39", "SOS_tw_l60_opp_2nd_sv_pts_won%_38", "SOS_tw_l60_opp_2nd_sv_pts_won%_37", "SOS_tw_l60_opp_2nd_sv_pts_won%_36", "SOS_tw_l60_opp_2nd_sv_pts_won%_35", "SOS_tw_l60_opp_2nd_sv_pts_won%_34", "SOS_tw_l60_opp_2nd_sv_pts_won%_33", "SOS_tw_l60_opp_2nd_sv_pts_won%_32", "SOS_tw_l60_opp_2nd_sv_pts_won%_31", "SOS_tw_l60_opp_2nd_sv_pts_won%_30", "SOS_tw_l60_opp_2nd_sv_pts_won%_29", "SOS_tw_l60_opp_2nd_sv_pts_won%_28", "SOS_tw_l60_opp_2nd_sv_pts_won%_27", "SOS_tw_l60_opp_2nd_sv_pts_won%_26", "SOS_tw_l60_opp_2nd_sv_pts_won%_25", "SOS_tw_l60_opp_2nd_sv_pts_won%_24", "SOS_tw_l60_opp_2nd_sv_pts_won%_23", "SOS_tw_l60_opp_2nd_sv_pts_won%_22", "SOS_tw_l60_opp_2nd_sv_pts_won%_21", "SOS_tw_l60_opp_2nd_sv_pts_won%_20", "SOS_tw_l60_opp_2nd_sv_pts_won%_19", "SOS_tw_l60_opp_2nd_sv_pts_won%_18", "SOS_tw_l60_opp_2nd_sv_pts_won%_17", "SOS_tw_l60_opp_2nd_sv_pts_won%_16", "SOS_tw_l60_opp_2nd_sv_pts_won%_15", "SOS_tw_l60_opp_2nd_sv_pts_won%_14", "SOS_tw_l60_opp_2nd_sv_pts_won%_13", "SOS_tw_l60_opp_2nd_sv_pts_won%_12", "SOS_tw_l60_opp_2nd_sv_pts_won%_11", "SOS_tw_l60_opp_2nd_sv_pts_won%_10", "SOS_tw_l60_opp_2nd_sv_pts_won%_9", "SOS_tw_l60_opp_2nd_sv_pts_won%_8", "SOS_tw_l60_opp_2nd_sv_pts_won%_7", "SOS_tw_l60_opp_2nd_sv_pts_won%_6", "SOS_tw_l60_opp_2nd_sv_pts_won%_5", "SOS_tw_l60_opp_2nd_sv_pts_won%_4", "SOS_tw_l60_opp_2nd_sv_pts_won%_3", "SOS_tw_l60_opp_2nd_sv_pts_won%_2", "SOS_tw_l60_opp_2nd_sv_pts_won%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_2nd_sv_pts_won%_60", "SOS_tw_l60_opp_2nd_sv_pts_won%_59", "SOS_tw_l60_opp_2nd_sv_pts_won%_58", "SOS_tw_l60_opp_2nd_sv_pts_won%_57", "SOS_tw_l60_opp_2nd_sv_pts_won%_56", "SOS_tw_l60_opp_2nd_sv_pts_won%_55", "SOS_tw_l60_opp_2nd_sv_pts_won%_54", "SOS_tw_l60_opp_2nd_sv_pts_won%_53", "SOS_tw_l60_opp_2nd_sv_pts_won%_52", "SOS_tw_l60_opp_2nd_sv_pts_won%_51", "SOS_tw_l60_opp_2nd_sv_pts_won%_50", "SOS_tw_l60_opp_2nd_sv_pts_won%_49", "SOS_tw_l60_opp_2nd_sv_pts_won%_48", "SOS_tw_l60_opp_2nd_sv_pts_won%_47", "SOS_tw_l60_opp_2nd_sv_pts_won%_46", "SOS_tw_l60_opp_2nd_sv_pts_won%_45", "SOS_tw_l60_opp_2nd_sv_pts_won%_44", "SOS_tw_l60_opp_2nd_sv_pts_won%_43", "SOS_tw_l60_opp_2nd_sv_pts_won%_42", "SOS_tw_l60_opp_2nd_sv_pts_won%_41", "SOS_tw_l60_opp_2nd_sv_pts_won%_40", "SOS_tw_l60_opp_2nd_sv_pts_won%_39", "SOS_tw_l60_opp_2nd_sv_pts_won%_38", "SOS_tw_l60_opp_2nd_sv_pts_won%_37", "SOS_tw_l60_opp_2nd_sv_pts_won%_36", "SOS_tw_l60_opp_2nd_sv_pts_won%_35", "SOS_tw_l60_opp_2nd_sv_pts_won%_34", "SOS_tw_l60_opp_2nd_sv_pts_won%_33", "SOS_tw_l60_opp_2nd_sv_pts_won%_32", "SOS_tw_l60_opp_2nd_sv_pts_won%_31", "SOS_tw_l60_opp_2nd_sv_pts_won%_30", "SOS_tw_l60_opp_2nd_sv_pts_won%_29", "SOS_tw_l60_opp_2nd_sv_pts_won%_28", "SOS_tw_l60_opp_2nd_sv_pts_won%_27", "SOS_tw_l60_opp_2nd_sv_pts_won%_26", "SOS_tw_l60_opp_2nd_sv_pts_won%_25", "SOS_tw_l60_opp_2nd_sv_pts_won%_24", "SOS_tw_l60_opp_2nd_sv_pts_won%_23", "SOS_tw_l60_opp_2nd_sv_pts_won%_22", "SOS_tw_l60_opp_2nd_sv_pts_won%_21", "SOS_tw_l60_opp_2nd_sv_pts_won%_20", "SOS_tw_l60_opp_2nd_sv_pts_won%_19", "SOS_tw_l60_opp_2nd_sv_pts_won%_18", "SOS_tw_l60_opp_2nd_sv_pts_won%_17", "SOS_tw_l60_opp_2nd_sv_pts_won%_16", "SOS_tw_l60_opp_2nd_sv_pts_won%_15", "SOS_tw_l60_opp_2nd_sv_pts_won%_14", "SOS_tw_l60_opp_2nd_sv_pts_won%_13", "SOS_tw_l60_opp_2nd_sv_pts_won%_12", "SOS_tw_l60_opp_2nd_sv_pts_won%_11", "SOS_tw_l60_opp_2nd_sv_pts_won%_10", "SOS_tw_l60_opp_2nd_sv_pts_won%_9", "SOS_tw_l60_opp_2nd_sv_pts_won%_8", "SOS_tw_l60_opp_2nd_sv_pts_won%_7", "SOS_tw_l60_opp_2nd_sv_pts_won%_6", "SOS_tw_l60_opp_2nd_sv_pts_won%_5", "SOS_tw_l60_opp_2nd_sv_pts_won%_4", "SOS_tw_l60_opp_2nd_sv_pts_won%_3", "SOS_tw_l60_opp_2nd_sv_pts_won%_2", "SOS_tw_l60_opp_2nd_sv_pts_won%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%"] = (df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_ws"]/df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % SECOND RETURN pts won the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface and IO status is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l60_opp_2nd_sv_pts_won%"]

# Mean % SECOND RETURN pts won performance (l60_tw_ss_IO) for ALL players per surface (clay, hard) and IO status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_2nd_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_2nd_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_2nd_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_2nd_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_2nd_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_2nd_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (nan, 44.59464285714285, 48.08191616766467, 47.71438596491226, 48.8462676056338, 47.314137931034516)

mean_clay_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_2nd_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_2nd_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_2nd_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_2nd_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_2nd_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_2nd_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (49.09524038461528, 49.52741840731064, 49.6245016949153, 49.513004484304716, 49.00275475543481, 49.01114237855953)

mean_hard_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_2nd_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_2nd_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_2nd_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_2nd_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_2nd_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_2nd_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# (47.89153658536584, 48.322034368071165, 48.63935317687462, 48.530777626193704, 49.28724041811845, 48.79020547945212)
                                         
mean_hard_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_2nd_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_2nd_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_2nd_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_2nd_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_2nd_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_2nd_sv_pts_won%_l60_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average                                         
# (48.55009154550357, 49.13092942345917, 49.2733210874356, 49.00830496283862, 48.99019270006811, 49.051302857142915)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l60_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 
                                         
del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_2nd_sv_pts_won%_ws", "SOS_tw_l60_opp_2nd_sv_pts_won%_ws_ct", "SOS_tw_l60_opp_2nd_sv_pts_won%", "SOS_tw_l60_opp_2nd_sv_pts_won%_50", "SOS_tw_l60_opp_2nd_sv_pts_won%_49", "SOS_tw_l60_opp_2nd_sv_pts_won%_48", "SOS_tw_l60_opp_2nd_sv_pts_won%_47", "SOS_tw_l60_opp_2nd_sv_pts_won%_46", "SOS_tw_l60_opp_2nd_sv_pts_won%_45", "SOS_tw_l60_opp_2nd_sv_pts_won%_44", "SOS_tw_l60_opp_2nd_sv_pts_won%_43", "SOS_tw_l60_opp_2nd_sv_pts_won%_42", "SOS_tw_l60_opp_2nd_sv_pts_won%_41", "SOS_tw_l60_opp_2nd_sv_pts_won%_40", "SOS_tw_l60_opp_2nd_sv_pts_won%_39", "SOS_tw_l60_opp_2nd_sv_pts_won%_38", "SOS_tw_l60_opp_2nd_sv_pts_won%_37", "SOS_tw_l60_opp_2nd_sv_pts_won%_36", "SOS_tw_l60_opp_2nd_sv_pts_won%_35", "SOS_tw_l60_opp_2nd_sv_pts_won%_34", "SOS_tw_l60_opp_2nd_sv_pts_won%_33", "SOS_tw_l60_opp_2nd_sv_pts_won%_32", "SOS_tw_l60_opp_2nd_sv_pts_won%_31", "SOS_tw_l60_opp_2nd_sv_pts_won%_30", "SOS_tw_l60_opp_2nd_sv_pts_won%_29", "SOS_tw_l60_opp_2nd_sv_pts_won%_28", "SOS_tw_l60_opp_2nd_sv_pts_won%_27", "SOS_tw_l60_opp_2nd_sv_pts_won%_26", "SOS_tw_l60_opp_2nd_sv_pts_won%_25", "SOS_tw_l60_opp_2nd_sv_pts_won%_24", "SOS_tw_l60_opp_2nd_sv_pts_won%_23", "SOS_tw_l60_opp_2nd_sv_pts_won%_22", "SOS_tw_l60_opp_2nd_sv_pts_won%_21", "SOS_tw_l60_opp_2nd_sv_pts_won%_20", "SOS_tw_l60_opp_2nd_sv_pts_won%_19", "SOS_tw_l60_opp_2nd_sv_pts_won%_18", "SOS_tw_l60_opp_2nd_sv_pts_won%_17", "SOS_tw_l60_opp_2nd_sv_pts_won%_16", "SOS_tw_l60_opp_2nd_sv_pts_won%_15", "SOS_tw_l60_opp_2nd_sv_pts_won%_14", "SOS_tw_l60_opp_2nd_sv_pts_won%_13", "SOS_tw_l60_opp_2nd_sv_pts_won%_12", "SOS_tw_l60_opp_2nd_sv_pts_won%_11", "SOS_tw_l60_opp_2nd_sv_pts_won%_10", "SOS_tw_l60_opp_2nd_sv_pts_won%_9", "SOS_tw_l60_opp_2nd_sv_pts_won%_8", "SOS_tw_l60_opp_2nd_sv_pts_won%_7", "SOS_tw_l60_opp_2nd_sv_pts_won%_6", "SOS_tw_l60_opp_2nd_sv_pts_won%_5", "SOS_tw_l60_opp_2nd_sv_pts_won%_4", "SOS_tw_l60_opp_2nd_sv_pts_won%_3", "SOS_tw_l60_opp_2nd_sv_pts_won%_2", "SOS_tw_l60_opp_2nd_sv_pts_won%_1"],axis=1)

In [173]:
# 'p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), indoor/outdoor I/O specific FIRST RETURN POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_2nd_sv_pts_won%_ws"] = df_player2[["SOS_tw_l60_opp_2nd_sv_pts_won%_60", "SOS_tw_l60_opp_2nd_sv_pts_won%_59", "SOS_tw_l60_opp_2nd_sv_pts_won%_58", "SOS_tw_l60_opp_2nd_sv_pts_won%_57", "SOS_tw_l60_opp_2nd_sv_pts_won%_56", "SOS_tw_l60_opp_2nd_sv_pts_won%_55", "SOS_tw_l60_opp_2nd_sv_pts_won%_54", "SOS_tw_l60_opp_2nd_sv_pts_won%_53", "SOS_tw_l60_opp_2nd_sv_pts_won%_52", "SOS_tw_l60_opp_2nd_sv_pts_won%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_2nd_sv_pts_won%_ws_ct"] = df_player2[["SOS_tw_l60_opp_2nd_sv_pts_won%_60", "SOS_tw_l60_opp_2nd_sv_pts_won%_59", "SOS_tw_l60_opp_2nd_sv_pts_won%_58", "SOS_tw_l60_opp_2nd_sv_pts_won%_57", "SOS_tw_l60_opp_2nd_sv_pts_won%_56", "SOS_tw_l60_opp_2nd_sv_pts_won%_55", "SOS_tw_l60_opp_2nd_sv_pts_won%_54", "SOS_tw_l60_opp_2nd_sv_pts_won%_53", "SOS_tw_l60_opp_2nd_sv_pts_won%_52", "SOS_tw_l60_opp_2nd_sv_pts_won%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_2nd_sv_pts_won%"] = (df_player2["SOS_tw_l10_opp_2nd_sv_pts_won%_ws"]/df_player2["SOS_tw_l10_opp_2nd_sv_pts_won%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % SECOND RETURN pts won the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface and IO status is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = 100 - df_player2["SOS_tw_l10_opp_2nd_sv_pts_won%"]

# Mean % SECOND RETURN pts won performance (l10_tw_ss_IO) for ALL players per surface (clay, hard) and IO status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_2nd_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_2nd_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_2nd_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_2nd_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_2nd_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_2nd_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# 

mean_clay_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_2nd_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_2nd_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_2nd_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_2nd_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_2nd_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_clay_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_2nd_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# 

mean_hard_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_2nd_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_2nd_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_2nd_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_2nd_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_2nd_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_2nd_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
# 
                                         
mean_hard_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_2nd_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_2nd_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_2nd_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_2nd_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_2nd_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average
mean_hard_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_2nd_sv_pts_won%_l10_tw_ss_IO'].mean()) #We want in terms of pct total pts the field ALLOWS on average                                         
# 

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_2nd_ret_pts_won%_l10_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 
                                         
del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_2nd_sv_pts_won%_ws", "SOS_tw_l10_opp_2nd_sv_pts_won%_ws_ct", "SOS_tw_l10_opp_2nd_sv_pts_won%", "SOS_tw_l60_opp_2nd_sv_pts_won%_60", "SOS_tw_l60_opp_2nd_sv_pts_won%_59", "SOS_tw_l60_opp_2nd_sv_pts_won%_58", "SOS_tw_l60_opp_2nd_sv_pts_won%_57", "SOS_tw_l60_opp_2nd_sv_pts_won%_56", "SOS_tw_l60_opp_2nd_sv_pts_won%_55", "SOS_tw_l60_opp_2nd_sv_pts_won%_54", "SOS_tw_l60_opp_2nd_sv_pts_won%_53", "SOS_tw_l60_opp_2nd_sv_pts_won%_52", "SOS_tw_l60_opp_2nd_sv_pts_won%_51"],axis=1)

In [174]:
df_player2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57068 entries, 56533 to 40644
Columns: 279 entries, t_id to p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj
dtypes: datetime64[ns](1), float64(243), int64(29), object(6)
memory usage: 124.4+ MB


In [175]:
# 'p_ace%_l60_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) ACE performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted ACED (as returners) performance for player OPPONENTS in the maximum interval (60 matches) prior to the match being predicted 
df_player2["SOS_tw_l60_opp_aced%_60"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-1)
df_player2["SOS_tw_l60_opp_aced%_59"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-2)
df_player2["SOS_tw_l60_opp_aced%_58"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-3)
df_player2["SOS_tw_l60_opp_aced%_57"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-4)
df_player2["SOS_tw_l60_opp_aced%_56"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-5)
df_player2["SOS_tw_l60_opp_aced%_55"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-6)
df_player2["SOS_tw_l60_opp_aced%_54"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-7)
df_player2["SOS_tw_l60_opp_aced%_53"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-8)
df_player2["SOS_tw_l60_opp_aced%_52"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-9)
df_player2["SOS_tw_l60_opp_aced%_51"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-10)
df_player2["SOS_tw_l60_opp_aced%_50"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-11)
df_player2["SOS_tw_l60_opp_aced%_49"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-12)
df_player2["SOS_tw_l60_opp_aced%_48"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-13)
df_player2["SOS_tw_l60_opp_aced%_47"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-14)
df_player2["SOS_tw_l60_opp_aced%_46"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-15)
df_player2["SOS_tw_l60_opp_aced%_45"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-16)
df_player2["SOS_tw_l60_opp_aced%_44"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-17)
df_player2["SOS_tw_l60_opp_aced%_43"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-18)
df_player2["SOS_tw_l60_opp_aced%_42"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-19)
df_player2["SOS_tw_l60_opp_aced%_41"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-20)
df_player2["SOS_tw_l60_opp_aced%_40"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-21)
df_player2["SOS_tw_l60_opp_aced%_39"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-22)
df_player2["SOS_tw_l60_opp_aced%_38"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-23)
df_player2["SOS_tw_l60_opp_aced%_37"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-24)
df_player2["SOS_tw_l60_opp_aced%_36"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-25)
df_player2["SOS_tw_l60_opp_aced%_35"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-26)
df_player2["SOS_tw_l60_opp_aced%_34"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-27)
df_player2["SOS_tw_l60_opp_aced%_33"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-28)
df_player2["SOS_tw_l60_opp_aced%_32"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-29)
df_player2["SOS_tw_l60_opp_aced%_31"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-30)
df_player2["SOS_tw_l60_opp_aced%_30"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-31)
df_player2["SOS_tw_l60_opp_aced%_29"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-32)
df_player2["SOS_tw_l60_opp_aced%_28"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-33)
df_player2["SOS_tw_l60_opp_aced%_27"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-34)
df_player2["SOS_tw_l60_opp_aced%_26"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-35)
df_player2["SOS_tw_l60_opp_aced%_25"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-36)
df_player2["SOS_tw_l60_opp_aced%_24"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-37)
df_player2["SOS_tw_l60_opp_aced%_23"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-38)
df_player2["SOS_tw_l60_opp_aced%_22"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-39)
df_player2["SOS_tw_l60_opp_aced%_21"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-40)
df_player2["SOS_tw_l60_opp_aced%_20"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-41)
df_player2["SOS_tw_l60_opp_aced%_19"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-42)
df_player2["SOS_tw_l60_opp_aced%_18"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-43)
df_player2["SOS_tw_l60_opp_aced%_17"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-44)
df_player2["SOS_tw_l60_opp_aced%_16"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-45)
df_player2["SOS_tw_l60_opp_aced%_15"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-46)
df_player2["SOS_tw_l60_opp_aced%_14"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-47)
df_player2["SOS_tw_l60_opp_aced%_13"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-48)
df_player2["SOS_tw_l60_opp_aced%_12"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-49)
df_player2["SOS_tw_l60_opp_aced%_11"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-50)
df_player2["SOS_tw_l60_opp_aced%_10"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-51)
df_player2["SOS_tw_l60_opp_aced%_9"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-52)
df_player2["SOS_tw_l60_opp_aced%_8"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-53)
df_player2["SOS_tw_l60_opp_aced%_7"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-54)
df_player2["SOS_tw_l60_opp_aced%_6"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-55)
df_player2["SOS_tw_l60_opp_aced%_5"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-56)
df_player2["SOS_tw_l60_opp_aced%_4"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-57)
df_player2["SOS_tw_l60_opp_aced%_3"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-58)
df_player2["SOS_tw_l60_opp_aced%_2"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-59)
df_player2["SOS_tw_l60_opp_aced%_1"] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_tw_ss'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_aced%_ws"] = df_player2[["SOS_tw_l60_opp_aced%_60", "SOS_tw_l60_opp_aced%_59", "SOS_tw_l60_opp_aced%_58", "SOS_tw_l60_opp_aced%_57", "SOS_tw_l60_opp_aced%_56", "SOS_tw_l60_opp_aced%_55", "SOS_tw_l60_opp_aced%_54", "SOS_tw_l60_opp_aced%_53", "SOS_tw_l60_opp_aced%_52", "SOS_tw_l60_opp_aced%_51", "SOS_tw_l60_opp_aced%_50", "SOS_tw_l60_opp_aced%_49", "SOS_tw_l60_opp_aced%_48", "SOS_tw_l60_opp_aced%_47", "SOS_tw_l60_opp_aced%_46", "SOS_tw_l60_opp_aced%_45", "SOS_tw_l60_opp_aced%_44", "SOS_tw_l60_opp_aced%_43", "SOS_tw_l60_opp_aced%_42", "SOS_tw_l60_opp_aced%_41", "SOS_tw_l60_opp_aced%_40", "SOS_tw_l60_opp_aced%_39", "SOS_tw_l60_opp_aced%_38", "SOS_tw_l60_opp_aced%_37", "SOS_tw_l60_opp_aced%_36", "SOS_tw_l60_opp_aced%_35", "SOS_tw_l60_opp_aced%_34", "SOS_tw_l60_opp_aced%_33", "SOS_tw_l60_opp_aced%_32", "SOS_tw_l60_opp_aced%_31", "SOS_tw_l60_opp_aced%_30", "SOS_tw_l60_opp_aced%_29", "SOS_tw_l60_opp_aced%_28", "SOS_tw_l60_opp_aced%_27", "SOS_tw_l60_opp_aced%_26", "SOS_tw_l60_opp_aced%_25", "SOS_tw_l60_opp_aced%_24", "SOS_tw_l60_opp_aced%_23", "SOS_tw_l60_opp_aced%_22", "SOS_tw_l60_opp_aced%_21", "SOS_tw_l60_opp_aced%_20", "SOS_tw_l60_opp_aced%_19", "SOS_tw_l60_opp_aced%_18", "SOS_tw_l60_opp_aced%_17", "SOS_tw_l60_opp_aced%_16", "SOS_tw_l60_opp_aced%_15", "SOS_tw_l60_opp_aced%_14", "SOS_tw_l60_opp_aced%_13", "SOS_tw_l60_opp_aced%_12", "SOS_tw_l60_opp_aced%_11", "SOS_tw_l60_opp_aced%_10", "SOS_tw_l60_opp_aced%_9", "SOS_tw_l60_opp_aced%_8", "SOS_tw_l60_opp_aced%_7", "SOS_tw_l60_opp_aced%_6", "SOS_tw_l60_opp_aced%_5", "SOS_tw_l60_opp_aced%_4", "SOS_tw_l60_opp_aced%_3", "SOS_tw_l60_opp_aced%_2", "SOS_tw_l60_opp_aced%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_aced%_ws_ct"] = df_player2[["SOS_tw_l60_opp_aced%_60", "SOS_tw_l60_opp_aced%_59", "SOS_tw_l60_opp_aced%_58", "SOS_tw_l60_opp_aced%_57", "SOS_tw_l60_opp_aced%_56", "SOS_tw_l60_opp_aced%_55", "SOS_tw_l60_opp_aced%_54", "SOS_tw_l60_opp_aced%_53", "SOS_tw_l60_opp_aced%_52", "SOS_tw_l60_opp_aced%_51", "SOS_tw_l60_opp_aced%_50", "SOS_tw_l60_opp_aced%_49", "SOS_tw_l60_opp_aced%_48", "SOS_tw_l60_opp_aced%_47", "SOS_tw_l60_opp_aced%_46", "SOS_tw_l60_opp_aced%_45", "SOS_tw_l60_opp_aced%_44", "SOS_tw_l60_opp_aced%_43", "SOS_tw_l60_opp_aced%_42", "SOS_tw_l60_opp_aced%_41", "SOS_tw_l60_opp_aced%_40", "SOS_tw_l60_opp_aced%_39", "SOS_tw_l60_opp_aced%_38", "SOS_tw_l60_opp_aced%_37", "SOS_tw_l60_opp_aced%_36", "SOS_tw_l60_opp_aced%_35", "SOS_tw_l60_opp_aced%_34", "SOS_tw_l60_opp_aced%_33", "SOS_tw_l60_opp_aced%_32", "SOS_tw_l60_opp_aced%_31", "SOS_tw_l60_opp_aced%_30", "SOS_tw_l60_opp_aced%_29", "SOS_tw_l60_opp_aced%_28", "SOS_tw_l60_opp_aced%_27", "SOS_tw_l60_opp_aced%_26", "SOS_tw_l60_opp_aced%_25", "SOS_tw_l60_opp_aced%_24", "SOS_tw_l60_opp_aced%_23", "SOS_tw_l60_opp_aced%_22", "SOS_tw_l60_opp_aced%_21", "SOS_tw_l60_opp_aced%_20", "SOS_tw_l60_opp_aced%_19", "SOS_tw_l60_opp_aced%_18", "SOS_tw_l60_opp_aced%_17", "SOS_tw_l60_opp_aced%_16", "SOS_tw_l60_opp_aced%_15", "SOS_tw_l60_opp_aced%_14", "SOS_tw_l60_opp_aced%_13", "SOS_tw_l60_opp_aced%_12", "SOS_tw_l60_opp_aced%_11", "SOS_tw_l60_opp_aced%_10", "SOS_tw_l60_opp_aced%_9", "SOS_tw_l60_opp_aced%_8", "SOS_tw_l60_opp_aced%_7", "SOS_tw_l60_opp_aced%_6", "SOS_tw_l60_opp_aced%_5", "SOS_tw_l60_opp_aced%_4", "SOS_tw_l60_opp_aced%_3", "SOS_tw_l60_opp_aced%_2", "SOS_tw_l60_opp_aced%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_aced%"] = (df_player2["SOS_tw_l60_opp_aced%_ws"]/df_player2["SOS_tw_l60_opp_aced%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % ACES the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to YIELD (as returners)
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l60_opp_aced%"]

# Mean % ACES YIELDED (as returners) performance (l60_tw_ss) across ALL players per surface. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (except 2009-2010), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_aced%_l60_tw_ss'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_2 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_aced%_l60_tw_ss'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_3 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_aced%_l60_tw_ss'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_4 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_aced%_l60_tw_ss'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_5 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_aced%_l60_tw_ss'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_clay_SOS_6 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_aced%_l60_tw_ss'].mean()) #We want in terms of pct aces the field ALLOWS on average
# (4.751923076923076, 5.144352904040411, 5.170843601895729, 5.3639092664092685, 5.642685990338157, 5.498738680465719) 

mean_hard_SOS_1 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_aced%_l60_tw_ss'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_2 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_aced%_l60_tw_ss'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_3 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_aced%_l60_tw_ss'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_4 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_aced%_l60_tw_ss'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_5 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_aced%_l60_tw_ss'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_6 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_aced%_l60_tw_ss'].mean()) #We want in terms of pct aces the field ALLOWS on average
# (7.927379603399439, 8.332086720867183, 8.067735945485508, 8.55715193223926, 8.711470338262792, 8.773131484946445)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_ace%_l60_tw_ss_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_ace%_l60_tw_ss_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_ace%_l60_tw_ss_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_ace%_l60_tw_ss_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_ace%_l60_tw_ss_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_ace%_l60_tw_ss_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_ace%_l60_tw_ss_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_ace%_l60_tw_ss_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_ace%_l60_tw_ss_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_ace%_l60_tw_ss_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_ace%_l60_tw_ss_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_ace%_l60_tw_ss_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_aced%_ws", "SOS_tw_l60_opp_aced%_ws_ct", "SOS_tw_l60_opp_aced%", "SOS_tw_l60_opp_aced%_50", "SOS_tw_l60_opp_aced%_49", "SOS_tw_l60_opp_aced%_48", "SOS_tw_l60_opp_aced%_47", "SOS_tw_l60_opp_aced%_46", "SOS_tw_l60_opp_aced%_45", "SOS_tw_l60_opp_aced%_44", "SOS_tw_l60_opp_aced%_43", "SOS_tw_l60_opp_aced%_42", "SOS_tw_l60_opp_aced%_41", "SOS_tw_l60_opp_aced%_40", "SOS_tw_l60_opp_aced%_39", "SOS_tw_l60_opp_aced%_38", "SOS_tw_l60_opp_aced%_37", "SOS_tw_l60_opp_aced%_36", "SOS_tw_l60_opp_aced%_35", "SOS_tw_l60_opp_aced%_34", "SOS_tw_l60_opp_aced%_33", "SOS_tw_l60_opp_aced%_32", "SOS_tw_l60_opp_aced%_31", "SOS_tw_l60_opp_aced%_30", "SOS_tw_l60_opp_aced%_29", "SOS_tw_l60_opp_aced%_28", "SOS_tw_l60_opp_aced%_27", "SOS_tw_l60_opp_aced%_26", "SOS_tw_l60_opp_aced%_25", "SOS_tw_l60_opp_aced%_24", "SOS_tw_l60_opp_aced%_23", "SOS_tw_l60_opp_aced%_22", "SOS_tw_l60_opp_aced%_21", "SOS_tw_l60_opp_aced%_20", "SOS_tw_l60_opp_aced%_19", "SOS_tw_l60_opp_aced%_18", "SOS_tw_l60_opp_aced%_17", "SOS_tw_l60_opp_aced%_16", "SOS_tw_l60_opp_aced%_15", "SOS_tw_l60_opp_aced%_14", "SOS_tw_l60_opp_aced%_13", "SOS_tw_l60_opp_aced%_12", "SOS_tw_l60_opp_aced%_11", "SOS_tw_l60_opp_aced%_10", "SOS_tw_l60_opp_aced%_9", "SOS_tw_l60_opp_aced%_8", "SOS_tw_l60_opp_aced%_7", "SOS_tw_l60_opp_aced%_6", "SOS_tw_l60_opp_aced%_5", "SOS_tw_l60_opp_aced%_4", "SOS_tw_l60_opp_aced%_3", "SOS_tw_l60_opp_aced%_2", "SOS_tw_l60_opp_aced%_1"],axis=1)

In [176]:
# 'p_ace%_l10_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) ACE performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_aced%_ws"] = df_player2[["SOS_tw_l60_opp_aced%_60", "SOS_tw_l60_opp_aced%_59", "SOS_tw_l60_opp_aced%_58", "SOS_tw_l60_opp_aced%_57", "SOS_tw_l60_opp_aced%_56", "SOS_tw_l60_opp_aced%_55", "SOS_tw_l60_opp_aced%_54", "SOS_tw_l60_opp_aced%_53", "SOS_tw_l60_opp_aced%_52", "SOS_tw_l60_opp_aced%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_aced%_ws_ct"] = df_player2[["SOS_tw_l60_opp_aced%_60", "SOS_tw_l60_opp_aced%_59", "SOS_tw_l60_opp_aced%_58", "SOS_tw_l60_opp_aced%_57", "SOS_tw_l60_opp_aced%_56", "SOS_tw_l60_opp_aced%_55", "SOS_tw_l60_opp_aced%_54", "SOS_tw_l60_opp_aced%_53", "SOS_tw_l60_opp_aced%_52", "SOS_tw_l60_opp_aced%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_aced%"] = (df_player2["SOS_tw_l10_opp_aced%_ws"]/df_player2["SOS_tw_l10_opp_aced%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % ACES the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches on this surface is expected to YIELD (as returners)
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l10_opp_aced%"]

# Mean % ACES YIELDED (as returners) across ALL players per surface. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 

# Mean % ACES YIELDED (as returners) performance (l10_tw_ss) across ALL players per surface. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (except 2009-2010), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_aced%_l10_tw_ss'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_2 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_aced%_l10_tw_ss'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_3 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_aced%_l10_tw_ss'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_4 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_aced%_l10_tw_ss'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_5 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_aced%_l10_tw_ss'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_clay_SOS_6 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_aced%_l10_tw_ss'].mean()) #We want in terms of pct aces the field ALLOWS on average
# (4.796607142857143, 5.174665404040403, 5.247472353870454, 5.470276705276698, 5.728528180354268, 5.528971539456665)

mean_hard_SOS_1 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_aced%_l10_tw_ss'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_2 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_aced%_l10_tw_ss'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_3 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_aced%_l10_tw_ss'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_4 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_aced%_l10_tw_ss'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_5 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_aced%_l10_tw_ss'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_6 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_aced%_l10_tw_ss'].mean()) #We want in terms of pct aces the field ALLOWS on average
# (8.055460339943352, 8.363949864498622, 8.111393526405466, 8.858763013940369, 8.617768145504005, 9.034830753529489)

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_ace%_l10_tw_ss_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_ace%_l10_tw_ss_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_ace%_l10_tw_ss_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_ace%_l10_tw_ss_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_ace%_l10_tw_ss_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_ace%_l10_tw_ss_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_ace%_l10_tw_ss_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_ace%_l10_tw_ss_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_ace%_l10_tw_ss_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_ace%_l10_tw_ss_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_ace%_l10_tw_ss_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_ace%_l10_tw_ss_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_aced%_ws", "SOS_tw_l10_opp_aced%_ws_ct", "SOS_tw_l10_opp_aced%", "SOS_tw_l60_opp_aced%_60", "SOS_tw_l60_opp_aced%_59", "SOS_tw_l60_opp_aced%_58", "SOS_tw_l60_opp_aced%_57", "SOS_tw_l60_opp_aced%_56", "SOS_tw_l60_opp_aced%_55", "SOS_tw_l60_opp_aced%_54", "SOS_tw_l60_opp_aced%_53", "SOS_tw_l60_opp_aced%_52", "SOS_tw_l60_opp_aced%_51"],axis=1)

In [177]:
# 'p_ace%_l60_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), I/O specific ACE performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

# First obtain mean, TW,SS, IO specific ACED (as returners) performance for player OPPONENTS in the maximum interval (60 matches) prior to the match being predicted 
df_player2["SOS_tw_l60_opp_aced%_60"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-1)
df_player2["SOS_tw_l60_opp_aced%_59"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-2)
df_player2["SOS_tw_l60_opp_aced%_58"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-3)
df_player2["SOS_tw_l60_opp_aced%_57"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-4)
df_player2["SOS_tw_l60_opp_aced%_56"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-5)
df_player2["SOS_tw_l60_opp_aced%_55"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-6)
df_player2["SOS_tw_l60_opp_aced%_54"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-7)
df_player2["SOS_tw_l60_opp_aced%_53"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-8)
df_player2["SOS_tw_l60_opp_aced%_52"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-9)
df_player2["SOS_tw_l60_opp_aced%_51"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-10)
df_player2["SOS_tw_l60_opp_aced%_50"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-11)
df_player2["SOS_tw_l60_opp_aced%_49"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-12)
df_player2["SOS_tw_l60_opp_aced%_48"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-13)
df_player2["SOS_tw_l60_opp_aced%_47"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-14)
df_player2["SOS_tw_l60_opp_aced%_46"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-15)
df_player2["SOS_tw_l60_opp_aced%_45"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-16)
df_player2["SOS_tw_l60_opp_aced%_44"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-17)
df_player2["SOS_tw_l60_opp_aced%_43"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-18)
df_player2["SOS_tw_l60_opp_aced%_42"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-19)
df_player2["SOS_tw_l60_opp_aced%_41"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-20)
df_player2["SOS_tw_l60_opp_aced%_40"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-21)
df_player2["SOS_tw_l60_opp_aced%_39"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-22)
df_player2["SOS_tw_l60_opp_aced%_38"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-23)
df_player2["SOS_tw_l60_opp_aced%_37"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-24)
df_player2["SOS_tw_l60_opp_aced%_36"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-25)
df_player2["SOS_tw_l60_opp_aced%_35"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-26)
df_player2["SOS_tw_l60_opp_aced%_34"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-27)
df_player2["SOS_tw_l60_opp_aced%_33"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-28)
df_player2["SOS_tw_l60_opp_aced%_32"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-29)
df_player2["SOS_tw_l60_opp_aced%_31"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-30)
df_player2["SOS_tw_l60_opp_aced%_30"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-31)
df_player2["SOS_tw_l60_opp_aced%_29"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-32)
df_player2["SOS_tw_l60_opp_aced%_28"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-33)
df_player2["SOS_tw_l60_opp_aced%_27"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-34)
df_player2["SOS_tw_l60_opp_aced%_26"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-35)
df_player2["SOS_tw_l60_opp_aced%_25"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-36)
df_player2["SOS_tw_l60_opp_aced%_24"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-37)
df_player2["SOS_tw_l60_opp_aced%_23"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-38)
df_player2["SOS_tw_l60_opp_aced%_22"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-39)
df_player2["SOS_tw_l60_opp_aced%_21"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-40)
df_player2["SOS_tw_l60_opp_aced%_20"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-41)
df_player2["SOS_tw_l60_opp_aced%_19"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-42)
df_player2["SOS_tw_l60_opp_aced%_18"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-43)
df_player2["SOS_tw_l60_opp_aced%_17"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-44)
df_player2["SOS_tw_l60_opp_aced%_16"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-45)
df_player2["SOS_tw_l60_opp_aced%_15"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-46)
df_player2["SOS_tw_l60_opp_aced%_14"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-47)
df_player2["SOS_tw_l60_opp_aced%_13"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-48)
df_player2["SOS_tw_l60_opp_aced%_12"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-49)
df_player2["SOS_tw_l60_opp_aced%_11"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-50)
df_player2["SOS_tw_l60_opp_aced%_10"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-51)
df_player2["SOS_tw_l60_opp_aced%_9"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-52)
df_player2["SOS_tw_l60_opp_aced%_8"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-53)
df_player2["SOS_tw_l60_opp_aced%_7"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-54)
df_player2["SOS_tw_l60_opp_aced%_6"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-55)
df_player2["SOS_tw_l60_opp_aced%_5"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-56)
df_player2["SOS_tw_l60_opp_aced%_4"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-57)
df_player2["SOS_tw_l60_opp_aced%_3"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-58)
df_player2["SOS_tw_l60_opp_aced%_2"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-59)
df_player2["SOS_tw_l60_opp_aced%_1"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_aced%_l60_tw_ss_IO'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_aced%_ws"] = df_player2[["SOS_tw_l60_opp_aced%_60", "SOS_tw_l60_opp_aced%_59", "SOS_tw_l60_opp_aced%_58", "SOS_tw_l60_opp_aced%_57", "SOS_tw_l60_opp_aced%_56", "SOS_tw_l60_opp_aced%_55", "SOS_tw_l60_opp_aced%_54", "SOS_tw_l60_opp_aced%_53", "SOS_tw_l60_opp_aced%_52", "SOS_tw_l60_opp_aced%_51", "SOS_tw_l60_opp_aced%_50", "SOS_tw_l60_opp_aced%_49", "SOS_tw_l60_opp_aced%_48", "SOS_tw_l60_opp_aced%_47", "SOS_tw_l60_opp_aced%_46", "SOS_tw_l60_opp_aced%_45", "SOS_tw_l60_opp_aced%_44", "SOS_tw_l60_opp_aced%_43", "SOS_tw_l60_opp_aced%_42", "SOS_tw_l60_opp_aced%_41", "SOS_tw_l60_opp_aced%_40", "SOS_tw_l60_opp_aced%_39", "SOS_tw_l60_opp_aced%_38", "SOS_tw_l60_opp_aced%_37", "SOS_tw_l60_opp_aced%_36", "SOS_tw_l60_opp_aced%_35", "SOS_tw_l60_opp_aced%_34", "SOS_tw_l60_opp_aced%_33", "SOS_tw_l60_opp_aced%_32", "SOS_tw_l60_opp_aced%_31", "SOS_tw_l60_opp_aced%_30", "SOS_tw_l60_opp_aced%_29", "SOS_tw_l60_opp_aced%_28", "SOS_tw_l60_opp_aced%_27", "SOS_tw_l60_opp_aced%_26", "SOS_tw_l60_opp_aced%_25", "SOS_tw_l60_opp_aced%_24", "SOS_tw_l60_opp_aced%_23", "SOS_tw_l60_opp_aced%_22", "SOS_tw_l60_opp_aced%_21", "SOS_tw_l60_opp_aced%_20", "SOS_tw_l60_opp_aced%_19", "SOS_tw_l60_opp_aced%_18", "SOS_tw_l60_opp_aced%_17", "SOS_tw_l60_opp_aced%_16", "SOS_tw_l60_opp_aced%_15", "SOS_tw_l60_opp_aced%_14", "SOS_tw_l60_opp_aced%_13", "SOS_tw_l60_opp_aced%_12", "SOS_tw_l60_opp_aced%_11", "SOS_tw_l60_opp_aced%_10", "SOS_tw_l60_opp_aced%_9", "SOS_tw_l60_opp_aced%_8", "SOS_tw_l60_opp_aced%_7", "SOS_tw_l60_opp_aced%_6", "SOS_tw_l60_opp_aced%_5", "SOS_tw_l60_opp_aced%_4", "SOS_tw_l60_opp_aced%_3", "SOS_tw_l60_opp_aced%_2", "SOS_tw_l60_opp_aced%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_aced%_ws_ct"] = df_player2[["SOS_tw_l60_opp_aced%_60", "SOS_tw_l60_opp_aced%_59", "SOS_tw_l60_opp_aced%_58", "SOS_tw_l60_opp_aced%_57", "SOS_tw_l60_opp_aced%_56", "SOS_tw_l60_opp_aced%_55", "SOS_tw_l60_opp_aced%_54", "SOS_tw_l60_opp_aced%_53", "SOS_tw_l60_opp_aced%_52", "SOS_tw_l60_opp_aced%_51", "SOS_tw_l60_opp_aced%_50", "SOS_tw_l60_opp_aced%_49", "SOS_tw_l60_opp_aced%_48", "SOS_tw_l60_opp_aced%_47", "SOS_tw_l60_opp_aced%_46", "SOS_tw_l60_opp_aced%_45", "SOS_tw_l60_opp_aced%_44", "SOS_tw_l60_opp_aced%_43", "SOS_tw_l60_opp_aced%_42", "SOS_tw_l60_opp_aced%_41", "SOS_tw_l60_opp_aced%_40", "SOS_tw_l60_opp_aced%_39", "SOS_tw_l60_opp_aced%_38", "SOS_tw_l60_opp_aced%_37", "SOS_tw_l60_opp_aced%_36", "SOS_tw_l60_opp_aced%_35", "SOS_tw_l60_opp_aced%_34", "SOS_tw_l60_opp_aced%_33", "SOS_tw_l60_opp_aced%_32", "SOS_tw_l60_opp_aced%_31", "SOS_tw_l60_opp_aced%_30", "SOS_tw_l60_opp_aced%_29", "SOS_tw_l60_opp_aced%_28", "SOS_tw_l60_opp_aced%_27", "SOS_tw_l60_opp_aced%_26", "SOS_tw_l60_opp_aced%_25", "SOS_tw_l60_opp_aced%_24", "SOS_tw_l60_opp_aced%_23", "SOS_tw_l60_opp_aced%_22", "SOS_tw_l60_opp_aced%_21", "SOS_tw_l60_opp_aced%_20", "SOS_tw_l60_opp_aced%_19", "SOS_tw_l60_opp_aced%_18", "SOS_tw_l60_opp_aced%_17", "SOS_tw_l60_opp_aced%_16", "SOS_tw_l60_opp_aced%_15", "SOS_tw_l60_opp_aced%_14", "SOS_tw_l60_opp_aced%_13", "SOS_tw_l60_opp_aced%_12", "SOS_tw_l60_opp_aced%_11", "SOS_tw_l60_opp_aced%_10", "SOS_tw_l60_opp_aced%_9", "SOS_tw_l60_opp_aced%_8", "SOS_tw_l60_opp_aced%_7", "SOS_tw_l60_opp_aced%_6", "SOS_tw_l60_opp_aced%_5", "SOS_tw_l60_opp_aced%_4", "SOS_tw_l60_opp_aced%_3", "SOS_tw_l60_opp_aced%_2", "SOS_tw_l60_opp_aced%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_aced%"] = (df_player2["SOS_tw_l60_opp_aced%_ws"]/df_player2["SOS_tw_l60_opp_aced%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % ACES the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface and IO status is expected to YIELD (as returners)
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l60_opp_aced%"]

# Mean % ACES YIELDED (as returners) performance (l60_tw_ss_IO) across ALL players per surface and IO status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (except 2009-2010), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_aced%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_2i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_aced%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_3i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_aced%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_4i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_aced%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_5i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_aced%_l60_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_clay_SOS_6i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_aced%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
# (nan, 3.8914285714285723, 6.172095808383229, 6.199590643274852, 5.79950704225352, 6.649425287356321)

mean_clay_SOS_1o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_aced%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_2o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_aced%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_3o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_aced%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_4o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_aced%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_5o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_aced%_l60_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_clay_SOS_6o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_aced%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
# (4.751923076923076, 5.140159921671027, 5.129840677966083, 5.2908554674025385, 5.626606657608692, 5.47055276381909)

mean_hard_SOS_1i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_aced%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_2i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_aced%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_3i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_aced%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_4i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_aced%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_5i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_aced%_l60_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_hard_SOS_6i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_aced%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
# (8.942207317073182, 9.00124722838137, 8.677784773898116, 8.648062755798078, 8.619017421602791, 8.727815068493152)

mean_hard_SOS_1o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_aced%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_2o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_aced%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_3o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_aced%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_4o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_aced%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_5o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_aced%_l60_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_hard_SOS_6o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_aced%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
# (7.452326332794834, 7.880879721669983, 7.771491550330651, 8.399700311675854, 8.72438222625253, 8.652221714285707)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_ace%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_ace%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_ace%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_ace%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_ace%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_ace%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_ace%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_ace%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_ace%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_ace%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_ace%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_ace%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_ace%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_ace%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_ace%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_ace%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_ace%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_ace%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_ace%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_ace%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_ace%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_ace%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_ace%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_ace%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l60_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 

del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_aced%_ws", "SOS_tw_l60_opp_aced%_ws_ct", "SOS_tw_l60_opp_aced%", "SOS_tw_l60_opp_aced%_50", "SOS_tw_l60_opp_aced%_49", "SOS_tw_l60_opp_aced%_48", "SOS_tw_l60_opp_aced%_47", "SOS_tw_l60_opp_aced%_46", "SOS_tw_l60_opp_aced%_45", "SOS_tw_l60_opp_aced%_44", "SOS_tw_l60_opp_aced%_43", "SOS_tw_l60_opp_aced%_42", "SOS_tw_l60_opp_aced%_41", "SOS_tw_l60_opp_aced%_40", "SOS_tw_l60_opp_aced%_39", "SOS_tw_l60_opp_aced%_38", "SOS_tw_l60_opp_aced%_37", "SOS_tw_l60_opp_aced%_36", "SOS_tw_l60_opp_aced%_35", "SOS_tw_l60_opp_aced%_34", "SOS_tw_l60_opp_aced%_33", "SOS_tw_l60_opp_aced%_32", "SOS_tw_l60_opp_aced%_31", "SOS_tw_l60_opp_aced%_30", "SOS_tw_l60_opp_aced%_29", "SOS_tw_l60_opp_aced%_28", "SOS_tw_l60_opp_aced%_27", "SOS_tw_l60_opp_aced%_26", "SOS_tw_l60_opp_aced%_25", "SOS_tw_l60_opp_aced%_24", "SOS_tw_l60_opp_aced%_23", "SOS_tw_l60_opp_aced%_22", "SOS_tw_l60_opp_aced%_21", "SOS_tw_l60_opp_aced%_20", "SOS_tw_l60_opp_aced%_19", "SOS_tw_l60_opp_aced%_18", "SOS_tw_l60_opp_aced%_17", "SOS_tw_l60_opp_aced%_16", "SOS_tw_l60_opp_aced%_15", "SOS_tw_l60_opp_aced%_14", "SOS_tw_l60_opp_aced%_13", "SOS_tw_l60_opp_aced%_12", "SOS_tw_l60_opp_aced%_11", "SOS_tw_l60_opp_aced%_10", "SOS_tw_l60_opp_aced%_9", "SOS_tw_l60_opp_aced%_8", "SOS_tw_l60_opp_aced%_7", "SOS_tw_l60_opp_aced%_6", "SOS_tw_l60_opp_aced%_5", "SOS_tw_l60_opp_aced%_4", "SOS_tw_l60_opp_aced%_3", "SOS_tw_l60_opp_aced%_2", "SOS_tw_l60_opp_aced%_1"],axis=1)

In [178]:
# 'p_ace%_l10_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), I/O specific ACE performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_aced%_ws"] = df_player2[["SOS_tw_l60_opp_aced%_60", "SOS_tw_l60_opp_aced%_59", "SOS_tw_l60_opp_aced%_58", "SOS_tw_l60_opp_aced%_57", "SOS_tw_l60_opp_aced%_56", "SOS_tw_l60_opp_aced%_55", "SOS_tw_l60_opp_aced%_54", "SOS_tw_l60_opp_aced%_53", "SOS_tw_l60_opp_aced%_52", "SOS_tw_l60_opp_aced%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_aced%_ws_ct"] = df_player2[["SOS_tw_l60_opp_aced%_60", "SOS_tw_l60_opp_aced%_59", "SOS_tw_l60_opp_aced%_58", "SOS_tw_l60_opp_aced%_57", "SOS_tw_l60_opp_aced%_56", "SOS_tw_l60_opp_aced%_55", "SOS_tw_l60_opp_aced%_54", "SOS_tw_l60_opp_aced%_53", "SOS_tw_l60_opp_aced%_52", "SOS_tw_l60_opp_aced%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_aced%"] = (df_player2["SOS_tw_l10_opp_aced%_ws"]/df_player2["SOS_tw_l10_opp_aced%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % ACES the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches on this surface and IO status is expected to YIELD (as returners)
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l10_opp_aced%"]

# Mean % ACES YIELDED (as returners) performance (l10_tw_ss_IO) across ALL players per surface and IO status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (except 2009-2010), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_aced%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_2i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_aced%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_3i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_aced%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_4i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_aced%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_5i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_aced%_l10_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_clay_SOS_6i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_aced%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
# 

mean_clay_SOS_1o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_aced%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_2o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_aced%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_3o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_aced%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_4o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_aced%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_5o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_aced%_l10_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_clay_SOS_6o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_aced%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
# 

mean_hard_SOS_1i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_aced%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_2i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_aced%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_3i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_aced%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_4i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_aced%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_5i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_aced%_l10_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_hard_SOS_6i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_aced%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
# 

mean_hard_SOS_1o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_aced%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_2o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_aced%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_3o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_aced%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_4o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_aced%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_5o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_aced%_l10_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_hard_SOS_6o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_aced%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
# 

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_ace%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_ace%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_ace%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_ace%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_ace%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_ace%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_ace%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_ace%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_ace%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_ace%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_ace%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_ace%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_ace%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_ace%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_ace%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_ace%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_ace%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_ace%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_ace%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_ace%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_ace%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_ace%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_ace%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_ace%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_ace%_l10_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 

del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_aced%_ws", "SOS_tw_l10_opp_aced%_ws_ct", "SOS_tw_l10_opp_aced%", "SOS_tw_l60_opp_aced%_60", "SOS_tw_l60_opp_aced%_59", "SOS_tw_l60_opp_aced%_58", "SOS_tw_l60_opp_aced%_57", "SOS_tw_l60_opp_aced%_56", "SOS_tw_l60_opp_aced%_55", "SOS_tw_l60_opp_aced%_54", "SOS_tw_l60_opp_aced%_53", "SOS_tw_l60_opp_aced%_52", "SOS_tw_l60_opp_aced%_51"],axis=1)


In [179]:
# 'p_aced%_l60_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) ACED performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted ACE (as servers) performance for player OPPONENTS in the maximum interval (60 matches) prior to the match being predicted 
df_player2["SOS_tw_l60_opp_ace%_60"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-1)
df_player2["SOS_tw_l60_opp_ace%_59"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-2)
df_player2["SOS_tw_l60_opp_ace%_58"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-3)
df_player2["SOS_tw_l60_opp_ace%_57"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-4)
df_player2["SOS_tw_l60_opp_ace%_56"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-5)
df_player2["SOS_tw_l60_opp_ace%_55"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-6)
df_player2["SOS_tw_l60_opp_ace%_54"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-7)
df_player2["SOS_tw_l60_opp_ace%_53"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-8)
df_player2["SOS_tw_l60_opp_ace%_52"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-9)
df_player2["SOS_tw_l60_opp_ace%_51"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-10)
df_player2["SOS_tw_l60_opp_ace%_50"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-11)
df_player2["SOS_tw_l60_opp_ace%_49"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-12)
df_player2["SOS_tw_l60_opp_ace%_48"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-13)
df_player2["SOS_tw_l60_opp_ace%_47"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-14)
df_player2["SOS_tw_l60_opp_ace%_46"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-15)
df_player2["SOS_tw_l60_opp_ace%_45"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-16)
df_player2["SOS_tw_l60_opp_ace%_44"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-17)
df_player2["SOS_tw_l60_opp_ace%_43"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-18)
df_player2["SOS_tw_l60_opp_ace%_42"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-19)
df_player2["SOS_tw_l60_opp_ace%_41"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-20)
df_player2["SOS_tw_l60_opp_ace%_40"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-21)
df_player2["SOS_tw_l60_opp_ace%_39"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-22)
df_player2["SOS_tw_l60_opp_ace%_38"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-23)
df_player2["SOS_tw_l60_opp_ace%_37"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-24)
df_player2["SOS_tw_l60_opp_ace%_36"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-25)
df_player2["SOS_tw_l60_opp_ace%_35"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-26)
df_player2["SOS_tw_l60_opp_ace%_34"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-27)
df_player2["SOS_tw_l60_opp_ace%_33"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-28)
df_player2["SOS_tw_l60_opp_ace%_32"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-29)
df_player2["SOS_tw_l60_opp_ace%_31"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-30)
df_player2["SOS_tw_l60_opp_ace%_30"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-31)
df_player2["SOS_tw_l60_opp_ace%_29"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-32)
df_player2["SOS_tw_l60_opp_ace%_28"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-33)
df_player2["SOS_tw_l60_opp_ace%_27"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-34)
df_player2["SOS_tw_l60_opp_ace%_26"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-35)
df_player2["SOS_tw_l60_opp_ace%_25"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-36)
df_player2["SOS_tw_l60_opp_ace%_24"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-37)
df_player2["SOS_tw_l60_opp_ace%_23"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-38)
df_player2["SOS_tw_l60_opp_ace%_22"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-39)
df_player2["SOS_tw_l60_opp_ace%_21"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-40)
df_player2["SOS_tw_l60_opp_ace%_20"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-41)
df_player2["SOS_tw_l60_opp_ace%_19"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-42)
df_player2["SOS_tw_l60_opp_ace%_18"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-43)
df_player2["SOS_tw_l60_opp_ace%_17"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-44)
df_player2["SOS_tw_l60_opp_ace%_16"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-45)
df_player2["SOS_tw_l60_opp_ace%_15"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-46)
df_player2["SOS_tw_l60_opp_ace%_14"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-47)
df_player2["SOS_tw_l60_opp_ace%_13"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-48)
df_player2["SOS_tw_l60_opp_ace%_12"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-49)
df_player2["SOS_tw_l60_opp_ace%_11"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-50)
df_player2["SOS_tw_l60_opp_ace%_10"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-51)
df_player2["SOS_tw_l60_opp_ace%_9"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-52)
df_player2["SOS_tw_l60_opp_ace%_8"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-53)
df_player2["SOS_tw_l60_opp_ace%_7"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-54)
df_player2["SOS_tw_l60_opp_ace%_6"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-55)
df_player2["SOS_tw_l60_opp_ace%_5"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-56)
df_player2["SOS_tw_l60_opp_ace%_4"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-57)
df_player2["SOS_tw_l60_opp_ace%_3"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-58)
df_player2["SOS_tw_l60_opp_ace%_2"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-59)
df_player2["SOS_tw_l60_opp_ace%_1"] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_tw_ss'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_ace%_ws"] = df_player2[["SOS_tw_l60_opp_ace%_60", "SOS_tw_l60_opp_ace%_59", "SOS_tw_l60_opp_ace%_58", "SOS_tw_l60_opp_ace%_57", "SOS_tw_l60_opp_ace%_56", "SOS_tw_l60_opp_ace%_55", "SOS_tw_l60_opp_ace%_54", "SOS_tw_l60_opp_ace%_53", "SOS_tw_l60_opp_ace%_52", "SOS_tw_l60_opp_ace%_51", "SOS_tw_l60_opp_ace%_50", "SOS_tw_l60_opp_ace%_49", "SOS_tw_l60_opp_ace%_48", "SOS_tw_l60_opp_ace%_47", "SOS_tw_l60_opp_ace%_46", "SOS_tw_l60_opp_ace%_45", "SOS_tw_l60_opp_ace%_44", "SOS_tw_l60_opp_ace%_43", "SOS_tw_l60_opp_ace%_42", "SOS_tw_l60_opp_ace%_41", "SOS_tw_l60_opp_ace%_40", "SOS_tw_l60_opp_ace%_39", "SOS_tw_l60_opp_ace%_38", "SOS_tw_l60_opp_ace%_37", "SOS_tw_l60_opp_ace%_36", "SOS_tw_l60_opp_ace%_35", "SOS_tw_l60_opp_ace%_34", "SOS_tw_l60_opp_ace%_33", "SOS_tw_l60_opp_ace%_32", "SOS_tw_l60_opp_ace%_31", "SOS_tw_l60_opp_ace%_30", "SOS_tw_l60_opp_ace%_29", "SOS_tw_l60_opp_ace%_28", "SOS_tw_l60_opp_ace%_27", "SOS_tw_l60_opp_ace%_26", "SOS_tw_l60_opp_ace%_25", "SOS_tw_l60_opp_ace%_24", "SOS_tw_l60_opp_ace%_23", "SOS_tw_l60_opp_ace%_22", "SOS_tw_l60_opp_ace%_21", "SOS_tw_l60_opp_ace%_20", "SOS_tw_l60_opp_ace%_19", "SOS_tw_l60_opp_ace%_18", "SOS_tw_l60_opp_ace%_17", "SOS_tw_l60_opp_ace%_16", "SOS_tw_l60_opp_ace%_15", "SOS_tw_l60_opp_ace%_14", "SOS_tw_l60_opp_ace%_13", "SOS_tw_l60_opp_ace%_12", "SOS_tw_l60_opp_ace%_11", "SOS_tw_l60_opp_ace%_10", "SOS_tw_l60_opp_ace%_9", "SOS_tw_l60_opp_ace%_8", "SOS_tw_l60_opp_ace%_7", "SOS_tw_l60_opp_ace%_6", "SOS_tw_l60_opp_ace%_5", "SOS_tw_l60_opp_ace%_4", "SOS_tw_l60_opp_ace%_3", "SOS_tw_l60_opp_ace%_2", "SOS_tw_l60_opp_ace%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_ace%_ws_ct"] = df_player2[["SOS_tw_l60_opp_ace%_60", "SOS_tw_l60_opp_ace%_59", "SOS_tw_l60_opp_ace%_58", "SOS_tw_l60_opp_ace%_57", "SOS_tw_l60_opp_ace%_56", "SOS_tw_l60_opp_ace%_55", "SOS_tw_l60_opp_ace%_54", "SOS_tw_l60_opp_ace%_53", "SOS_tw_l60_opp_ace%_52", "SOS_tw_l60_opp_ace%_51", "SOS_tw_l60_opp_ace%_50", "SOS_tw_l60_opp_ace%_49", "SOS_tw_l60_opp_ace%_48", "SOS_tw_l60_opp_ace%_47", "SOS_tw_l60_opp_ace%_46", "SOS_tw_l60_opp_ace%_45", "SOS_tw_l60_opp_ace%_44", "SOS_tw_l60_opp_ace%_43", "SOS_tw_l60_opp_ace%_42", "SOS_tw_l60_opp_ace%_41", "SOS_tw_l60_opp_ace%_40", "SOS_tw_l60_opp_ace%_39", "SOS_tw_l60_opp_ace%_38", "SOS_tw_l60_opp_ace%_37", "SOS_tw_l60_opp_ace%_36", "SOS_tw_l60_opp_ace%_35", "SOS_tw_l60_opp_ace%_34", "SOS_tw_l60_opp_ace%_33", "SOS_tw_l60_opp_ace%_32", "SOS_tw_l60_opp_ace%_31", "SOS_tw_l60_opp_ace%_30", "SOS_tw_l60_opp_ace%_29", "SOS_tw_l60_opp_ace%_28", "SOS_tw_l60_opp_ace%_27", "SOS_tw_l60_opp_ace%_26", "SOS_tw_l60_opp_ace%_25", "SOS_tw_l60_opp_ace%_24", "SOS_tw_l60_opp_ace%_23", "SOS_tw_l60_opp_ace%_22", "SOS_tw_l60_opp_ace%_21", "SOS_tw_l60_opp_ace%_20", "SOS_tw_l60_opp_ace%_19", "SOS_tw_l60_opp_ace%_18", "SOS_tw_l60_opp_ace%_17", "SOS_tw_l60_opp_ace%_16", "SOS_tw_l60_opp_ace%_15", "SOS_tw_l60_opp_ace%_14", "SOS_tw_l60_opp_ace%_13", "SOS_tw_l60_opp_ace%_12", "SOS_tw_l60_opp_ace%_11", "SOS_tw_l60_opp_ace%_10", "SOS_tw_l60_opp_ace%_9", "SOS_tw_l60_opp_ace%_8", "SOS_tw_l60_opp_ace%_7", "SOS_tw_l60_opp_ace%_6", "SOS_tw_l60_opp_ace%_5", "SOS_tw_l60_opp_ace%_4", "SOS_tw_l60_opp_ace%_3", "SOS_tw_l60_opp_ace%_2", "SOS_tw_l60_opp_ace%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_ace%"] = (df_player2["SOS_tw_l60_opp_ace%_ws"]/df_player2["SOS_tw_l60_opp_ace%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % ACES (as servers) that the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to achieve
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l60_opp_ace%"]

# Mean % ACES (as servers) performance (l60_tw_ss) achieved across ALL players per surface. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (except 2009-2010), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_ace%_l60_tw_ss'].mean()) #We want in terms of pct aces the field achieved on average
mean_clay_SOS_2 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_ace%_l60_tw_ss'].mean()) #We want in terms of pct aces the field achieved on average
mean_clay_SOS_3 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_ace%_l60_tw_ss'].mean()) #We want in terms of pct aces the field achieved on average
mean_clay_SOS_4 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_ace%_l60_tw_ss'].mean()) #We want in terms of pct aces the field achieved on average
mean_clay_SOS_5 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_ace%_l60_tw_ss'].mean()) #We want in terms of pct aces the field achieved on average
mean_clay_SOS_6 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_ace%_l60_tw_ss'].mean()) #We want in terms of pct aces the field achieved on average
# (5.030322802197798, 5.263276515151516, 5.459172195892575, 5.533500643500631, 5.679413848631253, 5.4340135834411365)

mean_hard_SOS_1 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_ace%_l60_tw_ss'].mean()) #We want in terms of pct aces the field achieved on average
mean_hard_SOS_2 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_ace%_l60_tw_ss'].mean()) #We want in terms of pct aces the field achieved on average
mean_hard_SOS_3 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_ace%_l60_tw_ss'].mean()) #We want in terms of pct aces the field achieved on average
mean_hard_SOS_4 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_ace%_l60_tw_ss'].mean()) #We want in terms of pct aces the field achieved on average
mean_hard_SOS_5 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_ace%_l60_tw_ss'].mean()) #We want in terms of pct aces the field achieved on average
mean_hard_SOS_6 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_ace%_l60_tw_ss'].mean()) #We want in terms of pct aces the field achieved on average
# (8.301044617563724, 8.493678861788624, 8.18068994889265, 8.875939650608824, 8.602100968893447, 8.904152066677968)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_aced%_l60_tw_ss_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_aced%_l60_tw_ss_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_aced%_l60_tw_ss_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_aced%_l60_tw_ss_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_aced%_l60_tw_ss_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_aced%_l60_tw_ss_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_aced%_l60_tw_ss_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_aced%_l60_tw_ss_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_aced%_l60_tw_ss_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_aced%_l60_tw_ss_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_aced%_l60_tw_ss_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_aced%_l60_tw_ss_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_ace%_ws", "SOS_tw_l60_opp_ace%_ws_ct", "SOS_tw_l60_opp_ace%", "SOS_tw_l60_opp_ace%_50", "SOS_tw_l60_opp_ace%_49", "SOS_tw_l60_opp_ace%_48", "SOS_tw_l60_opp_ace%_47", "SOS_tw_l60_opp_ace%_46", "SOS_tw_l60_opp_ace%_45", "SOS_tw_l60_opp_ace%_44", "SOS_tw_l60_opp_ace%_43", "SOS_tw_l60_opp_ace%_42", "SOS_tw_l60_opp_ace%_41", "SOS_tw_l60_opp_ace%_40", "SOS_tw_l60_opp_ace%_39", "SOS_tw_l60_opp_ace%_38", "SOS_tw_l60_opp_ace%_37", "SOS_tw_l60_opp_ace%_36", "SOS_tw_l60_opp_ace%_35", "SOS_tw_l60_opp_ace%_34", "SOS_tw_l60_opp_ace%_33", "SOS_tw_l60_opp_ace%_32", "SOS_tw_l60_opp_ace%_31", "SOS_tw_l60_opp_ace%_30", "SOS_tw_l60_opp_ace%_29", "SOS_tw_l60_opp_ace%_28", "SOS_tw_l60_opp_ace%_27", "SOS_tw_l60_opp_ace%_26", "SOS_tw_l60_opp_ace%_25", "SOS_tw_l60_opp_ace%_24", "SOS_tw_l60_opp_ace%_23", "SOS_tw_l60_opp_ace%_22", "SOS_tw_l60_opp_ace%_21", "SOS_tw_l60_opp_ace%_20", "SOS_tw_l60_opp_ace%_19", "SOS_tw_l60_opp_ace%_18", "SOS_tw_l60_opp_ace%_17", "SOS_tw_l60_opp_ace%_16", "SOS_tw_l60_opp_ace%_15", "SOS_tw_l60_opp_ace%_14", "SOS_tw_l60_opp_ace%_13", "SOS_tw_l60_opp_ace%_12", "SOS_tw_l60_opp_ace%_11", "SOS_tw_l60_opp_ace%_10", "SOS_tw_l60_opp_ace%_9", "SOS_tw_l60_opp_ace%_8", "SOS_tw_l60_opp_ace%_7", "SOS_tw_l60_opp_ace%_6", "SOS_tw_l60_opp_ace%_5", "SOS_tw_l60_opp_ace%_4", "SOS_tw_l60_opp_ace%_3", "SOS_tw_l60_opp_ace%_2", "SOS_tw_l60_opp_ace%_1"],axis=1)

In [180]:
# 'p_aced%_l10_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) ACED performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_ace%_ws"] = df_player2[["SOS_tw_l60_opp_ace%_60", "SOS_tw_l60_opp_ace%_59", "SOS_tw_l60_opp_ace%_58", "SOS_tw_l60_opp_ace%_57", "SOS_tw_l60_opp_ace%_56", "SOS_tw_l60_opp_ace%_55", "SOS_tw_l60_opp_ace%_54", "SOS_tw_l60_opp_ace%_53", "SOS_tw_l60_opp_ace%_52", "SOS_tw_l60_opp_ace%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_ace%_ws_ct"] = df_player2[["SOS_tw_l60_opp_ace%_60", "SOS_tw_l60_opp_ace%_59", "SOS_tw_l60_opp_ace%_58", "SOS_tw_l60_opp_ace%_57", "SOS_tw_l60_opp_ace%_56", "SOS_tw_l60_opp_ace%_55", "SOS_tw_l60_opp_ace%_54", "SOS_tw_l60_opp_ace%_53", "SOS_tw_l60_opp_ace%_52", "SOS_tw_l60_opp_ace%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_ace%"] = (df_player2["SOS_tw_l10_opp_ace%_ws"]/df_player2["SOS_tw_l10_opp_ace%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % ACES (as servers) that the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches on this surface is expected to achieve
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l10_opp_ace%"]

# Mean % ACES (as servers) performance (l10_tw_ss) achieved across ALL players per surface. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (except 2009-2010), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_ace%_l10_tw_ss'].mean()) #We want in terms of pct aces the field achieved on average
mean_clay_SOS_2 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_ace%_l10_tw_ss'].mean()) #We want in terms of pct aces the field achieved on average
mean_clay_SOS_3 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_ace%_l10_tw_ss'].mean()) #We want in terms of pct aces the field achieved on average
mean_clay_SOS_4 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_ace%_l10_tw_ss'].mean()) #We want in terms of pct aces the field achieved on average
mean_clay_SOS_5 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_ace%_l10_tw_ss'].mean()) #We want in terms of pct aces the field achieved on average
mean_clay_SOS_6 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_ace%_l10_tw_ss'].mean()) #We want in terms of pct aces the field achieved on average
# (5.064196428571427, 5.285527146464641, 5.4279241706161, 5.54362612612613, 5.72822544283413, 5.5415006468305315)

mean_hard_SOS_1 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_ace%_l10_tw_ss'].mean()) #We want in terms of pct aces the field achieved on average
mean_hard_SOS_2 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_ace%_l10_tw_ss'].mean()) #We want in terms of pct aces the field achieved on average
mean_hard_SOS_3 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_ace%_l10_tw_ss'].mean()) #We want in terms of pct aces the field achieved on average
mean_hard_SOS_4 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_ace%_l10_tw_ss'].mean()) #We want in terms of pct aces the field achieved on average
mean_hard_SOS_5 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_ace%_l10_tw_ss'].mean()) #We want in terms of pct aces the field achieved on average
mean_hard_SOS_6 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_ace%_l10_tw_ss'].mean()) #We want in terms of pct aces the field achieved on average
# (8.437960339943348, 8.449478319783204, 8.171332197615, 8.99304393859184, 8.57786673465922, 9.113129783976843)

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_aced%_l10_tw_ss_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_aced%_l10_tw_ss_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_aced%_l10_tw_ss_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_aced%_l10_tw_ss_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_aced%_l10_tw_ss_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_aced%_l10_tw_ss_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_aced%_l10_tw_ss_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_aced%_l10_tw_ss_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_aced%_l10_tw_ss_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_aced%_l10_tw_ss_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_aced%_l10_tw_ss_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_aced%_l10_tw_ss_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_ace%_ws", "SOS_tw_l10_opp_ace%_ws_ct", "SOS_tw_l10_opp_ace%", "SOS_tw_l60_opp_ace%_60", "SOS_tw_l60_opp_ace%_59", "SOS_tw_l60_opp_ace%_58", "SOS_tw_l60_opp_ace%_57", "SOS_tw_l60_opp_ace%_56", "SOS_tw_l60_opp_ace%_55", "SOS_tw_l60_opp_ace%_54", "SOS_tw_l60_opp_ace%_53", "SOS_tw_l60_opp_ace%_52", "SOS_tw_l60_opp_ace%_51"],axis=1)

In [181]:
# 'p_aced%_l60_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), I/O specific ACED performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

# First obtain mean, TW,SS, IO specific ACE (as servers) performance for player OPPONENTS in the maximum interval (60 matches) prior to the match being predicted 
df_player2["SOS_tw_l60_opp_ace%_60"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-1)
df_player2["SOS_tw_l60_opp_ace%_59"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-2)
df_player2["SOS_tw_l60_opp_ace%_58"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-3)
df_player2["SOS_tw_l60_opp_ace%_57"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-4)
df_player2["SOS_tw_l60_opp_ace%_56"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-5)
df_player2["SOS_tw_l60_opp_ace%_55"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-6)
df_player2["SOS_tw_l60_opp_ace%_54"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-7)
df_player2["SOS_tw_l60_opp_ace%_53"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-8)
df_player2["SOS_tw_l60_opp_ace%_52"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-9)
df_player2["SOS_tw_l60_opp_ace%_51"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-10)
df_player2["SOS_tw_l60_opp_ace%_50"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-11)
df_player2["SOS_tw_l60_opp_ace%_49"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-12)
df_player2["SOS_tw_l60_opp_ace%_48"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-13)
df_player2["SOS_tw_l60_opp_ace%_47"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-14)
df_player2["SOS_tw_l60_opp_ace%_46"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-15)
df_player2["SOS_tw_l60_opp_ace%_45"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-16)
df_player2["SOS_tw_l60_opp_ace%_44"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-17)
df_player2["SOS_tw_l60_opp_ace%_43"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-18)
df_player2["SOS_tw_l60_opp_ace%_42"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-19)
df_player2["SOS_tw_l60_opp_ace%_41"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-20)
df_player2["SOS_tw_l60_opp_ace%_40"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-21)
df_player2["SOS_tw_l60_opp_ace%_39"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-22)
df_player2["SOS_tw_l60_opp_ace%_38"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-23)
df_player2["SOS_tw_l60_opp_ace%_37"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-24)
df_player2["SOS_tw_l60_opp_ace%_36"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-25)
df_player2["SOS_tw_l60_opp_ace%_35"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-26)
df_player2["SOS_tw_l60_opp_ace%_34"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-27)
df_player2["SOS_tw_l60_opp_ace%_33"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-28)
df_player2["SOS_tw_l60_opp_ace%_32"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-29)
df_player2["SOS_tw_l60_opp_ace%_31"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-30)
df_player2["SOS_tw_l60_opp_ace%_30"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-31)
df_player2["SOS_tw_l60_opp_ace%_29"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-32)
df_player2["SOS_tw_l60_opp_ace%_28"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-33)
df_player2["SOS_tw_l60_opp_ace%_27"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-34)
df_player2["SOS_tw_l60_opp_ace%_26"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-35)
df_player2["SOS_tw_l60_opp_ace%_25"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-36)
df_player2["SOS_tw_l60_opp_ace%_24"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-37)
df_player2["SOS_tw_l60_opp_ace%_23"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-38)
df_player2["SOS_tw_l60_opp_ace%_22"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-39)
df_player2["SOS_tw_l60_opp_ace%_21"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-40)
df_player2["SOS_tw_l60_opp_ace%_20"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-41)
df_player2["SOS_tw_l60_opp_ace%_19"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-42)
df_player2["SOS_tw_l60_opp_ace%_18"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-43)
df_player2["SOS_tw_l60_opp_ace%_17"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-44)
df_player2["SOS_tw_l60_opp_ace%_16"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-45)
df_player2["SOS_tw_l60_opp_ace%_15"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-46)
df_player2["SOS_tw_l60_opp_ace%_14"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-47)
df_player2["SOS_tw_l60_opp_ace%_13"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-48)
df_player2["SOS_tw_l60_opp_ace%_12"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-49)
df_player2["SOS_tw_l60_opp_ace%_11"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-50)
df_player2["SOS_tw_l60_opp_ace%_10"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-51)
df_player2["SOS_tw_l60_opp_ace%_9"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-52)
df_player2["SOS_tw_l60_opp_ace%_8"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-53)
df_player2["SOS_tw_l60_opp_ace%_7"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-54)
df_player2["SOS_tw_l60_opp_ace%_6"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-55)
df_player2["SOS_tw_l60_opp_ace%_5"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-56)
df_player2["SOS_tw_l60_opp_ace%_4"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-57)
df_player2["SOS_tw_l60_opp_ace%_3"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-58)
df_player2["SOS_tw_l60_opp_ace%_2"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-59)
df_player2["SOS_tw_l60_opp_ace%_1"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_ace%_l60_tw_ss_IO'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_ace%_ws"] = df_player2[["SOS_tw_l60_opp_ace%_60", "SOS_tw_l60_opp_ace%_59", "SOS_tw_l60_opp_ace%_58", "SOS_tw_l60_opp_ace%_57", "SOS_tw_l60_opp_ace%_56", "SOS_tw_l60_opp_ace%_55", "SOS_tw_l60_opp_ace%_54", "SOS_tw_l60_opp_ace%_53", "SOS_tw_l60_opp_ace%_52", "SOS_tw_l60_opp_ace%_51", "SOS_tw_l60_opp_ace%_50", "SOS_tw_l60_opp_ace%_49", "SOS_tw_l60_opp_ace%_48", "SOS_tw_l60_opp_ace%_47", "SOS_tw_l60_opp_ace%_46", "SOS_tw_l60_opp_ace%_45", "SOS_tw_l60_opp_ace%_44", "SOS_tw_l60_opp_ace%_43", "SOS_tw_l60_opp_ace%_42", "SOS_tw_l60_opp_ace%_41", "SOS_tw_l60_opp_ace%_40", "SOS_tw_l60_opp_ace%_39", "SOS_tw_l60_opp_ace%_38", "SOS_tw_l60_opp_ace%_37", "SOS_tw_l60_opp_ace%_36", "SOS_tw_l60_opp_ace%_35", "SOS_tw_l60_opp_ace%_34", "SOS_tw_l60_opp_ace%_33", "SOS_tw_l60_opp_ace%_32", "SOS_tw_l60_opp_ace%_31", "SOS_tw_l60_opp_ace%_30", "SOS_tw_l60_opp_ace%_29", "SOS_tw_l60_opp_ace%_28", "SOS_tw_l60_opp_ace%_27", "SOS_tw_l60_opp_ace%_26", "SOS_tw_l60_opp_ace%_25", "SOS_tw_l60_opp_ace%_24", "SOS_tw_l60_opp_ace%_23", "SOS_tw_l60_opp_ace%_22", "SOS_tw_l60_opp_ace%_21", "SOS_tw_l60_opp_ace%_20", "SOS_tw_l60_opp_ace%_19", "SOS_tw_l60_opp_ace%_18", "SOS_tw_l60_opp_ace%_17", "SOS_tw_l60_opp_ace%_16", "SOS_tw_l60_opp_ace%_15", "SOS_tw_l60_opp_ace%_14", "SOS_tw_l60_opp_ace%_13", "SOS_tw_l60_opp_ace%_12", "SOS_tw_l60_opp_ace%_11", "SOS_tw_l60_opp_ace%_10", "SOS_tw_l60_opp_ace%_9", "SOS_tw_l60_opp_ace%_8", "SOS_tw_l60_opp_ace%_7", "SOS_tw_l60_opp_ace%_6", "SOS_tw_l60_opp_ace%_5", "SOS_tw_l60_opp_ace%_4", "SOS_tw_l60_opp_ace%_3", "SOS_tw_l60_opp_ace%_2", "SOS_tw_l60_opp_ace%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_ace%_ws_ct"] = df_player2[["SOS_tw_l60_opp_ace%_60", "SOS_tw_l60_opp_ace%_59", "SOS_tw_l60_opp_ace%_58", "SOS_tw_l60_opp_ace%_57", "SOS_tw_l60_opp_ace%_56", "SOS_tw_l60_opp_ace%_55", "SOS_tw_l60_opp_ace%_54", "SOS_tw_l60_opp_ace%_53", "SOS_tw_l60_opp_ace%_52", "SOS_tw_l60_opp_ace%_51", "SOS_tw_l60_opp_ace%_50", "SOS_tw_l60_opp_ace%_49", "SOS_tw_l60_opp_ace%_48", "SOS_tw_l60_opp_ace%_47", "SOS_tw_l60_opp_ace%_46", "SOS_tw_l60_opp_ace%_45", "SOS_tw_l60_opp_ace%_44", "SOS_tw_l60_opp_ace%_43", "SOS_tw_l60_opp_ace%_42", "SOS_tw_l60_opp_ace%_41", "SOS_tw_l60_opp_ace%_40", "SOS_tw_l60_opp_ace%_39", "SOS_tw_l60_opp_ace%_38", "SOS_tw_l60_opp_ace%_37", "SOS_tw_l60_opp_ace%_36", "SOS_tw_l60_opp_ace%_35", "SOS_tw_l60_opp_ace%_34", "SOS_tw_l60_opp_ace%_33", "SOS_tw_l60_opp_ace%_32", "SOS_tw_l60_opp_ace%_31", "SOS_tw_l60_opp_ace%_30", "SOS_tw_l60_opp_ace%_29", "SOS_tw_l60_opp_ace%_28", "SOS_tw_l60_opp_ace%_27", "SOS_tw_l60_opp_ace%_26", "SOS_tw_l60_opp_ace%_25", "SOS_tw_l60_opp_ace%_24", "SOS_tw_l60_opp_ace%_23", "SOS_tw_l60_opp_ace%_22", "SOS_tw_l60_opp_ace%_21", "SOS_tw_l60_opp_ace%_20", "SOS_tw_l60_opp_ace%_19", "SOS_tw_l60_opp_ace%_18", "SOS_tw_l60_opp_ace%_17", "SOS_tw_l60_opp_ace%_16", "SOS_tw_l60_opp_ace%_15", "SOS_tw_l60_opp_ace%_14", "SOS_tw_l60_opp_ace%_13", "SOS_tw_l60_opp_ace%_12", "SOS_tw_l60_opp_ace%_11", "SOS_tw_l60_opp_ace%_10", "SOS_tw_l60_opp_ace%_9", "SOS_tw_l60_opp_ace%_8", "SOS_tw_l60_opp_ace%_7", "SOS_tw_l60_opp_ace%_6", "SOS_tw_l60_opp_ace%_5", "SOS_tw_l60_opp_ace%_4", "SOS_tw_l60_opp_ace%_3", "SOS_tw_l60_opp_ace%_2", "SOS_tw_l60_opp_ace%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_ace%"] = (df_player2["SOS_tw_l60_opp_ace%_ws"]/df_player2["SOS_tw_l60_opp_ace%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % ACES (as servers) the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface and IO status is expected to achieve
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l60_opp_ace%"]

# Mean % ACES (as servers) performance (l60_tw_ss_IO) achieved across ALL players per surface and IO status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (except 2009-2010), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_2i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_3i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_4i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_5i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_ace%_l60_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_clay_SOS_6i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_ace%_l60_tw_ss_IO'].mean()) 
# (nan, 6.845714285714286, 5.663712574850302, 7.752105263157896, 6.169154929577465, 6.458045977011496)

mean_clay_SOS_1o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_2o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_3o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_4o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_5o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_ace%_l60_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_clay_SOS_6o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_ace%_l60_tw_ss_IO'].mean()) 
# (5.030322802197798, 5.2573727154047, 5.446922033898306, 5.45049327354259, 5.679850543478258, 5.394542713567836,

mean_hard_SOS_1i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_2i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_3i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_4i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_5i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_ace%_l60_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_hard_SOS_6i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_ace%_l60_tw_ss_IO'].mean()) 
# (10.158646341463418, 9.53692350332596, 9.120337721808827, 9.189208731241465, 8.666599303135897, 9.14351369863013)

mean_hard_SOS_1o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_2o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_3o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_4o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_5o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_ace%_l60_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_hard_SOS_6o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_ace%_l60_tw_ss_IO'].mean()) 
# (8.009913839526126, 8.059985089463233, 7.971156012735713, 8.716876048909137, 8.615973702108372, 8.781471999999956)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_aced%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_aced%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_aced%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_aced%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_aced%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_aced%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_aced%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_aced%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_aced%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_aced%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_aced%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_aced%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_aced%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_aced%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_aced%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_aced%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_aced%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_aced%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_aced%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_aced%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_aced%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_aced%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_aced%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_aced%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l60_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 

del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_ace%_ws", "SOS_tw_l60_opp_ace%_ws_ct", "SOS_tw_l60_opp_ace%", "SOS_tw_l60_opp_ace%_50", "SOS_tw_l60_opp_ace%_49", "SOS_tw_l60_opp_ace%_48", "SOS_tw_l60_opp_ace%_47", "SOS_tw_l60_opp_ace%_46", "SOS_tw_l60_opp_ace%_45", "SOS_tw_l60_opp_ace%_44", "SOS_tw_l60_opp_ace%_43", "SOS_tw_l60_opp_ace%_42", "SOS_tw_l60_opp_ace%_41", "SOS_tw_l60_opp_ace%_40", "SOS_tw_l60_opp_ace%_39", "SOS_tw_l60_opp_ace%_38", "SOS_tw_l60_opp_ace%_37", "SOS_tw_l60_opp_ace%_36", "SOS_tw_l60_opp_ace%_35", "SOS_tw_l60_opp_ace%_34", "SOS_tw_l60_opp_ace%_33", "SOS_tw_l60_opp_ace%_32", "SOS_tw_l60_opp_ace%_31", "SOS_tw_l60_opp_ace%_30", "SOS_tw_l60_opp_ace%_29", "SOS_tw_l60_opp_ace%_28", "SOS_tw_l60_opp_ace%_27", "SOS_tw_l60_opp_ace%_26", "SOS_tw_l60_opp_ace%_25", "SOS_tw_l60_opp_ace%_24", "SOS_tw_l60_opp_ace%_23", "SOS_tw_l60_opp_ace%_22", "SOS_tw_l60_opp_ace%_21", "SOS_tw_l60_opp_ace%_20", "SOS_tw_l60_opp_ace%_19", "SOS_tw_l60_opp_ace%_18", "SOS_tw_l60_opp_ace%_17", "SOS_tw_l60_opp_ace%_16", "SOS_tw_l60_opp_ace%_15", "SOS_tw_l60_opp_ace%_14", "SOS_tw_l60_opp_ace%_13", "SOS_tw_l60_opp_ace%_12", "SOS_tw_l60_opp_ace%_11", "SOS_tw_l60_opp_ace%_10", "SOS_tw_l60_opp_ace%_9", "SOS_tw_l60_opp_ace%_8", "SOS_tw_l60_opp_ace%_7", "SOS_tw_l60_opp_ace%_6", "SOS_tw_l60_opp_ace%_5", "SOS_tw_l60_opp_ace%_4", "SOS_tw_l60_opp_ace%_3", "SOS_tw_l60_opp_ace%_2", "SOS_tw_l60_opp_ace%_1"],axis=1)

In [182]:
# 'p_aced%_l10_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), I/O specific ACED performance of PLAYER over the 10 matches PRIOR TO the match being predicted

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_ace%_ws"] = df_player2[["SOS_tw_l60_opp_ace%_60", "SOS_tw_l60_opp_ace%_59", "SOS_tw_l60_opp_ace%_58", "SOS_tw_l60_opp_ace%_57", "SOS_tw_l60_opp_ace%_56", "SOS_tw_l60_opp_ace%_55", "SOS_tw_l60_opp_ace%_54", "SOS_tw_l60_opp_ace%_53", "SOS_tw_l60_opp_ace%_52", "SOS_tw_l60_opp_ace%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_ace%_ws_ct"] = df_player2[["SOS_tw_l60_opp_ace%_60", "SOS_tw_l60_opp_ace%_59", "SOS_tw_l60_opp_ace%_58", "SOS_tw_l60_opp_ace%_57", "SOS_tw_l60_opp_ace%_56", "SOS_tw_l60_opp_ace%_55", "SOS_tw_l60_opp_ace%_54", "SOS_tw_l60_opp_ace%_53", "SOS_tw_l60_opp_ace%_52", "SOS_tw_l60_opp_ace%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_ace%"] = (df_player2["SOS_tw_l10_opp_ace%_ws"]/df_player2["SOS_tw_l10_opp_ace%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % ACES (as servers) the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches on this surface and IO status is expected to achieve
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l10_opp_ace%"]

# Mean % ACES (as servers) performance (l10_tw_ss_IO) achieved across ALL players per surface and IO status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (except 2009-2010), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_2i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_3i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_4i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_5i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_ace%_l60_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_clay_SOS_6i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_ace%_l60_tw_ss_IO'].mean()) 
# 

mean_clay_SOS_1o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_2o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_3o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_4o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_5o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_ace%_l60_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_clay_SOS_6o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_ace%_l60_tw_ss_IO'].mean()) 
# 

mean_hard_SOS_1i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_2i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_3i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_4i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_5i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_ace%_l60_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_hard_SOS_6i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_ace%_l60_tw_ss_IO'].mean()) 
# 

mean_hard_SOS_1o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_2o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_3o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_4o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_ace%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_5o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_ace%_l60_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_hard_SOS_6o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_ace%_l60_tw_ss_IO'].mean()) 
# 

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_aced%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_aced%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_aced%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_aced%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_aced%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_aced%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_aced%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_aced%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_aced%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_aced%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_aced%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_aced%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_aced%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_aced%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_aced%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_aced%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_aced%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_aced%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_aced%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_aced%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_aced%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_aced%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_aced%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_aced%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_aced%_l10_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 

del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_ace%_ws", "SOS_tw_l10_opp_ace%_ws_ct", "SOS_tw_l10_opp_ace%", "SOS_tw_l60_opp_ace%_60", "SOS_tw_l60_opp_ace%_59", "SOS_tw_l60_opp_ace%_58", "SOS_tw_l60_opp_ace%_57", "SOS_tw_l60_opp_ace%_56", "SOS_tw_l60_opp_ace%_55", "SOS_tw_l60_opp_ace%_54", "SOS_tw_l60_opp_ace%_53", "SOS_tw_l60_opp_ace%_52", "SOS_tw_l60_opp_ace%_51"],axis=1)

In [183]:
# 'p_df%_l60_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) Double Fault performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted DOUBLE FAULT INDUCTION (as returners) performance for player OPPONENTS in the maximum interval (60 matches) prior to the match being predicted 
df_player2["SOS_tw_l60_opp_df_induce%_60"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-1)
df_player2["SOS_tw_l60_opp_df_induce%_59"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-2)
df_player2["SOS_tw_l60_opp_df_induce%_58"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-3)
df_player2["SOS_tw_l60_opp_df_induce%_57"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-4)
df_player2["SOS_tw_l60_opp_df_induce%_56"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-5)
df_player2["SOS_tw_l60_opp_df_induce%_55"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-6)
df_player2["SOS_tw_l60_opp_df_induce%_54"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-7)
df_player2["SOS_tw_l60_opp_df_induce%_53"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-8)
df_player2["SOS_tw_l60_opp_df_induce%_52"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-9)
df_player2["SOS_tw_l60_opp_df_induce%_51"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-10)
df_player2["SOS_tw_l60_opp_df_induce%_50"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-11)
df_player2["SOS_tw_l60_opp_df_induce%_49"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-12)
df_player2["SOS_tw_l60_opp_df_induce%_48"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-13)
df_player2["SOS_tw_l60_opp_df_induce%_47"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-14)
df_player2["SOS_tw_l60_opp_df_induce%_46"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-15)
df_player2["SOS_tw_l60_opp_df_induce%_45"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-16)
df_player2["SOS_tw_l60_opp_df_induce%_44"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-17)
df_player2["SOS_tw_l60_opp_df_induce%_43"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-18)
df_player2["SOS_tw_l60_opp_df_induce%_42"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-19)
df_player2["SOS_tw_l60_opp_df_induce%_41"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-20)
df_player2["SOS_tw_l60_opp_df_induce%_40"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-21)
df_player2["SOS_tw_l60_opp_df_induce%_39"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-22)
df_player2["SOS_tw_l60_opp_df_induce%_38"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-23)
df_player2["SOS_tw_l60_opp_df_induce%_37"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-24)
df_player2["SOS_tw_l60_opp_df_induce%_36"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-25)
df_player2["SOS_tw_l60_opp_df_induce%_35"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-26)
df_player2["SOS_tw_l60_opp_df_induce%_34"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-27)
df_player2["SOS_tw_l60_opp_df_induce%_33"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-28)
df_player2["SOS_tw_l60_opp_df_induce%_32"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-29)
df_player2["SOS_tw_l60_opp_df_induce%_31"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-30)
df_player2["SOS_tw_l60_opp_df_induce%_30"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-31)
df_player2["SOS_tw_l60_opp_df_induce%_29"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-32)
df_player2["SOS_tw_l60_opp_df_induce%_28"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-33)
df_player2["SOS_tw_l60_opp_df_induce%_27"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-34)
df_player2["SOS_tw_l60_opp_df_induce%_26"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-35)
df_player2["SOS_tw_l60_opp_df_induce%_25"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-36)
df_player2["SOS_tw_l60_opp_df_induce%_24"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-37)
df_player2["SOS_tw_l60_opp_df_induce%_23"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-38)
df_player2["SOS_tw_l60_opp_df_induce%_22"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-39)
df_player2["SOS_tw_l60_opp_df_induce%_21"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-40)
df_player2["SOS_tw_l60_opp_df_induce%_20"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-41)
df_player2["SOS_tw_l60_opp_df_induce%_19"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-42)
df_player2["SOS_tw_l60_opp_df_induce%_18"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-43)
df_player2["SOS_tw_l60_opp_df_induce%_17"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-44)
df_player2["SOS_tw_l60_opp_df_induce%_16"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-45)
df_player2["SOS_tw_l60_opp_df_induce%_15"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-46)
df_player2["SOS_tw_l60_opp_df_induce%_14"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-47)
df_player2["SOS_tw_l60_opp_df_induce%_13"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-48)
df_player2["SOS_tw_l60_opp_df_induce%_12"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-49)
df_player2["SOS_tw_l60_opp_df_induce%_11"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-50)
df_player2["SOS_tw_l60_opp_df_induce%_10"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-51)
df_player2["SOS_tw_l60_opp_df_induce%_9"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-52)
df_player2["SOS_tw_l60_opp_df_induce%_8"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-53)
df_player2["SOS_tw_l60_opp_df_induce%_7"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-54)
df_player2["SOS_tw_l60_opp_df_induce%_6"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-55)
df_player2["SOS_tw_l60_opp_df_induce%_5"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-56)
df_player2["SOS_tw_l60_opp_df_induce%_4"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-57)
df_player2["SOS_tw_l60_opp_df_induce%_3"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-58)
df_player2["SOS_tw_l60_opp_df_induce%_2"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-59)
df_player2["SOS_tw_l60_opp_df_induce%_1"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df_induce%_l60_tw_ss'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_df_induce%_ws"] = df_player2[["SOS_tw_l60_opp_df_induce%_60", "SOS_tw_l60_opp_df_induce%_59", "SOS_tw_l60_opp_df_induce%_58", "SOS_tw_l60_opp_df_induce%_57", "SOS_tw_l60_opp_df_induce%_56", "SOS_tw_l60_opp_df_induce%_55", "SOS_tw_l60_opp_df_induce%_54", "SOS_tw_l60_opp_df_induce%_53", "SOS_tw_l60_opp_df_induce%_52", "SOS_tw_l60_opp_df_induce%_51", "SOS_tw_l60_opp_df_induce%_50", "SOS_tw_l60_opp_df_induce%_49", "SOS_tw_l60_opp_df_induce%_48", "SOS_tw_l60_opp_df_induce%_47", "SOS_tw_l60_opp_df_induce%_46", "SOS_tw_l60_opp_df_induce%_45", "SOS_tw_l60_opp_df_induce%_44", "SOS_tw_l60_opp_df_induce%_43", "SOS_tw_l60_opp_df_induce%_42", "SOS_tw_l60_opp_df_induce%_41", "SOS_tw_l60_opp_df_induce%_40", "SOS_tw_l60_opp_df_induce%_39", "SOS_tw_l60_opp_df_induce%_38", "SOS_tw_l60_opp_df_induce%_37", "SOS_tw_l60_opp_df_induce%_36", "SOS_tw_l60_opp_df_induce%_35", "SOS_tw_l60_opp_df_induce%_34", "SOS_tw_l60_opp_df_induce%_33", "SOS_tw_l60_opp_df_induce%_32", "SOS_tw_l60_opp_df_induce%_31", "SOS_tw_l60_opp_df_induce%_30", "SOS_tw_l60_opp_df_induce%_29", "SOS_tw_l60_opp_df_induce%_28", "SOS_tw_l60_opp_df_induce%_27", "SOS_tw_l60_opp_df_induce%_26", "SOS_tw_l60_opp_df_induce%_25", "SOS_tw_l60_opp_df_induce%_24", "SOS_tw_l60_opp_df_induce%_23", "SOS_tw_l60_opp_df_induce%_22", "SOS_tw_l60_opp_df_induce%_21", "SOS_tw_l60_opp_df_induce%_20", "SOS_tw_l60_opp_df_induce%_19", "SOS_tw_l60_opp_df_induce%_18", "SOS_tw_l60_opp_df_induce%_17", "SOS_tw_l60_opp_df_induce%_16", "SOS_tw_l60_opp_df_induce%_15", "SOS_tw_l60_opp_df_induce%_14", "SOS_tw_l60_opp_df_induce%_13", "SOS_tw_l60_opp_df_induce%_12", "SOS_tw_l60_opp_df_induce%_11", "SOS_tw_l60_opp_df_induce%_10", "SOS_tw_l60_opp_df_induce%_9", "SOS_tw_l60_opp_df_induce%_8", "SOS_tw_l60_opp_df_induce%_7", "SOS_tw_l60_opp_df_induce%_6", "SOS_tw_l60_opp_df_induce%_5", "SOS_tw_l60_opp_df_induce%_4", "SOS_tw_l60_opp_df_induce%_3", "SOS_tw_l60_opp_df_induce%_2", "SOS_tw_l60_opp_df_induce%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_df_induce%_ws_ct"] = df_player2[["SOS_tw_l60_opp_df_induce%_60", "SOS_tw_l60_opp_df_induce%_59", "SOS_tw_l60_opp_df_induce%_58", "SOS_tw_l60_opp_df_induce%_57", "SOS_tw_l60_opp_df_induce%_56", "SOS_tw_l60_opp_df_induce%_55", "SOS_tw_l60_opp_df_induce%_54", "SOS_tw_l60_opp_df_induce%_53", "SOS_tw_l60_opp_df_induce%_52", "SOS_tw_l60_opp_df_induce%_51", "SOS_tw_l60_opp_df_induce%_50", "SOS_tw_l60_opp_df_induce%_49", "SOS_tw_l60_opp_df_induce%_48", "SOS_tw_l60_opp_df_induce%_47", "SOS_tw_l60_opp_df_induce%_46", "SOS_tw_l60_opp_df_induce%_45", "SOS_tw_l60_opp_df_induce%_44", "SOS_tw_l60_opp_df_induce%_43", "SOS_tw_l60_opp_df_induce%_42", "SOS_tw_l60_opp_df_induce%_41", "SOS_tw_l60_opp_df_induce%_40", "SOS_tw_l60_opp_df_induce%_39", "SOS_tw_l60_opp_df_induce%_38", "SOS_tw_l60_opp_df_induce%_37", "SOS_tw_l60_opp_df_induce%_36", "SOS_tw_l60_opp_df_induce%_35", "SOS_tw_l60_opp_df_induce%_34", "SOS_tw_l60_opp_df_induce%_33", "SOS_tw_l60_opp_df_induce%_32", "SOS_tw_l60_opp_df_induce%_31", "SOS_tw_l60_opp_df_induce%_30", "SOS_tw_l60_opp_df_induce%_29", "SOS_tw_l60_opp_df_induce%_28", "SOS_tw_l60_opp_df_induce%_27", "SOS_tw_l60_opp_df_induce%_26", "SOS_tw_l60_opp_df_induce%_25", "SOS_tw_l60_opp_df_induce%_24", "SOS_tw_l60_opp_df_induce%_23", "SOS_tw_l60_opp_df_induce%_22", "SOS_tw_l60_opp_df_induce%_21", "SOS_tw_l60_opp_df_induce%_20", "SOS_tw_l60_opp_df_induce%_19", "SOS_tw_l60_opp_df_induce%_18", "SOS_tw_l60_opp_df_induce%_17", "SOS_tw_l60_opp_df_induce%_16", "SOS_tw_l60_opp_df_induce%_15", "SOS_tw_l60_opp_df_induce%_14", "SOS_tw_l60_opp_df_induce%_13", "SOS_tw_l60_opp_df_induce%_12", "SOS_tw_l60_opp_df_induce%_11", "SOS_tw_l60_opp_df_induce%_10", "SOS_tw_l60_opp_df_induce%_9", "SOS_tw_l60_opp_df_induce%_8", "SOS_tw_l60_opp_df_induce%_7", "SOS_tw_l60_opp_df_induce%_6", "SOS_tw_l60_opp_df_induce%_5", "SOS_tw_l60_opp_df_induce%_4", "SOS_tw_l60_opp_df_induce%_3", "SOS_tw_l60_opp_df_induce%_2", "SOS_tw_l60_opp_df_induce%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_df_induce%"] = (df_player2["SOS_tw_l60_opp_df_induce%_ws"]/df_player2["SOS_tw_l60_opp_df_induce%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % DF INDUCE the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to "YIELD"
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l60_opp_df_induce%"]

# Mean % DF INDUCE "YIELDED" performance (l60_tw_ss) across ALL players per surface. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (except 2009-2010), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_df_induce%_l60_tw_ss'].mean()) #We want in terms of pct df induce the field ALLOWS on average
mean_clay_SOS_2 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_df_induce%_l60_tw_ss'].mean()) #We want in terms of pct df induce the field ALLOWS on average
mean_clay_SOS_3 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_df_induce%_l60_tw_ss'].mean()) #We want in terms of pct df induce the field ALLOWS on average
mean_clay_SOS_4 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_df_induce%_l60_tw_ss'].mean()) #We want in terms of pct df induce the field ALLOWS on average
mean_clay_SOS_5 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_df_induce%_l60_tw_ss'].mean()) #We want in terms of pct df induce the field ALLOWS on average
mean_clay_SOS_6 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_df_induce%_l60_tw_ss'].mean()) #We want in terms of pct df induce the field ALLOWS on average
# (3.4963530219780243, 3.2184816919191963, 3.2560410742495995, 3.2989607464607515, 3.3633590982286705, 3.4544598965071187)

mean_hard_SOS_1 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_df_induce%_l60_tw_ss'].mean()) #We want in terms of pct df induce the field ALLOWS on average
mean_hard_SOS_2 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_df_induce%_l60_tw_ss'].mean()) #We want in terms of pct df induce the field ALLOWS on average
mean_hard_SOS_3 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_df_induce%_l60_tw_ss'].mean()) #We want in terms of pct df induce the field ALLOWS on average
mean_hard_SOS_4 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_df_induce%_l60_tw_ss'].mean()) #We want in terms of pct df induce the field ALLOWS on average
mean_hard_SOS_5 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_df_induce%_l60_tw_ss'].mean()) #We want in terms of pct df induce the field ALLOWS on average
mean_hard_SOS_6 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_df_induce%_l60_tw_ss'].mean()) #We want in terms of pct df induce the field ALLOWS on average
# (3.8297379603399486, 3.6832723577235877, 3.7540664395230006, 3.878284806776084, 4.076469488356281, 4.041932301411816)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_df%_l60_tw_ss_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_df%_l60_tw_ss_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_df%_l60_tw_ss_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_df%_l60_tw_ss_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_df%_l60_tw_ss_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_df%_l60_tw_ss_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_df%_l60_tw_ss_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_df%_l60_tw_ss_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_df%_l60_tw_ss_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_df%_l60_tw_ss_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_df%_l60_tw_ss_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_df%_l60_tw_ss_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_df_induce%_ws", "SOS_tw_l60_opp_df_induce%_ws_ct", "SOS_tw_l60_opp_df_induce%", "SOS_tw_l60_opp_df_induce%_50", "SOS_tw_l60_opp_df_induce%_49", "SOS_tw_l60_opp_df_induce%_48", "SOS_tw_l60_opp_df_induce%_47", "SOS_tw_l60_opp_df_induce%_46", "SOS_tw_l60_opp_df_induce%_45", "SOS_tw_l60_opp_df_induce%_44", "SOS_tw_l60_opp_df_induce%_43", "SOS_tw_l60_opp_df_induce%_42", "SOS_tw_l60_opp_df_induce%_41", "SOS_tw_l60_opp_df_induce%_40", "SOS_tw_l60_opp_df_induce%_39", "SOS_tw_l60_opp_df_induce%_38", "SOS_tw_l60_opp_df_induce%_37", "SOS_tw_l60_opp_df_induce%_36", "SOS_tw_l60_opp_df_induce%_35", "SOS_tw_l60_opp_df_induce%_34", "SOS_tw_l60_opp_df_induce%_33", "SOS_tw_l60_opp_df_induce%_32", "SOS_tw_l60_opp_df_induce%_31", "SOS_tw_l60_opp_df_induce%_30", "SOS_tw_l60_opp_df_induce%_29", "SOS_tw_l60_opp_df_induce%_28", "SOS_tw_l60_opp_df_induce%_27", "SOS_tw_l60_opp_df_induce%_26", "SOS_tw_l60_opp_df_induce%_25", "SOS_tw_l60_opp_df_induce%_24", "SOS_tw_l60_opp_df_induce%_23", "SOS_tw_l60_opp_df_induce%_22", "SOS_tw_l60_opp_df_induce%_21", "SOS_tw_l60_opp_df_induce%_20", "SOS_tw_l60_opp_df_induce%_19", "SOS_tw_l60_opp_df_induce%_18", "SOS_tw_l60_opp_df_induce%_17", "SOS_tw_l60_opp_df_induce%_16", "SOS_tw_l60_opp_df_induce%_15", "SOS_tw_l60_opp_df_induce%_14", "SOS_tw_l60_opp_df_induce%_13", "SOS_tw_l60_opp_df_induce%_12", "SOS_tw_l60_opp_df_induce%_11", "SOS_tw_l60_opp_df_induce%_10", "SOS_tw_l60_opp_df_induce%_9", "SOS_tw_l60_opp_df_induce%_8", "SOS_tw_l60_opp_df_induce%_7", "SOS_tw_l60_opp_df_induce%_6", "SOS_tw_l60_opp_df_induce%_5", "SOS_tw_l60_opp_df_induce%_4", "SOS_tw_l60_opp_df_induce%_3", "SOS_tw_l60_opp_df_induce%_2", "SOS_tw_l60_opp_df_induce%_1"],axis=1)

In [184]:
# 'p_df%_l10_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) Double Fault performance of PLAYER over the 10 matches PRIOR TO the match being predicted

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_df_induce%_ws"] = df_player2[["SOS_tw_l60_opp_df_induce%_60", "SOS_tw_l60_opp_df_induce%_59", "SOS_tw_l60_opp_df_induce%_58", "SOS_tw_l60_opp_df_induce%_57", "SOS_tw_l60_opp_df_induce%_56", "SOS_tw_l60_opp_df_induce%_55", "SOS_tw_l60_opp_df_induce%_54", "SOS_tw_l60_opp_df_induce%_53", "SOS_tw_l60_opp_df_induce%_52", "SOS_tw_l60_opp_df_induce%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_df_induce%_ws_ct"] = df_player2[["SOS_tw_l60_opp_df_induce%_60", "SOS_tw_l60_opp_df_induce%_59", "SOS_tw_l60_opp_df_induce%_58", "SOS_tw_l60_opp_df_induce%_57", "SOS_tw_l60_opp_df_induce%_56", "SOS_tw_l60_opp_df_induce%_55", "SOS_tw_l60_opp_df_induce%_54", "SOS_tw_l60_opp_df_induce%_53", "SOS_tw_l60_opp_df_induce%_52", "SOS_tw_l60_opp_df_induce%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_df_induce%"] = (df_player2["SOS_tw_l10_opp_df_induce%_ws"]/df_player2["SOS_tw_l10_opp_df_induce%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % DF INDUCE the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to "YIELD"
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l10_opp_df_induce%"]

# Mean % DF INDUCE "YIELDED" performance (l10_tw_ss) across ALL players per surface. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (except 2009-2010), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_df_induce%_l10_tw_ss'].mean()) #We want in terms of pct df induce the field ALLOWS on average
mean_clay_SOS_2 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_df_induce%_l10_tw_ss'].mean()) #We want in terms of pct df induce the field ALLOWS on average
mean_clay_SOS_3 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_df_induce%_l10_tw_ss'].mean()) #We want in terms of pct df induce the field ALLOWS on average
mean_clay_SOS_4 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_df_induce%_l10_tw_ss'].mean()) #We want in terms of pct df induce the field ALLOWS on average
mean_clay_SOS_5 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_df_induce%_l10_tw_ss'].mean()) #We want in terms of pct df induce the field ALLOWS on average
mean_clay_SOS_6 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_df_induce%_l10_tw_ss'].mean()) #We want in terms of pct df induce the field ALLOWS on average
# (3.4600068681318716, 3.193787878787881, 3.288164296998419, 3.2779536679536716, 3.406296296296297, 3.5305401034928745)

mean_hard_SOS_1 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_df_induce%_l10_tw_ss'].mean()) #We want in terms of pct df induce the field ALLOWS on average
mean_hard_SOS_2 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_df_induce%_l10_tw_ss'].mean()) #We want in terms of pct df induce the field ALLOWS on average
mean_hard_SOS_3 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_df_induce%_l10_tw_ss'].mean()) #We want in terms of pct df induce the field ALLOWS on average
mean_hard_SOS_4 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_df_induce%_l10_tw_ss'].mean()) #We want in terms of pct df induce the field ALLOWS on average
mean_hard_SOS_5 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_df_induce%_l10_tw_ss'].mean()) #We want in terms of pct df induce the field ALLOWS on average
mean_hard_SOS_6 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_df_induce%_l10_tw_ss'].mean()) #We want in terms of pct df induce the field ALLOWS on average
# (3.807368980169969, 3.716927506775058, 3.7857444633731, 3.911187577201329, 4.126868944416101, 3.972595679537344)

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_df%_l10_tw_ss_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_df%_l10_tw_ss_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_df%_l10_tw_ss_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_df%_l10_tw_ss_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_df%_l10_tw_ss_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_df%_l10_tw_ss_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_df%_l10_tw_ss_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_df%_l10_tw_ss_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_df%_l10_tw_ss_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_df%_l10_tw_ss_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_df%_l10_tw_ss_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_df%_l10_tw_ss_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_df_induce%_ws", "SOS_tw_l10_opp_df_induce%_ws_ct", "SOS_tw_l10_opp_df_induce%", "SOS_tw_l60_opp_df_induce%_60", "SOS_tw_l60_opp_df_induce%_59", "SOS_tw_l60_opp_df_induce%_58", "SOS_tw_l60_opp_df_induce%_57", "SOS_tw_l60_opp_df_induce%_56", "SOS_tw_l60_opp_df_induce%_55", "SOS_tw_l60_opp_df_induce%_54", "SOS_tw_l60_opp_df_induce%_53", "SOS_tw_l60_opp_df_induce%_52", "SOS_tw_l60_opp_df_induce%_51"],axis=1)

In [185]:
# 'p_df%_l60_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), I/O specific DF performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

# First obtain mean, TW,SS, IO specific DF INDUCE (as returners) performance for player OPPONENTS in the maximum interval (60 matches) prior to the match being predicted 
df_player2["SOS_tw_l60_opp_df_induce%_60"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-1)
df_player2["SOS_tw_l60_opp_df_induce%_59"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-2)
df_player2["SOS_tw_l60_opp_df_induce%_58"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-3)
df_player2["SOS_tw_l60_opp_df_induce%_57"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-4)
df_player2["SOS_tw_l60_opp_df_induce%_56"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-5)
df_player2["SOS_tw_l60_opp_df_induce%_55"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-6)
df_player2["SOS_tw_l60_opp_df_induce%_54"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-7)
df_player2["SOS_tw_l60_opp_df_induce%_53"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-8)
df_player2["SOS_tw_l60_opp_df_induce%_52"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-9)
df_player2["SOS_tw_l60_opp_df_induce%_51"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-10)
df_player2["SOS_tw_l60_opp_df_induce%_50"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-11)
df_player2["SOS_tw_l60_opp_df_induce%_49"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-12)
df_player2["SOS_tw_l60_opp_df_induce%_48"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-13)
df_player2["SOS_tw_l60_opp_df_induce%_47"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-14)
df_player2["SOS_tw_l60_opp_df_induce%_46"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-15)
df_player2["SOS_tw_l60_opp_df_induce%_45"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-16)
df_player2["SOS_tw_l60_opp_df_induce%_44"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-17)
df_player2["SOS_tw_l60_opp_df_induce%_43"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-18)
df_player2["SOS_tw_l60_opp_df_induce%_42"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-19)
df_player2["SOS_tw_l60_opp_df_induce%_41"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-20)
df_player2["SOS_tw_l60_opp_df_induce%_40"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-21)
df_player2["SOS_tw_l60_opp_df_induce%_39"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-22)
df_player2["SOS_tw_l60_opp_df_induce%_38"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-23)
df_player2["SOS_tw_l60_opp_df_induce%_37"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-24)
df_player2["SOS_tw_l60_opp_df_induce%_36"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-25)
df_player2["SOS_tw_l60_opp_df_induce%_35"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-26)
df_player2["SOS_tw_l60_opp_df_induce%_34"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-27)
df_player2["SOS_tw_l60_opp_df_induce%_33"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-28)
df_player2["SOS_tw_l60_opp_df_induce%_32"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-29)
df_player2["SOS_tw_l60_opp_df_induce%_31"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-30)
df_player2["SOS_tw_l60_opp_df_induce%_30"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-31)
df_player2["SOS_tw_l60_opp_df_induce%_29"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-32)
df_player2["SOS_tw_l60_opp_df_induce%_28"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-33)
df_player2["SOS_tw_l60_opp_df_induce%_27"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-34)
df_player2["SOS_tw_l60_opp_df_induce%_26"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-35)
df_player2["SOS_tw_l60_opp_df_induce%_25"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-36)
df_player2["SOS_tw_l60_opp_df_induce%_24"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-37)
df_player2["SOS_tw_l60_opp_df_induce%_23"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-38)
df_player2["SOS_tw_l60_opp_df_induce%_22"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-39)
df_player2["SOS_tw_l60_opp_df_induce%_21"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-40)
df_player2["SOS_tw_l60_opp_df_induce%_20"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-41)
df_player2["SOS_tw_l60_opp_df_induce%_19"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-42)
df_player2["SOS_tw_l60_opp_df_induce%_18"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-43)
df_player2["SOS_tw_l60_opp_df_induce%_17"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-44)
df_player2["SOS_tw_l60_opp_df_induce%_16"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-45)
df_player2["SOS_tw_l60_opp_df_induce%_15"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-46)
df_player2["SOS_tw_l60_opp_df_induce%_14"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-47)
df_player2["SOS_tw_l60_opp_df_induce%_13"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-48)
df_player2["SOS_tw_l60_opp_df_induce%_12"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-49)
df_player2["SOS_tw_l60_opp_df_induce%_11"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-50)
df_player2["SOS_tw_l60_opp_df_induce%_10"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-51)
df_player2["SOS_tw_l60_opp_df_induce%_9"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-52)
df_player2["SOS_tw_l60_opp_df_induce%_8"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-53)
df_player2["SOS_tw_l60_opp_df_induce%_7"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-54)
df_player2["SOS_tw_l60_opp_df_induce%_6"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-55)
df_player2["SOS_tw_l60_opp_df_induce%_5"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-56)
df_player2["SOS_tw_l60_opp_df_induce%_4"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-57)
df_player2["SOS_tw_l60_opp_df_induce%_3"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-58)
df_player2["SOS_tw_l60_opp_df_induce%_2"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-59)
df_player2["SOS_tw_l60_opp_df_induce%_1"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df_induce%_l60_tw_ss_IO'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_df_induce%_ws"] = df_player2[["SOS_tw_l60_opp_df_induce%_60", "SOS_tw_l60_opp_df_induce%_59", "SOS_tw_l60_opp_df_induce%_58", "SOS_tw_l60_opp_df_induce%_57", "SOS_tw_l60_opp_df_induce%_56", "SOS_tw_l60_opp_df_induce%_55", "SOS_tw_l60_opp_df_induce%_54", "SOS_tw_l60_opp_df_induce%_53", "SOS_tw_l60_opp_df_induce%_52", "SOS_tw_l60_opp_df_induce%_51", "SOS_tw_l60_opp_df_induce%_50", "SOS_tw_l60_opp_df_induce%_49", "SOS_tw_l60_opp_df_induce%_48", "SOS_tw_l60_opp_df_induce%_47", "SOS_tw_l60_opp_df_induce%_46", "SOS_tw_l60_opp_df_induce%_45", "SOS_tw_l60_opp_df_induce%_44", "SOS_tw_l60_opp_df_induce%_43", "SOS_tw_l60_opp_df_induce%_42", "SOS_tw_l60_opp_df_induce%_41", "SOS_tw_l60_opp_df_induce%_40", "SOS_tw_l60_opp_df_induce%_39", "SOS_tw_l60_opp_df_induce%_38", "SOS_tw_l60_opp_df_induce%_37", "SOS_tw_l60_opp_df_induce%_36", "SOS_tw_l60_opp_df_induce%_35", "SOS_tw_l60_opp_df_induce%_34", "SOS_tw_l60_opp_df_induce%_33", "SOS_tw_l60_opp_df_induce%_32", "SOS_tw_l60_opp_df_induce%_31", "SOS_tw_l60_opp_df_induce%_30", "SOS_tw_l60_opp_df_induce%_29", "SOS_tw_l60_opp_df_induce%_28", "SOS_tw_l60_opp_df_induce%_27", "SOS_tw_l60_opp_df_induce%_26", "SOS_tw_l60_opp_df_induce%_25", "SOS_tw_l60_opp_df_induce%_24", "SOS_tw_l60_opp_df_induce%_23", "SOS_tw_l60_opp_df_induce%_22", "SOS_tw_l60_opp_df_induce%_21", "SOS_tw_l60_opp_df_induce%_20", "SOS_tw_l60_opp_df_induce%_19", "SOS_tw_l60_opp_df_induce%_18", "SOS_tw_l60_opp_df_induce%_17", "SOS_tw_l60_opp_df_induce%_16", "SOS_tw_l60_opp_df_induce%_15", "SOS_tw_l60_opp_df_induce%_14", "SOS_tw_l60_opp_df_induce%_13", "SOS_tw_l60_opp_df_induce%_12", "SOS_tw_l60_opp_df_induce%_11", "SOS_tw_l60_opp_df_induce%_10", "SOS_tw_l60_opp_df_induce%_9", "SOS_tw_l60_opp_df_induce%_8", "SOS_tw_l60_opp_df_induce%_7", "SOS_tw_l60_opp_df_induce%_6", "SOS_tw_l60_opp_df_induce%_5", "SOS_tw_l60_opp_df_induce%_4", "SOS_tw_l60_opp_df_induce%_3", "SOS_tw_l60_opp_df_induce%_2", "SOS_tw_l60_opp_df_induce%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_df_induce%_ws_ct"] = df_player2[["SOS_tw_l60_opp_df_induce%_60", "SOS_tw_l60_opp_df_induce%_59", "SOS_tw_l60_opp_df_induce%_58", "SOS_tw_l60_opp_df_induce%_57", "SOS_tw_l60_opp_df_induce%_56", "SOS_tw_l60_opp_df_induce%_55", "SOS_tw_l60_opp_df_induce%_54", "SOS_tw_l60_opp_df_induce%_53", "SOS_tw_l60_opp_df_induce%_52", "SOS_tw_l60_opp_df_induce%_51", "SOS_tw_l60_opp_df_induce%_50", "SOS_tw_l60_opp_df_induce%_49", "SOS_tw_l60_opp_df_induce%_48", "SOS_tw_l60_opp_df_induce%_47", "SOS_tw_l60_opp_df_induce%_46", "SOS_tw_l60_opp_df_induce%_45", "SOS_tw_l60_opp_df_induce%_44", "SOS_tw_l60_opp_df_induce%_43", "SOS_tw_l60_opp_df_induce%_42", "SOS_tw_l60_opp_df_induce%_41", "SOS_tw_l60_opp_df_induce%_40", "SOS_tw_l60_opp_df_induce%_39", "SOS_tw_l60_opp_df_induce%_38", "SOS_tw_l60_opp_df_induce%_37", "SOS_tw_l60_opp_df_induce%_36", "SOS_tw_l60_opp_df_induce%_35", "SOS_tw_l60_opp_df_induce%_34", "SOS_tw_l60_opp_df_induce%_33", "SOS_tw_l60_opp_df_induce%_32", "SOS_tw_l60_opp_df_induce%_31", "SOS_tw_l60_opp_df_induce%_30", "SOS_tw_l60_opp_df_induce%_29", "SOS_tw_l60_opp_df_induce%_28", "SOS_tw_l60_opp_df_induce%_27", "SOS_tw_l60_opp_df_induce%_26", "SOS_tw_l60_opp_df_induce%_25", "SOS_tw_l60_opp_df_induce%_24", "SOS_tw_l60_opp_df_induce%_23", "SOS_tw_l60_opp_df_induce%_22", "SOS_tw_l60_opp_df_induce%_21", "SOS_tw_l60_opp_df_induce%_20", "SOS_tw_l60_opp_df_induce%_19", "SOS_tw_l60_opp_df_induce%_18", "SOS_tw_l60_opp_df_induce%_17", "SOS_tw_l60_opp_df_induce%_16", "SOS_tw_l60_opp_df_induce%_15", "SOS_tw_l60_opp_df_induce%_14", "SOS_tw_l60_opp_df_induce%_13", "SOS_tw_l60_opp_df_induce%_12", "SOS_tw_l60_opp_df_induce%_11", "SOS_tw_l60_opp_df_induce%_10", "SOS_tw_l60_opp_df_induce%_9", "SOS_tw_l60_opp_df_induce%_8", "SOS_tw_l60_opp_df_induce%_7", "SOS_tw_l60_opp_df_induce%_6", "SOS_tw_l60_opp_df_induce%_5", "SOS_tw_l60_opp_df_induce%_4", "SOS_tw_l60_opp_df_induce%_3", "SOS_tw_l60_opp_df_induce%_2", "SOS_tw_l60_opp_df_induce%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_df_induce%"] = (df_player2["SOS_tw_l60_opp_df_induce%_ws"]/df_player2["SOS_tw_l60_opp_df_induce%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % DF INDUCE "YIELDED" performance the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface and IO status is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l60_opp_df_induce%"]

# Mean % DF INDUCE YIELDED performance (l60_tw_ss_IO) across ALL players per surface and IO status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (except 2009-2010), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_df_induce%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_2i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_df_induce%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_3i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_df_induce%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_4i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_df_induce%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_5i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_df_induce%_l60_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_clay_SOS_6i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_df_induce%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
# (nan, 3.440714285714286, 3.7144910179640713, 3.170175438596493, 3.878239436619717, 3.7686206896551724)

mean_clay_SOS_1o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_df_induce%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_2o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_df_induce%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_3o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_df_induce%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_4o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_df_induce%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_5o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_df_induce%_l60_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_clay_SOS_6o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_df_induce%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
# (3.4963530219780243, 3.2329830287206294, 3.2363016949152463, 3.3070886512590563, 3.325740489130439, 3.4451959798994993)

mean_hard_SOS_1i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_df_induce%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_2i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_df_induce%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_3i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_df_induce%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_4i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_df_induce%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_5i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_df_induce%_l60_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_hard_SOS_6i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_df_induce%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
# (3.664987804878052, 3.3387195121951194, 3.3843216943331456, 3.5016234652114635, 3.5332473867595824, 3.5516506849315075)

mean_hard_SOS_1o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_df_induce%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_2o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_df_induce%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_3o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_df_induce%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_4o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_df_induce%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_5o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_df_induce%_l60_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_hard_SOS_6o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_df_induce%_l60_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
# (3.877447495961232, 3.8238966202783304, 3.8826622581435246, 3.980954207624073, 4.229791430514608, 4.196683428571439)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_df%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_df%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_df%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_df%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_df%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_df%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_df%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_df%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_df%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_df%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_df%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_df%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_df%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_df%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_df%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_df%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_df%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_df%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_df%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_df%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_df%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_df%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_df%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_df%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l60_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 

del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_df_induce%_ws", "SOS_tw_l60_opp_df_induce%_ws_ct", "SOS_tw_l60_opp_df_induce%", "SOS_tw_l60_opp_df_induce%_50", "SOS_tw_l60_opp_df_induce%_49", "SOS_tw_l60_opp_df_induce%_48", "SOS_tw_l60_opp_df_induce%_47", "SOS_tw_l60_opp_df_induce%_46", "SOS_tw_l60_opp_df_induce%_45", "SOS_tw_l60_opp_df_induce%_44", "SOS_tw_l60_opp_df_induce%_43", "SOS_tw_l60_opp_df_induce%_42", "SOS_tw_l60_opp_df_induce%_41", "SOS_tw_l60_opp_df_induce%_40", "SOS_tw_l60_opp_df_induce%_39", "SOS_tw_l60_opp_df_induce%_38", "SOS_tw_l60_opp_df_induce%_37", "SOS_tw_l60_opp_df_induce%_36", "SOS_tw_l60_opp_df_induce%_35", "SOS_tw_l60_opp_df_induce%_34", "SOS_tw_l60_opp_df_induce%_33", "SOS_tw_l60_opp_df_induce%_32", "SOS_tw_l60_opp_df_induce%_31", "SOS_tw_l60_opp_df_induce%_30", "SOS_tw_l60_opp_df_induce%_29", "SOS_tw_l60_opp_df_induce%_28", "SOS_tw_l60_opp_df_induce%_27", "SOS_tw_l60_opp_df_induce%_26", "SOS_tw_l60_opp_df_induce%_25", "SOS_tw_l60_opp_df_induce%_24", "SOS_tw_l60_opp_df_induce%_23", "SOS_tw_l60_opp_df_induce%_22", "SOS_tw_l60_opp_df_induce%_21", "SOS_tw_l60_opp_df_induce%_20", "SOS_tw_l60_opp_df_induce%_19", "SOS_tw_l60_opp_df_induce%_18", "SOS_tw_l60_opp_df_induce%_17", "SOS_tw_l60_opp_df_induce%_16", "SOS_tw_l60_opp_df_induce%_15", "SOS_tw_l60_opp_df_induce%_14", "SOS_tw_l60_opp_df_induce%_13", "SOS_tw_l60_opp_df_induce%_12", "SOS_tw_l60_opp_df_induce%_11", "SOS_tw_l60_opp_df_induce%_10", "SOS_tw_l60_opp_df_induce%_9", "SOS_tw_l60_opp_df_induce%_8", "SOS_tw_l60_opp_df_induce%_7", "SOS_tw_l60_opp_df_induce%_6", "SOS_tw_l60_opp_df_induce%_5", "SOS_tw_l60_opp_df_induce%_4", "SOS_tw_l60_opp_df_induce%_3", "SOS_tw_l60_opp_df_induce%_2", "SOS_tw_l60_opp_df_induce%_1"],axis=1)

In [186]:
# 'p_df%_l10_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), I/O specific DF performance of PLAYER over the 10 matches PRIOR TO the match being predicted

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_df_induce%_ws"] = df_player2[["SOS_tw_l60_opp_df_induce%_60", "SOS_tw_l60_opp_df_induce%_59", "SOS_tw_l60_opp_df_induce%_58", "SOS_tw_l60_opp_df_induce%_57", "SOS_tw_l60_opp_df_induce%_56", "SOS_tw_l60_opp_df_induce%_55", "SOS_tw_l60_opp_df_induce%_54", "SOS_tw_l60_opp_df_induce%_53", "SOS_tw_l60_opp_df_induce%_52", "SOS_tw_l60_opp_df_induce%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_df_induce%_ws_ct"] = df_player2[["SOS_tw_l60_opp_df_induce%_60", "SOS_tw_l60_opp_df_induce%_59", "SOS_tw_l60_opp_df_induce%_58", "SOS_tw_l60_opp_df_induce%_57", "SOS_tw_l60_opp_df_induce%_56", "SOS_tw_l60_opp_df_induce%_55", "SOS_tw_l60_opp_df_induce%_54", "SOS_tw_l60_opp_df_induce%_53", "SOS_tw_l60_opp_df_induce%_52", "SOS_tw_l60_opp_df_induce%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_df_induce%"] = (df_player2["SOS_tw_l10_opp_df_induce%_ws"]/df_player2["SOS_tw_l10_opp_df_induce%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % DF INDUCE "YIELDED" performance the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches on this surface and IO status is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l10_opp_df_induce%"]

# Mean % DF INDUCE YIELDED performance (l10_tw_ss_IO) across ALL players per surface and IO status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (except 2009-2010), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_df_induce%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_2i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_df_induce%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_3i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_df_induce%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_4i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_df_induce%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_5i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_df_induce%_l10_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_clay_SOS_6i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_df_induce%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
# 

mean_clay_SOS_1o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_df_induce%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_2o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_df_induce%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_3o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_df_induce%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_4o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_df_induce%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_clay_SOS_5o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_df_induce%_l10_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_clay_SOS_6o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_df_induce%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
# 

mean_hard_SOS_1i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_df_induce%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_2i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_df_induce%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_3i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_df_induce%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_4i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_df_induce%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_5i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_df_induce%_l10_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_hard_SOS_6i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_df_induce%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
# 

mean_hard_SOS_1o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_df_induce%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_2o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_df_induce%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_3o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_df_induce%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_4o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_df_induce%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
mean_hard_SOS_5o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_df_induce%_l10_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_hard_SOS_6o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_df_induce%_l10_tw_ss_IO'].mean()) #We want in terms of pct aces the field ALLOWS on average
# 

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_df%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_df%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_df%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_df%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_df%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_df%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_df%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_df%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_df%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_df%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_df%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_df%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_df%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_df%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_df%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_df%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_df%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_df%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_df%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_df%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_df%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_df%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_df%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_df%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df%_l10_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 

del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_df_induce%_ws", "SOS_tw_l10_opp_df_induce%_ws_ct", "SOS_tw_l10_opp_df_induce%", "SOS_tw_l60_opp_df_induce%_60", "SOS_tw_l60_opp_df_induce%_59", "SOS_tw_l60_opp_df_induce%_58", "SOS_tw_l60_opp_df_induce%_57", "SOS_tw_l60_opp_df_induce%_56", "SOS_tw_l60_opp_df_induce%_55", "SOS_tw_l60_opp_df_induce%_54", "SOS_tw_l60_opp_df_induce%_53", "SOS_tw_l60_opp_df_induce%_52", "SOS_tw_l60_opp_df_induce%_51"],axis=1)

In [187]:
# 'p_df_induce%_l60_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) Double Fault INDUCE performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted DOUBLE FAULT (as servers) performance for player OPPONENTS in the maximum interval (60 matches) prior to the match being predicted 
df_player2["SOS_tw_l60_opp_df%_60"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-1)
df_player2["SOS_tw_l60_opp_df%_59"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-2)
df_player2["SOS_tw_l60_opp_df%_58"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-3)
df_player2["SOS_tw_l60_opp_df%_57"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-4)
df_player2["SOS_tw_l60_opp_df%_56"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-5)
df_player2["SOS_tw_l60_opp_df%_55"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-6)
df_player2["SOS_tw_l60_opp_df%_54"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-7)
df_player2["SOS_tw_l60_opp_df%_53"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-8)
df_player2["SOS_tw_l60_opp_df%_52"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-9)
df_player2["SOS_tw_l60_opp_df%_51"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-10)
df_player2["SOS_tw_l60_opp_df%_50"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-11)
df_player2["SOS_tw_l60_opp_df%_49"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-12)
df_player2["SOS_tw_l60_opp_df%_48"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-13)
df_player2["SOS_tw_l60_opp_df%_47"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-14)
df_player2["SOS_tw_l60_opp_df%_46"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-15)
df_player2["SOS_tw_l60_opp_df%_45"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-16)
df_player2["SOS_tw_l60_opp_df%_44"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-17)
df_player2["SOS_tw_l60_opp_df%_43"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-18)
df_player2["SOS_tw_l60_opp_df%_42"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-19)
df_player2["SOS_tw_l60_opp_df%_41"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-20)
df_player2["SOS_tw_l60_opp_df%_40"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-21)
df_player2["SOS_tw_l60_opp_df%_39"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-22)
df_player2["SOS_tw_l60_opp_df%_38"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-23)
df_player2["SOS_tw_l60_opp_df%_37"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-24)
df_player2["SOS_tw_l60_opp_df%_36"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-25)
df_player2["SOS_tw_l60_opp_df%_35"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-26)
df_player2["SOS_tw_l60_opp_df%_34"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-27)
df_player2["SOS_tw_l60_opp_df%_33"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-28)
df_player2["SOS_tw_l60_opp_df%_32"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-29)
df_player2["SOS_tw_l60_opp_df%_31"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-30)
df_player2["SOS_tw_l60_opp_df%_30"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-31)
df_player2["SOS_tw_l60_opp_df%_29"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-32)
df_player2["SOS_tw_l60_opp_df%_28"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-33)
df_player2["SOS_tw_l60_opp_df%_27"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-34)
df_player2["SOS_tw_l60_opp_df%_26"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-35)
df_player2["SOS_tw_l60_opp_df%_25"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-36)
df_player2["SOS_tw_l60_opp_df%_24"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-37)
df_player2["SOS_tw_l60_opp_df%_23"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-38)
df_player2["SOS_tw_l60_opp_df%_22"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-39)
df_player2["SOS_tw_l60_opp_df%_21"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-40)
df_player2["SOS_tw_l60_opp_df%_20"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-41)
df_player2["SOS_tw_l60_opp_df%_19"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-42)
df_player2["SOS_tw_l60_opp_df%_18"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-43)
df_player2["SOS_tw_l60_opp_df%_17"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-44)
df_player2["SOS_tw_l60_opp_df%_16"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-45)
df_player2["SOS_tw_l60_opp_df%_15"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-46)
df_player2["SOS_tw_l60_opp_df%_14"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-47)
df_player2["SOS_tw_l60_opp_df%_13"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-48)
df_player2["SOS_tw_l60_opp_df%_12"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-49)
df_player2["SOS_tw_l60_opp_df%_11"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-50)
df_player2["SOS_tw_l60_opp_df%_10"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-51)
df_player2["SOS_tw_l60_opp_df%_9"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-52)
df_player2["SOS_tw_l60_opp_df%_8"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-53)
df_player2["SOS_tw_l60_opp_df%_7"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-54)
df_player2["SOS_tw_l60_opp_df%_6"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-55)
df_player2["SOS_tw_l60_opp_df%_5"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-56)
df_player2["SOS_tw_l60_opp_df%_4"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-57)
df_player2["SOS_tw_l60_opp_df%_3"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-58)
df_player2["SOS_tw_l60_opp_df%_2"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-59)
df_player2["SOS_tw_l60_opp_df%_1"] = df_player2.groupby(['p_id','t_surf'])['p_opp_df%_l60_tw_ss'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_df%_ws"] = df_player2[["SOS_tw_l60_opp_df%_60", "SOS_tw_l60_opp_df%_59", "SOS_tw_l60_opp_df%_58", "SOS_tw_l60_opp_df%_57", "SOS_tw_l60_opp_df%_56", "SOS_tw_l60_opp_df%_55", "SOS_tw_l60_opp_df%_54", "SOS_tw_l60_opp_df%_53", "SOS_tw_l60_opp_df%_52", "SOS_tw_l60_opp_df%_51", "SOS_tw_l60_opp_df%_50", "SOS_tw_l60_opp_df%_49", "SOS_tw_l60_opp_df%_48", "SOS_tw_l60_opp_df%_47", "SOS_tw_l60_opp_df%_46", "SOS_tw_l60_opp_df%_45", "SOS_tw_l60_opp_df%_44", "SOS_tw_l60_opp_df%_43", "SOS_tw_l60_opp_df%_42", "SOS_tw_l60_opp_df%_41", "SOS_tw_l60_opp_df%_40", "SOS_tw_l60_opp_df%_39", "SOS_tw_l60_opp_df%_38", "SOS_tw_l60_opp_df%_37", "SOS_tw_l60_opp_df%_36", "SOS_tw_l60_opp_df%_35", "SOS_tw_l60_opp_df%_34", "SOS_tw_l60_opp_df%_33", "SOS_tw_l60_opp_df%_32", "SOS_tw_l60_opp_df%_31", "SOS_tw_l60_opp_df%_30", "SOS_tw_l60_opp_df%_29", "SOS_tw_l60_opp_df%_28", "SOS_tw_l60_opp_df%_27", "SOS_tw_l60_opp_df%_26", "SOS_tw_l60_opp_df%_25", "SOS_tw_l60_opp_df%_24", "SOS_tw_l60_opp_df%_23", "SOS_tw_l60_opp_df%_22", "SOS_tw_l60_opp_df%_21", "SOS_tw_l60_opp_df%_20", "SOS_tw_l60_opp_df%_19", "SOS_tw_l60_opp_df%_18", "SOS_tw_l60_opp_df%_17", "SOS_tw_l60_opp_df%_16", "SOS_tw_l60_opp_df%_15", "SOS_tw_l60_opp_df%_14", "SOS_tw_l60_opp_df%_13", "SOS_tw_l60_opp_df%_12", "SOS_tw_l60_opp_df%_11", "SOS_tw_l60_opp_df%_10", "SOS_tw_l60_opp_df%_9", "SOS_tw_l60_opp_df%_8", "SOS_tw_l60_opp_df%_7", "SOS_tw_l60_opp_df%_6", "SOS_tw_l60_opp_df%_5", "SOS_tw_l60_opp_df%_4", "SOS_tw_l60_opp_df%_3", "SOS_tw_l60_opp_df%_2", "SOS_tw_l60_opp_df%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_df%_ws_ct"] = df_player2[["SOS_tw_l60_opp_df%_60", "SOS_tw_l60_opp_df%_59", "SOS_tw_l60_opp_df%_58", "SOS_tw_l60_opp_df%_57", "SOS_tw_l60_opp_df%_56", "SOS_tw_l60_opp_df%_55", "SOS_tw_l60_opp_df%_54", "SOS_tw_l60_opp_df%_53", "SOS_tw_l60_opp_df%_52", "SOS_tw_l60_opp_df%_51", "SOS_tw_l60_opp_df%_50", "SOS_tw_l60_opp_df%_49", "SOS_tw_l60_opp_df%_48", "SOS_tw_l60_opp_df%_47", "SOS_tw_l60_opp_df%_46", "SOS_tw_l60_opp_df%_45", "SOS_tw_l60_opp_df%_44", "SOS_tw_l60_opp_df%_43", "SOS_tw_l60_opp_df%_42", "SOS_tw_l60_opp_df%_41", "SOS_tw_l60_opp_df%_40", "SOS_tw_l60_opp_df%_39", "SOS_tw_l60_opp_df%_38", "SOS_tw_l60_opp_df%_37", "SOS_tw_l60_opp_df%_36", "SOS_tw_l60_opp_df%_35", "SOS_tw_l60_opp_df%_34", "SOS_tw_l60_opp_df%_33", "SOS_tw_l60_opp_df%_32", "SOS_tw_l60_opp_df%_31", "SOS_tw_l60_opp_df%_30", "SOS_tw_l60_opp_df%_29", "SOS_tw_l60_opp_df%_28", "SOS_tw_l60_opp_df%_27", "SOS_tw_l60_opp_df%_26", "SOS_tw_l60_opp_df%_25", "SOS_tw_l60_opp_df%_24", "SOS_tw_l60_opp_df%_23", "SOS_tw_l60_opp_df%_22", "SOS_tw_l60_opp_df%_21", "SOS_tw_l60_opp_df%_20", "SOS_tw_l60_opp_df%_19", "SOS_tw_l60_opp_df%_18", "SOS_tw_l60_opp_df%_17", "SOS_tw_l60_opp_df%_16", "SOS_tw_l60_opp_df%_15", "SOS_tw_l60_opp_df%_14", "SOS_tw_l60_opp_df%_13", "SOS_tw_l60_opp_df%_12", "SOS_tw_l60_opp_df%_11", "SOS_tw_l60_opp_df%_10", "SOS_tw_l60_opp_df%_9", "SOS_tw_l60_opp_df%_8", "SOS_tw_l60_opp_df%_7", "SOS_tw_l60_opp_df%_6", "SOS_tw_l60_opp_df%_5", "SOS_tw_l60_opp_df%_4", "SOS_tw_l60_opp_df%_3", "SOS_tw_l60_opp_df%_2", "SOS_tw_l60_opp_df%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_df%"] = (df_player2["SOS_tw_l60_opp_df%_ws"]/df_player2["SOS_tw_l60_opp_df%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % DFS the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to YIELD (commit)
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l60_opp_df%"]

# Mean % DF committed performance (l60_tw_ss) across ALL players per surface. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (except 2009-2010), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_df%_l60_tw_ss'].mean()) #We want in terms of pct df the field COMMITS on average
mean_clay_SOS_2 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_df%_l60_tw_ss'].mean()) #We want in terms of pct df the field COMMITS on average
mean_clay_SOS_3 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_df%_l60_tw_ss'].mean()) #We want in terms of pct df the field COMMITS on average
mean_clay_SOS_4 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_df%_l60_tw_ss'].mean()) #We want in terms of pct df the field COMMITS on average
mean_clay_SOS_5 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_df%_l60_tw_ss'].mean()) #We want in terms of pct df the field COMMITS on average
mean_clay_SOS_6 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_df%_l60_tw_ss'].mean()) #We want in terms of pct df the field COMMITS on average
# (3.363310439560433, 3.1464520202020188, 3.327402843601895, 3.319691119691114, 3.3560837359098126, 3.4051196636481116)

mean_hard_SOS_1 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_df%_l60_tw_ss'].mean()) #We want in terms of pct df the field COMMITS on average
mean_hard_SOS_2 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_df%_l60_tw_ss'].mean()) #We want in terms of pct df the field COMMITS on average
mean_hard_SOS_3 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_df%_l60_tw_ss'].mean()) #We want in terms of pct df the field COMMITS on average
mean_hard_SOS_4 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_df%_l60_tw_ss'].mean()) #We want in terms of pct df the field COMMITS on average
mean_hard_SOS_5 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_df%_l60_tw_ss'].mean()) #We want in terms of pct df the field COMMITS on average
mean_hard_SOS_6 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_df%_l60_tw_ss'].mean()) #We want in terms of pct df the field COMMITS on average
# (3.715885269121823, 3.630196476964779, 3.8174599659284585, 3.8966243162166907, 4.039814720380762, 4.055409083177421)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_df_induce%_l60_tw_ss_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_df_induce%_l60_tw_ss_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_df_induce%_l60_tw_ss_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_df_induce%_l60_tw_ss_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_df_induce%_l60_tw_ss_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_df_induce%_l60_tw_ss_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_df_induce%_l60_tw_ss_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_df_induce%_l60_tw_ss_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_df_induce%_l60_tw_ss_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_df_induce%_l60_tw_ss_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_df_induce%_l60_tw_ss_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_df_induce%_l60_tw_ss_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_df%_ws", "SOS_tw_l60_opp_df%_ws_ct", "SOS_tw_l60_opp_df%", "SOS_tw_l60_opp_df%_50", "SOS_tw_l60_opp_df%_49", "SOS_tw_l60_opp_df%_48", "SOS_tw_l60_opp_df%_47", "SOS_tw_l60_opp_df%_46", "SOS_tw_l60_opp_df%_45", "SOS_tw_l60_opp_df%_44", "SOS_tw_l60_opp_df%_43", "SOS_tw_l60_opp_df%_42", "SOS_tw_l60_opp_df%_41", "SOS_tw_l60_opp_df%_40", "SOS_tw_l60_opp_df%_39", "SOS_tw_l60_opp_df%_38", "SOS_tw_l60_opp_df%_37", "SOS_tw_l60_opp_df%_36", "SOS_tw_l60_opp_df%_35", "SOS_tw_l60_opp_df%_34", "SOS_tw_l60_opp_df%_33", "SOS_tw_l60_opp_df%_32", "SOS_tw_l60_opp_df%_31", "SOS_tw_l60_opp_df%_30", "SOS_tw_l60_opp_df%_29", "SOS_tw_l60_opp_df%_28", "SOS_tw_l60_opp_df%_27", "SOS_tw_l60_opp_df%_26", "SOS_tw_l60_opp_df%_25", "SOS_tw_l60_opp_df%_24", "SOS_tw_l60_opp_df%_23", "SOS_tw_l60_opp_df%_22", "SOS_tw_l60_opp_df%_21", "SOS_tw_l60_opp_df%_20", "SOS_tw_l60_opp_df%_19", "SOS_tw_l60_opp_df%_18", "SOS_tw_l60_opp_df%_17", "SOS_tw_l60_opp_df%_16", "SOS_tw_l60_opp_df%_15", "SOS_tw_l60_opp_df%_14", "SOS_tw_l60_opp_df%_13", "SOS_tw_l60_opp_df%_12", "SOS_tw_l60_opp_df%_11", "SOS_tw_l60_opp_df%_10", "SOS_tw_l60_opp_df%_9", "SOS_tw_l60_opp_df%_8", "SOS_tw_l60_opp_df%_7", "SOS_tw_l60_opp_df%_6", "SOS_tw_l60_opp_df%_5", "SOS_tw_l60_opp_df%_4", "SOS_tw_l60_opp_df%_3", "SOS_tw_l60_opp_df%_2", "SOS_tw_l60_opp_df%_1"],axis=1)

In [188]:
# 'p_df_induce%_l10_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) Double Fault INDUCE performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_df%_ws"] = df_player2[["SOS_tw_l60_opp_df%_60", "SOS_tw_l60_opp_df%_59", "SOS_tw_l60_opp_df%_58", "SOS_tw_l60_opp_df%_57", "SOS_tw_l60_opp_df%_56", "SOS_tw_l60_opp_df%_55", "SOS_tw_l60_opp_df%_54", "SOS_tw_l60_opp_df%_53", "SOS_tw_l60_opp_df%_52", "SOS_tw_l60_opp_df%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_df%_ws_ct"] = df_player2[["SOS_tw_l60_opp_df%_60", "SOS_tw_l60_opp_df%_59", "SOS_tw_l60_opp_df%_58", "SOS_tw_l60_opp_df%_57", "SOS_tw_l60_opp_df%_56", "SOS_tw_l60_opp_df%_55", "SOS_tw_l60_opp_df%_54", "SOS_tw_l60_opp_df%_53", "SOS_tw_l60_opp_df%_52", "SOS_tw_l60_opp_df%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_df%"] = (df_player2["SOS_tw_l10_opp_df%_ws"]/df_player2["SOS_tw_l10_opp_df%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % DFS the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to YIELD (commit)
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l10_opp_df%"]

# Mean % DF committed performance (l10_tw_ss) across ALL players per surface. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (except 2009-2010), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_df%_l10_tw_ss'].mean()) #We want in terms of pct df the field COMMITS on average
mean_clay_SOS_2 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_df%_l10_tw_ss'].mean()) #We want in terms of pct df the field COMMITS on average
mean_clay_SOS_3 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_df%_l10_tw_ss'].mean()) #We want in terms of pct df the field COMMITS on average
mean_clay_SOS_4 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_df%_l10_tw_ss'].mean()) #We want in terms of pct df the field COMMITS on average
mean_clay_SOS_5 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_df%_l10_tw_ss'].mean()) #We want in terms of pct df the field COMMITS on average
mean_clay_SOS_6 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_df%_l10_tw_ss'].mean()) #We want in terms of pct df the field COMMITS on average
# (3.3346497252747205, 3.137531565656569, 3.3131437598736064, 3.2649099099099077, 3.38437037037036, 3.4910866752910774)

mean_hard_SOS_1 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_df%_l10_tw_ss'].mean()) #We want in terms of pct df the field COMMITS on average
mean_hard_SOS_2 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_df%_l10_tw_ss'].mean()) #We want in terms of pct df the field COMMITS on average
mean_hard_SOS_3 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_df%_l10_tw_ss'].mean()) #We want in terms of pct df the field COMMITS on average
mean_hard_SOS_4 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_df%_l10_tw_ss'].mean()) #We want in terms of pct df the field COMMITS on average
mean_hard_SOS_5 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_df%_l10_tw_ss'].mean()) #We want in terms of pct df the field COMMITS on average
mean_hard_SOS_6 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_df%_l10_tw_ss'].mean()) #We want in terms of pct df the field COMMITS on average
# (3.711157932011333, 3.669480013550149, 3.810122657580918, 3.878454208575974, 4.122228454869957, 3.999306004422511)

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_df_induce%_l10_tw_ss_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_df_induce%_l10_tw_ss_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_df_induce%_l10_tw_ss_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_df_induce%_l10_tw_ss_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_df_induce%_l10_tw_ss_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_df_induce%_l10_tw_ss_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_df_induce%_l10_tw_ss_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_df_induce%_l10_tw_ss_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_df_induce%_l10_tw_ss_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_df_induce%_l10_tw_ss_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_df_induce%_l10_tw_ss_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_df_induce%_l10_tw_ss_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_df%_ws", "SOS_tw_l10_opp_df%_ws_ct", "SOS_tw_l10_opp_df%", "SOS_tw_l60_opp_df%_60", "SOS_tw_l60_opp_df%_59", "SOS_tw_l60_opp_df%_58", "SOS_tw_l60_opp_df%_57", "SOS_tw_l60_opp_df%_56", "SOS_tw_l60_opp_df%_55", "SOS_tw_l60_opp_df%_54", "SOS_tw_l60_opp_df%_53", "SOS_tw_l60_opp_df%_52", "SOS_tw_l60_opp_df%_51"],axis=1)

In [189]:
# 'p_df_induce%_l60_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), I/O specific DF INDUCE performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

# First obtain mean, TW,SS, IO specific DF (as servers) performance for player OPPONENTS in the maximum interval (60 matches) prior to the match being predicted 
df_player2["SOS_tw_l60_opp_df%_60"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-1)
df_player2["SOS_tw_l60_opp_df%_59"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-2)
df_player2["SOS_tw_l60_opp_df%_58"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-3)
df_player2["SOS_tw_l60_opp_df%_57"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-4)
df_player2["SOS_tw_l60_opp_df%_56"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-5)
df_player2["SOS_tw_l60_opp_df%_55"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-6)
df_player2["SOS_tw_l60_opp_df%_54"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-7)
df_player2["SOS_tw_l60_opp_df%_53"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-8)
df_player2["SOS_tw_l60_opp_df%_52"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-9)
df_player2["SOS_tw_l60_opp_df%_51"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-10)
df_player2["SOS_tw_l60_opp_df%_50"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-11)
df_player2["SOS_tw_l60_opp_df%_49"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-12)
df_player2["SOS_tw_l60_opp_df%_48"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-13)
df_player2["SOS_tw_l60_opp_df%_47"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-14)
df_player2["SOS_tw_l60_opp_df%_46"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-15)
df_player2["SOS_tw_l60_opp_df%_45"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-16)
df_player2["SOS_tw_l60_opp_df%_44"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-17)
df_player2["SOS_tw_l60_opp_df%_43"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-18)
df_player2["SOS_tw_l60_opp_df%_42"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-19)
df_player2["SOS_tw_l60_opp_df%_41"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-20)
df_player2["SOS_tw_l60_opp_df%_40"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-21)
df_player2["SOS_tw_l60_opp_df%_39"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-22)
df_player2["SOS_tw_l60_opp_df%_38"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-23)
df_player2["SOS_tw_l60_opp_df%_37"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-24)
df_player2["SOS_tw_l60_opp_df%_36"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-25)
df_player2["SOS_tw_l60_opp_df%_35"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-26)
df_player2["SOS_tw_l60_opp_df%_34"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-27)
df_player2["SOS_tw_l60_opp_df%_33"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-28)
df_player2["SOS_tw_l60_opp_df%_32"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-29)
df_player2["SOS_tw_l60_opp_df%_31"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-30)
df_player2["SOS_tw_l60_opp_df%_30"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-31)
df_player2["SOS_tw_l60_opp_df%_29"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-32)
df_player2["SOS_tw_l60_opp_df%_28"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-33)
df_player2["SOS_tw_l60_opp_df%_27"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-34)
df_player2["SOS_tw_l60_opp_df%_26"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-35)
df_player2["SOS_tw_l60_opp_df%_25"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-36)
df_player2["SOS_tw_l60_opp_df%_24"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-37)
df_player2["SOS_tw_l60_opp_df%_23"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-38)
df_player2["SOS_tw_l60_opp_df%_22"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-39)
df_player2["SOS_tw_l60_opp_df%_21"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-40)
df_player2["SOS_tw_l60_opp_df%_20"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-41)
df_player2["SOS_tw_l60_opp_df%_19"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-42)
df_player2["SOS_tw_l60_opp_df%_18"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-43)
df_player2["SOS_tw_l60_opp_df%_17"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-44)
df_player2["SOS_tw_l60_opp_df%_16"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-45)
df_player2["SOS_tw_l60_opp_df%_15"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-46)
df_player2["SOS_tw_l60_opp_df%_14"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-47)
df_player2["SOS_tw_l60_opp_df%_13"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-48)
df_player2["SOS_tw_l60_opp_df%_12"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-49)
df_player2["SOS_tw_l60_opp_df%_11"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-50)
df_player2["SOS_tw_l60_opp_df%_10"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-51)
df_player2["SOS_tw_l60_opp_df%_9"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-52)
df_player2["SOS_tw_l60_opp_df%_8"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-53)
df_player2["SOS_tw_l60_opp_df%_7"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-54)
df_player2["SOS_tw_l60_opp_df%_6"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-55)
df_player2["SOS_tw_l60_opp_df%_5"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-56)
df_player2["SOS_tw_l60_opp_df%_4"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-57)
df_player2["SOS_tw_l60_opp_df%_3"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-58)
df_player2["SOS_tw_l60_opp_df%_2"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-59)
df_player2["SOS_tw_l60_opp_df%_1"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_df%_l60_tw_ss_IO'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_df%_ws"] = df_player2[["SOS_tw_l60_opp_df%_60", "SOS_tw_l60_opp_df%_59", "SOS_tw_l60_opp_df%_58", "SOS_tw_l60_opp_df%_57", "SOS_tw_l60_opp_df%_56", "SOS_tw_l60_opp_df%_55", "SOS_tw_l60_opp_df%_54", "SOS_tw_l60_opp_df%_53", "SOS_tw_l60_opp_df%_52", "SOS_tw_l60_opp_df%_51", "SOS_tw_l60_opp_df%_50", "SOS_tw_l60_opp_df%_49", "SOS_tw_l60_opp_df%_48", "SOS_tw_l60_opp_df%_47", "SOS_tw_l60_opp_df%_46", "SOS_tw_l60_opp_df%_45", "SOS_tw_l60_opp_df%_44", "SOS_tw_l60_opp_df%_43", "SOS_tw_l60_opp_df%_42", "SOS_tw_l60_opp_df%_41", "SOS_tw_l60_opp_df%_40", "SOS_tw_l60_opp_df%_39", "SOS_tw_l60_opp_df%_38", "SOS_tw_l60_opp_df%_37", "SOS_tw_l60_opp_df%_36", "SOS_tw_l60_opp_df%_35", "SOS_tw_l60_opp_df%_34", "SOS_tw_l60_opp_df%_33", "SOS_tw_l60_opp_df%_32", "SOS_tw_l60_opp_df%_31", "SOS_tw_l60_opp_df%_30", "SOS_tw_l60_opp_df%_29", "SOS_tw_l60_opp_df%_28", "SOS_tw_l60_opp_df%_27", "SOS_tw_l60_opp_df%_26", "SOS_tw_l60_opp_df%_25", "SOS_tw_l60_opp_df%_24", "SOS_tw_l60_opp_df%_23", "SOS_tw_l60_opp_df%_22", "SOS_tw_l60_opp_df%_21", "SOS_tw_l60_opp_df%_20", "SOS_tw_l60_opp_df%_19", "SOS_tw_l60_opp_df%_18", "SOS_tw_l60_opp_df%_17", "SOS_tw_l60_opp_df%_16", "SOS_tw_l60_opp_df%_15", "SOS_tw_l60_opp_df%_14", "SOS_tw_l60_opp_df%_13", "SOS_tw_l60_opp_df%_12", "SOS_tw_l60_opp_df%_11", "SOS_tw_l60_opp_df%_10", "SOS_tw_l60_opp_df%_9", "SOS_tw_l60_opp_df%_8", "SOS_tw_l60_opp_df%_7", "SOS_tw_l60_opp_df%_6", "SOS_tw_l60_opp_df%_5", "SOS_tw_l60_opp_df%_4", "SOS_tw_l60_opp_df%_3", "SOS_tw_l60_opp_df%_2", "SOS_tw_l60_opp_df%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_df%_ws_ct"] = df_player2[["SOS_tw_l60_opp_df%_60", "SOS_tw_l60_opp_df%_59", "SOS_tw_l60_opp_df%_58", "SOS_tw_l60_opp_df%_57", "SOS_tw_l60_opp_df%_56", "SOS_tw_l60_opp_df%_55", "SOS_tw_l60_opp_df%_54", "SOS_tw_l60_opp_df%_53", "SOS_tw_l60_opp_df%_52", "SOS_tw_l60_opp_df%_51", "SOS_tw_l60_opp_df%_50", "SOS_tw_l60_opp_df%_49", "SOS_tw_l60_opp_df%_48", "SOS_tw_l60_opp_df%_47", "SOS_tw_l60_opp_df%_46", "SOS_tw_l60_opp_df%_45", "SOS_tw_l60_opp_df%_44", "SOS_tw_l60_opp_df%_43", "SOS_tw_l60_opp_df%_42", "SOS_tw_l60_opp_df%_41", "SOS_tw_l60_opp_df%_40", "SOS_tw_l60_opp_df%_39", "SOS_tw_l60_opp_df%_38", "SOS_tw_l60_opp_df%_37", "SOS_tw_l60_opp_df%_36", "SOS_tw_l60_opp_df%_35", "SOS_tw_l60_opp_df%_34", "SOS_tw_l60_opp_df%_33", "SOS_tw_l60_opp_df%_32", "SOS_tw_l60_opp_df%_31", "SOS_tw_l60_opp_df%_30", "SOS_tw_l60_opp_df%_29", "SOS_tw_l60_opp_df%_28", "SOS_tw_l60_opp_df%_27", "SOS_tw_l60_opp_df%_26", "SOS_tw_l60_opp_df%_25", "SOS_tw_l60_opp_df%_24", "SOS_tw_l60_opp_df%_23", "SOS_tw_l60_opp_df%_22", "SOS_tw_l60_opp_df%_21", "SOS_tw_l60_opp_df%_20", "SOS_tw_l60_opp_df%_19", "SOS_tw_l60_opp_df%_18", "SOS_tw_l60_opp_df%_17", "SOS_tw_l60_opp_df%_16", "SOS_tw_l60_opp_df%_15", "SOS_tw_l60_opp_df%_14", "SOS_tw_l60_opp_df%_13", "SOS_tw_l60_opp_df%_12", "SOS_tw_l60_opp_df%_11", "SOS_tw_l60_opp_df%_10", "SOS_tw_l60_opp_df%_9", "SOS_tw_l60_opp_df%_8", "SOS_tw_l60_opp_df%_7", "SOS_tw_l60_opp_df%_6", "SOS_tw_l60_opp_df%_5", "SOS_tw_l60_opp_df%_4", "SOS_tw_l60_opp_df%_3", "SOS_tw_l60_opp_df%_2", "SOS_tw_l60_opp_df%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_df%"] = (df_player2["SOS_tw_l60_opp_df%_ws"]/df_player2["SOS_tw_l60_opp_df%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % DFS the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface and IO status is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l60_opp_df%"]

# Mean % DF committed performance (l60_tw_ss_IO) across ALL players per surface and IO status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (except 2009-2010), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_df%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_2i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_df%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_3i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_df%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_4i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_df%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_5i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_df%_l60_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_clay_SOS_6i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_df%_l60_tw_ss_IO'].mean()) 
# (nan, 2.561785714285715, 3.2762275449101783, 3.1629824561403517, 3.2502816901408433, 3.448045977011494)

mean_clay_SOS_1o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_df%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_2o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_df%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_3o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_df%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_4o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_df%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_5o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_df%_l60_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_clay_SOS_6o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_df%_l60_tw_ss_IO'].mean()) 
# (3.363310439560433, 3.1579503916449085, 3.3200033898305072, 3.3252362883753035, 3.3387024456521686, 3.3930184254606286)

mean_hard_SOS_1i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_df%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_2i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_df%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_3i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_df%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_4i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_df%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_5i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_df%_l60_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_hard_SOS_6i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_df%_l60_tw_ss_IO'].mean()) 
# (3.5011219512195115, 3.2208425720620872, 3.4212306811677133, 3.4813028649386077, 3.5560905923344874, 3.5060479452054842)

mean_hard_SOS_1o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_df%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_2o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_df%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_3o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_df%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_4o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_df%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_5o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_df%_l60_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_hard_SOS_6o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_df%_l60_tw_ss_IO'].mean()) 
# (3.7313570274636545, 3.774771371769377, 3.9446975263286754, 4.035171421721409, 4.155672183178414, 4.226164571428581)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_df_induce%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_df_induce%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_df_induce%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_df_induce%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_df_induce%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_df_induce%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_df_induce%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_df_induce%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_df_induce%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_df_induce%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_df_induce%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_df_induce%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_df_induce%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_df_induce%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_df_induce%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_df_induce%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_df_induce%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_df_induce%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_df_induce%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_df_induce%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_df_induce%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_df_induce%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_df_induce%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_df_induce%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l60_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 

del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_df%_ws", "SOS_tw_l60_opp_df%_ws_ct", "SOS_tw_l60_opp_df%", "SOS_tw_l60_opp_df%_50", "SOS_tw_l60_opp_df%_49", "SOS_tw_l60_opp_df%_48", "SOS_tw_l60_opp_df%_47", "SOS_tw_l60_opp_df%_46", "SOS_tw_l60_opp_df%_45", "SOS_tw_l60_opp_df%_44", "SOS_tw_l60_opp_df%_43", "SOS_tw_l60_opp_df%_42", "SOS_tw_l60_opp_df%_41", "SOS_tw_l60_opp_df%_40", "SOS_tw_l60_opp_df%_39", "SOS_tw_l60_opp_df%_38", "SOS_tw_l60_opp_df%_37", "SOS_tw_l60_opp_df%_36", "SOS_tw_l60_opp_df%_35", "SOS_tw_l60_opp_df%_34", "SOS_tw_l60_opp_df%_33", "SOS_tw_l60_opp_df%_32", "SOS_tw_l60_opp_df%_31", "SOS_tw_l60_opp_df%_30", "SOS_tw_l60_opp_df%_29", "SOS_tw_l60_opp_df%_28", "SOS_tw_l60_opp_df%_27", "SOS_tw_l60_opp_df%_26", "SOS_tw_l60_opp_df%_25", "SOS_tw_l60_opp_df%_24", "SOS_tw_l60_opp_df%_23", "SOS_tw_l60_opp_df%_22", "SOS_tw_l60_opp_df%_21", "SOS_tw_l60_opp_df%_20", "SOS_tw_l60_opp_df%_19", "SOS_tw_l60_opp_df%_18", "SOS_tw_l60_opp_df%_17", "SOS_tw_l60_opp_df%_16", "SOS_tw_l60_opp_df%_15", "SOS_tw_l60_opp_df%_14", "SOS_tw_l60_opp_df%_13", "SOS_tw_l60_opp_df%_12", "SOS_tw_l60_opp_df%_11", "SOS_tw_l60_opp_df%_10", "SOS_tw_l60_opp_df%_9", "SOS_tw_l60_opp_df%_8", "SOS_tw_l60_opp_df%_7", "SOS_tw_l60_opp_df%_6", "SOS_tw_l60_opp_df%_5", "SOS_tw_l60_opp_df%_4", "SOS_tw_l60_opp_df%_3", "SOS_tw_l60_opp_df%_2", "SOS_tw_l60_opp_df%_1"],axis=1)

In [190]:
# 'p_df_induce%_l10_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), I/O specific DF INDUCE performance of PLAYER over the 10 matches PRIOR TO the match being predicted

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_df%_ws"] = df_player2[["SOS_tw_l60_opp_df%_60", "SOS_tw_l60_opp_df%_59", "SOS_tw_l60_opp_df%_58", "SOS_tw_l60_opp_df%_57", "SOS_tw_l60_opp_df%_56", "SOS_tw_l60_opp_df%_55", "SOS_tw_l60_opp_df%_54", "SOS_tw_l60_opp_df%_53", "SOS_tw_l60_opp_df%_52", "SOS_tw_l60_opp_df%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_df%_ws_ct"] = df_player2[["SOS_tw_l60_opp_df%_60", "SOS_tw_l60_opp_df%_59", "SOS_tw_l60_opp_df%_58", "SOS_tw_l60_opp_df%_57", "SOS_tw_l60_opp_df%_56", "SOS_tw_l60_opp_df%_55", "SOS_tw_l60_opp_df%_54", "SOS_tw_l60_opp_df%_53", "SOS_tw_l60_opp_df%_52", "SOS_tw_l60_opp_df%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_df%"] = (df_player2["SOS_tw_l10_opp_df%_ws"]/df_player2["SOS_tw_l10_opp_df%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % DFS the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches on this surface and IO status is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l10_opp_df%"]

# Mean % DF committed performance (l10_tw_ss_IO) across ALL players per surface and IO status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (except 2009-2010), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_df%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_2i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_df%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_3i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_df%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_4i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_df%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_5i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_df%_l10_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_clay_SOS_6i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_df%_l10_tw_ss_IO'].mean()) 
# 

mean_clay_SOS_1o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_df%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_2o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_df%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_3o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_df%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_4o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_df%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_5o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_df%_l10_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_clay_SOS_6o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_df%_l10_tw_ss_IO'].mean()) 
# 

mean_hard_SOS_1i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_df%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_2i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_df%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_3i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_df%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_4i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_df%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_5i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_df%_l10_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_hard_SOS_6i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_df%_l10_tw_ss_IO'].mean()) 
# 

mean_hard_SOS_1o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_df%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_2o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_df%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_3o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_df%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_4o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_df%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_5o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_df%_l10_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_hard_SOS_6o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_df%_l10_tw_ss_IO'].mean()) 
# 

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_df_induce%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_df_induce%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_df_induce%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_df_induce%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_df_induce%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_df_induce%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_df_induce%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_df_induce%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_df_induce%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_df_induce%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_df_induce%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_df_induce%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_df_induce%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_df_induce%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_df_induce%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_df_induce%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_df_induce%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_df_induce%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_df_induce%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_df_induce%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_df_induce%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_df_induce%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_df_induce%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_df_induce%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_df_induce%_l10_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2) 

del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_df%_ws", "SOS_tw_l10_opp_df%_ws_ct", "SOS_tw_l10_opp_df%", "SOS_tw_l60_opp_df%_60", "SOS_tw_l60_opp_df%_59", "SOS_tw_l60_opp_df%_58", "SOS_tw_l60_opp_df%_57", "SOS_tw_l60_opp_df%_56", "SOS_tw_l60_opp_df%_55", "SOS_tw_l60_opp_df%_54", "SOS_tw_l60_opp_df%_53", "SOS_tw_l60_opp_df%_52", "SOS_tw_l60_opp_df%_51"],axis=1)

In [191]:
# 'p_bp_save%_l60_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted (tw), surface-specific (SS) BREAK POINTS SAVED (as server) performance of PLAYER over the 60 matches PRIOR TO the match being predicted

df_player2 = df_player2.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain OPPONENTS' mean, surface-specific BREAK POINTS CONVERTED (as returners) performance in the maximum interval (60 matches) prior to the match being predicted
df_player2["SOS_l60_opp_bp_conv%_60"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-1)
df_player2["SOS_l60_opp_bp_conv%_59"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-2)
df_player2["SOS_l60_opp_bp_conv%_58"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-3)
df_player2["SOS_l60_opp_bp_conv%_57"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-4)
df_player2["SOS_l60_opp_bp_conv%_56"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-5)
df_player2["SOS_l60_opp_bp_conv%_55"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-6)
df_player2["SOS_l60_opp_bp_conv%_54"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-7)
df_player2["SOS_l60_opp_bp_conv%_53"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-8)
df_player2["SOS_l60_opp_bp_conv%_52"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-9)
df_player2["SOS_l60_opp_bp_conv%_51"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-10)
df_player2["SOS_l60_opp_bp_conv%_50"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-11)
df_player2["SOS_l60_opp_bp_conv%_49"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-12)
df_player2["SOS_l60_opp_bp_conv%_48"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-13)
df_player2["SOS_l60_opp_bp_conv%_47"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-14)
df_player2["SOS_l60_opp_bp_conv%_46"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-15)
df_player2["SOS_l60_opp_bp_conv%_45"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-16)
df_player2["SOS_l60_opp_bp_conv%_44"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-17)
df_player2["SOS_l60_opp_bp_conv%_43"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-18)
df_player2["SOS_l60_opp_bp_conv%_42"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-19)
df_player2["SOS_l60_opp_bp_conv%_41"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-20)
df_player2["SOS_l60_opp_bp_conv%_40"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-21)
df_player2["SOS_l60_opp_bp_conv%_39"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-22)
df_player2["SOS_l60_opp_bp_conv%_38"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-23)
df_player2["SOS_l60_opp_bp_conv%_37"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-24)
df_player2["SOS_l60_opp_bp_conv%_36"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-25)
df_player2["SOS_l60_opp_bp_conv%_35"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-26)
df_player2["SOS_l60_opp_bp_conv%_34"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-27)
df_player2["SOS_l60_opp_bp_conv%_33"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-28)
df_player2["SOS_l60_opp_bp_conv%_32"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-29)
df_player2["SOS_l60_opp_bp_conv%_31"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-30)
df_player2["SOS_l60_opp_bp_conv%_30"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-31)
df_player2["SOS_l60_opp_bp_conv%_29"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-32)
df_player2["SOS_l60_opp_bp_conv%_28"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-33)
df_player2["SOS_l60_opp_bp_conv%_27"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-34)
df_player2["SOS_l60_opp_bp_conv%_26"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-35)
df_player2["SOS_l60_opp_bp_conv%_25"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-36)
df_player2["SOS_l60_opp_bp_conv%_24"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-37)
df_player2["SOS_l60_opp_bp_conv%_23"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-38)
df_player2["SOS_l60_opp_bp_conv%_22"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-39)
df_player2["SOS_l60_opp_bp_conv%_21"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-40)
df_player2["SOS_l60_opp_bp_conv%_20"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-41)
df_player2["SOS_l60_opp_bp_conv%_19"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-42)
df_player2["SOS_l60_opp_bp_conv%_18"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-43)
df_player2["SOS_l60_opp_bp_conv%_17"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-44)
df_player2["SOS_l60_opp_bp_conv%_16"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-45)
df_player2["SOS_l60_opp_bp_conv%_15"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-46)
df_player2["SOS_l60_opp_bp_conv%_14"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-47)
df_player2["SOS_l60_opp_bp_conv%_13"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-48)
df_player2["SOS_l60_opp_bp_conv%_12"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-49)
df_player2["SOS_l60_opp_bp_conv%_11"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-50)
df_player2["SOS_l60_opp_bp_conv%_10"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-51)
df_player2["SOS_l60_opp_bp_conv%_9"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-52)
df_player2["SOS_l60_opp_bp_conv%_8"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-53)
df_player2["SOS_l60_opp_bp_conv%_7"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-54)
df_player2["SOS_l60_opp_bp_conv%_6"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-55)
df_player2["SOS_l60_opp_bp_conv%_5"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-56)
df_player2["SOS_l60_opp_bp_conv%_4"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-57)
df_player2["SOS_l60_opp_bp_conv%_3"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-58)
df_player2["SOS_l60_opp_bp_conv%_2"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-59)
df_player2["SOS_l60_opp_bp_conv%_1"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_conv%_l60_tw_ss'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation, which is preferred in metrics of this type. As a further note, in the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_l60_opp_bp_conv%_ws"] = df_player2[["SOS_l60_opp_bp_conv%_60", "SOS_l60_opp_bp_conv%_59", "SOS_l60_opp_bp_conv%_58", "SOS_l60_opp_bp_conv%_57", "SOS_l60_opp_bp_conv%_56", "SOS_l60_opp_bp_conv%_55", "SOS_l60_opp_bp_conv%_54", "SOS_l60_opp_bp_conv%_53", "SOS_l60_opp_bp_conv%_52", "SOS_l60_opp_bp_conv%_51", "SOS_l60_opp_bp_conv%_50", "SOS_l60_opp_bp_conv%_49", "SOS_l60_opp_bp_conv%_48", "SOS_l60_opp_bp_conv%_47", "SOS_l60_opp_bp_conv%_46", "SOS_l60_opp_bp_conv%_45", "SOS_l60_opp_bp_conv%_44", "SOS_l60_opp_bp_conv%_43", "SOS_l60_opp_bp_conv%_42", "SOS_l60_opp_bp_conv%_41", "SOS_l60_opp_bp_conv%_40", "SOS_l60_opp_bp_conv%_39", "SOS_l60_opp_bp_conv%_38", "SOS_l60_opp_bp_conv%_37", "SOS_l60_opp_bp_conv%_36", "SOS_l60_opp_bp_conv%_35", "SOS_l60_opp_bp_conv%_34", "SOS_l60_opp_bp_conv%_33", "SOS_l60_opp_bp_conv%_32", "SOS_l60_opp_bp_conv%_31", "SOS_l60_opp_bp_conv%_30", "SOS_l60_opp_bp_conv%_29", "SOS_l60_opp_bp_conv%_28", "SOS_l60_opp_bp_conv%_27", "SOS_l60_opp_bp_conv%_26", "SOS_l60_opp_bp_conv%_25", "SOS_l60_opp_bp_conv%_24", "SOS_l60_opp_bp_conv%_23", "SOS_l60_opp_bp_conv%_22", "SOS_l60_opp_bp_conv%_21", "SOS_l60_opp_bp_conv%_20", "SOS_l60_opp_bp_conv%_19", "SOS_l60_opp_bp_conv%_18", "SOS_l60_opp_bp_conv%_17", "SOS_l60_opp_bp_conv%_16", "SOS_l60_opp_bp_conv%_15", "SOS_l60_opp_bp_conv%_14", "SOS_l60_opp_bp_conv%_13", "SOS_l60_opp_bp_conv%_12", "SOS_l60_opp_bp_conv%_11", "SOS_l60_opp_bp_conv%_10", "SOS_l60_opp_bp_conv%_9", "SOS_l60_opp_bp_conv%_8", "SOS_l60_opp_bp_conv%_7", "SOS_l60_opp_bp_conv%_6", "SOS_l60_opp_bp_conv%_5", "SOS_l60_opp_bp_conv%_4", "SOS_l60_opp_bp_conv%_3", "SOS_l60_opp_bp_conv%_2", "SOS_l60_opp_bp_conv%_1"]].sum(axis=1)
df_player2["SOS_l60_opp_bp_conv%_ws_ct"] = df_player2[["SOS_l60_opp_bp_conv%_60", "SOS_l60_opp_bp_conv%_59", "SOS_l60_opp_bp_conv%_58", "SOS_l60_opp_bp_conv%_57", "SOS_l60_opp_bp_conv%_56", "SOS_l60_opp_bp_conv%_55", "SOS_l60_opp_bp_conv%_54", "SOS_l60_opp_bp_conv%_53", "SOS_l60_opp_bp_conv%_52", "SOS_l60_opp_bp_conv%_51", "SOS_l60_opp_bp_conv%_50", "SOS_l60_opp_bp_conv%_49", "SOS_l60_opp_bp_conv%_48", "SOS_l60_opp_bp_conv%_47", "SOS_l60_opp_bp_conv%_46", "SOS_l60_opp_bp_conv%_45", "SOS_l60_opp_bp_conv%_44", "SOS_l60_opp_bp_conv%_43", "SOS_l60_opp_bp_conv%_42", "SOS_l60_opp_bp_conv%_41", "SOS_l60_opp_bp_conv%_40", "SOS_l60_opp_bp_conv%_39", "SOS_l60_opp_bp_conv%_38", "SOS_l60_opp_bp_conv%_37", "SOS_l60_opp_bp_conv%_36", "SOS_l60_opp_bp_conv%_35", "SOS_l60_opp_bp_conv%_34", "SOS_l60_opp_bp_conv%_33", "SOS_l60_opp_bp_conv%_32", "SOS_l60_opp_bp_conv%_31", "SOS_l60_opp_bp_conv%_30", "SOS_l60_opp_bp_conv%_29", "SOS_l60_opp_bp_conv%_28", "SOS_l60_opp_bp_conv%_27", "SOS_l60_opp_bp_conv%_26", "SOS_l60_opp_bp_conv%_25", "SOS_l60_opp_bp_conv%_24", "SOS_l60_opp_bp_conv%_23", "SOS_l60_opp_bp_conv%_22", "SOS_l60_opp_bp_conv%_21", "SOS_l60_opp_bp_conv%_20", "SOS_l60_opp_bp_conv%_19", "SOS_l60_opp_bp_conv%_18", "SOS_l60_opp_bp_conv%_17", "SOS_l60_opp_bp_conv%_16", "SOS_l60_opp_bp_conv%_15", "SOS_l60_opp_bp_conv%_14", "SOS_l60_opp_bp_conv%_13", "SOS_l60_opp_bp_conv%_12", "SOS_l60_opp_bp_conv%_11", "SOS_l60_opp_bp_conv%_10", "SOS_l60_opp_bp_conv%_9", "SOS_l60_opp_bp_conv%_8", "SOS_l60_opp_bp_conv%_7", "SOS_l60_opp_bp_conv%_6", "SOS_l60_opp_bp_conv%_5", "SOS_l60_opp_bp_conv%_4", "SOS_l60_opp_bp_conv%_3", "SOS_l60_opp_bp_conv%_2", "SOS_l60_opp_bp_conv%_1"]].count(axis=1)
df_player2["SOS_l60_opp_bp_conv%"] = (df_player2["SOS_l60_opp_bp_conv%_ws"]/df_player2["SOS_l60_opp_bp_conv%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted) 

# % BREAK POINTS CONVERTED that the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to achieve
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_l60_opp_bp_conv%"]

# Mean % BREAK POINTS CONVERTED performance (l60_tw_ss) across ALL players per surface. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (except 2009-2010), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_bp_conv%_l60_tw_ss'].mean()) 
mean_clay_SOS_2 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_bp_conv%_l60_tw_ss'].mean()) 
mean_clay_SOS_3 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_bp_conv%_l60_tw_ss'].mean()) 
mean_clay_SOS_4 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_bp_conv%_l60_tw_ss'].mean()) 
mean_clay_SOS_5 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_bp_conv%_l60_tw_ss'].mean()) 
mean_clay_SOS_6 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_bp_conv%_l60_tw_ss'].mean()) 
# (42.53188835286013, 41.18735461441207, 40.98857911392403, 40.641860315416835, 40.234363519175105, 40.12918367346935)

mean_hard_SOS_1 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_bp_conv%_l60_tw_ss'].mean()) 
mean_hard_SOS_2 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_bp_conv%_l60_tw_ss'].mean()) 
mean_hard_SOS_3 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_bp_conv%_l60_tw_ss'].mean()) 
mean_hard_SOS_4 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_bp_conv%_l60_tw_ss'].mean()) 
mean_hard_SOS_5 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_bp_conv%_l60_tw_ss'].mean()) 
mean_hard_SOS_6 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_bp_conv%_l60_tw_ss'].mean()) 
# (41.2480021329541, 39.455725436959135, 39.470872975277246, 38.34040614515262, 38.721722789115766, 38.71676310415268)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_bp_save%_l60_tw_ss_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss"])*(df_player2["EY"]/mean_clay_SOS_1)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_bp_save%_l60_tw_ss_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss"])*(df_player2["EY"]/mean_clay_SOS_2)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_bp_save%_l60_tw_ss_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss"])*(df_player2["EY"]/mean_clay_SOS_3)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_bp_save%_l60_tw_ss_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss"])*(df_player2["EY"]/mean_clay_SOS_4)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_bp_save%_l60_tw_ss_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss"])*(df_player2["EY"]/mean_clay_SOS_5)).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_bp_save%_l60_tw_ss_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss"])*(df_player2["EY"]/mean_clay_SOS_6)).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_bp_save%_l60_tw_ss_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_1)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_bp_save%_l60_tw_ss_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_2)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_bp_save%_l60_tw_ss_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_3)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_bp_save%_l60_tw_ss_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_4)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_bp_save%_l60_tw_ss_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_5)).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_bp_save%_l60_tw_ss_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_6)).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_l60_opp_bp_conv%_ws", "SOS_l60_opp_bp_conv%_ws_ct", "SOS_l60_opp_bp_conv%", "SOS_l60_opp_bp_conv%_50", "SOS_l60_opp_bp_conv%_49", "SOS_l60_opp_bp_conv%_48", "SOS_l60_opp_bp_conv%_47", "SOS_l60_opp_bp_conv%_46", "SOS_l60_opp_bp_conv%_45", "SOS_l60_opp_bp_conv%_44", "SOS_l60_opp_bp_conv%_43", "SOS_l60_opp_bp_conv%_42", "SOS_l60_opp_bp_conv%_41", "SOS_l60_opp_bp_conv%_40", "SOS_l60_opp_bp_conv%_39", "SOS_l60_opp_bp_conv%_38", "SOS_l60_opp_bp_conv%_37", "SOS_l60_opp_bp_conv%_36", "SOS_l60_opp_bp_conv%_35", "SOS_l60_opp_bp_conv%_34", "SOS_l60_opp_bp_conv%_33", "SOS_l60_opp_bp_conv%_32", "SOS_l60_opp_bp_conv%_31", "SOS_l60_opp_bp_conv%_30", "SOS_l60_opp_bp_conv%_29", "SOS_l60_opp_bp_conv%_28", "SOS_l60_opp_bp_conv%_27", "SOS_l60_opp_bp_conv%_26", "SOS_l60_opp_bp_conv%_25", "SOS_l60_opp_bp_conv%_24", "SOS_l60_opp_bp_conv%_23", "SOS_l60_opp_bp_conv%_22", "SOS_l60_opp_bp_conv%_21", "SOS_l60_opp_bp_conv%_20", "SOS_l60_opp_bp_conv%_19", "SOS_l60_opp_bp_conv%_18", "SOS_l60_opp_bp_conv%_17", "SOS_l60_opp_bp_conv%_16", "SOS_l60_opp_bp_conv%_15", "SOS_l60_opp_bp_conv%_14", "SOS_l60_opp_bp_conv%_13", "SOS_l60_opp_bp_conv%_12", "SOS_l60_opp_bp_conv%_11", "SOS_l60_opp_bp_conv%_10", "SOS_l60_opp_bp_conv%_9", "SOS_l60_opp_bp_conv%_8", "SOS_l60_opp_bp_conv%_7", "SOS_l60_opp_bp_conv%_6", "SOS_l60_opp_bp_conv%_5", "SOS_l60_opp_bp_conv%_4", "SOS_l60_opp_bp_conv%_3", "SOS_l60_opp_bp_conv%_2", "SOS_l60_opp_bp_conv%_1"],axis=1)

In [192]:
# 'p_bp_save%_l10_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted (tw), surface-specific (SS) BREAK POINTS SAVED (as server) performance of PLAYER over the 60 matches PRIOR TO the match being predicted
# NOT time-weighted (events too sparse for that to make sense)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_l10_opp_bp_conv%_ws"] = df_player2[["SOS_l60_opp_bp_conv%_60", "SOS_l60_opp_bp_conv%_59", "SOS_l60_opp_bp_conv%_58", "SOS_l60_opp_bp_conv%_57", "SOS_l60_opp_bp_conv%_56", "SOS_l60_opp_bp_conv%_55", "SOS_l60_opp_bp_conv%_54", "SOS_l60_opp_bp_conv%_53", "SOS_l60_opp_bp_conv%_52", "SOS_l60_opp_bp_conv%_51"]].sum(axis=1)
df_player2["SOS_l10_opp_bp_conv%_ws_ct"] = df_player2[["SOS_l60_opp_bp_conv%_60", "SOS_l60_opp_bp_conv%_59", "SOS_l60_opp_bp_conv%_58", "SOS_l60_opp_bp_conv%_57", "SOS_l60_opp_bp_conv%_56", "SOS_l60_opp_bp_conv%_55", "SOS_l60_opp_bp_conv%_54", "SOS_l60_opp_bp_conv%_53", "SOS_l60_opp_bp_conv%_52", "SOS_l60_opp_bp_conv%_51"]].count(axis=1)
df_player2["SOS_l10_opp_bp_conv%"] = (df_player2["SOS_l10_opp_bp_conv%_ws"]/df_player2["SOS_l10_opp_bp_conv%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted) NOTE: time-weighting is currently NOT implemented for this feature

# % BREAK POINTS CONVERTED performance (l10_tw_ss) that the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches on this surface is expected to achieve
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_l10_opp_bp_conv%"]

# Mean % BREAK POINTS CONVERTED across ALL players per surface. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (except 2009-2010), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_bp_conv%_l10_tw_ss'].mean()) 
mean_clay_SOS_2 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_bp_conv%_l10_tw_ss'].mean()) 
mean_clay_SOS_3 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_bp_conv%_l10_tw_ss'].mean()) 
mean_clay_SOS_4 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_bp_conv%_l10_tw_ss'].mean()) 
mean_clay_SOS_5 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_bp_conv%_l10_tw_ss'].mean()) 
mean_clay_SOS_6 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_bp_conv%_l10_tw_ss'].mean()) 
# (42.43565127498277, 41.467133375474106, 41.246424050632946, 40.702317347923994, 40.2125781501772, 40.38227405247813)

mean_hard_SOS_1 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_bp_conv%_l10_tw_ss'].mean()) 
mean_hard_SOS_2 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_bp_conv%_l10_tw_ss'].mean()) 
mean_hard_SOS_3 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_bp_conv%_l10_tw_ss'].mean()) 
mean_hard_SOS_4 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_bp_conv%_l10_tw_ss'].mean()) 
mean_hard_SOS_5 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_bp_conv%_l10_tw_ss'].mean()) 
mean_hard_SOS_6 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_bp_conv%_l10_tw_ss'].mean()) 
# (40.96639175257737, 39.830010181571204, 39.39586189258311, 38.45744834893176, 39.18996938775506, 38.68535228046292)

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_bp_save%_l10_tw_ss_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_1)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_bp_save%_l10_tw_ss_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_2)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_bp_save%_l10_tw_ss_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_3)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_bp_save%_l10_tw_ss_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_4)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_bp_save%_l10_tw_ss_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_5)).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_bp_save%_l10_tw_ss_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_6)).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_bp_save%_l10_tw_ss_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_1)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_bp_save%_l10_tw_ss_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_2)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_bp_save%_l10_tw_ss_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_3)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_bp_save%_l10_tw_ss_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_4)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_bp_save%_l10_tw_ss_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_5)).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_bp_save%_l10_tw_ss_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_6)).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_l10_opp_bp_conv%_ws", "SOS_l10_opp_bp_conv%_ws_ct", "SOS_l10_opp_bp_conv%", "SOS_l60_opp_bp_conv%_60", "SOS_l60_opp_bp_conv%_59", "SOS_l60_opp_bp_conv%_58", "SOS_l60_opp_bp_conv%_57", "SOS_l60_opp_bp_conv%_56", "SOS_l60_opp_bp_conv%_55", "SOS_l60_opp_bp_conv%_54", "SOS_l60_opp_bp_conv%_53", "SOS_l60_opp_bp_conv%_52", "SOS_l60_opp_bp_conv%_51"],axis=1)

In [193]:
# 'p_bp_save%_l60_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), I/O specific BP SAVED performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

# First obtain mean, TW,SS, IO specific BREAK POINTS CONVERTED (as returners) performance for player OPPONENTS in the maximum interval (60 matches) prior to the match being predicted 
df_player2["SOS_tw_l60_opp_bp_conv%_60"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-1)
df_player2["SOS_tw_l60_opp_bp_conv%_59"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-2)
df_player2["SOS_tw_l60_opp_bp_conv%_58"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-3)
df_player2["SOS_tw_l60_opp_bp_conv%_57"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-4)
df_player2["SOS_tw_l60_opp_bp_conv%_56"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-5)
df_player2["SOS_tw_l60_opp_bp_conv%_55"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-6)
df_player2["SOS_tw_l60_opp_bp_conv%_54"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-7)
df_player2["SOS_tw_l60_opp_bp_conv%_53"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-8)
df_player2["SOS_tw_l60_opp_bp_conv%_52"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-9)
df_player2["SOS_tw_l60_opp_bp_conv%_51"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-10)
df_player2["SOS_tw_l60_opp_bp_conv%_50"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-11)
df_player2["SOS_tw_l60_opp_bp_conv%_49"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-12)
df_player2["SOS_tw_l60_opp_bp_conv%_48"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-13)
df_player2["SOS_tw_l60_opp_bp_conv%_47"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-14)
df_player2["SOS_tw_l60_opp_bp_conv%_46"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-15)
df_player2["SOS_tw_l60_opp_bp_conv%_45"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-16)
df_player2["SOS_tw_l60_opp_bp_conv%_44"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-17)
df_player2["SOS_tw_l60_opp_bp_conv%_43"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-18)
df_player2["SOS_tw_l60_opp_bp_conv%_42"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-19)
df_player2["SOS_tw_l60_opp_bp_conv%_41"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-20)
df_player2["SOS_tw_l60_opp_bp_conv%_40"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-21)
df_player2["SOS_tw_l60_opp_bp_conv%_39"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-22)
df_player2["SOS_tw_l60_opp_bp_conv%_38"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-23)
df_player2["SOS_tw_l60_opp_bp_conv%_37"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-24)
df_player2["SOS_tw_l60_opp_bp_conv%_36"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-25)
df_player2["SOS_tw_l60_opp_bp_conv%_35"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-26)
df_player2["SOS_tw_l60_opp_bp_conv%_34"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-27)
df_player2["SOS_tw_l60_opp_bp_conv%_33"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-28)
df_player2["SOS_tw_l60_opp_bp_conv%_32"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-29)
df_player2["SOS_tw_l60_opp_bp_conv%_31"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-30)
df_player2["SOS_tw_l60_opp_bp_conv%_30"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-31)
df_player2["SOS_tw_l60_opp_bp_conv%_29"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-32)
df_player2["SOS_tw_l60_opp_bp_conv%_28"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-33)
df_player2["SOS_tw_l60_opp_bp_conv%_27"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-34)
df_player2["SOS_tw_l60_opp_bp_conv%_26"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-35)
df_player2["SOS_tw_l60_opp_bp_conv%_25"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-36)
df_player2["SOS_tw_l60_opp_bp_conv%_24"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-37)
df_player2["SOS_tw_l60_opp_bp_conv%_23"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-38)
df_player2["SOS_tw_l60_opp_bp_conv%_22"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-39)
df_player2["SOS_tw_l60_opp_bp_conv%_21"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-40)
df_player2["SOS_tw_l60_opp_bp_conv%_20"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-41)
df_player2["SOS_tw_l60_opp_bp_conv%_19"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-42)
df_player2["SOS_tw_l60_opp_bp_conv%_18"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-43)
df_player2["SOS_tw_l60_opp_bp_conv%_17"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-44)
df_player2["SOS_tw_l60_opp_bp_conv%_16"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-45)
df_player2["SOS_tw_l60_opp_bp_conv%_15"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-46)
df_player2["SOS_tw_l60_opp_bp_conv%_14"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-47)
df_player2["SOS_tw_l60_opp_bp_conv%_13"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-48)
df_player2["SOS_tw_l60_opp_bp_conv%_12"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-49)
df_player2["SOS_tw_l60_opp_bp_conv%_11"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-50)
df_player2["SOS_tw_l60_opp_bp_conv%_10"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-51)
df_player2["SOS_tw_l60_opp_bp_conv%_9"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-52)
df_player2["SOS_tw_l60_opp_bp_conv%_8"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-53)
df_player2["SOS_tw_l60_opp_bp_conv%_7"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-54)
df_player2["SOS_tw_l60_opp_bp_conv%_6"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-55)
df_player2["SOS_tw_l60_opp_bp_conv%_5"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-56)
df_player2["SOS_tw_l60_opp_bp_conv%_4"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-57)
df_player2["SOS_tw_l60_opp_bp_conv%_3"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-58)
df_player2["SOS_tw_l60_opp_bp_conv%_2"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-59)
df_player2["SOS_tw_l60_opp_bp_conv%_1"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_conv%_l60_tw_ss_IO'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_bp_conv%_ws"] = df_player2[["SOS_tw_l60_opp_bp_conv%_60", "SOS_tw_l60_opp_bp_conv%_59", "SOS_tw_l60_opp_bp_conv%_58", "SOS_tw_l60_opp_bp_conv%_57", "SOS_tw_l60_opp_bp_conv%_56", "SOS_tw_l60_opp_bp_conv%_55", "SOS_tw_l60_opp_bp_conv%_54", "SOS_tw_l60_opp_bp_conv%_53", "SOS_tw_l60_opp_bp_conv%_52", "SOS_tw_l60_opp_bp_conv%_51", "SOS_tw_l60_opp_bp_conv%_50", "SOS_tw_l60_opp_bp_conv%_49", "SOS_tw_l60_opp_bp_conv%_48", "SOS_tw_l60_opp_bp_conv%_47", "SOS_tw_l60_opp_bp_conv%_46", "SOS_tw_l60_opp_bp_conv%_45", "SOS_tw_l60_opp_bp_conv%_44", "SOS_tw_l60_opp_bp_conv%_43", "SOS_tw_l60_opp_bp_conv%_42", "SOS_tw_l60_opp_bp_conv%_41", "SOS_tw_l60_opp_bp_conv%_40", "SOS_tw_l60_opp_bp_conv%_39", "SOS_tw_l60_opp_bp_conv%_38", "SOS_tw_l60_opp_bp_conv%_37", "SOS_tw_l60_opp_bp_conv%_36", "SOS_tw_l60_opp_bp_conv%_35", "SOS_tw_l60_opp_bp_conv%_34", "SOS_tw_l60_opp_bp_conv%_33", "SOS_tw_l60_opp_bp_conv%_32", "SOS_tw_l60_opp_bp_conv%_31", "SOS_tw_l60_opp_bp_conv%_30", "SOS_tw_l60_opp_bp_conv%_29", "SOS_tw_l60_opp_bp_conv%_28", "SOS_tw_l60_opp_bp_conv%_27", "SOS_tw_l60_opp_bp_conv%_26", "SOS_tw_l60_opp_bp_conv%_25", "SOS_tw_l60_opp_bp_conv%_24", "SOS_tw_l60_opp_bp_conv%_23", "SOS_tw_l60_opp_bp_conv%_22", "SOS_tw_l60_opp_bp_conv%_21", "SOS_tw_l60_opp_bp_conv%_20", "SOS_tw_l60_opp_bp_conv%_19", "SOS_tw_l60_opp_bp_conv%_18", "SOS_tw_l60_opp_bp_conv%_17", "SOS_tw_l60_opp_bp_conv%_16", "SOS_tw_l60_opp_bp_conv%_15", "SOS_tw_l60_opp_bp_conv%_14", "SOS_tw_l60_opp_bp_conv%_13", "SOS_tw_l60_opp_bp_conv%_12", "SOS_tw_l60_opp_bp_conv%_11", "SOS_tw_l60_opp_bp_conv%_10", "SOS_tw_l60_opp_bp_conv%_9", "SOS_tw_l60_opp_bp_conv%_8", "SOS_tw_l60_opp_bp_conv%_7", "SOS_tw_l60_opp_bp_conv%_6", "SOS_tw_l60_opp_bp_conv%_5", "SOS_tw_l60_opp_bp_conv%_4", "SOS_tw_l60_opp_bp_conv%_3", "SOS_tw_l60_opp_bp_conv%_2", "SOS_tw_l60_opp_bp_conv%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_bp_conv%_ws_ct"] = df_player2[["SOS_tw_l60_opp_bp_conv%_60", "SOS_tw_l60_opp_bp_conv%_59", "SOS_tw_l60_opp_bp_conv%_58", "SOS_tw_l60_opp_bp_conv%_57", "SOS_tw_l60_opp_bp_conv%_56", "SOS_tw_l60_opp_bp_conv%_55", "SOS_tw_l60_opp_bp_conv%_54", "SOS_tw_l60_opp_bp_conv%_53", "SOS_tw_l60_opp_bp_conv%_52", "SOS_tw_l60_opp_bp_conv%_51", "SOS_tw_l60_opp_bp_conv%_50", "SOS_tw_l60_opp_bp_conv%_49", "SOS_tw_l60_opp_bp_conv%_48", "SOS_tw_l60_opp_bp_conv%_47", "SOS_tw_l60_opp_bp_conv%_46", "SOS_tw_l60_opp_bp_conv%_45", "SOS_tw_l60_opp_bp_conv%_44", "SOS_tw_l60_opp_bp_conv%_43", "SOS_tw_l60_opp_bp_conv%_42", "SOS_tw_l60_opp_bp_conv%_41", "SOS_tw_l60_opp_bp_conv%_40", "SOS_tw_l60_opp_bp_conv%_39", "SOS_tw_l60_opp_bp_conv%_38", "SOS_tw_l60_opp_bp_conv%_37", "SOS_tw_l60_opp_bp_conv%_36", "SOS_tw_l60_opp_bp_conv%_35", "SOS_tw_l60_opp_bp_conv%_34", "SOS_tw_l60_opp_bp_conv%_33", "SOS_tw_l60_opp_bp_conv%_32", "SOS_tw_l60_opp_bp_conv%_31", "SOS_tw_l60_opp_bp_conv%_30", "SOS_tw_l60_opp_bp_conv%_29", "SOS_tw_l60_opp_bp_conv%_28", "SOS_tw_l60_opp_bp_conv%_27", "SOS_tw_l60_opp_bp_conv%_26", "SOS_tw_l60_opp_bp_conv%_25", "SOS_tw_l60_opp_bp_conv%_24", "SOS_tw_l60_opp_bp_conv%_23", "SOS_tw_l60_opp_bp_conv%_22", "SOS_tw_l60_opp_bp_conv%_21", "SOS_tw_l60_opp_bp_conv%_20", "SOS_tw_l60_opp_bp_conv%_19", "SOS_tw_l60_opp_bp_conv%_18", "SOS_tw_l60_opp_bp_conv%_17", "SOS_tw_l60_opp_bp_conv%_16", "SOS_tw_l60_opp_bp_conv%_15", "SOS_tw_l60_opp_bp_conv%_14", "SOS_tw_l60_opp_bp_conv%_13", "SOS_tw_l60_opp_bp_conv%_12", "SOS_tw_l60_opp_bp_conv%_11", "SOS_tw_l60_opp_bp_conv%_10", "SOS_tw_l60_opp_bp_conv%_9", "SOS_tw_l60_opp_bp_conv%_8", "SOS_tw_l60_opp_bp_conv%_7", "SOS_tw_l60_opp_bp_conv%_6", "SOS_tw_l60_opp_bp_conv%_5", "SOS_tw_l60_opp_bp_conv%_4", "SOS_tw_l60_opp_bp_conv%_3", "SOS_tw_l60_opp_bp_conv%_2", "SOS_tw_l60_opp_bp_conv%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_bp_conv%"] = (df_player2["SOS_tw_l60_opp_bp_conv%_ws"]/df_player2["SOS_tw_l60_opp_bp_conv%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % BREAK POINTS CONVERTED performance (l60_tw_ss_IO) that the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface and IO status is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l60_opp_bp_conv%"]

# Mean % BREAK POINTS CONVERTED YIELDED performance (l60_tw_ss) across ALL players per surface and IO status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (except 2009-2010), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_bp_conv%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_2i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_bp_conv%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_3i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_bp_conv%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_4i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_bp_conv%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_5i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_bp_conv%_l60_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_clay_SOS_6i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_bp_conv%_l60_tw_ss_IO'].mean()) 
# (nan, 52.721964285714286, 38.76660606060607, 44.349941520467844, 42.635985915492945, 41.8628735632184)

mean_clay_SOS_1o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_bp_conv%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_2o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_bp_conv%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_3o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_bp_conv%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_4o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_bp_conv%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_5o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_bp_conv%_l60_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_clay_SOS_6o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_bp_conv%_l60_tw_ss_IO'].mean()) 
# (42.53188835286013, 41.08846405228759, 41.1757691001698, 40.777156659765346, 40.278640380693425, 40.251258389261785)

mean_hard_SOS_1i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_bp_conv%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_2i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_bp_conv%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_3i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_bp_conv%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_4i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_bp_conv%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_5i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_bp_conv%_l60_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_hard_SOS_6i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_bp_conv%_l60_tw_ss_IO'].mean()) 
# (41.51241421568631, 38.74388641425385, 38.85507179781736, 37.86572305593459, 38.85572727272725, 38.89202749140893)

mean_hard_SOS_1o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_bp_conv%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_2o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_bp_conv%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_3o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_bp_conv%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_4o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_bp_conv%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_5o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_bp_conv%_l60_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_hard_SOS_6o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_bp_conv%_l60_tw_ss_IO'].mean()) 
# (41.95747157552787, 40.037822841502845, 39.96277628032334, 38.63089752819772, 38.785933318212756, 39.014160567246)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_bp_save%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_1i)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_bp_save%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_2i)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_bp_save%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_3i)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_bp_save%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_4i)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_bp_save%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_5i)).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_bp_save%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_6i)).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_bp_save%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_1o)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_bp_save%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_2o)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_bp_save%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_3o)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_bp_save%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_4o)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_bp_save%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_5o)).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_bp_save%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_6o)).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_bp_save%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_1i)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_bp_save%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_2i)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_bp_save%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_3i)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_bp_save%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_4i)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_bp_save%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_5i)).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_bp_save%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_6i)).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_bp_save%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_1o)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_bp_save%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_2o)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_bp_save%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_3o)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_bp_save%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_4o)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_bp_save%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_5o)).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_bp_save%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_6o)).round(2) 

del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_bp_conv%_ws", "SOS_tw_l60_opp_bp_conv%_ws_ct", "SOS_tw_l60_opp_bp_conv%", "SOS_tw_l60_opp_bp_conv%_50", "SOS_tw_l60_opp_bp_conv%_49", "SOS_tw_l60_opp_bp_conv%_48", "SOS_tw_l60_opp_bp_conv%_47", "SOS_tw_l60_opp_bp_conv%_46", "SOS_tw_l60_opp_bp_conv%_45", "SOS_tw_l60_opp_bp_conv%_44", "SOS_tw_l60_opp_bp_conv%_43", "SOS_tw_l60_opp_bp_conv%_42", "SOS_tw_l60_opp_bp_conv%_41", "SOS_tw_l60_opp_bp_conv%_40", "SOS_tw_l60_opp_bp_conv%_39", "SOS_tw_l60_opp_bp_conv%_38", "SOS_tw_l60_opp_bp_conv%_37", "SOS_tw_l60_opp_bp_conv%_36", "SOS_tw_l60_opp_bp_conv%_35", "SOS_tw_l60_opp_bp_conv%_34", "SOS_tw_l60_opp_bp_conv%_33", "SOS_tw_l60_opp_bp_conv%_32", "SOS_tw_l60_opp_bp_conv%_31", "SOS_tw_l60_opp_bp_conv%_30", "SOS_tw_l60_opp_bp_conv%_29", "SOS_tw_l60_opp_bp_conv%_28", "SOS_tw_l60_opp_bp_conv%_27", "SOS_tw_l60_opp_bp_conv%_26", "SOS_tw_l60_opp_bp_conv%_25", "SOS_tw_l60_opp_bp_conv%_24", "SOS_tw_l60_opp_bp_conv%_23", "SOS_tw_l60_opp_bp_conv%_22", "SOS_tw_l60_opp_bp_conv%_21", "SOS_tw_l60_opp_bp_conv%_20", "SOS_tw_l60_opp_bp_conv%_19", "SOS_tw_l60_opp_bp_conv%_18", "SOS_tw_l60_opp_bp_conv%_17", "SOS_tw_l60_opp_bp_conv%_16", "SOS_tw_l60_opp_bp_conv%_15", "SOS_tw_l60_opp_bp_conv%_14", "SOS_tw_l60_opp_bp_conv%_13", "SOS_tw_l60_opp_bp_conv%_12", "SOS_tw_l60_opp_bp_conv%_11", "SOS_tw_l60_opp_bp_conv%_10", "SOS_tw_l60_opp_bp_conv%_9", "SOS_tw_l60_opp_bp_conv%_8", "SOS_tw_l60_opp_bp_conv%_7", "SOS_tw_l60_opp_bp_conv%_6", "SOS_tw_l60_opp_bp_conv%_5", "SOS_tw_l60_opp_bp_conv%_4", "SOS_tw_l60_opp_bp_conv%_3", "SOS_tw_l60_opp_bp_conv%_2", "SOS_tw_l60_opp_bp_conv%_1"],axis=1)

In [194]:
# 'p_bp_save%_l10_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), I/O specific BP SAVED performance of PLAYER over the 10 matches PRIOR TO the match being predicted

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_bp_conv%_ws"] = df_player2[["SOS_tw_l60_opp_bp_conv%_60", "SOS_tw_l60_opp_bp_conv%_59", "SOS_tw_l60_opp_bp_conv%_58", "SOS_tw_l60_opp_bp_conv%_57", "SOS_tw_l60_opp_bp_conv%_56", "SOS_tw_l60_opp_bp_conv%_55", "SOS_tw_l60_opp_bp_conv%_54", "SOS_tw_l60_opp_bp_conv%_53", "SOS_tw_l60_opp_bp_conv%_52", "SOS_tw_l60_opp_bp_conv%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_bp_conv%_ws_ct"] = df_player2[["SOS_tw_l60_opp_bp_conv%_60", "SOS_tw_l60_opp_bp_conv%_59", "SOS_tw_l60_opp_bp_conv%_58", "SOS_tw_l60_opp_bp_conv%_57", "SOS_tw_l60_opp_bp_conv%_56", "SOS_tw_l60_opp_bp_conv%_55", "SOS_tw_l60_opp_bp_conv%_54", "SOS_tw_l60_opp_bp_conv%_53", "SOS_tw_l60_opp_bp_conv%_52", "SOS_tw_l60_opp_bp_conv%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_bp_conv%"] = (df_player2["SOS_tw_l10_opp_bp_conv%_ws"]/df_player2["SOS_tw_l10_opp_bp_conv%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % BREAK POINTS CONVERTED performance (l10_tw_ss) that the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches on this surface and IO status is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l10_opp_bp_conv%"]

# Mean % BREAK POINTS CONVERTED YIELDED performance (l10_tw_ss_IO) across ALL players per surface and IO status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (except 2009-2010), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_bp_conv%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_2i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_bp_conv%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_3i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_bp_conv%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_4i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_bp_conv%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_5i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_bp_conv%_l10_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_clay_SOS_6i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_bp_conv%_l10_tw_ss_IO'].mean()) 
# 

mean_clay_SOS_1o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_bp_conv%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_2o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_bp_conv%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_3o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_bp_conv%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_4o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_bp_conv%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_5o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_bp_conv%_l10_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_clay_SOS_6o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_bp_conv%_l10_tw_ss_IO'].mean()) 
# 

mean_hard_SOS_1i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_bp_conv%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_2i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_bp_conv%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_3i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_bp_conv%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_4i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_bp_conv%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_5i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_bp_conv%_l10_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_hard_SOS_6i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_bp_conv%_l10_tw_ss_IO'].mean()) 
# 

mean_hard_SOS_1o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_bp_conv%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_2o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_bp_conv%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_3o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_bp_conv%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_4o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_bp_conv%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_5o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_bp_conv%_l10_tw_ss_IO'].mean()) #We want in terms of pct ace the field ALLOWS on average
mean_hard_SOS_6o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_bp_conv%_l10_tw_ss_IO'].mean()) 
# 

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_bp_save%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_1i)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_bp_save%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_2i)).round(2)        
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_bp_save%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_3i)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_bp_save%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_4i)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_bp_save%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_5i)).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_bp_save%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_6i)).round(2)

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_bp_save%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_1o)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_bp_save%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_2o)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_bp_save%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_3o)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_bp_save%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_4o)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_bp_save%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_5o)).round(2)
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_bp_save%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_6o)).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_bp_save%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_1i)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_bp_save%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_2i)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_bp_save%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_3i)).round(2)         
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_bp_save%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_4i)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_bp_save%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_5i)).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_bp_save%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_6i)).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_bp_save%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_1o)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_bp_save%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_2o)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_bp_save%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_3o)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_bp_save%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_4o)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_bp_save%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_5o)).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_bp_save%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_save%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_6o)).round(2) 

del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_bp_conv%_ws", "SOS_tw_l10_opp_bp_conv%_ws_ct", "SOS_tw_l10_opp_bp_conv%", "SOS_tw_l60_opp_bp_conv%_60", "SOS_tw_l60_opp_bp_conv%_59", "SOS_tw_l60_opp_bp_conv%_58", "SOS_tw_l60_opp_bp_conv%_57", "SOS_tw_l60_opp_bp_conv%_56", "SOS_tw_l60_opp_bp_conv%_55", "SOS_tw_l60_opp_bp_conv%_54", "SOS_tw_l60_opp_bp_conv%_53", "SOS_tw_l60_opp_bp_conv%_52", "SOS_tw_l60_opp_bp_conv%_51"],axis=1)

In [195]:
# 'p_bp_conv%_l60_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted (tw), surface-specific (SS) BREAK POINTS CONVERTED (as returner) performance of PLAYER over the 60 matches PRIOR TO the match being predicted

df_player2 = df_player2.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain OPPONENTS' mean, surface-specific BREAK POINTS SAVED (as servers) performance in the maximum interval (60 matches) prior to the match being predicted
df_player2["SOS_l60_opp_bp_save%_60"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-1)
df_player2["SOS_l60_opp_bp_save%_59"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-2)
df_player2["SOS_l60_opp_bp_save%_58"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-3)
df_player2["SOS_l60_opp_bp_save%_57"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-4)
df_player2["SOS_l60_opp_bp_save%_56"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-5)
df_player2["SOS_l60_opp_bp_save%_55"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-6)
df_player2["SOS_l60_opp_bp_save%_54"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-7)
df_player2["SOS_l60_opp_bp_save%_53"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-8)
df_player2["SOS_l60_opp_bp_save%_52"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-9)
df_player2["SOS_l60_opp_bp_save%_51"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-10)
df_player2["SOS_l60_opp_bp_save%_50"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-11)
df_player2["SOS_l60_opp_bp_save%_49"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-12)
df_player2["SOS_l60_opp_bp_save%_48"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-13)
df_player2["SOS_l60_opp_bp_save%_47"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-14)
df_player2["SOS_l60_opp_bp_save%_46"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-15)
df_player2["SOS_l60_opp_bp_save%_45"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-16)
df_player2["SOS_l60_opp_bp_save%_44"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-17)
df_player2["SOS_l60_opp_bp_save%_43"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-18)
df_player2["SOS_l60_opp_bp_save%_42"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-19)
df_player2["SOS_l60_opp_bp_save%_41"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-20)
df_player2["SOS_l60_opp_bp_save%_40"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-21)
df_player2["SOS_l60_opp_bp_save%_39"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-22)
df_player2["SOS_l60_opp_bp_save%_38"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-23)
df_player2["SOS_l60_opp_bp_save%_37"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-24)
df_player2["SOS_l60_opp_bp_save%_36"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-25)
df_player2["SOS_l60_opp_bp_save%_35"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-26)
df_player2["SOS_l60_opp_bp_save%_34"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-27)
df_player2["SOS_l60_opp_bp_save%_33"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-28)
df_player2["SOS_l60_opp_bp_save%_32"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-29)
df_player2["SOS_l60_opp_bp_save%_31"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-30)
df_player2["SOS_l60_opp_bp_save%_30"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-31)
df_player2["SOS_l60_opp_bp_save%_29"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-32)
df_player2["SOS_l60_opp_bp_save%_28"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-33)
df_player2["SOS_l60_opp_bp_save%_27"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-34)
df_player2["SOS_l60_opp_bp_save%_26"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-35)
df_player2["SOS_l60_opp_bp_save%_25"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-36)
df_player2["SOS_l60_opp_bp_save%_24"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-37)
df_player2["SOS_l60_opp_bp_save%_23"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-38)
df_player2["SOS_l60_opp_bp_save%_22"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-39)
df_player2["SOS_l60_opp_bp_save%_21"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-40)
df_player2["SOS_l60_opp_bp_save%_20"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-41)
df_player2["SOS_l60_opp_bp_save%_19"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-42)
df_player2["SOS_l60_opp_bp_save%_18"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-43)
df_player2["SOS_l60_opp_bp_save%_17"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-44)
df_player2["SOS_l60_opp_bp_save%_16"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-45)
df_player2["SOS_l60_opp_bp_save%_15"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-46)
df_player2["SOS_l60_opp_bp_save%_14"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-47)
df_player2["SOS_l60_opp_bp_save%_13"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-48)
df_player2["SOS_l60_opp_bp_save%_12"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-49)
df_player2["SOS_l60_opp_bp_save%_11"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-50)
df_player2["SOS_l60_opp_bp_save%_10"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-51)
df_player2["SOS_l60_opp_bp_save%_9"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-52)
df_player2["SOS_l60_opp_bp_save%_8"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-53)
df_player2["SOS_l60_opp_bp_save%_7"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-54)
df_player2["SOS_l60_opp_bp_save%_6"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-55)
df_player2["SOS_l60_opp_bp_save%_5"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-56)
df_player2["SOS_l60_opp_bp_save%_4"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-57)
df_player2["SOS_l60_opp_bp_save%_3"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-58)
df_player2["SOS_l60_opp_bp_save%_2"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-59)
df_player2["SOS_l60_opp_bp_save%_1"] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60_tw_ss'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_l60_opp_bp_save%_ws"] = df_player2[["SOS_l60_opp_bp_save%_60", "SOS_l60_opp_bp_save%_59", "SOS_l60_opp_bp_save%_58", "SOS_l60_opp_bp_save%_57", "SOS_l60_opp_bp_save%_56", "SOS_l60_opp_bp_save%_55", "SOS_l60_opp_bp_save%_54", "SOS_l60_opp_bp_save%_53", "SOS_l60_opp_bp_save%_52", "SOS_l60_opp_bp_save%_51", "SOS_l60_opp_bp_save%_50", "SOS_l60_opp_bp_save%_49", "SOS_l60_opp_bp_save%_48", "SOS_l60_opp_bp_save%_47", "SOS_l60_opp_bp_save%_46", "SOS_l60_opp_bp_save%_45", "SOS_l60_opp_bp_save%_44", "SOS_l60_opp_bp_save%_43", "SOS_l60_opp_bp_save%_42", "SOS_l60_opp_bp_save%_41", "SOS_l60_opp_bp_save%_40", "SOS_l60_opp_bp_save%_39", "SOS_l60_opp_bp_save%_38", "SOS_l60_opp_bp_save%_37", "SOS_l60_opp_bp_save%_36", "SOS_l60_opp_bp_save%_35", "SOS_l60_opp_bp_save%_34", "SOS_l60_opp_bp_save%_33", "SOS_l60_opp_bp_save%_32", "SOS_l60_opp_bp_save%_31", "SOS_l60_opp_bp_save%_30", "SOS_l60_opp_bp_save%_29", "SOS_l60_opp_bp_save%_28", "SOS_l60_opp_bp_save%_27", "SOS_l60_opp_bp_save%_26", "SOS_l60_opp_bp_save%_25", "SOS_l60_opp_bp_save%_24", "SOS_l60_opp_bp_save%_23", "SOS_l60_opp_bp_save%_22", "SOS_l60_opp_bp_save%_21", "SOS_l60_opp_bp_save%_20", "SOS_l60_opp_bp_save%_19", "SOS_l60_opp_bp_save%_18", "SOS_l60_opp_bp_save%_17", "SOS_l60_opp_bp_save%_16", "SOS_l60_opp_bp_save%_15", "SOS_l60_opp_bp_save%_14", "SOS_l60_opp_bp_save%_13", "SOS_l60_opp_bp_save%_12", "SOS_l60_opp_bp_save%_11", "SOS_l60_opp_bp_save%_10", "SOS_l60_opp_bp_save%_9", "SOS_l60_opp_bp_save%_8", "SOS_l60_opp_bp_save%_7", "SOS_l60_opp_bp_save%_6", "SOS_l60_opp_bp_save%_5", "SOS_l60_opp_bp_save%_4", "SOS_l60_opp_bp_save%_3", "SOS_l60_opp_bp_save%_2", "SOS_l60_opp_bp_save%_1"]].sum(axis=1)
df_player2["SOS_l60_opp_bp_save%_ws_ct"] = df_player2[["SOS_l60_opp_bp_save%_60", "SOS_l60_opp_bp_save%_59", "SOS_l60_opp_bp_save%_58", "SOS_l60_opp_bp_save%_57", "SOS_l60_opp_bp_save%_56", "SOS_l60_opp_bp_save%_55", "SOS_l60_opp_bp_save%_54", "SOS_l60_opp_bp_save%_53", "SOS_l60_opp_bp_save%_52", "SOS_l60_opp_bp_save%_51", "SOS_l60_opp_bp_save%_50", "SOS_l60_opp_bp_save%_49", "SOS_l60_opp_bp_save%_48", "SOS_l60_opp_bp_save%_47", "SOS_l60_opp_bp_save%_46", "SOS_l60_opp_bp_save%_45", "SOS_l60_opp_bp_save%_44", "SOS_l60_opp_bp_save%_43", "SOS_l60_opp_bp_save%_42", "SOS_l60_opp_bp_save%_41", "SOS_l60_opp_bp_save%_40", "SOS_l60_opp_bp_save%_39", "SOS_l60_opp_bp_save%_38", "SOS_l60_opp_bp_save%_37", "SOS_l60_opp_bp_save%_36", "SOS_l60_opp_bp_save%_35", "SOS_l60_opp_bp_save%_34", "SOS_l60_opp_bp_save%_33", "SOS_l60_opp_bp_save%_32", "SOS_l60_opp_bp_save%_31", "SOS_l60_opp_bp_save%_30", "SOS_l60_opp_bp_save%_29", "SOS_l60_opp_bp_save%_28", "SOS_l60_opp_bp_save%_27", "SOS_l60_opp_bp_save%_26", "SOS_l60_opp_bp_save%_25", "SOS_l60_opp_bp_save%_24", "SOS_l60_opp_bp_save%_23", "SOS_l60_opp_bp_save%_22", "SOS_l60_opp_bp_save%_21", "SOS_l60_opp_bp_save%_20", "SOS_l60_opp_bp_save%_19", "SOS_l60_opp_bp_save%_18", "SOS_l60_opp_bp_save%_17", "SOS_l60_opp_bp_save%_16", "SOS_l60_opp_bp_save%_15", "SOS_l60_opp_bp_save%_14", "SOS_l60_opp_bp_save%_13", "SOS_l60_opp_bp_save%_12", "SOS_l60_opp_bp_save%_11", "SOS_l60_opp_bp_save%_10", "SOS_l60_opp_bp_save%_9", "SOS_l60_opp_bp_save%_8", "SOS_l60_opp_bp_save%_7", "SOS_l60_opp_bp_save%_6", "SOS_l60_opp_bp_save%_5", "SOS_l60_opp_bp_save%_4", "SOS_l60_opp_bp_save%_3", "SOS_l60_opp_bp_save%_2", "SOS_l60_opp_bp_save%_1"]].count(axis=1)
df_player2["SOS_l60_opp_bp_save%"] = (df_player2["SOS_l60_opp_bp_save%_ws"]/df_player2["SOS_l60_opp_bp_save%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted) NOTE: time-weighting is currently NOT implemented for this feature

# % BREAK POINTS SAVED that the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to achieve
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_l60_opp_bp_save%"]

# Mean % BREAK POINTS SAVED performance (l60_tw_ss) across ALL players per surface. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (except 2009-2010), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_bp_save%_l60_tw_ss'].mean()) 
mean_clay_SOS_2 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_bp_save%_l60_tw_ss'].mean()) 
mean_clay_SOS_3 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_bp_save%_l60_tw_ss'].mean()) 
mean_clay_SOS_4 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_bp_save%_l60_tw_ss'].mean()) 
mean_clay_SOS_5 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_bp_save%_l60_tw_ss'].mean()) 
mean_clay_SOS_6 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_bp_save%_l60_tw_ss'].mean()) 
# (59.70547143840329, 59.62532828282815, 59.802417061611344, 59.470656370656364, 59.959858247422666, 60.14120633893913)

mean_hard_SOS_1 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_bp_save%_l60_tw_ss'].mean()) 
mean_hard_SOS_2 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_bp_save%_l60_tw_ss'].mean()) 
mean_hard_SOS_3 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_bp_save%_l60_tw_ss'].mean()) 
mean_hard_SOS_4 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_bp_save%_l60_tw_ss'].mean()) 
mean_hard_SOS_5 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_bp_save%_l60_tw_ss'].mean()) 
mean_hard_SOS_6 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_bp_save%_l60_tw_ss'].mean()) 
# (60.916743690010684, 61.02165084745734, 60.88078050443089, 61.93887025595755, 61.22786054421793, 61.574113644096634)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_bp_conv%_l60_tw_ss_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss"])*(df_player2["EY"]/mean_clay_SOS_1)).round(2)         
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_bp_conv%_l60_tw_ss_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss"])*(df_player2["EY"]/mean_clay_SOS_2)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_bp_conv%_l60_tw_ss_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss"])*(df_player2["EY"]/mean_clay_SOS_3)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_bp_conv%_l60_tw_ss_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss"])*(df_player2["EY"]/mean_clay_SOS_4)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_bp_conv%_l60_tw_ss_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss"])*(df_player2["EY"]/mean_clay_SOS_5)).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_bp_conv%_l60_tw_ss_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss"])*(df_player2["EY"]/mean_clay_SOS_6)).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_bp_conv%_l60_tw_ss_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_1)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_bp_conv%_l60_tw_ss_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_2)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_bp_conv%_l60_tw_ss_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_3)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_bp_conv%_l60_tw_ss_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_4)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_bp_conv%_l60_tw_ss_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_5)).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_bp_conv%_l60_tw_ss_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_6)).round(2) 

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_l60_opp_bp_save%_ws", "SOS_l60_opp_bp_save%_ws_ct", "SOS_l60_opp_bp_save%", "SOS_l60_opp_bp_save%_50", "SOS_l60_opp_bp_save%_49", "SOS_l60_opp_bp_save%_48", "SOS_l60_opp_bp_save%_47", "SOS_l60_opp_bp_save%_46", "SOS_l60_opp_bp_save%_45", "SOS_l60_opp_bp_save%_44", "SOS_l60_opp_bp_save%_43", "SOS_l60_opp_bp_save%_42", "SOS_l60_opp_bp_save%_41", "SOS_l60_opp_bp_save%_40", "SOS_l60_opp_bp_save%_39", "SOS_l60_opp_bp_save%_38", "SOS_l60_opp_bp_save%_37", "SOS_l60_opp_bp_save%_36", "SOS_l60_opp_bp_save%_35", "SOS_l60_opp_bp_save%_34", "SOS_l60_opp_bp_save%_33", "SOS_l60_opp_bp_save%_32", "SOS_l60_opp_bp_save%_31", "SOS_l60_opp_bp_save%_30", "SOS_l60_opp_bp_save%_29", "SOS_l60_opp_bp_save%_28", "SOS_l60_opp_bp_save%_27", "SOS_l60_opp_bp_save%_26", "SOS_l60_opp_bp_save%_25", "SOS_l60_opp_bp_save%_24", "SOS_l60_opp_bp_save%_23", "SOS_l60_opp_bp_save%_22", "SOS_l60_opp_bp_save%_21", "SOS_l60_opp_bp_save%_20", "SOS_l60_opp_bp_save%_19", "SOS_l60_opp_bp_save%_18", "SOS_l60_opp_bp_save%_17", "SOS_l60_opp_bp_save%_16", "SOS_l60_opp_bp_save%_15", "SOS_l60_opp_bp_save%_14", "SOS_l60_opp_bp_save%_13", "SOS_l60_opp_bp_save%_12", "SOS_l60_opp_bp_save%_11", "SOS_l60_opp_bp_save%_10", "SOS_l60_opp_bp_save%_9", "SOS_l60_opp_bp_save%_8", "SOS_l60_opp_bp_save%_7", "SOS_l60_opp_bp_save%_6", "SOS_l60_opp_bp_save%_5", "SOS_l60_opp_bp_save%_4", "SOS_l60_opp_bp_save%_3", "SOS_l60_opp_bp_save%_2", "SOS_l60_opp_bp_save%_1"],axis=1)

In [196]:
# 'p_bp_conv%_l10_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted (tw), surface-specific (SS) BREAK POINTS CONVERTED (as returner) performance of PLAYER over the 10 matches PRIOR TO the match being predicted

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_l10_opp_bp_save%_ws"] = df_player2[["SOS_l60_opp_bp_save%_60", "SOS_l60_opp_bp_save%_59", "SOS_l60_opp_bp_save%_58", "SOS_l60_opp_bp_save%_57", "SOS_l60_opp_bp_save%_56", "SOS_l60_opp_bp_save%_55", "SOS_l60_opp_bp_save%_54", "SOS_l60_opp_bp_save%_53", "SOS_l60_opp_bp_save%_52", "SOS_l60_opp_bp_save%_51"]].sum(axis=1)
df_player2["SOS_l10_opp_bp_save%_ws_ct"] = df_player2[["SOS_l60_opp_bp_save%_60", "SOS_l60_opp_bp_save%_59", "SOS_l60_opp_bp_save%_58", "SOS_l60_opp_bp_save%_57", "SOS_l60_opp_bp_save%_56", "SOS_l60_opp_bp_save%_55", "SOS_l60_opp_bp_save%_54", "SOS_l60_opp_bp_save%_53", "SOS_l60_opp_bp_save%_52", "SOS_l60_opp_bp_save%_51"]].count(axis=1)
df_player2["SOS_l10_opp_bp_save%"] = (df_player2["SOS_l10_opp_bp_save%_ws"]/df_player2["SOS_l10_opp_bp_save%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted) NOTE: time-weighting is currently NOT implemented for this feature

# % BREAK POINTS SAVED that the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to achieve
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_l10_opp_bp_save%"]

# Mean % BREAK POINTS SAVED performance (l10_tw_ss) across ALL players per surface. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (except 2009-2010), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_bp_save%_l10_tw_ss'].mean()) 
mean_clay_SOS_2 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_bp_save%_l10_tw_ss'].mean()) 
mean_clay_SOS_3 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_bp_save%_l10_tw_ss'].mean()) 
mean_clay_SOS_4 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_bp_save%_l10_tw_ss'].mean()) 
mean_clay_SOS_5 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_bp_save%_l10_tw_ss'].mean()) 
mean_clay_SOS_6 = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_bp_save%_l10_tw_ss'].mean()) 
# 

mean_hard_SOS_1 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_bp_save%_l10_tw_ss'].mean()) 
mean_hard_SOS_2 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_bp_save%_l10_tw_ss'].mean()) 
mean_hard_SOS_3 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_bp_save%_l10_tw_ss'].mean()) 
mean_hard_SOS_4 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_bp_save%_l10_tw_ss'].mean()) 
mean_hard_SOS_5 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_bp_save%_l10_tw_ss'].mean()) 
mean_hard_SOS_6 = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_bp_save%_l10_tw_ss'].mean()) 
# 

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_bp_conv%_l10_tw_ss_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss"])*(df_player2["EY"]/mean_clay_SOS_1)).round(2)
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_bp_conv%_l10_tw_ss_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss"])*(df_player2["EY"]/mean_clay_SOS_2)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_bp_conv%_l10_tw_ss_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss"])*(df_player2["EY"]/mean_clay_SOS_3)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_bp_conv%_l10_tw_ss_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss"])*(df_player2["EY"]/mean_clay_SOS_4)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_bp_conv%_l10_tw_ss_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss"])*(df_player2["EY"]/mean_clay_SOS_5)).round(2)
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_bp_conv%_l10_tw_ss_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss"])*(df_player2["EY"]/mean_clay_SOS_6)).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_bp_conv%_l10_tw_ss_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_1)).round(2)        
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_bp_conv%_l10_tw_ss_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_2)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_bp_conv%_l10_tw_ss_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_3)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_bp_conv%_l10_tw_ss_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_4)).round(2)         
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_bp_conv%_l10_tw_ss_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_5)).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_bp_conv%_l10_tw_ss_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss"])*(df_player2["EY"]/mean_hard_SOS_6)).round(2)

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_l10_opp_bp_save%_ws", "SOS_l10_opp_bp_save%_ws_ct", "SOS_l10_opp_bp_save%", "SOS_l60_opp_bp_save%_60", "SOS_l60_opp_bp_save%_59", "SOS_l60_opp_bp_save%_58", "SOS_l60_opp_bp_save%_57", "SOS_l60_opp_bp_save%_56", "SOS_l60_opp_bp_save%_55", "SOS_l60_opp_bp_save%_54", "SOS_l60_opp_bp_save%_53", "SOS_l60_opp_bp_save%_52", "SOS_l60_opp_bp_save%_51"],axis=1)

In [197]:
# 'p_bp_conv%_l60_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), I/O specific BP CONVERTED performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player2 = df_player2.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

# First obtain mean, TW,SS, IO specific BREAK POINTS SAVED (as servers) performance for player OPPONENTS in the maximum interval (60 matches) prior to the match being predicted 
df_player2["SOS_tw_l60_opp_bp_save%_60"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-1)
df_player2["SOS_tw_l60_opp_bp_save%_59"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-2)
df_player2["SOS_tw_l60_opp_bp_save%_58"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-3)
df_player2["SOS_tw_l60_opp_bp_save%_57"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-4)
df_player2["SOS_tw_l60_opp_bp_save%_56"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-5)
df_player2["SOS_tw_l60_opp_bp_save%_55"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-6)
df_player2["SOS_tw_l60_opp_bp_save%_54"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-7)
df_player2["SOS_tw_l60_opp_bp_save%_53"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-8)
df_player2["SOS_tw_l60_opp_bp_save%_52"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-9)
df_player2["SOS_tw_l60_opp_bp_save%_51"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-10)
df_player2["SOS_tw_l60_opp_bp_save%_50"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-11)
df_player2["SOS_tw_l60_opp_bp_save%_49"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-12)
df_player2["SOS_tw_l60_opp_bp_save%_48"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-13)
df_player2["SOS_tw_l60_opp_bp_save%_47"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-14)
df_player2["SOS_tw_l60_opp_bp_save%_46"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-15)
df_player2["SOS_tw_l60_opp_bp_save%_45"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-16)
df_player2["SOS_tw_l60_opp_bp_save%_44"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-17)
df_player2["SOS_tw_l60_opp_bp_save%_43"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-18)
df_player2["SOS_tw_l60_opp_bp_save%_42"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-19)
df_player2["SOS_tw_l60_opp_bp_save%_41"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-20)
df_player2["SOS_tw_l60_opp_bp_save%_40"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-21)
df_player2["SOS_tw_l60_opp_bp_save%_39"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-22)
df_player2["SOS_tw_l60_opp_bp_save%_38"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-23)
df_player2["SOS_tw_l60_opp_bp_save%_37"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-24)
df_player2["SOS_tw_l60_opp_bp_save%_36"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-25)
df_player2["SOS_tw_l60_opp_bp_save%_35"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-26)
df_player2["SOS_tw_l60_opp_bp_save%_34"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-27)
df_player2["SOS_tw_l60_opp_bp_save%_33"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-28)
df_player2["SOS_tw_l60_opp_bp_save%_32"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-29)
df_player2["SOS_tw_l60_opp_bp_save%_31"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-30)
df_player2["SOS_tw_l60_opp_bp_save%_30"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-31)
df_player2["SOS_tw_l60_opp_bp_save%_29"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-32)
df_player2["SOS_tw_l60_opp_bp_save%_28"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-33)
df_player2["SOS_tw_l60_opp_bp_save%_27"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-34)
df_player2["SOS_tw_l60_opp_bp_save%_26"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-35)
df_player2["SOS_tw_l60_opp_bp_save%_25"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-36)
df_player2["SOS_tw_l60_opp_bp_save%_24"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-37)
df_player2["SOS_tw_l60_opp_bp_save%_23"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-38)
df_player2["SOS_tw_l60_opp_bp_save%_22"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-39)
df_player2["SOS_tw_l60_opp_bp_save%_21"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-40)
df_player2["SOS_tw_l60_opp_bp_save%_20"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-41)
df_player2["SOS_tw_l60_opp_bp_save%_19"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-42)
df_player2["SOS_tw_l60_opp_bp_save%_18"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-43)
df_player2["SOS_tw_l60_opp_bp_save%_17"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-44)
df_player2["SOS_tw_l60_opp_bp_save%_16"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-45)
df_player2["SOS_tw_l60_opp_bp_save%_15"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-46)
df_player2["SOS_tw_l60_opp_bp_save%_14"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-47)
df_player2["SOS_tw_l60_opp_bp_save%_13"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-48)
df_player2["SOS_tw_l60_opp_bp_save%_12"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-49)
df_player2["SOS_tw_l60_opp_bp_save%_11"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-50)
df_player2["SOS_tw_l60_opp_bp_save%_10"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-51)
df_player2["SOS_tw_l60_opp_bp_save%_9"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-52)
df_player2["SOS_tw_l60_opp_bp_save%_8"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-53)
df_player2["SOS_tw_l60_opp_bp_save%_7"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-54)
df_player2["SOS_tw_l60_opp_bp_save%_6"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-55)
df_player2["SOS_tw_l60_opp_bp_save%_5"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-56)
df_player2["SOS_tw_l60_opp_bp_save%_4"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-57)
df_player2["SOS_tw_l60_opp_bp_save%_3"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-58)
df_player2["SOS_tw_l60_opp_bp_save%_2"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-59)
df_player2["SOS_tw_l60_opp_bp_save%_1"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_bp_save%_l60_tw_ss_IO'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_bp_save%_ws"] = df_player2[["SOS_tw_l60_opp_bp_save%_60", "SOS_tw_l60_opp_bp_save%_59", "SOS_tw_l60_opp_bp_save%_58", "SOS_tw_l60_opp_bp_save%_57", "SOS_tw_l60_opp_bp_save%_56", "SOS_tw_l60_opp_bp_save%_55", "SOS_tw_l60_opp_bp_save%_54", "SOS_tw_l60_opp_bp_save%_53", "SOS_tw_l60_opp_bp_save%_52", "SOS_tw_l60_opp_bp_save%_51", "SOS_tw_l60_opp_bp_save%_50", "SOS_tw_l60_opp_bp_save%_49", "SOS_tw_l60_opp_bp_save%_48", "SOS_tw_l60_opp_bp_save%_47", "SOS_tw_l60_opp_bp_save%_46", "SOS_tw_l60_opp_bp_save%_45", "SOS_tw_l60_opp_bp_save%_44", "SOS_tw_l60_opp_bp_save%_43", "SOS_tw_l60_opp_bp_save%_42", "SOS_tw_l60_opp_bp_save%_41", "SOS_tw_l60_opp_bp_save%_40", "SOS_tw_l60_opp_bp_save%_39", "SOS_tw_l60_opp_bp_save%_38", "SOS_tw_l60_opp_bp_save%_37", "SOS_tw_l60_opp_bp_save%_36", "SOS_tw_l60_opp_bp_save%_35", "SOS_tw_l60_opp_bp_save%_34", "SOS_tw_l60_opp_bp_save%_33", "SOS_tw_l60_opp_bp_save%_32", "SOS_tw_l60_opp_bp_save%_31", "SOS_tw_l60_opp_bp_save%_30", "SOS_tw_l60_opp_bp_save%_29", "SOS_tw_l60_opp_bp_save%_28", "SOS_tw_l60_opp_bp_save%_27", "SOS_tw_l60_opp_bp_save%_26", "SOS_tw_l60_opp_bp_save%_25", "SOS_tw_l60_opp_bp_save%_24", "SOS_tw_l60_opp_bp_save%_23", "SOS_tw_l60_opp_bp_save%_22", "SOS_tw_l60_opp_bp_save%_21", "SOS_tw_l60_opp_bp_save%_20", "SOS_tw_l60_opp_bp_save%_19", "SOS_tw_l60_opp_bp_save%_18", "SOS_tw_l60_opp_bp_save%_17", "SOS_tw_l60_opp_bp_save%_16", "SOS_tw_l60_opp_bp_save%_15", "SOS_tw_l60_opp_bp_save%_14", "SOS_tw_l60_opp_bp_save%_13", "SOS_tw_l60_opp_bp_save%_12", "SOS_tw_l60_opp_bp_save%_11", "SOS_tw_l60_opp_bp_save%_10", "SOS_tw_l60_opp_bp_save%_9", "SOS_tw_l60_opp_bp_save%_8", "SOS_tw_l60_opp_bp_save%_7", "SOS_tw_l60_opp_bp_save%_6", "SOS_tw_l60_opp_bp_save%_5", "SOS_tw_l60_opp_bp_save%_4", "SOS_tw_l60_opp_bp_save%_3", "SOS_tw_l60_opp_bp_save%_2", "SOS_tw_l60_opp_bp_save%_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_bp_save%_ws_ct"] = df_player2[["SOS_tw_l60_opp_bp_save%_60", "SOS_tw_l60_opp_bp_save%_59", "SOS_tw_l60_opp_bp_save%_58", "SOS_tw_l60_opp_bp_save%_57", "SOS_tw_l60_opp_bp_save%_56", "SOS_tw_l60_opp_bp_save%_55", "SOS_tw_l60_opp_bp_save%_54", "SOS_tw_l60_opp_bp_save%_53", "SOS_tw_l60_opp_bp_save%_52", "SOS_tw_l60_opp_bp_save%_51", "SOS_tw_l60_opp_bp_save%_50", "SOS_tw_l60_opp_bp_save%_49", "SOS_tw_l60_opp_bp_save%_48", "SOS_tw_l60_opp_bp_save%_47", "SOS_tw_l60_opp_bp_save%_46", "SOS_tw_l60_opp_bp_save%_45", "SOS_tw_l60_opp_bp_save%_44", "SOS_tw_l60_opp_bp_save%_43", "SOS_tw_l60_opp_bp_save%_42", "SOS_tw_l60_opp_bp_save%_41", "SOS_tw_l60_opp_bp_save%_40", "SOS_tw_l60_opp_bp_save%_39", "SOS_tw_l60_opp_bp_save%_38", "SOS_tw_l60_opp_bp_save%_37", "SOS_tw_l60_opp_bp_save%_36", "SOS_tw_l60_opp_bp_save%_35", "SOS_tw_l60_opp_bp_save%_34", "SOS_tw_l60_opp_bp_save%_33", "SOS_tw_l60_opp_bp_save%_32", "SOS_tw_l60_opp_bp_save%_31", "SOS_tw_l60_opp_bp_save%_30", "SOS_tw_l60_opp_bp_save%_29", "SOS_tw_l60_opp_bp_save%_28", "SOS_tw_l60_opp_bp_save%_27", "SOS_tw_l60_opp_bp_save%_26", "SOS_tw_l60_opp_bp_save%_25", "SOS_tw_l60_opp_bp_save%_24", "SOS_tw_l60_opp_bp_save%_23", "SOS_tw_l60_opp_bp_save%_22", "SOS_tw_l60_opp_bp_save%_21", "SOS_tw_l60_opp_bp_save%_20", "SOS_tw_l60_opp_bp_save%_19", "SOS_tw_l60_opp_bp_save%_18", "SOS_tw_l60_opp_bp_save%_17", "SOS_tw_l60_opp_bp_save%_16", "SOS_tw_l60_opp_bp_save%_15", "SOS_tw_l60_opp_bp_save%_14", "SOS_tw_l60_opp_bp_save%_13", "SOS_tw_l60_opp_bp_save%_12", "SOS_tw_l60_opp_bp_save%_11", "SOS_tw_l60_opp_bp_save%_10", "SOS_tw_l60_opp_bp_save%_9", "SOS_tw_l60_opp_bp_save%_8", "SOS_tw_l60_opp_bp_save%_7", "SOS_tw_l60_opp_bp_save%_6", "SOS_tw_l60_opp_bp_save%_5", "SOS_tw_l60_opp_bp_save%_4", "SOS_tw_l60_opp_bp_save%_3", "SOS_tw_l60_opp_bp_save%_2", "SOS_tw_l60_opp_bp_save%_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_bp_save%"] = (df_player2["SOS_tw_l60_opp_bp_save%_ws"]/df_player2["SOS_tw_l60_opp_bp_save%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % BREAK POINTS SAVED performance (l60_tw_ss_IO) that the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface and IO status is expected to achieve
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l60_opp_bp_save%"]

# Mean % BREAK POINTS SAVED performance (l60_tw_ss_IO) across ALL players per surface and IO status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (except 2009-2010), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_bp_save%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_2i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_bp_save%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_3i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_bp_save%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_4i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_bp_save%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_5i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_bp_save%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_6i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_bp_save%_l60_tw_ss_IO'].mean()) 
# (nan, 61.26981818181819, 62.12256097560978, 63.37411764705882, 58.79845070422533, 60.37380952380952)

mean_clay_SOS_1o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_bp_save%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_2o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_bp_save%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_3o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_bp_save%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_4o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_bp_save%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_5o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_bp_save%_l60_tw_ss_IO'].mean()) 
mean_clay_SOS_6o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_bp_save%_l60_tw_ss_IO'].mean()) 
# (59.70547143840329, 59.72235966057421, 59.76650847457634, 59.30916867885493, 59.90016989466528, 60.11089447236178)

mean_hard_SOS_1i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_bp_save%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_2i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_bp_save%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_3i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_bp_save%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_4i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_bp_save%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_5i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_bp_save%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_6i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_bp_save%_l60_tw_ss_IO'].mean()) 
# (64.66257107540171, 62.59936217415419, 62.03204701834862, 62.561057298772184, 62.03658519553068, 62.694842465753396)

mean_hard_SOS_1o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_bp_save%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_2o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_bp_save%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_3o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_bp_save%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_4o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_bp_save%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_5o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_bp_save%_l60_tw_ss_IO'].mean()) 
mean_hard_SOS_6o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_bp_save%_l60_tw_ss_IO'].mean()) 
# (60.57149972929059, 60.51293708032835, 60.39245222929947, 61.486236507555645, 61.09505330006806, 61.24646776406033)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_bp_conv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_1i)).round(2)           
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_bp_conv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_2i)).round(2)           
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_bp_conv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_3i)).round(2)           
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_bp_conv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_4i)).round(2)           
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_bp_conv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_5i)).round(2)  
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_bp_conv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_6i)).round(2)  

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_bp_conv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_1o)).round(2)           
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_bp_conv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_2o)).round(2)           
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_bp_conv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_3o)).round(2)           
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_bp_conv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_4o)).round(2)           
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_bp_conv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_5o)).round(2)  
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_bp_conv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_6o)).round(2)  

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_bp_conv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_1i)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_bp_conv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_2i)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_bp_conv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_3i)).round(2)           
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_bp_conv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_4i)).round(2)           
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_bp_conv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_5i)).round(2)  
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_bp_conv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_6i)).round(2)  

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_bp_conv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_1o)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_bp_conv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_2o)).round(2)           
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_bp_conv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_3o)).round(2)           
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_bp_conv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_4o)).round(2)           
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_bp_conv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_5o)).round(2)  
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_bp_conv%_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l60_tw_ss_IO"])*(df_player2["EY"]/mean_hard_SOS_6o)).round(2)  

del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_bp_save%_ws", "SOS_tw_l60_opp_bp_save%_ws_ct", "SOS_tw_l60_opp_bp_save%", "SOS_tw_l60_opp_bp_save%_50", "SOS_tw_l60_opp_bp_save%_49", "SOS_tw_l60_opp_bp_save%_48", "SOS_tw_l60_opp_bp_save%_47", "SOS_tw_l60_opp_bp_save%_46", "SOS_tw_l60_opp_bp_save%_45", "SOS_tw_l60_opp_bp_save%_44", "SOS_tw_l60_opp_bp_save%_43", "SOS_tw_l60_opp_bp_save%_42", "SOS_tw_l60_opp_bp_save%_41", "SOS_tw_l60_opp_bp_save%_40", "SOS_tw_l60_opp_bp_save%_39", "SOS_tw_l60_opp_bp_save%_38", "SOS_tw_l60_opp_bp_save%_37", "SOS_tw_l60_opp_bp_save%_36", "SOS_tw_l60_opp_bp_save%_35", "SOS_tw_l60_opp_bp_save%_34", "SOS_tw_l60_opp_bp_save%_33", "SOS_tw_l60_opp_bp_save%_32", "SOS_tw_l60_opp_bp_save%_31", "SOS_tw_l60_opp_bp_save%_30", "SOS_tw_l60_opp_bp_save%_29", "SOS_tw_l60_opp_bp_save%_28", "SOS_tw_l60_opp_bp_save%_27", "SOS_tw_l60_opp_bp_save%_26", "SOS_tw_l60_opp_bp_save%_25", "SOS_tw_l60_opp_bp_save%_24", "SOS_tw_l60_opp_bp_save%_23", "SOS_tw_l60_opp_bp_save%_22", "SOS_tw_l60_opp_bp_save%_21", "SOS_tw_l60_opp_bp_save%_20", "SOS_tw_l60_opp_bp_save%_19", "SOS_tw_l60_opp_bp_save%_18", "SOS_tw_l60_opp_bp_save%_17", "SOS_tw_l60_opp_bp_save%_16", "SOS_tw_l60_opp_bp_save%_15", "SOS_tw_l60_opp_bp_save%_14", "SOS_tw_l60_opp_bp_save%_13", "SOS_tw_l60_opp_bp_save%_12", "SOS_tw_l60_opp_bp_save%_11", "SOS_tw_l60_opp_bp_save%_10", "SOS_tw_l60_opp_bp_save%_9", "SOS_tw_l60_opp_bp_save%_8", "SOS_tw_l60_opp_bp_save%_7", "SOS_tw_l60_opp_bp_save%_6", "SOS_tw_l60_opp_bp_save%_5", "SOS_tw_l60_opp_bp_save%_4", "SOS_tw_l60_opp_bp_save%_3", "SOS_tw_l60_opp_bp_save%_2", "SOS_tw_l60_opp_bp_save%_1"],axis=1)

In [198]:
# 'p_bp_conv%_l10_tw_ss_IO_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS), I/O specific BP CONVERTED performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_bp_save%_ws"] = df_player2[["SOS_tw_l60_opp_bp_save%_60", "SOS_tw_l60_opp_bp_save%_59", "SOS_tw_l60_opp_bp_save%_58", "SOS_tw_l60_opp_bp_save%_57", "SOS_tw_l60_opp_bp_save%_56", "SOS_tw_l60_opp_bp_save%_55", "SOS_tw_l60_opp_bp_save%_54", "SOS_tw_l60_opp_bp_save%_53", "SOS_tw_l60_opp_bp_save%_52", "SOS_tw_l60_opp_bp_save%_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_bp_save%_ws_ct"] = df_player2[["SOS_tw_l60_opp_bp_save%_60", "SOS_tw_l60_opp_bp_save%_59", "SOS_tw_l60_opp_bp_save%_58", "SOS_tw_l60_opp_bp_save%_57", "SOS_tw_l60_opp_bp_save%_56", "SOS_tw_l60_opp_bp_save%_55", "SOS_tw_l60_opp_bp_save%_54", "SOS_tw_l60_opp_bp_save%_53", "SOS_tw_l60_opp_bp_save%_52", "SOS_tw_l60_opp_bp_save%_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_bp_save%"] = (df_player2["SOS_tw_l10_opp_bp_save%_ws"]/df_player2["SOS_tw_l10_opp_bp_save%_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % BREAK POINTS SAVED performance (l10_tw_ss_IO) that the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface and IO status is expected to YIELD
# EY is EXPECTED YIELD
df_player2["EY"] = df_player2["SOS_tw_l10_opp_bp_save%"]

# Mean % BREAK POINTS SAVED YIELDED performance (l10_tw_ss_IO) across ALL players per surface and IO status. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (except 2009-2010), and match to match date ranges during the SOS adjustment

mean_clay_SOS_1i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_bp_save%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_2i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_bp_save%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_3i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_bp_save%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_4i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_bp_save%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_5i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_bp_save%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_6i = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_bp_save%_l10_tw_ss_IO'].mean()) 
# 

mean_clay_SOS_1o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_bp_save%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_2o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_bp_save%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_3o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_bp_save%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_4o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_bp_save%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_5o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_bp_save%_l10_tw_ss_IO'].mean()) 
mean_clay_SOS_6o = (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_bp_save%_l10_tw_ss_IO'].mean()) 
# 

mean_hard_SOS_1i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_bp_save%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_2i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_bp_save%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_3i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_bp_save%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_4i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_bp_save%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_5i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_bp_save%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_6i = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_bp_save%_l10_tw_ss_IO'].mean()) 
# 

mean_hard_SOS_1o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_bp_save%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_2o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_bp_save%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_3o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_bp_save%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_4o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_bp_save%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_5o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_bp_save%_l10_tw_ss_IO'].mean()) 
mean_hard_SOS_6o = (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_bp_save%_l10_tw_ss_IO'].mean()) 
# 

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_bp_conv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_1i)).round(2)           
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_bp_conv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_1i)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_bp_conv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_1i)).round(2)           
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_bp_conv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_1i)).round(2)           
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_bp_conv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_1i)).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_bp_conv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_1i)).round(2)  

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_bp_conv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_1o)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_bp_conv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_2o)).round(2)           
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_bp_conv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_3o)).round(2)           
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_bp_conv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_4o)).round(2)           
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_bp_conv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_5o)).round(2)  
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_bp_conv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_6o)).round(2)  

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_bp_conv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_1i)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_bp_conv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_2i)).round(2)           
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_bp_conv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_3i)).round(2)           
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_bp_conv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_4i)).round(2)           
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_bp_conv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_5i)).round(2)  
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_bp_conv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_6i)).round(2)  

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_bp_conv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_1i)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_bp_conv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_2i)).round(2)           
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_bp_conv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_3i)).round(2)           
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_bp_conv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_4i)).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_bp_conv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_5i)).round(2)  
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_bp_conv%_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_bp_conv%_l10_tw_ss_IO"])*(df_player2["EY"]/mean_clay_SOS_6i)).round(2) 

del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_bp_save%_ws", "SOS_tw_l10_opp_bp_save%_ws_ct", "SOS_tw_l10_opp_bp_save%", "SOS_tw_l60_opp_bp_save%_60", "SOS_tw_l60_opp_bp_save%_59", "SOS_tw_l60_opp_bp_save%_58", "SOS_tw_l60_opp_bp_save%_57", "SOS_tw_l60_opp_bp_save%_56", "SOS_tw_l60_opp_bp_save%_55", "SOS_tw_l60_opp_bp_save%_54", "SOS_tw_l60_opp_bp_save%_53", "SOS_tw_l60_opp_bp_save%_52", "SOS_tw_l60_opp_bp_save%_51"],axis=1)

In [199]:
# 'p_AVG_C_IP_l60_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) IMPLIED WIN PROBABILITY "performance" of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player2 = df_player2.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific IMPLIED WIN PROBABILITY "performance" of player OPPONENTS on the same surface over the maximum interval of interest (60 matches) prior to the match being predicted 
df_player2["SOS_tw_l60_opp_AVG_C_IP_60"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-1)
df_player2["SOS_tw_l60_opp_AVG_C_IP_59"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-2)
df_player2["SOS_tw_l60_opp_AVG_C_IP_58"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-3)
df_player2["SOS_tw_l60_opp_AVG_C_IP_57"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-4)
df_player2["SOS_tw_l60_opp_AVG_C_IP_56"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-5)
df_player2["SOS_tw_l60_opp_AVG_C_IP_55"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-6)
df_player2["SOS_tw_l60_opp_AVG_C_IP_54"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-7)
df_player2["SOS_tw_l60_opp_AVG_C_IP_53"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-8)
df_player2["SOS_tw_l60_opp_AVG_C_IP_52"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-9)
df_player2["SOS_tw_l60_opp_AVG_C_IP_51"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-10)
df_player2["SOS_tw_l60_opp_AVG_C_IP_50"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-11)
df_player2["SOS_tw_l60_opp_AVG_C_IP_49"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-12)
df_player2["SOS_tw_l60_opp_AVG_C_IP_48"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-13)
df_player2["SOS_tw_l60_opp_AVG_C_IP_47"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-14)
df_player2["SOS_tw_l60_opp_AVG_C_IP_46"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-15)
df_player2["SOS_tw_l60_opp_AVG_C_IP_45"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-16)
df_player2["SOS_tw_l60_opp_AVG_C_IP_44"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-17)
df_player2["SOS_tw_l60_opp_AVG_C_IP_43"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-18)
df_player2["SOS_tw_l60_opp_AVG_C_IP_42"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-19)
df_player2["SOS_tw_l60_opp_AVG_C_IP_41"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-20)
df_player2["SOS_tw_l60_opp_AVG_C_IP_40"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-21)
df_player2["SOS_tw_l60_opp_AVG_C_IP_39"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-22)
df_player2["SOS_tw_l60_opp_AVG_C_IP_38"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-23)
df_player2["SOS_tw_l60_opp_AVG_C_IP_37"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-24)
df_player2["SOS_tw_l60_opp_AVG_C_IP_36"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-25)
df_player2["SOS_tw_l60_opp_AVG_C_IP_35"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-26)
df_player2["SOS_tw_l60_opp_AVG_C_IP_34"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-27)
df_player2["SOS_tw_l60_opp_AVG_C_IP_33"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-28)
df_player2["SOS_tw_l60_opp_AVG_C_IP_32"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-29)
df_player2["SOS_tw_l60_opp_AVG_C_IP_31"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-30)
df_player2["SOS_tw_l60_opp_AVG_C_IP_30"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-31)
df_player2["SOS_tw_l60_opp_AVG_C_IP_29"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-32)
df_player2["SOS_tw_l60_opp_AVG_C_IP_28"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-33)
df_player2["SOS_tw_l60_opp_AVG_C_IP_27"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-34)
df_player2["SOS_tw_l60_opp_AVG_C_IP_26"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-35)
df_player2["SOS_tw_l60_opp_AVG_C_IP_25"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-36)
df_player2["SOS_tw_l60_opp_AVG_C_IP_24"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-37)
df_player2["SOS_tw_l60_opp_AVG_C_IP_23"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-38)
df_player2["SOS_tw_l60_opp_AVG_C_IP_22"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-39)
df_player2["SOS_tw_l60_opp_AVG_C_IP_21"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-40)
df_player2["SOS_tw_l60_opp_AVG_C_IP_20"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-41)
df_player2["SOS_tw_l60_opp_AVG_C_IP_19"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-42)
df_player2["SOS_tw_l60_opp_AVG_C_IP_18"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-43)
df_player2["SOS_tw_l60_opp_AVG_C_IP_17"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-44)
df_player2["SOS_tw_l60_opp_AVG_C_IP_16"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-45)
df_player2["SOS_tw_l60_opp_AVG_C_IP_15"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-46)
df_player2["SOS_tw_l60_opp_AVG_C_IP_14"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-47)
df_player2["SOS_tw_l60_opp_AVG_C_IP_13"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-48)
df_player2["SOS_tw_l60_opp_AVG_C_IP_12"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-49)
df_player2["SOS_tw_l60_opp_AVG_C_IP_11"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-50)
df_player2["SOS_tw_l60_opp_AVG_C_IP_10"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-51)
df_player2["SOS_tw_l60_opp_AVG_C_IP_9"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-52)
df_player2["SOS_tw_l60_opp_AVG_C_IP_8"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-53)
df_player2["SOS_tw_l60_opp_AVG_C_IP_7"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-54)
df_player2["SOS_tw_l60_opp_AVG_C_IP_6"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-55)
df_player2["SOS_tw_l60_opp_AVG_C_IP_5"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-56)
df_player2["SOS_tw_l60_opp_AVG_C_IP_4"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-57)
df_player2["SOS_tw_l60_opp_AVG_C_IP_3"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-58)
df_player2["SOS_tw_l60_opp_AVG_C_IP_2"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-59)
df_player2["SOS_tw_l60_opp_AVG_C_IP_1"] = df_player2.groupby(['p_id','t_surf'])['p_opp_AVG_C_IP_l60_tw_ss'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_AVG_C_IP_ws"] = df_player2[["SOS_tw_l60_opp_AVG_C_IP_60", "SOS_tw_l60_opp_AVG_C_IP_59", "SOS_tw_l60_opp_AVG_C_IP_58", "SOS_tw_l60_opp_AVG_C_IP_57", "SOS_tw_l60_opp_AVG_C_IP_56", "SOS_tw_l60_opp_AVG_C_IP_55", "SOS_tw_l60_opp_AVG_C_IP_54", "SOS_tw_l60_opp_AVG_C_IP_53", "SOS_tw_l60_opp_AVG_C_IP_52", "SOS_tw_l60_opp_AVG_C_IP_51", "SOS_tw_l60_opp_AVG_C_IP_50", "SOS_tw_l60_opp_AVG_C_IP_49", "SOS_tw_l60_opp_AVG_C_IP_48", "SOS_tw_l60_opp_AVG_C_IP_47", "SOS_tw_l60_opp_AVG_C_IP_46", "SOS_tw_l60_opp_AVG_C_IP_45", "SOS_tw_l60_opp_AVG_C_IP_44", "SOS_tw_l60_opp_AVG_C_IP_43", "SOS_tw_l60_opp_AVG_C_IP_42", "SOS_tw_l60_opp_AVG_C_IP_41", "SOS_tw_l60_opp_AVG_C_IP_40", "SOS_tw_l60_opp_AVG_C_IP_39", "SOS_tw_l60_opp_AVG_C_IP_38", "SOS_tw_l60_opp_AVG_C_IP_37", "SOS_tw_l60_opp_AVG_C_IP_36", "SOS_tw_l60_opp_AVG_C_IP_35", "SOS_tw_l60_opp_AVG_C_IP_34", "SOS_tw_l60_opp_AVG_C_IP_33", "SOS_tw_l60_opp_AVG_C_IP_32", "SOS_tw_l60_opp_AVG_C_IP_31", "SOS_tw_l60_opp_AVG_C_IP_30", "SOS_tw_l60_opp_AVG_C_IP_29", "SOS_tw_l60_opp_AVG_C_IP_28", "SOS_tw_l60_opp_AVG_C_IP_27", "SOS_tw_l60_opp_AVG_C_IP_26", "SOS_tw_l60_opp_AVG_C_IP_25", "SOS_tw_l60_opp_AVG_C_IP_24", "SOS_tw_l60_opp_AVG_C_IP_23", "SOS_tw_l60_opp_AVG_C_IP_22", "SOS_tw_l60_opp_AVG_C_IP_21", "SOS_tw_l60_opp_AVG_C_IP_20", "SOS_tw_l60_opp_AVG_C_IP_19", "SOS_tw_l60_opp_AVG_C_IP_18", "SOS_tw_l60_opp_AVG_C_IP_17", "SOS_tw_l60_opp_AVG_C_IP_16", "SOS_tw_l60_opp_AVG_C_IP_15", "SOS_tw_l60_opp_AVG_C_IP_14", "SOS_tw_l60_opp_AVG_C_IP_13", "SOS_tw_l60_opp_AVG_C_IP_12", "SOS_tw_l60_opp_AVG_C_IP_11", "SOS_tw_l60_opp_AVG_C_IP_10", "SOS_tw_l60_opp_AVG_C_IP_9", "SOS_tw_l60_opp_AVG_C_IP_8", "SOS_tw_l60_opp_AVG_C_IP_7", "SOS_tw_l60_opp_AVG_C_IP_6", "SOS_tw_l60_opp_AVG_C_IP_5", "SOS_tw_l60_opp_AVG_C_IP_4", "SOS_tw_l60_opp_AVG_C_IP_3", "SOS_tw_l60_opp_AVG_C_IP_2", "SOS_tw_l60_opp_AVG_C_IP_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_AVG_C_IP_ws_ct"] = df_player2[["SOS_tw_l60_opp_AVG_C_IP_60", "SOS_tw_l60_opp_AVG_C_IP_59", "SOS_tw_l60_opp_AVG_C_IP_58", "SOS_tw_l60_opp_AVG_C_IP_57", "SOS_tw_l60_opp_AVG_C_IP_56", "SOS_tw_l60_opp_AVG_C_IP_55", "SOS_tw_l60_opp_AVG_C_IP_54", "SOS_tw_l60_opp_AVG_C_IP_53", "SOS_tw_l60_opp_AVG_C_IP_52", "SOS_tw_l60_opp_AVG_C_IP_51", "SOS_tw_l60_opp_AVG_C_IP_50", "SOS_tw_l60_opp_AVG_C_IP_49", "SOS_tw_l60_opp_AVG_C_IP_48", "SOS_tw_l60_opp_AVG_C_IP_47", "SOS_tw_l60_opp_AVG_C_IP_46", "SOS_tw_l60_opp_AVG_C_IP_45", "SOS_tw_l60_opp_AVG_C_IP_44", "SOS_tw_l60_opp_AVG_C_IP_43", "SOS_tw_l60_opp_AVG_C_IP_42", "SOS_tw_l60_opp_AVG_C_IP_41", "SOS_tw_l60_opp_AVG_C_IP_40", "SOS_tw_l60_opp_AVG_C_IP_39", "SOS_tw_l60_opp_AVG_C_IP_38", "SOS_tw_l60_opp_AVG_C_IP_37", "SOS_tw_l60_opp_AVG_C_IP_36", "SOS_tw_l60_opp_AVG_C_IP_35", "SOS_tw_l60_opp_AVG_C_IP_34", "SOS_tw_l60_opp_AVG_C_IP_33", "SOS_tw_l60_opp_AVG_C_IP_32", "SOS_tw_l60_opp_AVG_C_IP_31", "SOS_tw_l60_opp_AVG_C_IP_30", "SOS_tw_l60_opp_AVG_C_IP_29", "SOS_tw_l60_opp_AVG_C_IP_28", "SOS_tw_l60_opp_AVG_C_IP_27", "SOS_tw_l60_opp_AVG_C_IP_26", "SOS_tw_l60_opp_AVG_C_IP_25", "SOS_tw_l60_opp_AVG_C_IP_24", "SOS_tw_l60_opp_AVG_C_IP_23", "SOS_tw_l60_opp_AVG_C_IP_22", "SOS_tw_l60_opp_AVG_C_IP_21", "SOS_tw_l60_opp_AVG_C_IP_20", "SOS_tw_l60_opp_AVG_C_IP_19", "SOS_tw_l60_opp_AVG_C_IP_18", "SOS_tw_l60_opp_AVG_C_IP_17", "SOS_tw_l60_opp_AVG_C_IP_16", "SOS_tw_l60_opp_AVG_C_IP_15", "SOS_tw_l60_opp_AVG_C_IP_14", "SOS_tw_l60_opp_AVG_C_IP_13", "SOS_tw_l60_opp_AVG_C_IP_12", "SOS_tw_l60_opp_AVG_C_IP_11", "SOS_tw_l60_opp_AVG_C_IP_10", "SOS_tw_l60_opp_AVG_C_IP_9", "SOS_tw_l60_opp_AVG_C_IP_8", "SOS_tw_l60_opp_AVG_C_IP_7", "SOS_tw_l60_opp_AVG_C_IP_6", "SOS_tw_l60_opp_AVG_C_IP_5", "SOS_tw_l60_opp_AVG_C_IP_4", "SOS_tw_l60_opp_AVG_C_IP_3", "SOS_tw_l60_opp_AVG_C_IP_2", "SOS_tw_l60_opp_AVG_C_IP_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_AVG_C_IP"] = (df_player2["SOS_tw_l60_opp_AVG_C_IP_ws"]/df_player2["SOS_tw_l60_opp_AVG_C_IP_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % Implied Win Probability the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to "yield"
# EY is EXPECTED YIELD
df_player2["EY"] = 100-df_player2["SOS_tw_l60_opp_AVG_C_IP"]

# Mean % Implied Win Probability "performance" (l60_tw_ss) across ALL players per surface. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_clay_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
# (nan, 48.524239342980025, 49.49365768621238, 50.301474090762866, 50.826475515463855, 50.62616628922686)

mean_hard_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
# (nan, 48.965471827759764, 50.01677815699663, 50.088361292601064, 50.97061564625844, 50.13756039469204)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_AVG_C_IP_l60_tw_ss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_AVG_C_IP_l60_tw_ss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_AVG_C_IP_l60_tw_ss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_AVG_C_IP_l60_tw_ss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_AVG_C_IP_l60_tw_ss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_AVG_C_IP_l60_tw_ss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_AVG_C_IP_l60_tw_ss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_AVG_C_IP_l60_tw_ss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_AVG_C_IP_l60_tw_ss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_AVG_C_IP_l60_tw_ss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_AVG_C_IP_l60_tw_ss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_AVG_C_IP_l60_tw_ss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

# Caps highest adjusted player past win probability at 99.9% (this adjustment can push the biggest favorites to well over 100%, which is an impossibility)
df_player2.loc[(df_player2["p_AVG_C_IP_l60_tw_ss_SOS_adj"] >=100), "p_AVG_C_IP_l60_tw_ss_SOS_adj"] = 99.99 


del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_AVG_C_IP_ws", "SOS_tw_l60_opp_AVG_C_IP_ws_ct", "SOS_tw_l60_opp_AVG_C_IP", "SOS_tw_l60_opp_AVG_C_IP_50", "SOS_tw_l60_opp_AVG_C_IP_49", "SOS_tw_l60_opp_AVG_C_IP_48", "SOS_tw_l60_opp_AVG_C_IP_47", "SOS_tw_l60_opp_AVG_C_IP_46", "SOS_tw_l60_opp_AVG_C_IP_45", "SOS_tw_l60_opp_AVG_C_IP_44", "SOS_tw_l60_opp_AVG_C_IP_43", "SOS_tw_l60_opp_AVG_C_IP_42", "SOS_tw_l60_opp_AVG_C_IP_41", "SOS_tw_l60_opp_AVG_C_IP_40", "SOS_tw_l60_opp_AVG_C_IP_39", "SOS_tw_l60_opp_AVG_C_IP_38", "SOS_tw_l60_opp_AVG_C_IP_37", "SOS_tw_l60_opp_AVG_C_IP_36", "SOS_tw_l60_opp_AVG_C_IP_35", "SOS_tw_l60_opp_AVG_C_IP_34", "SOS_tw_l60_opp_AVG_C_IP_33", "SOS_tw_l60_opp_AVG_C_IP_32", "SOS_tw_l60_opp_AVG_C_IP_31", "SOS_tw_l60_opp_AVG_C_IP_30", "SOS_tw_l60_opp_AVG_C_IP_29", "SOS_tw_l60_opp_AVG_C_IP_28", "SOS_tw_l60_opp_AVG_C_IP_27", "SOS_tw_l60_opp_AVG_C_IP_26", "SOS_tw_l60_opp_AVG_C_IP_25", "SOS_tw_l60_opp_AVG_C_IP_24", "SOS_tw_l60_opp_AVG_C_IP_23", "SOS_tw_l60_opp_AVG_C_IP_22", "SOS_tw_l60_opp_AVG_C_IP_21", "SOS_tw_l60_opp_AVG_C_IP_20", "SOS_tw_l60_opp_AVG_C_IP_19", "SOS_tw_l60_opp_AVG_C_IP_18", "SOS_tw_l60_opp_AVG_C_IP_17", "SOS_tw_l60_opp_AVG_C_IP_16", "SOS_tw_l60_opp_AVG_C_IP_15", "SOS_tw_l60_opp_AVG_C_IP_14", "SOS_tw_l60_opp_AVG_C_IP_13", "SOS_tw_l60_opp_AVG_C_IP_12", "SOS_tw_l60_opp_AVG_C_IP_11", "SOS_tw_l60_opp_AVG_C_IP_10", "SOS_tw_l60_opp_AVG_C_IP_9", "SOS_tw_l60_opp_AVG_C_IP_8", "SOS_tw_l60_opp_AVG_C_IP_7", "SOS_tw_l60_opp_AVG_C_IP_6", "SOS_tw_l60_opp_AVG_C_IP_5", "SOS_tw_l60_opp_AVG_C_IP_4", "SOS_tw_l60_opp_AVG_C_IP_3", "SOS_tw_l60_opp_AVG_C_IP_2", "SOS_tw_l60_opp_AVG_C_IP_1"],axis=1)

In [200]:
# 'p_AVG_C_IP_l10_tw_ss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) IMPLIED WIN PROBABILITY "performance" of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_AVG_C_IP_ws"] = df_player2[["SOS_tw_l60_opp_AVG_C_IP_60", "SOS_tw_l60_opp_AVG_C_IP_59", "SOS_tw_l60_opp_AVG_C_IP_58", "SOS_tw_l60_opp_AVG_C_IP_57", "SOS_tw_l60_opp_AVG_C_IP_56", "SOS_tw_l60_opp_AVG_C_IP_55", "SOS_tw_l60_opp_AVG_C_IP_54", "SOS_tw_l60_opp_AVG_C_IP_53", "SOS_tw_l60_opp_AVG_C_IP_52", "SOS_tw_l60_opp_AVG_C_IP_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_AVG_C_IP_ws_ct"] = df_player2[["SOS_tw_l60_opp_AVG_C_IP_60", "SOS_tw_l60_opp_AVG_C_IP_59", "SOS_tw_l60_opp_AVG_C_IP_58", "SOS_tw_l60_opp_AVG_C_IP_57", "SOS_tw_l60_opp_AVG_C_IP_56", "SOS_tw_l60_opp_AVG_C_IP_55", "SOS_tw_l60_opp_AVG_C_IP_54", "SOS_tw_l60_opp_AVG_C_IP_53", "SOS_tw_l60_opp_AVG_C_IP_52", "SOS_tw_l60_opp_AVG_C_IP_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_AVG_C_IP"] = (df_player2["SOS_tw_l10_opp_AVG_C_IP_ws"]/df_player2["SOS_tw_l10_opp_AVG_C_IP_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % Implied Win Probability the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface is expected to "yield"
# EY is EXPECTED YIELD
df_player2["EY"] = 100-df_player2["SOS_tw_l10_opp_AVG_C_IP"]

# Mean % Implied Win Probability "performance" (l10_tw_ss) across ALL players per surface. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_clay_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
# (nan, 48.37126710989437, 49.41195245641842, 49.638072095268704, 50.11445554123712, 50.06549336784206)

mean_hard_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
# (nan, 48.658002748511194, 49.737875426621095, 49.6316475366414, 50.31286224489804, 49.718419530452614)

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_AVG_C_IP_l10_tw_ss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss"])*(mean_clay_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_AVG_C_IP_l10_tw_ss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss"])*(mean_clay_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_AVG_C_IP_l10_tw_ss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss"])*(mean_clay_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_AVG_C_IP_l10_tw_ss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss"])*(mean_clay_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_AVG_C_IP_l10_tw_ss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss"])*(mean_clay_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_AVG_C_IP_l10_tw_ss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss"])*(mean_clay_SOS_6/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_AVG_C_IP_l10_tw_ss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss"])*(mean_hard_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_AVG_C_IP_l10_tw_ss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss"])*(mean_hard_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_AVG_C_IP_l10_tw_ss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss"])*(mean_hard_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_AVG_C_IP_l10_tw_ss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss"])*(mean_hard_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_AVG_C_IP_l10_tw_ss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss"])*(mean_hard_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_AVG_C_IP_l10_tw_ss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss"])*(mean_hard_SOS_6/df_player2["EY"])).round(2) 

# Caps highest adjusted player past win probability at 99.9% (this adjustment can push th biggest favorites to well over 100%, which is an impossibility)
df_player2.loc[(df_player2["p_AVG_C_IP_l10_tw_ss_SOS_adj"] >=100), "p_AVG_C_IP_l10_tw_ss_SOS_adj"] = 99.99

del mean_clay_SOS_1, mean_clay_SOS_2, mean_clay_SOS_3, mean_clay_SOS_4, mean_clay_SOS_5, mean_clay_SOS_6, mean_hard_SOS_1, mean_hard_SOS_2, mean_hard_SOS_3, mean_hard_SOS_4, mean_hard_SOS_5, mean_hard_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_AVG_C_IP_ws", "SOS_tw_l10_opp_AVG_C_IP_ws_ct", "SOS_tw_l10_opp_AVG_C_IP", "SOS_tw_l60_opp_AVG_C_IP_60", "SOS_tw_l60_opp_AVG_C_IP_59", "SOS_tw_l60_opp_AVG_C_IP_58", "SOS_tw_l60_opp_AVG_C_IP_57", "SOS_tw_l60_opp_AVG_C_IP_56", "SOS_tw_l60_opp_AVG_C_IP_55", "SOS_tw_l60_opp_AVG_C_IP_54", "SOS_tw_l60_opp_AVG_C_IP_53", "SOS_tw_l60_opp_AVG_C_IP_52", "SOS_tw_l60_opp_AVG_C_IP_51"],axis=1)

In [201]:
# 'p_AVG_C_IP_l60_tw_ss_SOS_adj_IO'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) indoor-outdoor-specific IMPLIED WIN PROBABILITY "performance" of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player2 = df_player2.sort_values(by=['p_id','t_surf','t_ind','t_ind','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific IMPLIED WIN PROBABILITY "performance" of player OPPONENTS on the same surface over the maximum interval of interest (60 matches) prior to the match being predicted 
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_60"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-1)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_59"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-2)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_58"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-3)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_57"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-4)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_56"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-5)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_55"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-6)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_54"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-7)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_53"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-8)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_52"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-9)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_51"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-10)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_50"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-11)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_49"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-12)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_48"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-13)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_47"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-14)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_46"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-15)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_45"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-16)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_44"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-17)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_43"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-18)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_42"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-19)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_41"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-20)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_40"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-21)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_39"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-22)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_38"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-23)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_37"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-24)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_36"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-25)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_35"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-26)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_34"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-27)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_33"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-28)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_32"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-29)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_31"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-30)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_30"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-31)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_29"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-32)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_28"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-33)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_27"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-34)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_26"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-35)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_25"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-36)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_24"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-37)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_23"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-38)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_22"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-39)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_21"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-40)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_20"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-41)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_19"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-42)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_18"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-43)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_17"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-44)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_16"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-45)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_15"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-46)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_14"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-47)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_13"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-48)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_12"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-49)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_11"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-50)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_10"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-51)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_9"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-52)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_8"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-53)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_7"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-54)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_6"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-55)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_5"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-56)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_4"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-57)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_3"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-58)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_2"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-59)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_1"] = df_player2.groupby(['p_id','t_surf','t_ind'])['p_opp_AVG_C_IP_l60_tw_ss_IO'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_ws"] = df_player2[["SOS_tw_l60_opp_AVG_C_IP_IO_60", "SOS_tw_l60_opp_AVG_C_IP_IO_59", "SOS_tw_l60_opp_AVG_C_IP_IO_58", "SOS_tw_l60_opp_AVG_C_IP_IO_57", "SOS_tw_l60_opp_AVG_C_IP_IO_56", "SOS_tw_l60_opp_AVG_C_IP_IO_55", "SOS_tw_l60_opp_AVG_C_IP_IO_54", "SOS_tw_l60_opp_AVG_C_IP_IO_53", "SOS_tw_l60_opp_AVG_C_IP_IO_52", "SOS_tw_l60_opp_AVG_C_IP_IO_51", "SOS_tw_l60_opp_AVG_C_IP_IO_50", "SOS_tw_l60_opp_AVG_C_IP_IO_49", "SOS_tw_l60_opp_AVG_C_IP_IO_48", "SOS_tw_l60_opp_AVG_C_IP_IO_47", "SOS_tw_l60_opp_AVG_C_IP_IO_46", "SOS_tw_l60_opp_AVG_C_IP_IO_45", "SOS_tw_l60_opp_AVG_C_IP_IO_44", "SOS_tw_l60_opp_AVG_C_IP_IO_43", "SOS_tw_l60_opp_AVG_C_IP_IO_42", "SOS_tw_l60_opp_AVG_C_IP_IO_41", "SOS_tw_l60_opp_AVG_C_IP_IO_40", "SOS_tw_l60_opp_AVG_C_IP_IO_39", "SOS_tw_l60_opp_AVG_C_IP_IO_38", "SOS_tw_l60_opp_AVG_C_IP_IO_37", "SOS_tw_l60_opp_AVG_C_IP_IO_36", "SOS_tw_l60_opp_AVG_C_IP_IO_35", "SOS_tw_l60_opp_AVG_C_IP_IO_34", "SOS_tw_l60_opp_AVG_C_IP_IO_33", "SOS_tw_l60_opp_AVG_C_IP_IO_32", "SOS_tw_l60_opp_AVG_C_IP_IO_31", "SOS_tw_l60_opp_AVG_C_IP_IO_30", "SOS_tw_l60_opp_AVG_C_IP_IO_29", "SOS_tw_l60_opp_AVG_C_IP_IO_28", "SOS_tw_l60_opp_AVG_C_IP_IO_27", "SOS_tw_l60_opp_AVG_C_IP_IO_26", "SOS_tw_l60_opp_AVG_C_IP_IO_25", "SOS_tw_l60_opp_AVG_C_IP_IO_24", "SOS_tw_l60_opp_AVG_C_IP_IO_23", "SOS_tw_l60_opp_AVG_C_IP_IO_22", "SOS_tw_l60_opp_AVG_C_IP_IO_21", "SOS_tw_l60_opp_AVG_C_IP_IO_20", "SOS_tw_l60_opp_AVG_C_IP_IO_19", "SOS_tw_l60_opp_AVG_C_IP_IO_18", "SOS_tw_l60_opp_AVG_C_IP_IO_17", "SOS_tw_l60_opp_AVG_C_IP_IO_16", "SOS_tw_l60_opp_AVG_C_IP_IO_15", "SOS_tw_l60_opp_AVG_C_IP_IO_14", "SOS_tw_l60_opp_AVG_C_IP_IO_13", "SOS_tw_l60_opp_AVG_C_IP_IO_12", "SOS_tw_l60_opp_AVG_C_IP_IO_11", "SOS_tw_l60_opp_AVG_C_IP_IO_10", "SOS_tw_l60_opp_AVG_C_IP_IO_9", "SOS_tw_l60_opp_AVG_C_IP_IO_8", "SOS_tw_l60_opp_AVG_C_IP_IO_7", "SOS_tw_l60_opp_AVG_C_IP_IO_6", "SOS_tw_l60_opp_AVG_C_IP_IO_5", "SOS_tw_l60_opp_AVG_C_IP_IO_4", "SOS_tw_l60_opp_AVG_C_IP_IO_3", "SOS_tw_l60_opp_AVG_C_IP_IO_2", "SOS_tw_l60_opp_AVG_C_IP_IO_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_ws_ct"] = df_player2[["SOS_tw_l60_opp_AVG_C_IP_IO_60", "SOS_tw_l60_opp_AVG_C_IP_IO_59", "SOS_tw_l60_opp_AVG_C_IP_IO_58", "SOS_tw_l60_opp_AVG_C_IP_IO_57", "SOS_tw_l60_opp_AVG_C_IP_IO_56", "SOS_tw_l60_opp_AVG_C_IP_IO_55", "SOS_tw_l60_opp_AVG_C_IP_IO_54", "SOS_tw_l60_opp_AVG_C_IP_IO_53", "SOS_tw_l60_opp_AVG_C_IP_IO_52", "SOS_tw_l60_opp_AVG_C_IP_IO_51", "SOS_tw_l60_opp_AVG_C_IP_IO_50", "SOS_tw_l60_opp_AVG_C_IP_IO_49", "SOS_tw_l60_opp_AVG_C_IP_IO_48", "SOS_tw_l60_opp_AVG_C_IP_IO_47", "SOS_tw_l60_opp_AVG_C_IP_IO_46", "SOS_tw_l60_opp_AVG_C_IP_IO_45", "SOS_tw_l60_opp_AVG_C_IP_IO_44", "SOS_tw_l60_opp_AVG_C_IP_IO_43", "SOS_tw_l60_opp_AVG_C_IP_IO_42", "SOS_tw_l60_opp_AVG_C_IP_IO_41", "SOS_tw_l60_opp_AVG_C_IP_IO_40", "SOS_tw_l60_opp_AVG_C_IP_IO_39", "SOS_tw_l60_opp_AVG_C_IP_IO_38", "SOS_tw_l60_opp_AVG_C_IP_IO_37", "SOS_tw_l60_opp_AVG_C_IP_IO_36", "SOS_tw_l60_opp_AVG_C_IP_IO_35", "SOS_tw_l60_opp_AVG_C_IP_IO_34", "SOS_tw_l60_opp_AVG_C_IP_IO_33", "SOS_tw_l60_opp_AVG_C_IP_IO_32", "SOS_tw_l60_opp_AVG_C_IP_IO_31", "SOS_tw_l60_opp_AVG_C_IP_IO_30", "SOS_tw_l60_opp_AVG_C_IP_IO_29", "SOS_tw_l60_opp_AVG_C_IP_IO_28", "SOS_tw_l60_opp_AVG_C_IP_IO_27", "SOS_tw_l60_opp_AVG_C_IP_IO_26", "SOS_tw_l60_opp_AVG_C_IP_IO_25", "SOS_tw_l60_opp_AVG_C_IP_IO_24", "SOS_tw_l60_opp_AVG_C_IP_IO_23", "SOS_tw_l60_opp_AVG_C_IP_IO_22", "SOS_tw_l60_opp_AVG_C_IP_IO_21", "SOS_tw_l60_opp_AVG_C_IP_IO_20", "SOS_tw_l60_opp_AVG_C_IP_IO_19", "SOS_tw_l60_opp_AVG_C_IP_IO_18", "SOS_tw_l60_opp_AVG_C_IP_IO_17", "SOS_tw_l60_opp_AVG_C_IP_IO_16", "SOS_tw_l60_opp_AVG_C_IP_IO_15", "SOS_tw_l60_opp_AVG_C_IP_IO_14", "SOS_tw_l60_opp_AVG_C_IP_IO_13", "SOS_tw_l60_opp_AVG_C_IP_IO_12", "SOS_tw_l60_opp_AVG_C_IP_IO_11", "SOS_tw_l60_opp_AVG_C_IP_IO_10", "SOS_tw_l60_opp_AVG_C_IP_IO_9", "SOS_tw_l60_opp_AVG_C_IP_IO_8", "SOS_tw_l60_opp_AVG_C_IP_IO_7", "SOS_tw_l60_opp_AVG_C_IP_IO_6", "SOS_tw_l60_opp_AVG_C_IP_IO_5", "SOS_tw_l60_opp_AVG_C_IP_IO_4", "SOS_tw_l60_opp_AVG_C_IP_IO_3", "SOS_tw_l60_opp_AVG_C_IP_IO_2", "SOS_tw_l60_opp_AVG_C_IP_IO_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_AVG_C_IP_IO"] = (df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_ws"]/df_player2["SOS_tw_l60_opp_AVG_C_IP_IO_ws_ct"]).round(2) #see note on prior line

#(ws = weighted sum; tw = time-weighted)

# % Implied Win Probability the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches on this surface (and same indoor-outdoor status) is expected to "yield"
# EY is EXPECTED YIELD
df_player2["EY"] = 100-df_player2["SOS_tw_l60_opp_AVG_C_IP_IO"]

# Mean % Implied Win Probability "performance" (l60_tw_ss) across ALL players per surface. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_clay_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
# (nan, 53.048080808080805, 51.28018691588789, 53.6002463054187, 54.544779874213845, 54.700754716981145)

mean_clay_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
# (nan, 48.34203417412526, 49.363662019721176, 50.07087809917356, 50.62572495755512, 50.48147403685096)
                                         
mean_hard_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
# (nan, 49.925925170068005, 50.86580973952436, 50.14734143049937, 51.29827278958186, 49.70823529411766)

mean_hard_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_AVG_C_IP_l60_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
# (nan, 48.47794889502755, 49.650537371763605, 50.06745515426959, 50.86248360099545, 50.284130534002635)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)           
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)           
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)            
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2)   

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)         
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)           
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)           
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2)  
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)         
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)         
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)         
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2)
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2)

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)         
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)        
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2)

# Caps highest adjusted player past win probability at 99.9% (this adjustment can push th biggest favorites to well over 100%, which is an impossibility)
df_player2.loc[(df_player2["p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] >=100), "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = 99.99 

del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o 

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_AVG_C_IP_IO_ws", "SOS_tw_l60_opp_AVG_C_IP_IO_ws_ct", "SOS_tw_l60_opp_AVG_C_IP_IO", "SOS_tw_l60_opp_AVG_C_IP_IO_50", "SOS_tw_l60_opp_AVG_C_IP_IO_49", "SOS_tw_l60_opp_AVG_C_IP_IO_48", "SOS_tw_l60_opp_AVG_C_IP_IO_47", "SOS_tw_l60_opp_AVG_C_IP_IO_46", "SOS_tw_l60_opp_AVG_C_IP_IO_45", "SOS_tw_l60_opp_AVG_C_IP_IO_44", "SOS_tw_l60_opp_AVG_C_IP_IO_43", "SOS_tw_l60_opp_AVG_C_IP_IO_42", "SOS_tw_l60_opp_AVG_C_IP_IO_41", "SOS_tw_l60_opp_AVG_C_IP_IO_40", "SOS_tw_l60_opp_AVG_C_IP_IO_39", "SOS_tw_l60_opp_AVG_C_IP_IO_38", "SOS_tw_l60_opp_AVG_C_IP_IO_37", "SOS_tw_l60_opp_AVG_C_IP_IO_36", "SOS_tw_l60_opp_AVG_C_IP_IO_35", "SOS_tw_l60_opp_AVG_C_IP_IO_34", "SOS_tw_l60_opp_AVG_C_IP_IO_33", "SOS_tw_l60_opp_AVG_C_IP_IO_32", "SOS_tw_l60_opp_AVG_C_IP_IO_31", "SOS_tw_l60_opp_AVG_C_IP_IO_30", "SOS_tw_l60_opp_AVG_C_IP_IO_29", "SOS_tw_l60_opp_AVG_C_IP_IO_28", "SOS_tw_l60_opp_AVG_C_IP_IO_27", "SOS_tw_l60_opp_AVG_C_IP_IO_26", "SOS_tw_l60_opp_AVG_C_IP_IO_25", "SOS_tw_l60_opp_AVG_C_IP_IO_24", "SOS_tw_l60_opp_AVG_C_IP_IO_23", "SOS_tw_l60_opp_AVG_C_IP_IO_22", "SOS_tw_l60_opp_AVG_C_IP_IO_21", "SOS_tw_l60_opp_AVG_C_IP_IO_20", "SOS_tw_l60_opp_AVG_C_IP_IO_19", "SOS_tw_l60_opp_AVG_C_IP_IO_18", "SOS_tw_l60_opp_AVG_C_IP_IO_17", "SOS_tw_l60_opp_AVG_C_IP_IO_16", "SOS_tw_l60_opp_AVG_C_IP_IO_15", "SOS_tw_l60_opp_AVG_C_IP_IO_14", "SOS_tw_l60_opp_AVG_C_IP_IO_13", "SOS_tw_l60_opp_AVG_C_IP_IO_12", "SOS_tw_l60_opp_AVG_C_IP_IO_11", "SOS_tw_l60_opp_AVG_C_IP_IO_10", "SOS_tw_l60_opp_AVG_C_IP_IO_9", "SOS_tw_l60_opp_AVG_C_IP_IO_8", "SOS_tw_l60_opp_AVG_C_IP_IO_7", "SOS_tw_l60_opp_AVG_C_IP_IO_6", "SOS_tw_l60_opp_AVG_C_IP_IO_5", "SOS_tw_l60_opp_AVG_C_IP_IO_4", "SOS_tw_l60_opp_AVG_C_IP_IO_3", "SOS_tw_l60_opp_AVG_C_IP_IO_2", "SOS_tw_l60_opp_AVG_C_IP_IO_1"],axis=1)

In [202]:
# 'p_AVG_C_IP_l10_tw_ss_SOS_adj_IO'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, surface-specific (SS) indoor-outdoor-specific IMPLIED WIN PROBABILITY "performance" of PLAYER over the 10 matches PRIOR TO the match being predicted

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_AVG_C_IP_IO_ws"] = df_player2[["SOS_tw_l60_opp_AVG_C_IP_IO_60", "SOS_tw_l60_opp_AVG_C_IP_IO_59", "SOS_tw_l60_opp_AVG_C_IP_IO_58", "SOS_tw_l60_opp_AVG_C_IP_IO_57", "SOS_tw_l60_opp_AVG_C_IP_IO_56", "SOS_tw_l60_opp_AVG_C_IP_IO_55", "SOS_tw_l60_opp_AVG_C_IP_IO_54", "SOS_tw_l60_opp_AVG_C_IP_IO_53", "SOS_tw_l60_opp_AVG_C_IP_IO_52", "SOS_tw_l60_opp_AVG_C_IP_IO_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_AVG_C_IP_IO_ws_ct"] = df_player2[["SOS_tw_l60_opp_AVG_C_IP_IO_60", "SOS_tw_l60_opp_AVG_C_IP_IO_59", "SOS_tw_l60_opp_AVG_C_IP_IO_58", "SOS_tw_l60_opp_AVG_C_IP_IO_57", "SOS_tw_l60_opp_AVG_C_IP_IO_56", "SOS_tw_l60_opp_AVG_C_IP_IO_55", "SOS_tw_l60_opp_AVG_C_IP_IO_54", "SOS_tw_l60_opp_AVG_C_IP_IO_53", "SOS_tw_l60_opp_AVG_C_IP_IO_52", "SOS_tw_l60_opp_AVG_C_IP_IO_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_AVG_C_IP_IO"] = (df_player2["SOS_tw_l10_opp_AVG_C_IP_IO_ws"]/df_player2["SOS_tw_l10_opp_AVG_C_IP_IO_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % Implied Win Probability the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches on this surface (and same indoor-outdoor status) is expected to "yield"
# EY is EXPECTED YIELD
df_player2["EY"] = 100-df_player2["SOS_tw_l10_opp_AVG_C_IP_IO"]

# Mean % Implied Win Probability "performance" (l10_tw_ss) across ALL players per surface. We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean per surface over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for clay] not for modeling), and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_clay_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
# (nan, 52.85353535353534, 51.965467289719655, 52.70137931034481, 52.74408805031446, 53.81009433962266)

mean_clay_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_clay_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 1) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
# (nan, 48.1907363710333, 49.22614756885416, 49.42393595041323, 49.97248217317496, 49.93251926298155)
                                         
mean_hard_SOS_1i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_2i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_3i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_4i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_5i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_6i = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
# (nan, 49.393673469387785, 50.47847678369195, 49.614622132253686, 50.69502398903361, 49.69479946524064)

mean_hard_SOS_1o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_AVG_C_IP'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_2o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_AVG_C_IP'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_3o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_AVG_C_IP'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_4o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_AVG_C_IP'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_5o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_AVG_C_IP'].mean()) #we want in terms of pct players "YIELD" on average
mean_hard_SOS_6o = 100 - (df_player2.loc[(df_player2['t_surf'] == 2) & (df_player2['t_ind'] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_AVG_C_IP'].mean()) #we want in terms of pct players "YIELD" on average
# (nan, 48.284578729281705, 49.41840742550099, 49.63768237263814, 50.186742818366845, 49.72648334094011)

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss_IO"])*(mean_clay_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss_IO"])*(mean_clay_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss_IO"])*(mean_clay_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss_IO"])*(mean_clay_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss_IO"])*(mean_clay_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss_IO"])*(mean_clay_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss_IO"])*(mean_clay_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss_IO"])*(mean_clay_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss_IO"])*(mean_clay_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss_IO"])*(mean_clay_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss_IO"])*(mean_clay_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 1) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss_IO"])*(mean_clay_SOS_6o/df_player2["EY"])).round(2)

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss_IO"])*(mean_hard_SOS_1i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss_IO"])*(mean_hard_SOS_2i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss_IO"])*(mean_hard_SOS_3i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss_IO"])*(mean_hard_SOS_4i/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss_IO"])*(mean_hard_SOS_5i/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 1) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss_IO"])*(mean_hard_SOS_6i/df_player2["EY"])).round(2) 

df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss_IO"])*(mean_hard_SOS_1o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss_IO"])*(mean_hard_SOS_2o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss_IO"])*(mean_hard_SOS_3o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss_IO"])*(mean_hard_SOS_4o/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss_IO"])*(mean_hard_SOS_5o/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] == 2) & (df_player2["t_ind"] == 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_ss_IO"])*(mean_hard_SOS_6o/df_player2["EY"])).round(2)

# Caps highest adjusted player past win probability at 99.9% (this adjustment can push th biggest favorites to well over 100%, which is an impossibility)
df_player2.loc[(df_player2["p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] >=100), "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = 99.99 

del mean_clay_SOS_1i, mean_clay_SOS_2i, mean_clay_SOS_3i, mean_clay_SOS_4i, mean_clay_SOS_5i, mean_clay_SOS_6i, mean_clay_SOS_1o, mean_clay_SOS_2o, mean_clay_SOS_3o, mean_clay_SOS_4o, mean_clay_SOS_5o, mean_clay_SOS_6o, mean_hard_SOS_1i, mean_hard_SOS_2i, mean_hard_SOS_3i, mean_hard_SOS_4i, mean_hard_SOS_5i, mean_hard_SOS_6i, mean_hard_SOS_1o, mean_hard_SOS_2o, mean_hard_SOS_3o, mean_hard_SOS_4o, mean_hard_SOS_5o, mean_hard_SOS_6o 

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_AVG_C_IP_IO_ws", "SOS_tw_l10_opp_AVG_C_IP_IO_ws_ct", "SOS_tw_l10_opp_AVG_C_IP_IO", "SOS_tw_l60_opp_AVG_C_IP_IO_60", "SOS_tw_l60_opp_AVG_C_IP_IO_59", "SOS_tw_l60_opp_AVG_C_IP_IO_58", "SOS_tw_l60_opp_AVG_C_IP_IO_57", "SOS_tw_l60_opp_AVG_C_IP_IO_56", "SOS_tw_l60_opp_AVG_C_IP_IO_55", "SOS_tw_l60_opp_AVG_C_IP_IO_54", "SOS_tw_l60_opp_AVG_C_IP_IO_53", "SOS_tw_l60_opp_AVG_C_IP_IO_52", "SOS_tw_l60_opp_AVG_C_IP_IO_51"],axis=1)

In [203]:
# 'p_AVG_C_IP_l60_tw_nss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, NON-surface-specific (NSS) IMPLIED WIN PROBABILITY "performance" of PLAYER over the 60 matches PRIOR TO the match being predicted  

df_player2 = df_player2.sort_values(by=['p_id','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, NON-surface-specific IMPLIED WIN PROBABILITY "performance" of player OPPONENTS over the maximum interval of interest (60 matches) prior to the match being predicted 
df_player2["SOS_tw_l60_opp_AVG_C_IP_60"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-1)
df_player2["SOS_tw_l60_opp_AVG_C_IP_59"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-2)
df_player2["SOS_tw_l60_opp_AVG_C_IP_58"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-3)
df_player2["SOS_tw_l60_opp_AVG_C_IP_57"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-4)
df_player2["SOS_tw_l60_opp_AVG_C_IP_56"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-5)
df_player2["SOS_tw_l60_opp_AVG_C_IP_55"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-6)
df_player2["SOS_tw_l60_opp_AVG_C_IP_54"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-7)
df_player2["SOS_tw_l60_opp_AVG_C_IP_53"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-8)
df_player2["SOS_tw_l60_opp_AVG_C_IP_52"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-9)
df_player2["SOS_tw_l60_opp_AVG_C_IP_51"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-10)
df_player2["SOS_tw_l60_opp_AVG_C_IP_50"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-11)
df_player2["SOS_tw_l60_opp_AVG_C_IP_49"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-12)
df_player2["SOS_tw_l60_opp_AVG_C_IP_48"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-13)
df_player2["SOS_tw_l60_opp_AVG_C_IP_47"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-14)
df_player2["SOS_tw_l60_opp_AVG_C_IP_46"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-15)
df_player2["SOS_tw_l60_opp_AVG_C_IP_45"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-16)
df_player2["SOS_tw_l60_opp_AVG_C_IP_44"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-17)
df_player2["SOS_tw_l60_opp_AVG_C_IP_43"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-18)
df_player2["SOS_tw_l60_opp_AVG_C_IP_42"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-19)
df_player2["SOS_tw_l60_opp_AVG_C_IP_41"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-20)
df_player2["SOS_tw_l60_opp_AVG_C_IP_40"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-21)
df_player2["SOS_tw_l60_opp_AVG_C_IP_39"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-22)
df_player2["SOS_tw_l60_opp_AVG_C_IP_38"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-23)
df_player2["SOS_tw_l60_opp_AVG_C_IP_37"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-24)
df_player2["SOS_tw_l60_opp_AVG_C_IP_36"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-25)
df_player2["SOS_tw_l60_opp_AVG_C_IP_35"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-26)
df_player2["SOS_tw_l60_opp_AVG_C_IP_34"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-27)
df_player2["SOS_tw_l60_opp_AVG_C_IP_33"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-28)
df_player2["SOS_tw_l60_opp_AVG_C_IP_32"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-29)
df_player2["SOS_tw_l60_opp_AVG_C_IP_31"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-30)
df_player2["SOS_tw_l60_opp_AVG_C_IP_30"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-31)
df_player2["SOS_tw_l60_opp_AVG_C_IP_29"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-32)
df_player2["SOS_tw_l60_opp_AVG_C_IP_28"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-33)
df_player2["SOS_tw_l60_opp_AVG_C_IP_27"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-34)
df_player2["SOS_tw_l60_opp_AVG_C_IP_26"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-35)
df_player2["SOS_tw_l60_opp_AVG_C_IP_25"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-36)
df_player2["SOS_tw_l60_opp_AVG_C_IP_24"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-37)
df_player2["SOS_tw_l60_opp_AVG_C_IP_23"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-38)
df_player2["SOS_tw_l60_opp_AVG_C_IP_22"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-39)
df_player2["SOS_tw_l60_opp_AVG_C_IP_21"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-40)
df_player2["SOS_tw_l60_opp_AVG_C_IP_20"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-41)
df_player2["SOS_tw_l60_opp_AVG_C_IP_19"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-42)
df_player2["SOS_tw_l60_opp_AVG_C_IP_18"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-43)
df_player2["SOS_tw_l60_opp_AVG_C_IP_17"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-44)
df_player2["SOS_tw_l60_opp_AVG_C_IP_16"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-45)
df_player2["SOS_tw_l60_opp_AVG_C_IP_15"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-46)
df_player2["SOS_tw_l60_opp_AVG_C_IP_14"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-47)
df_player2["SOS_tw_l60_opp_AVG_C_IP_13"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-48)
df_player2["SOS_tw_l60_opp_AVG_C_IP_12"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-49)
df_player2["SOS_tw_l60_opp_AVG_C_IP_11"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-50)
df_player2["SOS_tw_l60_opp_AVG_C_IP_10"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-51)
df_player2["SOS_tw_l60_opp_AVG_C_IP_9"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-52)
df_player2["SOS_tw_l60_opp_AVG_C_IP_8"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-53)
df_player2["SOS_tw_l60_opp_AVG_C_IP_7"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-54)
df_player2["SOS_tw_l60_opp_AVG_C_IP_6"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-55)
df_player2["SOS_tw_l60_opp_AVG_C_IP_5"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-56)
df_player2["SOS_tw_l60_opp_AVG_C_IP_4"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-57)
df_player2["SOS_tw_l60_opp_AVG_C_IP_3"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-58)
df_player2["SOS_tw_l60_opp_AVG_C_IP_2"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-59)
df_player2["SOS_tw_l60_opp_AVG_C_IP_1"] = df_player2.groupby(['p_id'])['p_opp_AVG_C_IP_l60_tw_nss'].shift(-60)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l60_opp_AVG_C_IP_ws"] = df_player2[["SOS_tw_l60_opp_AVG_C_IP_60", "SOS_tw_l60_opp_AVG_C_IP_59", "SOS_tw_l60_opp_AVG_C_IP_58", "SOS_tw_l60_opp_AVG_C_IP_57", "SOS_tw_l60_opp_AVG_C_IP_56", "SOS_tw_l60_opp_AVG_C_IP_55", "SOS_tw_l60_opp_AVG_C_IP_54", "SOS_tw_l60_opp_AVG_C_IP_53", "SOS_tw_l60_opp_AVG_C_IP_52", "SOS_tw_l60_opp_AVG_C_IP_51", "SOS_tw_l60_opp_AVG_C_IP_50", "SOS_tw_l60_opp_AVG_C_IP_49", "SOS_tw_l60_opp_AVG_C_IP_48", "SOS_tw_l60_opp_AVG_C_IP_47", "SOS_tw_l60_opp_AVG_C_IP_46", "SOS_tw_l60_opp_AVG_C_IP_45", "SOS_tw_l60_opp_AVG_C_IP_44", "SOS_tw_l60_opp_AVG_C_IP_43", "SOS_tw_l60_opp_AVG_C_IP_42", "SOS_tw_l60_opp_AVG_C_IP_41", "SOS_tw_l60_opp_AVG_C_IP_40", "SOS_tw_l60_opp_AVG_C_IP_39", "SOS_tw_l60_opp_AVG_C_IP_38", "SOS_tw_l60_opp_AVG_C_IP_37", "SOS_tw_l60_opp_AVG_C_IP_36", "SOS_tw_l60_opp_AVG_C_IP_35", "SOS_tw_l60_opp_AVG_C_IP_34", "SOS_tw_l60_opp_AVG_C_IP_33", "SOS_tw_l60_opp_AVG_C_IP_32", "SOS_tw_l60_opp_AVG_C_IP_31", "SOS_tw_l60_opp_AVG_C_IP_30", "SOS_tw_l60_opp_AVG_C_IP_29", "SOS_tw_l60_opp_AVG_C_IP_28", "SOS_tw_l60_opp_AVG_C_IP_27", "SOS_tw_l60_opp_AVG_C_IP_26", "SOS_tw_l60_opp_AVG_C_IP_25", "SOS_tw_l60_opp_AVG_C_IP_24", "SOS_tw_l60_opp_AVG_C_IP_23", "SOS_tw_l60_opp_AVG_C_IP_22", "SOS_tw_l60_opp_AVG_C_IP_21", "SOS_tw_l60_opp_AVG_C_IP_20", "SOS_tw_l60_opp_AVG_C_IP_19", "SOS_tw_l60_opp_AVG_C_IP_18", "SOS_tw_l60_opp_AVG_C_IP_17", "SOS_tw_l60_opp_AVG_C_IP_16", "SOS_tw_l60_opp_AVG_C_IP_15", "SOS_tw_l60_opp_AVG_C_IP_14", "SOS_tw_l60_opp_AVG_C_IP_13", "SOS_tw_l60_opp_AVG_C_IP_12", "SOS_tw_l60_opp_AVG_C_IP_11", "SOS_tw_l60_opp_AVG_C_IP_10", "SOS_tw_l60_opp_AVG_C_IP_9", "SOS_tw_l60_opp_AVG_C_IP_8", "SOS_tw_l60_opp_AVG_C_IP_7", "SOS_tw_l60_opp_AVG_C_IP_6", "SOS_tw_l60_opp_AVG_C_IP_5", "SOS_tw_l60_opp_AVG_C_IP_4", "SOS_tw_l60_opp_AVG_C_IP_3", "SOS_tw_l60_opp_AVG_C_IP_2", "SOS_tw_l60_opp_AVG_C_IP_1"]].sum(axis=1)
df_player2["SOS_tw_l60_opp_AVG_C_IP_ws_ct"] = df_player2[["SOS_tw_l60_opp_AVG_C_IP_60", "SOS_tw_l60_opp_AVG_C_IP_59", "SOS_tw_l60_opp_AVG_C_IP_58", "SOS_tw_l60_opp_AVG_C_IP_57", "SOS_tw_l60_opp_AVG_C_IP_56", "SOS_tw_l60_opp_AVG_C_IP_55", "SOS_tw_l60_opp_AVG_C_IP_54", "SOS_tw_l60_opp_AVG_C_IP_53", "SOS_tw_l60_opp_AVG_C_IP_52", "SOS_tw_l60_opp_AVG_C_IP_51", "SOS_tw_l60_opp_AVG_C_IP_50", "SOS_tw_l60_opp_AVG_C_IP_49", "SOS_tw_l60_opp_AVG_C_IP_48", "SOS_tw_l60_opp_AVG_C_IP_47", "SOS_tw_l60_opp_AVG_C_IP_46", "SOS_tw_l60_opp_AVG_C_IP_45", "SOS_tw_l60_opp_AVG_C_IP_44", "SOS_tw_l60_opp_AVG_C_IP_43", "SOS_tw_l60_opp_AVG_C_IP_42", "SOS_tw_l60_opp_AVG_C_IP_41", "SOS_tw_l60_opp_AVG_C_IP_40", "SOS_tw_l60_opp_AVG_C_IP_39", "SOS_tw_l60_opp_AVG_C_IP_38", "SOS_tw_l60_opp_AVG_C_IP_37", "SOS_tw_l60_opp_AVG_C_IP_36", "SOS_tw_l60_opp_AVG_C_IP_35", "SOS_tw_l60_opp_AVG_C_IP_34", "SOS_tw_l60_opp_AVG_C_IP_33", "SOS_tw_l60_opp_AVG_C_IP_32", "SOS_tw_l60_opp_AVG_C_IP_31", "SOS_tw_l60_opp_AVG_C_IP_30", "SOS_tw_l60_opp_AVG_C_IP_29", "SOS_tw_l60_opp_AVG_C_IP_28", "SOS_tw_l60_opp_AVG_C_IP_27", "SOS_tw_l60_opp_AVG_C_IP_26", "SOS_tw_l60_opp_AVG_C_IP_25", "SOS_tw_l60_opp_AVG_C_IP_24", "SOS_tw_l60_opp_AVG_C_IP_23", "SOS_tw_l60_opp_AVG_C_IP_22", "SOS_tw_l60_opp_AVG_C_IP_21", "SOS_tw_l60_opp_AVG_C_IP_20", "SOS_tw_l60_opp_AVG_C_IP_19", "SOS_tw_l60_opp_AVG_C_IP_18", "SOS_tw_l60_opp_AVG_C_IP_17", "SOS_tw_l60_opp_AVG_C_IP_16", "SOS_tw_l60_opp_AVG_C_IP_15", "SOS_tw_l60_opp_AVG_C_IP_14", "SOS_tw_l60_opp_AVG_C_IP_13", "SOS_tw_l60_opp_AVG_C_IP_12", "SOS_tw_l60_opp_AVG_C_IP_11", "SOS_tw_l60_opp_AVG_C_IP_10", "SOS_tw_l60_opp_AVG_C_IP_9", "SOS_tw_l60_opp_AVG_C_IP_8", "SOS_tw_l60_opp_AVG_C_IP_7", "SOS_tw_l60_opp_AVG_C_IP_6", "SOS_tw_l60_opp_AVG_C_IP_5", "SOS_tw_l60_opp_AVG_C_IP_4", "SOS_tw_l60_opp_AVG_C_IP_3", "SOS_tw_l60_opp_AVG_C_IP_2", "SOS_tw_l60_opp_AVG_C_IP_1"]].count(axis=1)
df_player2["SOS_tw_l60_opp_AVG_C_IP"] = (df_player2["SOS_tw_l60_opp_AVG_C_IP_ws"]/df_player2["SOS_tw_l60_opp_AVG_C_IP_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % Implied Win Probability the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 60 matches ACROSS SURFACES is expected to "yield"
# EY is EXPECTED YIELD
df_player2["EY"] = 100-df_player2["SOS_tw_l60_opp_AVG_C_IP"]

# Mean % Implied Win Probability "performance" (l60_tw_nss) across ALL players ACROSS ALL surfaces (allsurface, hard, grass). We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean ACROSS surfaces over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for allsurface] not for modeling), and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_allsurface_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] != 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_AVG_C_IP_l60_tw_nss'].mean()) #we want in terms of pct players "YIELD" on average
mean_allsurface_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] != 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_AVG_C_IP_l60_tw_nss'].mean()) #we want in terms of pct players "YIELD" on average
mean_allsurface_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] != 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_AVG_C_IP_l60_tw_nss'].mean()) #we want in terms of pct players "YIELD" on average
mean_allsurface_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] != 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_AVG_C_IP_l60_tw_nss'].mean()) #we want in terms of pct players "YIELD" on average
mean_allsurface_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] != 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_AVG_C_IP_l60_tw_nss'].mean()) #we want in terms of pct players "YIELD" on average
mean_allsurface_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] != 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_AVG_C_IP_l60_tw_nss'].mean()) #we want in terms of pct players "YIELD" on average
# (nan, 49.25915491259812, 49.848284625159, 50.0612709163348, 50.64474446171781, 50.293127428127356)

# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] != 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_AVG_C_IP_l60_tw_nss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_nss"])*(mean_allsurface_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] != 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_AVG_C_IP_l60_tw_nss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_nss"])*(mean_allsurface_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] != 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_AVG_C_IP_l60_tw_nss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_nss"])*(mean_allsurface_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] != 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_AVG_C_IP_l60_tw_nss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_nss"])*(mean_allsurface_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] != 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_AVG_C_IP_l60_tw_nss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_nss"])*(mean_allsurface_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] != 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_AVG_C_IP_l60_tw_nss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l60_tw_nss"])*(mean_allsurface_SOS_6/df_player2["EY"])).round(2) 

# Caps highest adjusted player past win probability at 99.9% (this adjustment can push the biggest favorites to well over 100%, which is an impossibility)
df_player2.loc[(df_player2["p_AVG_C_IP_l60_tw_nss_SOS_adj"] >=100), "p_AVG_C_IP_l60_tw_nss_SOS_adj"] = 99.99 

del mean_allsurface_SOS_1, mean_allsurface_SOS_2, mean_allsurface_SOS_3, mean_allsurface_SOS_4, mean_allsurface_SOS_5, mean_allsurface_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l60_opp_AVG_C_IP_ws", "SOS_tw_l60_opp_AVG_C_IP_ws_ct", "SOS_tw_l60_opp_AVG_C_IP", "SOS_tw_l60_opp_AVG_C_IP_50", "SOS_tw_l60_opp_AVG_C_IP_49", "SOS_tw_l60_opp_AVG_C_IP_48", "SOS_tw_l60_opp_AVG_C_IP_47", "SOS_tw_l60_opp_AVG_C_IP_46", "SOS_tw_l60_opp_AVG_C_IP_45", "SOS_tw_l60_opp_AVG_C_IP_44", "SOS_tw_l60_opp_AVG_C_IP_43", "SOS_tw_l60_opp_AVG_C_IP_42", "SOS_tw_l60_opp_AVG_C_IP_41", "SOS_tw_l60_opp_AVG_C_IP_40", "SOS_tw_l60_opp_AVG_C_IP_39", "SOS_tw_l60_opp_AVG_C_IP_38", "SOS_tw_l60_opp_AVG_C_IP_37", "SOS_tw_l60_opp_AVG_C_IP_36", "SOS_tw_l60_opp_AVG_C_IP_35", "SOS_tw_l60_opp_AVG_C_IP_34", "SOS_tw_l60_opp_AVG_C_IP_33", "SOS_tw_l60_opp_AVG_C_IP_32", "SOS_tw_l60_opp_AVG_C_IP_31", "SOS_tw_l60_opp_AVG_C_IP_30", "SOS_tw_l60_opp_AVG_C_IP_29", "SOS_tw_l60_opp_AVG_C_IP_28", "SOS_tw_l60_opp_AVG_C_IP_27", "SOS_tw_l60_opp_AVG_C_IP_26", "SOS_tw_l60_opp_AVG_C_IP_25", "SOS_tw_l60_opp_AVG_C_IP_24", "SOS_tw_l60_opp_AVG_C_IP_23", "SOS_tw_l60_opp_AVG_C_IP_22", "SOS_tw_l60_opp_AVG_C_IP_21", "SOS_tw_l60_opp_AVG_C_IP_20", "SOS_tw_l60_opp_AVG_C_IP_19", "SOS_tw_l60_opp_AVG_C_IP_18", "SOS_tw_l60_opp_AVG_C_IP_17", "SOS_tw_l60_opp_AVG_C_IP_16", "SOS_tw_l60_opp_AVG_C_IP_15", "SOS_tw_l60_opp_AVG_C_IP_14", "SOS_tw_l60_opp_AVG_C_IP_13", "SOS_tw_l60_opp_AVG_C_IP_12", "SOS_tw_l60_opp_AVG_C_IP_11", "SOS_tw_l60_opp_AVG_C_IP_10", "SOS_tw_l60_opp_AVG_C_IP_9", "SOS_tw_l60_opp_AVG_C_IP_8", "SOS_tw_l60_opp_AVG_C_IP_7", "SOS_tw_l60_opp_AVG_C_IP_6", "SOS_tw_l60_opp_AVG_C_IP_5", "SOS_tw_l60_opp_AVG_C_IP_4", "SOS_tw_l60_opp_AVG_C_IP_3", "SOS_tw_l60_opp_AVG_C_IP_2", "SOS_tw_l60_opp_AVG_C_IP_1"],axis=1)

In [204]:
# 'p_AVG_C_IP_l10_tw_nss_SOS_adj'
# Provides Strength of Schedule-adjustment (SOS_adj) to mean, time-weighted, NON-surface-specific (NSS) IMPLIED WIN PROBABILITY "performance" of PLAYER over the 10 matches PRIOR TO the match being predicted  

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player2["SOS_tw_l10_opp_AVG_C_IP_ws"] = df_player2[["SOS_tw_l60_opp_AVG_C_IP_60", "SOS_tw_l60_opp_AVG_C_IP_59", "SOS_tw_l60_opp_AVG_C_IP_58", "SOS_tw_l60_opp_AVG_C_IP_57", "SOS_tw_l60_opp_AVG_C_IP_56", "SOS_tw_l60_opp_AVG_C_IP_55", "SOS_tw_l60_opp_AVG_C_IP_54", "SOS_tw_l60_opp_AVG_C_IP_53", "SOS_tw_l60_opp_AVG_C_IP_52", "SOS_tw_l60_opp_AVG_C_IP_51"]].sum(axis=1)
df_player2["SOS_tw_l10_opp_AVG_C_IP_ws_ct"] = df_player2[["SOS_tw_l60_opp_AVG_C_IP_60", "SOS_tw_l60_opp_AVG_C_IP_59", "SOS_tw_l60_opp_AVG_C_IP_58", "SOS_tw_l60_opp_AVG_C_IP_57", "SOS_tw_l60_opp_AVG_C_IP_56", "SOS_tw_l60_opp_AVG_C_IP_55", "SOS_tw_l60_opp_AVG_C_IP_54", "SOS_tw_l60_opp_AVG_C_IP_53", "SOS_tw_l60_opp_AVG_C_IP_52", "SOS_tw_l60_opp_AVG_C_IP_51"]].count(axis=1)
df_player2["SOS_tw_l10_opp_AVG_C_IP"] = (df_player2["SOS_tw_l10_opp_AVG_C_IP_ws"]/df_player2["SOS_tw_l10_opp_AVG_C_IP_ws_ct"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# % Implied Win Probability the SCHEDULE OF OPPONENTS FACED BY THIS PLAYER over their last 10 matches ACROSS SURFACES is expected to "yield"
# EY is EXPECTED YIELD
df_player2["EY"] = 100-df_player2["SOS_tw_l10_opp_AVG_C_IP"]

# Mean % Implied Win Probability "performance" (l10_tw_ss) across ALL players ACROSS ALL surfaces (allsurface, hard, grass). We need this to evaluate and calibrate schedule actually faced by a player prior to a given match being predicted. 
# Because the game changes over time, we will calculate mean ACROSS surfaces over 2 year intervals (2009 by itself at the front end; that year is only used for retrospective stats accrual [and only for allsurface] not for modeling), and match to match date ranges during the SOS adjustment
# For this feature, this should be 50%. But let's just calculate what it actually is in this sample as a QC :)

mean_allsurface_SOS_1 = 100 - (df_player2.loc[(df_player2['t_surf'] != 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_allsurface_SOS_2 = 100 - (df_player2.loc[(df_player2['t_surf'] != 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_allsurface_SOS_3 = 100 - (df_player2.loc[(df_player2['t_surf'] != 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_allsurface_SOS_4 = 100 - (df_player2.loc[(df_player2['t_surf'] != 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_allsurface_SOS_5 = 100 - (df_player2.loc[(df_player2['t_surf'] != 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
mean_allsurface_SOS_6 = 100 - (df_player2.loc[(df_player2['t_surf'] != 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), 'p_AVG_C_IP_l10_tw_ss'].mean()) #we want in terms of pct players "YIELD" on average
# (nan, 49.99999884098296, 49.999999034003125, 50.000000000000014, 49.99999807395966, 49.9999980698706)

# Puts together the above- factors the player's actual performance over the last 10 by schedule of opponents' aggregrate performance over THEIR l10 prior to when they faced the player.
# Adjustment proportional to opponents' deviation from field mean performs better than various boosted or blunted versions attempted 
df_player2.loc[(df_player2["t_surf"] != 0) & (df_player2['m_date'] > "1/1/2009") & (df_player2['m_date'] <= "1/1/2010"), "p_AVG_C_IP_l10_tw_nss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_nss"])*(mean_allsurface_SOS_1/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] != 0) & (df_player2['m_date'] > "1/1/2010") & (df_player2['m_date'] <= "1/1/2012"), "p_AVG_C_IP_l10_tw_nss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_nss"])*(mean_allsurface_SOS_2/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] != 0) & (df_player2['m_date'] > "1/1/2012") & (df_player2['m_date'] <= "1/1/2014"), "p_AVG_C_IP_l10_tw_nss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_nss"])*(mean_allsurface_SOS_3/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] != 0) & (df_player2['m_date'] > "1/1/2014") & (df_player2['m_date'] <= "1/1/2016"), "p_AVG_C_IP_l10_tw_nss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_nss"])*(mean_allsurface_SOS_4/df_player2["EY"])).round(2)          
df_player2.loc[(df_player2["t_surf"] != 0) & (df_player2['m_date'] > "1/1/2016") & (df_player2['m_date'] <= "1/1/2018"), "p_AVG_C_IP_l10_tw_nss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_nss"])*(mean_allsurface_SOS_5/df_player2["EY"])).round(2) 
df_player2.loc[(df_player2["t_surf"] != 0) & (df_player2['m_date'] > "1/1/2018") & (df_player2['m_date'] <= "1/1/2020"), "p_AVG_C_IP_l10_tw_nss_SOS_adj"] = ((df_player2["p_AVG_C_IP_l10_tw_nss"])*(mean_allsurface_SOS_6/df_player2["EY"])).round(2) 

# Caps highest adjusted player past win probability at 99.9% (this adjustment can push the biggest favorites to well over 100%, which is an impossibility)
df_player2.loc[(df_player2["p_AVG_C_IP_l10_tw_nss_SOS_adj"] >=100), "p_AVG_C_IP_l10_tw_nss_SOS_adj"] = 99.99 

del mean_allsurface_SOS_1, mean_allsurface_SOS_2, mean_allsurface_SOS_3, mean_allsurface_SOS_4, mean_allsurface_SOS_5, mean_allsurface_SOS_6

df_player2 = df_player2.drop(["EY", "SOS_tw_l10_opp_AVG_C_IP_ws", "SOS_tw_l10_opp_AVG_C_IP_ws_ct", "SOS_tw_l10_opp_AVG_C_IP", "SOS_tw_l60_opp_AVG_C_IP_60", "SOS_tw_l60_opp_AVG_C_IP_59", "SOS_tw_l60_opp_AVG_C_IP_58", "SOS_tw_l60_opp_AVG_C_IP_57", "SOS_tw_l60_opp_AVG_C_IP_56", "SOS_tw_l60_opp_AVG_C_IP_55", "SOS_tw_l60_opp_AVG_C_IP_54", "SOS_tw_l60_opp_AVG_C_IP_53", "SOS_tw_l60_opp_AVG_C_IP_52", "SOS_tw_l60_opp_AVG_C_IP_51"],axis=1)

In [205]:
df_player2 = df_player2.drop(["p_opp_tot_pts_won%_l60_tw_ss", "p_opp_tot_pts_won%_l10_tw_ss", "p_opp_tot_pts_won%_l60_tw_ss_IO", "p_opp_tot_pts_won%_l10_tw_ss_IO", "p_opp_tot_pts_won%_l60_tw_nss", "p_opp_tot_pts_won%_l10_tw_nss", "p_opp_tot_pts_won%_l60_tw_ss_comp", "p_opp_tot_pts_won%_l10_tw_ss_comp", "p_opp_sv_pts_won%_l60_tw_ss", "p_opp_sv_pts_won%_l10_tw_ss", "p_opp_ret_pts_won%_l60_tw_ss", "p_opp_ret_pts_won%_l10_tw_ss", "p_opp_ace%_l60_tw_ss", "p_opp_ace%_l10_tw_ss", "p_opp_aced%_l60_tw_ss", "p_opp_aced%_l10_tw_ss", "p_opp_bp_save%_l60_tw_ss", "p_opp_bp_save%_l10_tw_ss", "p_opp_bp_conv%_l60_tw_ss", "p_opp_bp_conv%_l10_tw_ss", "p_opp_AVG_C_IP_l60_tw_ss", "p_opp_AVG_C_IP_l10_tw_ss", "p_opp_AVG_C_IP_l60_tw_ss_IO", "p_opp_AVG_C_IP_l10_tw_ss_IO", "p_opp_AVG_C_IP_l60_tw_nss", "p_opp_AVG_C_IP_l10_tw_nss", "p_opp_1st_sv%_l60_tw_ss", "p_opp_1st_sv%_l10_tw_ss", "p_opp_1st_sv%_yielded_l60_tw_ss", "p_opp_1st_sv%_yielded_l10_tw_ss", "p_opp_1st_sv_pts_won%_l60_tw_ss", "p_opp_1st_sv_pts_won%_l10_tw_ss", "p_opp_2nd_sv_pts_won%_l60_tw_ss", "p_opp_2nd_sv_pts_won%_l10_tw_ss", "p_opp_1st_ret_pts_won%_l60_tw_ss", "p_opp_1st_ret_pts_won%_l10_tw_ss", "p_opp_2nd_ret_pts_won%_l60_tw_ss", "p_opp_2nd_ret_pts_won%_l10_tw_ss", "p_opp_df%_l60_tw_ss", "p_opp_df%_l10_tw_ss", "p_opp_df_induce%_l60_tw_ss", "p_opp_df_induce%_l10_tw_ss", "p_opp_tot_pts_won%_l60_tw_ss_IO_comp", "p_opp_tot_pts_won%_l10_tw_ss_IO_comp", "p_opp_1st_sv%_l60_tw_ss_IO", "p_opp_1st_sv%_l10_tw_ss_IO", "p_opp_1st_sv%_yielded_l60_tw_ss_IO", "p_opp_1st_sv%_yielded_l10_tw_ss_IO", "p_opp_sv_pts_won%_l60_tw_ss_IO", "p_opp_sv_pts_won%_l10_tw_ss_IO", "p_opp_1st_sv_pts_won%_l60_tw_ss_IO", "p_opp_1st_sv_pts_won%_l10_tw_ss_IO", "p_opp_2nd_sv_pts_won%_l60_tw_ss_IO", "p_opp_2nd_sv_pts_won%_l10_tw_ss_IO", "p_opp_ret_pts_won%_l60_tw_ss_IO", "p_opp_ret_pts_won%_l10_tw_ss_IO", "p_opp_1st_ret_pts_won%_l60_tw_ss_IO", "p_opp_1st_ret_pts_won%_l10_tw_ss_IO", "p_opp_2nd_ret_pts_won%_l60_tw_ss_IO", "p_opp_2nd_ret_pts_won%_l10_tw_ss_IO", "p_opp_ace%_l60_tw_ss_IO", "p_opp_ace%_l10_tw_ss_IO", "p_opp_aced%_l60_tw_ss_IO", "p_opp_aced%_l10_tw_ss_IO", "p_opp_df%_l60_tw_ss_IO", "p_opp_df%_l10_tw_ss_IO", "p_opp_df_induce%_l60_tw_ss_IO", "p_opp_df_induce%_l10_tw_ss_IO", "p_opp_bp_save%_l60_tw_ss_IO", "p_opp_bp_save%_l10_tw_ss_IO", "p_opp_bp_conv%_l60_tw_ss_IO", "p_opp_bp_conv%_l10_tw_ss_IO"], axis=1)

In [206]:
#Save to review
#df_player2.to_csv('../data/df_player2.csv', index=False)

Below a few "efficiency" ratios related to serving are computed that could potentially be useful as predictive features. We will also compute another variant of these ratios later on that has court speed proxy already baked in.

In [207]:
# 'p_ace_df%_ratio_l60_tw_ss_SOS_adj'
# Provides the ratio of % aces to % double faults for PLAYER over the last 60 surface-specific matches prior to the match being predicted

df_player2["p_ace_df%_ratio_l60_tw_ss_SOS_adj"] = (df_player2["p_ace%_l60_tw_ss_SOS_adj"]/df_player2["p_df%_l60_tw_ss_SOS_adj"]).round(2)

In [208]:
# 'p_ace_df%_ratio_l10_tw_ss_SOS_adj'
# Provides the ratio of % aces to % double faults for PLAYER over the last 60 surface-specific matches prior to the match being predicted

df_player2["p_ace_df%_ratio_l10_tw_ss_SOS_adj"] = (df_player2["p_ace%_l10_tw_ss_SOS_adj"]/df_player2["p_df%_l10_tw_ss_SOS_adj"]).round(2)

In [209]:
# 'p_ace_df%_ratio_l60_tw_ss_IO_SOS_adj'
# Provides the ratio of % aces to % double faults for PLAYER over the last 60 surface-specific, IO-specific matches prior to the match being predicted

df_player2["p_ace_df%_ratio_l60_tw_ss_IO_SOS_adj"] = (df_player2["p_ace%_l60_tw_ss_IO_SOS_adj"]/df_player2["p_df%_l60_tw_ss_IO_SOS_adj"]).round(2)

In [210]:
# 'p_ace_df%_ratio_l10_tw_ss_IO_SOS_adj'
# Provides the ratio of % aces to % double faults for PLAYER over the last 10 surface-specific, IO-specific matches prior to the match being predicted

df_player2["p_ace_df%_ratio_l10_tw_ss_IO_SOS_adj"] = (df_player2["p_ace%_l10_tw_ss_IO_SOS_adj"]/df_player2["p_df%_l10_tw_ss_IO_SOS_adj"]).round(2)

In [211]:
# 'p_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_adj'
# Provides the ratio of % first serves in to % first serve points won for PLAYER over the last 60 surface-specific matches prior to the match being predicted

df_player2["p_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_adj"] = (df_player2["p_1st_sv_pts_won%_l60_tw_ss_SOS_adj"]/df_player2["p_1st_sv%_l60_tw_ss_SOS_adj"]).round(2)

In [212]:
# 'p_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_adj'
# Provides the ratio of % first serves in to % first serve points won for PLAYER over the last 10 surface-specific matches prior to the match being predicted

df_player2["p_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_adj"] = (df_player2["p_1st_sv_pts_won%_l10_tw_ss_SOS_adj"]/df_player2["p_1st_sv%_l10_tw_ss_SOS_adj"]).round(2)

In [213]:
# 'p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_adj'
# Provides the ratio of % first serves in to % first serve points won for PLAYER over the last 60 surface-specific, IO-specific matches prior to the match being predicted

df_player2["p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_adj"] = (df_player2["p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]/df_player2["p_1st_sv%_l60_tw_ss_IO_SOS_adj"]).round(2)

In [214]:
# 'p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_adj'
# Provides the ratio of % first serves in to % first serve points won for PLAYER over the last 10 surface-specific, IO-specific matches prior to the match being predicted

df_player2["p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_adj"] = (df_player2["p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"]/df_player2["p_1st_sv%_l10_tw_ss_IO_SOS_adj"]).round(2)

In [215]:
# 'p_ace_1stSv%_ratio_l60_tw_ss_SOS_adj'
# Provides the ratio of % aces in to % first serves in for PLAYER over the last 60 surface-specific matches prior to the match being predicted

df_player2["p_ace_1stSv%_ratio_l60_tw_ss_SOS_adj"] = (df_player2["p_ace%_l60_tw_ss_SOS_adj"]/df_player2["p_1st_sv%_l60_tw_ss_SOS_adj"]).round(2)

In [216]:
# 'p_ace_1stSv%_ratio_l10_tw_ss_SOS_adj'
# Provides the ratio of % aces in to % first serves in for PLAYER over the last 10 surface-specific matches prior to the match being predicted

df_player2["p_ace_1stSv%_ratio_l10_tw_ss_SOS_adj"] = (df_player2["p_ace%_l10_tw_ss_SOS_adj"]/df_player2["p_1st_sv%_l10_tw_ss_SOS_adj"]).round(2)

In [217]:
# 'p_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_adj'
# Provides the ratio of % aces in to % first serves in for PLAYER over the last 60 surface-specific, IO-specific matches prior to the match being predicted

df_player2["p_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_adj"] = (df_player2["p_ace%_l60_tw_ss_IO_SOS_adj"]/df_player2["p_1st_sv%_l60_tw_ss_IO_SOS_adj"]).round(2)

In [218]:
# 'p_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_adj'
# Provides the ratio of % aces in to % first serves in for PLAYER over the last 10 surface-specific, IO-specific matches prior to the match being predicted

df_player2["p_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_adj"] = (df_player2["p_ace%_l10_tw_ss_IO_SOS_adj"]/df_player2["p_1st_sv%_l10_tw_ss_IO_SOS_adj"]).round(2)

In [219]:
# 'p_df_SvPtsWon%_ratio_l60_tw_ss_SOS_adj'
# Provides the ratio of % double faults to % serve points won for PLAYER over the last 60 surface-specific matches prior to the match being predicted

df_player2["p_df_SvPtsWon%_ratio_l60_tw_ss_SOS_adj"] = (df_player2["p_df%_l60_tw_ss_SOS_adj"]/df_player2["p_sv_pts_won%_l60_tw_ss_SOS_adj"]).round(2)

In [220]:
# 'p_df_SvPtsWon%_ratio_l10_tw_ss_SOS_adj'
# Provides the ratio of % double faults to % serve points won for PLAYER over the last 10 surface-specific matches prior to the match being predicted

df_player2["p_df_SvPtsWon%_ratio_l10_tw_ss_SOS_adj"] = (df_player2["p_df%_l10_tw_ss_SOS_adj"]/df_player2["p_sv_pts_won%_l10_tw_ss_SOS_adj"]).round(2)

In [221]:
# 'p_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_adj'
# Provides the ratio of % double faults to % serve points won for PLAYER over the last 60 surface-specific, IO-specific matches prior to the match being predicted

df_player2["p_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_adj"] = (df_player2["p_df%_l60_tw_ss_IO_SOS_adj"]/df_player2["p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]).round(2)

In [222]:
# 'p_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_adj'
# Provides the ratio of % double faults to % serve points won for PLAYER over the last 10 surface-specific, IO-specific matches prior to the match being predicted

df_player2["p_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_adj"] = (df_player2["p_df%_l10_tw_ss_IO_SOS_adj"]/df_player2["p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"]).round(2)

In [223]:
df_player2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57068 entries, 56533 to 40644
Columns: 253 entries, t_id to p_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_adj
dtypes: datetime64[ns](1), float64(217), int64(29), object(6)
memory usage: 113.1+ MB


In [224]:
#Save to review
df_player2.to_csv('../data/df_player2_no_dtw.csv', index=False)

### 6. Predictive Features: "Court Speed Proxy" Per Serve and Return Feature, and Adjustment of Associated Short and Long-Term Predictive Features With These Proxies

While we don't have a direct way to measure or predict court speed at any given tournament, one approach is to use the mean % of service points won by players in that tournament relative to % of service points won by players on the same surface (also respecting indoor/outdoor status). We can also create an analogous proxy for % return points won, % of points resulting in an ace (or returner being aced), and % of break points saved (as a server) or converted (as a returner).

In practice, as we have an eye on making this a viable wagering model, what we can do is calculate this quantity from the first round of a specific tournament and utilize it as a predictive feature in the second round of the tournament, then use the first and second rounds to generate a feature for the third round, and so on and so forth. There are multiple options to generate a quantity specifically for the first round of a tournament. What we will do here is set court speed proxy per serving or returning feature for first round matches to the final court speed proxy for the same tournament from the prior year. If the tournament is new or switched surfaces we will assign a neutral prior (assume sample mean performance for surface/indoor-outdoor status for the field) for first round matches.   

In [225]:
df_player3 = df_player2

In [226]:
# 't_1st_sv_in%_ratio'
# 't_1st_sv_in%_yielded_ratio'
# Provides ratios, to pass back to main dataframe, of % first serves made (on serve) and % first serves "yielded" (on return), respectively by the entire field in a tournament's prior rounds (cumulative, weighted by number of matches per previous round) and 
# These ratios, in turn will serve as stand-alone 'court speed proxy' predictive features, as well as calibration metrics to make one further adjustment to time-weighted and SOS-adjusted serve and return metrics
# Court speed proxy for first round matches is set to the final court speed proxy for the same tournament from the prior year. If the tournament is new or switched surfaces we will assign a neutral prior (assume sample mean performance for surface/indoor-outdoor status for the field) for first round matches.   
# Provides mean player service pts won% on a per-round basis for each tournament

df_player3 = df_player3.sort_values(by=['m_yr','t_id','m_rd_num','m_date'], ascending = False)
ct_speed_proxy  = df_player3.groupby(['t_id', 'm_rd_num'])['t_id', 't_ident', 't_ind', 't_surf', 't_draw_sz', 'm_yr', 'm_rd_num', 'p_1st_sv_in%', 'p_1st_sv%_l60_tw_ss_IO_SOS_adj'].mean().round(2)
ct_speed_proxy.rename(columns = {"p_1st_sv_in%": "t_rd_1st_sv_in%", "p_1st_sv%_l60_tw_ss_IO_SOS_adj": "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"}, inplace = True)
ct_speed_proxy.reset_index(drop = True, inplace = True)
#ct_speed_proxy = ct_speed_proxy.sort_values(by=['t_ident','m_yr','t_id','m_rd_num'], ascending = False)
#ct_speed_proxy.to_csv('../data/ct_speed_proxy.csv', index=False)


# # Weights each round mean to reflect the number of matches that went into generating that mean (yes, some walkover and shortened matches have already been removed prior to computing the means but we don't need to correct for this because the effect on per-round variance of these matches is minimal)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*64
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*128

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*64

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*32

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*16

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_in%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_in%"]*8

# # Sums the total percentages from all PREVIOUS rounds within a tournament
ct_speed_proxy.reset_index(drop = True, inplace = True)
ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"] = ct_speed_proxy.groupby(['t_id'])["t_rd_1st_sv_in%_mw_tot"].transform(lambda x: x.rolling(window=100, min_periods = 1).sum().round(2).shift(1))

# # Divides the weighted totals appropriately to get weighted averages
ct_speed_proxy.reset_index(drop = True, inplace = True)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_1st_sv_in%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]/252).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_1st_sv_in%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]/248).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv_in%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]/240).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_in%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]/224).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_in%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]/192).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_in%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]/128).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_in%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]).round(2)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_1st_sv_in%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]/124).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv_in%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]/120).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_in%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]/112).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_in%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]/96).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_in%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]/64).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_in%_mw"] =  ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv_in%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]/60).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_in%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]/56).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_in%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]/48).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_in%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]/32).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_in%_mw"] =  ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_in%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_in%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_in%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_in%_mw"] =  ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv_in%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_in%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_in%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_in%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]/8).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_in%_mw"] =  ct_speed_proxy["t_rd_1st_sv_in%_mw_tot2"]


# We also want the exact same weighted metric for the comparator mean for creating the tournament court speed proxy calibration (l60_tw_ss_SOS_adj)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*64
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*128

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*64

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*32

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*16

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj"]*8

# # Sums the total percentages from all PREVIOUS rounds within a tournament
ct_speed_proxy.reset_index(drop = True, inplace = True)
ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"] = ct_speed_proxy.groupby(['t_id'])["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot"].transform(lambda x: x.rolling(window=100, min_periods = 1).sum().round(2).shift(1))

# # Divides the weighted totals appropriately to get weighted averages
ct_speed_proxy.reset_index(drop = True, inplace = True)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/252).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/248).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/240).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/224).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/192).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/128).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]).round(2)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/124).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/120).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/112).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/96).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/64).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/60).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/56).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/48).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/32).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/8).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

# # Provides ratio of cumulative tournament % first serve to the cumulative (and previously adjusted) weighted mean for players in the field
ct_speed_proxy['t_1st_sv_in%_ratio'] = (ct_speed_proxy['t_rd_1st_sv_in%_mw']/ct_speed_proxy['t_rd_1st_sv%_l60_tw_ss_IO_SOS_adj_mw']).round(2)
ct_speed_proxy["t_1st_sv_in%_ratio"] = ct_speed_proxy["t_1st_sv_in%_ratio"].fillna(1) # Sets first round matches to ratio of 1 (assumes mean for court type)

# Provides ratio of cumulative tournament % first serve yielded to the cumulative (and previously adjusted) weighted mean for players in the field
# In this case, the same as 1st_sv%_ratio, just from returner perspective
ct_speed_proxy["t_1st_sv_in%_yielded_ratio"] = ct_speed_proxy["t_1st_sv_in%_ratio"].round(2)
ct_speed_proxy["t_1st_sv_in%_yielded_ratio"] = ct_speed_proxy["t_1st_sv_in%_yielded_ratio"].fillna(1) # Sets first round matches to ratio of 1 (assumes mean for court type)

#ct_speed_proxy.reset_index(drop = True, inplace = True)
#ct_speed_proxy = ct_speed_proxy.sort_values(by=['t_ident','m_yr','t_id','m_rd_num'], ascending = False)
#ct_speed_proxy.to_csv('../data/ct_speed_proxy.csv', index=False)

# Merge the relevant ratios back into core dataframe
df_player3 = df_player3.merge(ct_speed_proxy[['t_id', 'm_rd_num', 't_1st_sv_in%_ratio', 't_1st_sv_in%_yielded_ratio']], on= ['t_id', 'm_rd_num'], how = 'left')
df_player3 = df_player3.sort_values(by=['p_id','m_date','m_rd_num'], ascending = False)
#df_player3.to_csv('../data/df_player3.csv', index=False)

del ct_speed_proxy
# #del df_player2

In [227]:
# 't_sv_pts_won%_ratio'
# 't_ret_pts_won%_ratio'
# Provides ratios, to pass back to main dataframe, of % points won on serve and return, respectively by the entire field in a tournament's prior rounds (cumulative, weighted by number of matches per previous round) and 
# These ratios, in turn will serve as stand-alone 'court speed proxy' predictive features, as well as calibration metrics to make one further adjustment to time-weighted and SOS-adjusted serve and return metrics
# Court speed proxy for first round matches is assigned a neutral prior (assume sample mean performance for surface/indoor-outdoor status for the field) for first round matches.
# Provides mean player service pts won% on a per-round basis for each tournament
#df_player3 = df_player2
df_player3 = df_player3.sort_values(by=['m_yr','t_id','m_rd_num','m_date'], ascending = False)
ct_speed_proxy  = df_player3.groupby(['t_id', 'm_rd_num'])['t_id', 't_ident', 't_ind', 't_surf', 't_draw_sz', 'm_yr', 'm_rd_num', 'p_sv_pts_won%', 'p_sv_pts_won%_l60_tw_ss_IO_SOS_adj'].mean().round(2)
ct_speed_proxy.rename(columns = {"p_sv_pts_won%": "t_rd_sv_pts_won%", "p_sv_pts_won%_l60_tw_ss_IO_SOS_adj": "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"}, inplace = True)
ct_speed_proxy.reset_index(drop = True, inplace = True)
#ct_speed_proxy = ct_speed_proxy.sort_values(by=['t_ident','m_yr','t_id','m_rd_num'], ascending = False)
#ct_speed_proxy.to_csv('../data/ct_speed_proxy.csv', index=False)


# # Weights each round mean to reflect the number of matches that went into generating that mean (yes, some walkover and shortened matches have already been removed prior to computing the means but we don't need to correct for this because the effect on per-round variance of these matches is minimal)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*64
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*128

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*64

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*32

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*16

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%"]*8

# # Sums the total percentages from all PREVIOUS rounds within a tournament
ct_speed_proxy.reset_index(drop = True, inplace = True)
ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"] = ct_speed_proxy.groupby(['t_id'])["t_rd_sv_pts_won%_mw_tot"].transform(lambda x: x.rolling(window=100, min_periods = 1).sum().round(2).shift(1))

# # Divides the weighted totals appropriately to get weighted averages
ct_speed_proxy.reset_index(drop = True, inplace = True)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]/252).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]/248).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]/240).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]/224).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]/192).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]/128).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]).round(2)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]/124).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]/120).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]/112).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]/96).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]/64).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_sv_pts_won%_mw"] =  ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]/60).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]/56).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]/48).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]/32).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_sv_pts_won%_mw"] =  ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_sv_pts_won%_mw"] =  ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]/8).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_sv_pts_won%_mw"] =  ct_speed_proxy["t_rd_sv_pts_won%_mw_tot2"]


# We also want the exact same weighted metric for the comparator mean for creating the tournament court speed proxy calibration (l60_tw_ss_SOS_adj)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*64
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*128

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*64

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*32

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*16

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*8

# # Sums the total percentages from all PREVIOUS rounds within a tournament
ct_speed_proxy.reset_index(drop = True, inplace = True)
ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"] = ct_speed_proxy.groupby(['t_id'])["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"].transform(lambda x: x.rolling(window=100, min_periods = 1).sum().round(2).shift(1))

# # Divides the weighted totals appropriately to get weighted averages
ct_speed_proxy.reset_index(drop = True, inplace = True)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/252).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/248).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/240).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/224).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/192).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/128).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]).round(2)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/124).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/120).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/112).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/96).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/64).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/60).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/56).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/48).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/32).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/8).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

# # Provides ratio of cumulative tournament % serve pts won to the cumulative (and previously adjusted) weighted mean for players in the field
ct_speed_proxy['t_sv_pts_won%_ratio'] = (ct_speed_proxy['t_rd_sv_pts_won%_mw']/ct_speed_proxy['t_rd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw']).round(2)
ct_speed_proxy["t_sv_pts_won%_ratio"] = ct_speed_proxy["t_sv_pts_won%_ratio"].fillna(1) # Sets first round matches to ratio of 1 (assumes mean for court type)

# Provides ratio of cumulative tournament % return pts won to the matched field l60_ss_IO mean across tournaments (simply inverse of % return points won)
ct_speed_proxy["t_ret_pts_won%_ratio"] = (1/ct_speed_proxy["t_sv_pts_won%_ratio"]).round(2)

#ct_speed_proxy.reset_index(drop = True, inplace = True)
#ct_speed_proxy = ct_speed_proxy.sort_values(by=['t_ident','m_yr','t_id','m_rd_num'], ascending = False)
#ct_speed_proxy.to_csv('../data/ct_speed_proxy.csv', index=False)

# Merge the relevant ratios back into core dataframe
df_player3 = df_player3.merge(ct_speed_proxy[['t_id', 'm_rd_num', 't_sv_pts_won%_ratio', 't_ret_pts_won%_ratio']], on= ['t_id', 'm_rd_num'], how = 'left')
df_player3 = df_player3.sort_values(by=['p_id','m_date','m_rd_num'], ascending = False)
#df_player3.to_csv('../data/df_player3.csv', index=False)

del ct_speed_proxy

In [228]:
# 't_1st_sv_pts_won%_ratio'
# 't_1st_ret_pts_won%_ratio'
# Provides ratios, to pass back to main dataframe, of % points won on 1st serve and return, respectively by the entire field in a tournament's prior rounds (cumulative, weighted by number of matches per previous round) and 
# These ratios, in turn will serve as stand-alone 'court speed proxy' predictive features, as well as calibration metrics to make one further adjustment to time-weighted and SOS-adjusted serve and return metrics
# Court speed proxy for first round matches is assigned a neutral prior (assume sample mean performance for surface/indoor-outdoor status for the field) for first round matches.
# Provides mean player service pts won% on a per-round basis for each tournament
#df_player3 = df_player2
df_player3 = df_player3.sort_values(by=['m_yr','t_id','m_rd_num','m_date'], ascending = False)
ct_speed_proxy  = df_player3.groupby(['t_id', 'm_rd_num'])['t_id', 't_ident', 't_ind', 't_surf', 't_draw_sz', 'm_yr', 'm_rd_num', 'p_1st_sv_pts_won%', 'p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj'].mean().round(2)
ct_speed_proxy.rename(columns = {"p_1st_sv_pts_won%": "t_rd_1st_sv_pts_won%", "p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj": "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"}, inplace = True)
ct_speed_proxy.reset_index(drop = True, inplace = True)
#ct_speed_proxy = ct_speed_proxy.sort_values(by=['t_ident','m_yr','t_id','m_rd_num'], ascending = False)
#ct_speed_proxy.to_csv('../data/ct_speed_proxy.csv', index=False)


# # Weights each round mean to reflect the number of matches that went into generating that mean (yes, some walkover and shortened matches have already been removed prior to computing the means but we don't need to correct for this because the effect on per-round variance of these matches is minimal)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*64
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*128

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*64

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*32

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*16

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%"]*8

# # Sums the total percentages from all PREVIOUS rounds within a tournament
ct_speed_proxy.reset_index(drop = True, inplace = True)
ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"] = ct_speed_proxy.groupby(['t_id'])["t_rd_1st_sv_pts_won%_mw_tot"].transform(lambda x: x.rolling(window=100, min_periods = 1).sum().round(2).shift(1))

# # Divides the weighted totals appropriately to get weighted averages
ct_speed_proxy.reset_index(drop = True, inplace = True)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_1st_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]/252).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_1st_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]/248).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]/240).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]/224).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]/192).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]/128).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]).round(2)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_1st_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]/124).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]/120).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]/112).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]/96).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]/64).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_pts_won%_mw"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]/60).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]/56).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]/48).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]/32).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_pts_won%_mw"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_pts_won%_mw"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]/8).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_pts_won%_mw"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_mw_tot2"]


# We also want the exact same weighted metric for the comparator mean for creating the tournament court speed proxy calibration (l60_tw_ss_SOS_adj)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*64
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*128

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*64

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*32

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*16

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*8

# # Sums the total percentages from all PREVIOUS rounds within a tournament
ct_speed_proxy.reset_index(drop = True, inplace = True)
ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"] = ct_speed_proxy.groupby(['t_id'])["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"].transform(lambda x: x.rolling(window=100, min_periods = 1).sum().round(2).shift(1))

# # Divides the weighted totals appropriately to get weighted averages
ct_speed_proxy.reset_index(drop = True, inplace = True)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/252).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/248).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/240).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/224).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/192).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/128).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]).round(2)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/124).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/120).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/112).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/96).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/64).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/60).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/56).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/48).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/32).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/8).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

# # Provides ratio of cumulative tournament % 1st serve pts won to the cumulative (and previously adjusted) weighted mean for players in the field
ct_speed_proxy['t_1st_sv_pts_won%_ratio'] = (ct_speed_proxy['t_rd_1st_sv_pts_won%_mw']/ct_speed_proxy['t_rd_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw']).round(2)
ct_speed_proxy["t_1st_sv_pts_won%_ratio"] = ct_speed_proxy["t_1st_sv_pts_won%_ratio"].fillna(1) # Sets first round matches to ratio of 1 (assumes mean for court type)

# Provides ratio of cumulative tournament % 1st (serve) return pts won to the matched field l60_ss_IO mean across tournaments (simply inverse of % return points won)
ct_speed_proxy["t_1st_ret_pts_won%_ratio"] = (1/ct_speed_proxy["t_1st_sv_pts_won%_ratio"]).round(2)

#ct_speed_proxy.reset_index(drop = True, inplace = True)
#ct_speed_proxy = ct_speed_proxy.sort_values(by=['t_ident','m_yr','t_id','m_rd_num'], ascending = False)
#ct_speed_proxy.to_csv('../data/ct_speed_proxy.csv', index=False)

# Merge the relevant ratios back into core dataframe
df_player3 = df_player3.merge(ct_speed_proxy[['t_id', 'm_rd_num', 't_1st_sv_pts_won%_ratio', 't_1st_ret_pts_won%_ratio']], on= ['t_id', 'm_rd_num'], how = 'left')
df_player3 = df_player3.sort_values(by=['p_id','m_date','m_rd_num'], ascending = False)
#df_player3.to_csv('../data/df_player3.csv', index=False)

del ct_speed_proxy

In [229]:
# 't_2nd_sv_pts_won%_ratio'
# 't_2nd_ret_pts_won%_ratio'
# Provides ratios, to pass back to main dataframe, of % points won on 2nd serve and return, respectively by the entire field in a tournament's prior rounds (cumulative, weighted by number of matches per previous round) and 
# These ratios, in turn will serve as stand-alone 'court speed proxy' predictive features, as well as calibration metrics to make one further adjustment to time-weighted and SOS-adjusted serve and return metrics
# Court speed proxy for first round matches is assigned a neutral prior (assume sample mean performance for surface/indoor-outdoor status for the field) for first round matches.
# Provides mean player service pts won% on a per-round basis for each tournament
#df_player3 = df_player2
df_player3 = df_player3.sort_values(by=['m_yr','t_id','m_rd_num','m_date'], ascending = False)
ct_speed_proxy  = df_player3.groupby(['t_id', 'm_rd_num'])['t_id', 't_ident', 't_ind', 't_surf', 't_draw_sz', 'm_yr', 'm_rd_num', 'p_2nd_sv_pts_won%', 'p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj'].mean().round(2)
ct_speed_proxy.rename(columns = {"p_2nd_sv_pts_won%": "t_rd_2nd_sv_pts_won%", "p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj": "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"}, inplace = True)
ct_speed_proxy.reset_index(drop = True, inplace = True)
#ct_speed_proxy = ct_speed_proxy.sort_values(by=['t_ident','m_yr','t_id','m_rd_num'], ascending = False)
#ct_speed_proxy.to_csv('../data/ct_speed_proxy.csv', index=False)


# # Weights each round mean to reflect the number of matches that went into generating that mean (yes, some walkover and shortened matches have already been removed prior to computing the means but we don't need to correct for this because the effect on per-round variance of these matches is minimal)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*64
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*128

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*64

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*32

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*16

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_2nd_sv_pts_won%_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%"]*8

# # Sums the total percentages from all PREVIOUS rounds within a tournament
ct_speed_proxy.reset_index(drop = True, inplace = True)
ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"] = ct_speed_proxy.groupby(['t_id'])["t_rd_2nd_sv_pts_won%_mw_tot"].transform(lambda x: x.rolling(window=100, min_periods = 1).sum().round(2).shift(1))

# # Divides the weighted totals appropriately to get weighted averages
ct_speed_proxy.reset_index(drop = True, inplace = True)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_2nd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]/252).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_2nd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]/248).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_2nd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]/240).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_2nd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]/224).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_2nd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]/192).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_2nd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]/128).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_2nd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]).round(2)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_2nd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]/124).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_2nd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]/120).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_2nd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]/112).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_2nd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]/96).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_2nd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]/64).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_2nd_sv_pts_won%_mw"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_2nd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]/60).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_2nd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]/56).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_2nd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]/48).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_2nd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]/32).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_2nd_sv_pts_won%_mw"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_2nd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_2nd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_2nd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_2nd_sv_pts_won%_mw"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_2nd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_2nd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_2nd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_2nd_sv_pts_won%_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]/8).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_2nd_sv_pts_won%_mw"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_mw_tot2"]


# We also want the exact same weighted metric for the comparator mean for creating the tournament court speed proxy calibration (l60_tw_ss_SOS_adj)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*64
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*128

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*64

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*32

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*16

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*8

# # Sums the total percentages from all PREVIOUS rounds within a tournament
ct_speed_proxy.reset_index(drop = True, inplace = True)
ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"] = ct_speed_proxy.groupby(['t_id'])["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot"].transform(lambda x: x.rolling(window=100, min_periods = 1).sum().round(2).shift(1))

# # Divides the weighted totals appropriately to get weighted averages
ct_speed_proxy.reset_index(drop = True, inplace = True)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/252).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/248).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/240).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/224).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/192).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/128).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]).round(2)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/124).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/120).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/112).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/96).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/64).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/60).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/56).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/48).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/32).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/8).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

# # Provides ratio of cumulative tournament % 2nd serve pts won to the cumulative (and previously adjusted) weighted mean for players in the field
ct_speed_proxy['t_2nd_sv_pts_won%_ratio'] = (ct_speed_proxy['t_rd_2nd_sv_pts_won%_mw']/ct_speed_proxy['t_rd_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_mw']).round(2)
ct_speed_proxy["t_2nd_sv_pts_won%_ratio"] = ct_speed_proxy["t_2nd_sv_pts_won%_ratio"].fillna(1) # Sets first round matches to ratio of 1 (assumes mean for court type)

# Provides ratio of cumulative tournament % 2nd (serve) return pts won to the matched field l60_ss_IO mean across tournaments (simply inverse of % return points won)
ct_speed_proxy["t_2nd_ret_pts_won%_ratio"] = (1/ct_speed_proxy["t_2nd_sv_pts_won%_ratio"]).round(2)

#ct_speed_proxy.reset_index(drop = True, inplace = True)
#ct_speed_proxy = ct_speed_proxy.sort_values(by=['t_ident','m_yr','t_id','m_rd_num'], ascending = False)
#ct_speed_proxy.to_csv('../data/ct_speed_proxy.csv', index=False)

# Merge the relevant ratios back into core dataframe
df_player3 = df_player3.merge(ct_speed_proxy[['t_id', 'm_rd_num', 't_2nd_sv_pts_won%_ratio', 't_2nd_ret_pts_won%_ratio']], on= ['t_id', 'm_rd_num'], how = 'left')
df_player3 = df_player3.sort_values(by=['p_id','m_date','m_rd_num'], ascending = False)
#df_player3.to_csv('../data/df_player3.csv', index=False)

del ct_speed_proxy

In [230]:
# 't_ace%_ratio'
# 't_aced%_ratio'
# Provides ratios, to pass back to main dataframe, of % points resulting in an ace on both serve and return, respectively by the entire field in a tournament's prior rounds (cumulative, weighted by number of matches per previous round) and 
# These ratios, in turn will serve as stand-alone 'court speed proxy' predictive features, as well as calibration metrics to make one further adjustment to time-weighted and SOS-adjusted serve and return metrics
# Court speed proxy for first round matches is assigned a neutral prior (assume sample mean performance for surface/indoor-outdoor status for the field) for first round matches.

#df_player3 = df_player2
df_player3 = df_player3.sort_values(by=['m_yr','t_id','m_rd_num','m_date'], ascending = False)
ct_speed_proxy  = df_player3.groupby(['t_id', 'm_rd_num'])['t_id', 't_ident', 't_ind', 't_surf', 't_draw_sz', 'm_yr', 'm_rd_num', 'p_ace%', 'p_ace%_l60_tw_ss_IO_SOS_adj'].mean().round(2)
ct_speed_proxy.rename(columns = {"p_ace%": "t_rd_ace%", "p_ace%_l60_tw_ss_IO_SOS_adj": "t_rd_ace%_l60_tw_ss_IO_SOS_adj"}, inplace = True)
ct_speed_proxy.reset_index(drop = True, inplace = True)
#ct_speed_proxy = ct_speed_proxy.sort_values(by=['t_ident','m_yr','t_id','m_rd_num'], ascending = False)
#ct_speed_proxy.to_csv('../data/ct_speed_proxy.csv', index=False)


# # Weights each round mean to reflect the number of matches that went into generating that mean (yes, some walkover and shortened matches have already been removed prior to computing the means but we don't need to correct for this because the effect on per-round variance of these matches is minimal)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*64
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*128

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*64

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*32

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*16

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_ace%_mw_tot"] =  ct_speed_proxy["t_rd_ace%"]*8

# # Sums the total percentages from all PREVIOUS rounds within a tournament
ct_speed_proxy.reset_index(drop = True, inplace = True)
ct_speed_proxy["t_rd_ace%_mw_tot2"] = ct_speed_proxy.groupby(['t_id'])["t_rd_ace%_mw_tot"].transform(lambda x: x.rolling(window=100, min_periods = 1).sum().round(2).shift(1))

# # Divides the weighted totals appropriately to get weighted averages
ct_speed_proxy.reset_index(drop = True, inplace = True)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_ace%_mw"] =  (ct_speed_proxy["t_rd_ace%_mw_tot2"]/252).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_ace%_mw"] =  (ct_speed_proxy["t_rd_ace%_mw_tot2"]/248).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_ace%_mw"] =  (ct_speed_proxy["t_rd_ace%_mw_tot2"]/240).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_ace%_mw"] =  (ct_speed_proxy["t_rd_ace%_mw_tot2"]/224).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_ace%_mw"] =  (ct_speed_proxy["t_rd_ace%_mw_tot2"]/192).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_ace%_mw"] =  (ct_speed_proxy["t_rd_ace%_mw_tot2"]/128).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_ace%_mw"] =  (ct_speed_proxy["t_rd_ace%_mw_tot2"]).round(2)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_ace%_mw"] =  (ct_speed_proxy["t_rd_ace%_mw_tot2"]/124).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_ace%_mw"] =  (ct_speed_proxy["t_rd_ace%_mw_tot2"]/120).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_ace%_mw"] =  (ct_speed_proxy["t_rd_ace%_mw_tot2"]/112).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_ace%_mw"] =  (ct_speed_proxy["t_rd_ace%_mw_tot2"]/96).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_ace%_mw"] =  (ct_speed_proxy["t_rd_ace%_mw_tot2"]/64).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_ace%_mw"] =  ct_speed_proxy["t_rd_ace%_mw_tot2"]

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_ace%_mw"] =  (ct_speed_proxy["t_rd_ace%_mw_tot2"]/60).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_ace%_mw"] =  (ct_speed_proxy["t_rd_ace%_mw_tot2"]/56).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_ace%_mw"] =  (ct_speed_proxy["t_rd_ace%_mw_tot2"]/48).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_ace%_mw"] =  (ct_speed_proxy["t_rd_ace%_mw_tot2"]/32).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_ace%_mw"] =  ct_speed_proxy["t_rd_ace%_mw_tot2"]

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_ace%_mw"] =  (ct_speed_proxy["t_rd_ace%_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_ace%_mw"] =  (ct_speed_proxy["t_rd_ace%_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_ace%_mw"] =  (ct_speed_proxy["t_rd_ace%_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_ace%_mw"] =  ct_speed_proxy["t_rd_ace%_mw_tot2"]

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_ace%_mw"] =  (ct_speed_proxy["t_rd_ace%_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_ace%_mw"] =  (ct_speed_proxy["t_rd_ace%_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_ace%_mw"] =  (ct_speed_proxy["t_rd_ace%_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_ace%_mw"] =  (ct_speed_proxy["t_rd_ace%_mw_tot2"]/8).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_ace%_mw"] =  ct_speed_proxy["t_rd_ace%_mw_tot2"]


# We also want the exact same weighted metric for the comparator mean for creating the tournament court speed proxy calibration (l60_tw_ss_SOS_adj)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*64
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*128

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*64

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*32

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*16

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj"]*8

# # Sums the total percentages from all PREVIOUS rounds within a tournament
ct_speed_proxy.reset_index(drop = True, inplace = True)
ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"] = ct_speed_proxy.groupby(['t_id'])["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot"].transform(lambda x: x.rolling(window=100, min_periods = 1).sum().round(2).shift(1))

# # Divides the weighted totals appropriately to get weighted averages
ct_speed_proxy.reset_index(drop = True, inplace = True)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/252).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/248).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/240).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/224).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/192).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/128).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]).round(2)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/124).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/120).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/112).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/96).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/64).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/60).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/56).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/48).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/32).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/8).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

# # Provides ratio of cumulative tournament % serve pts won to the cumulative (and previously adjusted) weighted mean for players in the field
ct_speed_proxy['t_ace%_ratio'] = (ct_speed_proxy['t_rd_ace%_mw']/ct_speed_proxy['t_rd_ace%_l60_tw_ss_IO_SOS_adj_mw']).round(2)
ct_speed_proxy["t_ace%_ratio"] = ct_speed_proxy["t_ace%_ratio"].fillna(1) # Sets first round matches to ratio of 1 (assumes mean for court type)

# Provides ratio of cumulative tournament % aced to the matched field l60_ss_IO mean across tournaments (same as % aces, but from returner perspective)
ct_speed_proxy["t_aced%_ratio"] = ct_speed_proxy["t_ace%_ratio"].round(2)

#ct_speed_proxy.reset_index(drop = True, inplace = True)
#ct_speed_proxy = ct_speed_proxy.sort_values(by=['t_ident','m_yr','t_id','m_rd_num'], ascending = False)
#ct_speed_proxy.to_csv('../data/ct_speed_proxy.csv', index=False)

# Merge the relevant ratios back into core dataframe
df_player3 = df_player3.merge(ct_speed_proxy[['t_id', 'm_rd_num', 't_ace%_ratio', 't_aced%_ratio']], on= ['t_id', 'm_rd_num'], how = 'left')
df_player3 = df_player3.sort_values(by=['p_id','m_date','m_rd_num'], ascending = False)
#df_player3.to_csv('../data/df_player3.csv', index=False)

del ct_speed_proxy

In [231]:
# 't_df%_ratio'
# 't_df_induce%_ratio'
# Provides ratios, to pass back to main dataframe, of % points resulting in a double fault on both serve and return, respectively by the entire field in a tournament's prior rounds (cumulative, weighted by number of matches per previous round) and 
# These ratios, in turn will serve as stand-alone 'court speed proxy' predictive features, as well as calibration metrics to make one further adjustment to time-weighted and SOS-adjusted serve and return metrics
# Court speed proxy for first round matches is assigned a neutral prior (assume sample mean performance for surface/indoor-outdoor status for the field) for first round matches.

#df_player3 = df_player2
df_player3 = df_player3.sort_values(by=['m_yr','t_id','m_rd_num','m_date'], ascending = False)
ct_speed_proxy  = df_player3.groupby(['t_id', 'm_rd_num'])['t_id', 't_ident', 't_ind', 't_surf', 't_draw_sz', 'm_yr', 'm_rd_num', 'p_df%', 'p_df%_l60_tw_ss_IO_SOS_adj'].mean().round(2)
ct_speed_proxy.rename(columns = {"p_df%": "t_rd_df%", "p_df%_l60_tw_ss_IO_SOS_adj": "t_rd_df%_l60_tw_ss_IO_SOS_adj"}, inplace = True)
ct_speed_proxy.reset_index(drop = True, inplace = True)
#ct_speed_proxy = ct_speed_proxy.sort_values(by=['t_ident','m_yr','t_id','m_rd_num'], ascending = False)
#ct_speed_proxy.to_csv('../data/ct_speed_proxy.csv', index=False)


# # Weights each round mean to reflect the number of matches that went into generating that mean (yes, some walkover and shortened matches have already been removed prior to computing the means but we don't need to correct for this because the effect on per-round variance of these matches is minimal)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*64
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*128

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*64

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*32

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*16

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_df%_mw_tot"] =  ct_speed_proxy["t_rd_df%"]*8

# # Sums the total percentages from all PREVIOUS rounds within a tournament
ct_speed_proxy.reset_index(drop = True, inplace = True)
ct_speed_proxy["t_rd_df%_mw_tot2"] = ct_speed_proxy.groupby(['t_id'])["t_rd_df%_mw_tot"].transform(lambda x: x.rolling(window=100, min_periods = 1).sum().round(2).shift(1))

# # Divides the weighted totals appropriately to get weighted averages
ct_speed_proxy.reset_index(drop = True, inplace = True)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_df%_mw"] =  (ct_speed_proxy["t_rd_df%_mw_tot2"]/252).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_df%_mw"] =  (ct_speed_proxy["t_rd_df%_mw_tot2"]/248).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_df%_mw"] =  (ct_speed_proxy["t_rd_df%_mw_tot2"]/240).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_df%_mw"] =  (ct_speed_proxy["t_rd_df%_mw_tot2"]/224).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_df%_mw"] =  (ct_speed_proxy["t_rd_df%_mw_tot2"]/192).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_df%_mw"] =  (ct_speed_proxy["t_rd_df%_mw_tot2"]/128).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_df%_mw"] =  (ct_speed_proxy["t_rd_df%_mw_tot2"]).round(2)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_df%_mw"] =  (ct_speed_proxy["t_rd_df%_mw_tot2"]/124).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_df%_mw"] =  (ct_speed_proxy["t_rd_df%_mw_tot2"]/120).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_df%_mw"] =  (ct_speed_proxy["t_rd_df%_mw_tot2"]/112).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_df%_mw"] =  (ct_speed_proxy["t_rd_df%_mw_tot2"]/96).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_df%_mw"] =  (ct_speed_proxy["t_rd_df%_mw_tot2"]/64).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_df%_mw"] =  ct_speed_proxy["t_rd_df%_mw_tot2"]

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_df%_mw"] =  (ct_speed_proxy["t_rd_df%_mw_tot2"]/60).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_df%_mw"] =  (ct_speed_proxy["t_rd_df%_mw_tot2"]/56).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_df%_mw"] =  (ct_speed_proxy["t_rd_df%_mw_tot2"]/48).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_df%_mw"] =  (ct_speed_proxy["t_rd_df%_mw_tot2"]/32).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_df%_mw"] =  ct_speed_proxy["t_rd_df%_mw_tot2"]

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_df%_mw"] =  (ct_speed_proxy["t_rd_df%_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_df%_mw"] =  (ct_speed_proxy["t_rd_df%_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_df%_mw"] =  (ct_speed_proxy["t_rd_df%_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_df%_mw"] =  ct_speed_proxy["t_rd_df%_mw_tot2"]

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_df%_mw"] =  (ct_speed_proxy["t_rd_df%_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_df%_mw"] =  (ct_speed_proxy["t_rd_df%_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_df%_mw"] =  (ct_speed_proxy["t_rd_df%_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_df%_mw"] =  (ct_speed_proxy["t_rd_df%_mw_tot2"]/8).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_df%_mw"] =  ct_speed_proxy["t_rd_df%_mw_tot2"]


# We also want the exact same weighted metric for the comparator mean for creating the tournament court speed proxy calibration (l60_tw_ss_SOS_adj)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*64
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*128

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*64

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*32

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*16

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj"]*8

# # Sums the total percentages from all PREVIOUS rounds within a tournament
ct_speed_proxy.reset_index(drop = True, inplace = True)
ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"] = ct_speed_proxy.groupby(['t_id'])["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot"].transform(lambda x: x.rolling(window=100, min_periods = 1).sum().round(2).shift(1))

# # Divides the weighted totals appropriately to get weighted averages
ct_speed_proxy.reset_index(drop = True, inplace = True)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/252).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/248).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/240).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/224).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/192).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/128).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]).round(2)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/124).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/120).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/112).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/96).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/64).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/60).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/56).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/48).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/32).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/8).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_df%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_df%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

# # Provides ratio of cumulative tournament % serve pts won to the cumulative (and previously adjusted) weighted mean for players in the field
ct_speed_proxy['t_df%_ratio'] = (ct_speed_proxy['t_rd_df%_mw']/ct_speed_proxy['t_rd_df%_l60_tw_ss_IO_SOS_adj_mw']).round(2)
ct_speed_proxy["t_df%_ratio"] = ct_speed_proxy["t_df%_ratio"].fillna(1) # Sets first round matches to ratio of 1 (assumes mean for court type)

# Provides ratio of cumulative tournament % df induce to the matched field l60_ss_IO mean across tournaments (same as % df, but from returner perspective)
ct_speed_proxy["t_df_induce%_ratio"] = ct_speed_proxy["t_df%_ratio"].round(2)

#ct_speed_proxy.reset_index(drop = True, inplace = True)
#ct_speed_proxy = ct_speed_proxy.sort_values(by=['t_ident','m_yr','t_id','m_rd_num'], ascending = False)
#ct_speed_proxy.to_csv('../data/ct_speed_proxy.csv', index=False)

# Merge the relevant ratios back into core dataframe
df_player3 = df_player3.merge(ct_speed_proxy[['t_id', 'm_rd_num', 't_df%_ratio', 't_df_induce%_ratio']], on= ['t_id', 'm_rd_num'], how = 'left')
df_player3 = df_player3.sort_values(by=['p_id','m_date','m_rd_num'], ascending = False)
#df_player3.to_csv('../data/df_player3.csv', index=False)

del ct_speed_proxy

In [232]:
# 't_bp_save%_ratio'
# 't_bp_conv%_ratio'
# Provides ratios, to pass back to main dataframe, of % points resulting in a break point saved or converted (conv) on serve and return, respectively, by the entire field in a tournament's prior rounds (cumulative, weighted by number of matches per previous round) and 
# These ratios, in turn will serve as stand-alone 'court speed proxy' predictive features, as well as calibration metrics to make one further adjustment to time-weighted and SOS-adjusted serve and return metrics
# Court speed proxy for first round matches is assigned a neutral prior (assume sample mean performance for surface/indoor-outdoor status for the field) for first round matches.

#df_player3 = df_player2
df_player3 = df_player3.sort_values(by=['m_yr','t_id','m_rd_num','m_date'], ascending = False)
ct_speed_proxy  = df_player3.groupby(['t_id', 'm_rd_num'])['t_id', 't_ident', 't_ind', 't_surf', 't_draw_sz', 'm_yr', 'm_rd_num', 'p_bp_save%', 'p_bp_save%_l60_tw_ss_IO_SOS_adj'].mean().round(2)
ct_speed_proxy.rename(columns = {"p_bp_save%": "t_rd_bp_save%", "p_bp_save%_l60_tw_ss_IO_SOS_adj": "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"}, inplace = True)
ct_speed_proxy.reset_index(drop = True, inplace = True)
#ct_speed_proxy = ct_speed_proxy.sort_values(by=['t_ident','m_yr','t_id','m_rd_num'], ascending = False)
#ct_speed_proxy.to_csv('../data/ct_speed_proxy.csv', index=False)


# # Weights each round mean to reflect the number of matches that went into generating that mean (yes, some walkover and shortened matches have already been removed prior to computing the means but we don't need to correct for this because the effect on per-round variance of these matches is minimal)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*64
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*128

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*64

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*32

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*16

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_bp_save%_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%"]*8

# # Sums the total percentages from all PREVIOUS rounds within a tournament
ct_speed_proxy.reset_index(drop = True, inplace = True)
ct_speed_proxy["t_rd_bp_save%_mw_tot2"] = ct_speed_proxy.groupby(['t_id'])["t_rd_bp_save%_mw_tot"].transform(lambda x: x.rolling(window=100, min_periods = 1).sum().round(2).shift(1))

# # Divides the weighted totals appropriately to get weighted averages
ct_speed_proxy.reset_index(drop = True, inplace = True)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_bp_save%_mw"] =  (ct_speed_proxy["t_rd_bp_save%_mw_tot2"]/252).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_bp_save%_mw"] =  (ct_speed_proxy["t_rd_bp_save%_mw_tot2"]/248).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_bp_save%_mw"] =  (ct_speed_proxy["t_rd_bp_save%_mw_tot2"]/240).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_bp_save%_mw"] =  (ct_speed_proxy["t_rd_bp_save%_mw_tot2"]/224).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_bp_save%_mw"] =  (ct_speed_proxy["t_rd_bp_save%_mw_tot2"]/192).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_bp_save%_mw"] =  (ct_speed_proxy["t_rd_bp_save%_mw_tot2"]/128).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_bp_save%_mw"] =  (ct_speed_proxy["t_rd_bp_save%_mw_tot2"]).round(2)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_bp_save%_mw"] =  (ct_speed_proxy["t_rd_bp_save%_mw_tot2"]/124).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_bp_save%_mw"] =  (ct_speed_proxy["t_rd_bp_save%_mw_tot2"]/120).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_bp_save%_mw"] =  (ct_speed_proxy["t_rd_bp_save%_mw_tot2"]/112).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_bp_save%_mw"] =  (ct_speed_proxy["t_rd_bp_save%_mw_tot2"]/96).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_bp_save%_mw"] =  (ct_speed_proxy["t_rd_bp_save%_mw_tot2"]/64).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_bp_save%_mw"] =  ct_speed_proxy["t_rd_bp_save%_mw_tot2"]

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_bp_save%_mw"] =  (ct_speed_proxy["t_rd_bp_save%_mw_tot2"]/60).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_bp_save%_mw"] =  (ct_speed_proxy["t_rd_bp_save%_mw_tot2"]/56).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_bp_save%_mw"] =  (ct_speed_proxy["t_rd_bp_save%_mw_tot2"]/48).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_bp_save%_mw"] =  (ct_speed_proxy["t_rd_bp_save%_mw_tot2"]/32).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_bp_save%_mw"] =  ct_speed_proxy["t_rd_bp_save%_mw_tot2"]

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_bp_save%_mw"] =  (ct_speed_proxy["t_rd_bp_save%_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_bp_save%_mw"] =  (ct_speed_proxy["t_rd_bp_save%_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_bp_save%_mw"] =  (ct_speed_proxy["t_rd_bp_save%_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_bp_save%_mw"] =  ct_speed_proxy["t_rd_bp_save%_mw_tot2"]

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_bp_save%_mw"] =  (ct_speed_proxy["t_rd_bp_save%_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_bp_save%_mw"] =  (ct_speed_proxy["t_rd_bp_save%_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_bp_save%_mw"] =  (ct_speed_proxy["t_rd_bp_save%_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_bp_save%_mw"] =  (ct_speed_proxy["t_rd_bp_save%_mw_tot2"]/8).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_bp_save%_mw"] =  ct_speed_proxy["t_rd_bp_save%_mw_tot2"]


# We also want the exact same weighted metric for the comparator mean for creating the tournament court speed proxy calibration (l60_tw_ss_SOS_adj)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*64
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*128

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*32
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*64

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*16
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*32

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*16

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*2
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*4
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*8
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj"]*8

# # Sums the total percentages from all PREVIOUS rounds within a tournament
ct_speed_proxy.reset_index(drop = True, inplace = True)
ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"] = ct_speed_proxy.groupby(['t_id'])["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot"].transform(lambda x: x.rolling(window=100, min_periods = 1).sum().round(2).shift(1))

# # Divides the weighted totals appropriately to get weighted averages
ct_speed_proxy.reset_index(drop = True, inplace = True)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 7), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/252).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/248).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/240).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/224).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/192).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/128).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 128) | (ct_speed_proxy["t_draw_sz"] == 96)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]).round(2)

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 6), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/124).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/120).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/112).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/96).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/64).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 64) | (ct_speed_proxy["t_draw_sz"] == 56) | (ct_speed_proxy["t_draw_sz"] == 48)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/60).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/56).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/48).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/32).round(2)
ct_speed_proxy.loc[((ct_speed_proxy["t_draw_sz"] == 32) | (ct_speed_proxy["t_draw_sz"] == 28)) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 16) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

#Tour Finals and Next Gen Finals -RR Format (SF & F elimination)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 5), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/28).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 4), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/24).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 3), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/16).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 2), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  (ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]/8).round(2)
ct_speed_proxy.loc[(ct_speed_proxy["t_draw_sz"] == 8) & (ct_speed_proxy["m_rd_num"] == 1), "t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw"] =  ct_speed_proxy["t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw_tot2"]

# # Provides ratio of cumulative tournament % serve pts won to the cumulative (and previously adjusted) weighted mean for players in the field
ct_speed_proxy['t_bp_save%_ratio'] = (ct_speed_proxy['t_rd_bp_save%_mw']/ct_speed_proxy['t_rd_bp_save%_l60_tw_ss_IO_SOS_adj_mw']).round(2)
ct_speed_proxy["t_bp_save%_ratio"] = ct_speed_proxy["t_bp_save%_ratio"].fillna(1) # Sets first round matches to ratio of 1 (assumes mean for court type)

# Provides ratio of cumulative tournament % df induce to the matched field l60_ss_IO mean across tournaments (same as % df, but from returner perspective)
ct_speed_proxy["t_bp_conv%_ratio"] = ct_speed_proxy["t_bp_save%_ratio"].round(2)

# Provides ratio of cumulative tournament % break points CONVERTED (inverse of break points saved ratio) to the matched field l60_ss_IO mean across tournaments
ct_speed_proxy["t_bp_conv%_ratio"] = (1/ct_speed_proxy["t_bp_save%_ratio"]).round(2)

#ct_speed_proxy.reset_index(drop = True, inplace = True)
#ct_speed_proxy = ct_speed_proxy.sort_values(by=['t_ident','m_yr','t_id','m_rd_num'], ascending = False)
#ct_speed_proxy.to_csv('../data/ct_speed_proxy.csv', index=False)

# Merge the relevant ratios back into core dataframe
df_player3 = df_player3.merge(ct_speed_proxy[['t_id', 'm_rd_num', 't_bp_save%_ratio', 't_bp_conv%_ratio']], on= ['t_id', 'm_rd_num'], how = 'left')
df_player3 = df_player3.sort_values(by=['p_id','m_date','m_rd_num'], ascending = False)
#df_player3.to_csv('../data/df_player3.csv', index=False)

del ct_speed_proxy

Now we will create "court speed proxy-adjusted" versions of the already decay time-weighted and strength of schedule-adjusted predictive features related to serving and returning on a per-player, per-match basis (for both long term performance [last 60 matches] and short term performance [last 10 matches]). These following CSP-adjusted feature variants are run on a surface specific basis (clay or hard), and separately for versions respcting and not including an indoor/outdoor (IO) separation: 
% first serves in, % first serves "induced", % service pts won (overall, 1st and 2nd separately), % return pts won (overall, 1st and 2nd separately), % aces, % aced, % double faults, % double faults "induced", % break points saved, % break points converted.

In [233]:
# 'p_1st_sv%_l60_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), FIRST SERVE IN performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_1st_sv%_l60_tw_ss_SOS_csp_adj"] = (df_player3["p_1st_sv%_l60_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [234]:
# 'p_1st_sv%_l10_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), SERVE POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_1st_sv%_l10_tw_ss_SOS_csp_adj"] = (df_player3["p_1st_sv%_l10_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [235]:
# 'p_1st_sv%_l60_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific, strength of schedule-adjusted (SOS), FIRST SERVE IN performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_1st_sv%_l60_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_1st_sv%_l60_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [236]:
# 'p_1st_sv%_l10_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific strength of schedule-adjusted (SOS), SERVE POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_1st_sv%_l10_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_1st_sv%_l10_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [237]:
# 'p_1st_sv%_yielded_l60_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), FIRST SERVE IN YIELDED (as returner) performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_in%_yielded_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_1st_sv%_yielded_l60_tw_ss_SOS_csp_adj"] = (df_player3["p_1st_sv%_yielded_l60_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [238]:
# 'p_1st_sv%_yielded_l10_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), FIRST SERVE IN YIELDED (as returner) performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_1st_sv%_yielded_l10_tw_ss_SOS_csp_adj"] = (df_player3["p_1st_sv%_yielded_l10_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [239]:
# 'p_1st_sv%_yielded_l60_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific, strength of schedule-adjusted (SOS), FIRST SERVE IN YIELDED (as returner) performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, IO-specific court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_in%_yielded_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_1st_sv%_yielded_l60_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [240]:
# 'p_1st_sv%_yielded_l10_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific strength of schedule-adjusted (SOS), FIRST SERVE IN YIELDED (as returner) performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_1st_sv%_yielded_l10_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [241]:
# 'p_sv_pts_won%_l60_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), SERVE POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf'])['t_sv_pts_won%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_sv_pts_won%_l60_tw_ss_SOS_csp_adj"] = (df_player3["p_sv_pts_won%_l60_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [242]:
# 'p_sv_pts_won%_l10_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), SERVE POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_sv_pts_won%_l10_tw_ss_SOS_csp_adj"] = (df_player3["p_sv_pts_won%_l10_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [243]:
# 'p_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific strength of schedule-adjusted (SOS), SERVE POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, IO-specific court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_sv_pts_won%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [244]:
# 'p_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific strength of schedule-adjusted (SOS), SERVE POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [245]:
# 'p_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), 1st SERVE POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf'])['t_1st_sv_pts_won%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj"] = (df_player3["p_1st_sv_pts_won%_l60_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [246]:
# 'p_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), 1st SERVE POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj"] = (df_player3["p_1st_sv_pts_won%_l10_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [247]:
# 'p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific strength of schedule-adjusted (SOS), 1st SERVE POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_sv_pts_won%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [248]:
# 'p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific strength of schedule-adjusted (SOS), 1st SERVE POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [249]:
# 'p_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), 2nd SERVE POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_sv_pts_won%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj"] = (df_player3["p_2nd_sv_pts_won%_l60_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [250]:
# 'p_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), 2nd SERVE POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj"] = (df_player3["p_2nd_sv_pts_won%_l10_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [251]:
# 'p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific strength of schedule-adjusted (SOS), 2nd SERVE POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_sv_pts_won%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [252]:
# 'p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific strength of schedule-adjusted (SOS), 2nd SERVE POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [253]:
# 'p_ret_pts_won%_l60_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), RETURN POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf'])['t_ret_pts_won%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_ret_pts_won%_l60_tw_ss_SOS_csp_adj"] = (df_player3["p_ret_pts_won%_l60_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [254]:
# 'p_ret_pts_won%_l10_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), RETURN POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_ret_pts_won%_l10_tw_ss_SOS_csp_adj"] = (df_player3["p_ret_pts_won%_l10_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [255]:
# 'p_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific strength of schedule-adjusted (SOS), RETURN POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, IO-specific court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ret_pts_won%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [256]:
# 'p_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific strength of schedule-adjusted (SOS), RETURN POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [257]:
# 'p_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), 1st RETURN POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf'])['t_1st_ret_pts_won%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj"] = (df_player3["p_1st_ret_pts_won%_l60_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [258]:
# 'p_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), 1st RETURN POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj"] = (df_player3["p_1st_ret_pts_won%_l10_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [259]:
# 'p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific strength of schedule-adjusted (SOS), 1st RETURN POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, IO-specific court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_1st_ret_pts_won%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [260]:
# 'p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific strength of schedule-adjusted (SOS), 1st RETURN POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [261]:
# 'p_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), 2nd RETURN POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf'])['t_2nd_ret_pts_won%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj"] = (df_player3["p_2nd_ret_pts_won%_l60_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [262]:
# 'p_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), 2nd RETURN POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj"] = (df_player3["p_2nd_ret_pts_won%_l10_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [263]:
# 'p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific strength of schedule-adjusted (SOS), 2nd RETURN POINTS WON performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, IO-specific court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_2nd_ret_pts_won%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [264]:
# 'p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific strength of schedule-adjusted (SOS), 2nd RETURN POINTS WON performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [265]:
# 'p_ace%_l60_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), ACE performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf'])['t_ace%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_ace%_l60_tw_ss_SOS_csp_adj"] = (df_player3["p_ace%_l60_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [266]:
# 'p_ace%_l10_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), ACE performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_ace%_l10_tw_ss_SOS_csp_adj"] = (df_player3["p_ace%_l10_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [267]:
# 'p_ace%_l60_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS),IO-specific strength of schedule-adjusted (SOS), ACE performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, IO-specific court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_ace%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_ace%_l60_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_ace%_l60_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [268]:
# 'p_ace%_l10_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific strength of schedule-adjusted (SOS), ACE performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_ace%_l10_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_ace%_l10_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [269]:
# 'p_aced%_l60_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), ACED performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf'])['t_aced%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_aced%_l60_tw_ss_SOS_csp_adj"] = (df_player3["p_aced%_l60_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [270]:
# 'p_aced%_l10_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), ACE performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_aced%_l10_tw_ss_SOS_csp_adj"] = (df_player3["p_aced%_l10_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [271]:
# 'p_aced%_l60_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific strength of schedule-adjusted (SOS), ACED performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf', 't_ind', 'm_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, IO-specific court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_aced%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_aced%_l60_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_aced%_l60_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [272]:
# 'p_aced%_l10_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific strength of schedule-adjusted (SOS), ACE performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_aced%_l10_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_aced%_l10_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [273]:
# 'p_df%_l60_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), double fault performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surfdf-specific, court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf'])['t_df%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surfdf will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surfdf prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_df%_l60_tw_ss_SOS_csp_adj"] = (df_player3["p_df%_l60_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [274]:
# 'p_df%_l10_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), df performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surfdf will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surfdf prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_df%_l10_tw_ss_SOS_csp_adj"] = (df_player3["p_df%_l10_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [275]:
# 'p_df%_l60_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific strength of schedule-adjusted (SOS), double fault performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surfdf-specific, IO-specific court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surfdf will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surfdf prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_df%_l60_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_df%_l60_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [276]:
# 'p_df%_l10_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific strength of schedule-adjusted (SOS), df performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surfdf will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surfdf prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_df%_l10_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_df%_l10_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [277]:
# 'p_df_induce%_l60_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), double fault induce (as returner) performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf'])['t_df_induce%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surfdf_induce will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surfdf_induce prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_df_induce%_l60_tw_ss_SOS_csp_adj"] = (df_player3["p_df_induce%_l60_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [278]:
# 'p_df_induce%_l10_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), df induce (as a returner) performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surfdf will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surfdf prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_df_induce%_l10_tw_ss_SOS_csp_adj"] = (df_player3["p_df_induce%_l10_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [279]:
# 'p_df_induce%_l60_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific strength of schedule-adjusted (SOS), double fault induce (as returner) performance of PLAYER over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, IO-specific, court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_df_induce%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surfdf_induce will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surfdf_induce prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_df_induce%_l60_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_df_induce%_l60_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [280]:
# 'p_df_induce%_l10_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS),IO-specific strength of schedule-adjusted (SOS), df induce (as a returner) performance of PLAYER over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surfdf will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surfdf prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_df_induce%_l10_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_df_induce%_l10_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [281]:
# 'p_bp_save%_l60_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), BREAK POINTS SAVED performance of PLAYER (as a server) over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf'])['t_bp_save%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_bp_save%_l60_tw_ss_SOS_csp_adj"] = (df_player3["p_bp_save%_l60_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [282]:
# 'p_bp_save%_l10_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), BREAK POINTS SAVED performance of PLAYER (as a server) over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_bp_save%_l10_tw_ss_SOS_csp_adj"] = (df_player3["p_bp_save%_l10_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [283]:
# 'p_bp_save%_l60_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific strength of schedule-adjusted (SOS), BREAK POINTS SAVED performance of PLAYER (as a server) over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, IO-specific court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_save%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_bp_save%_l60_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_bp_save%_l60_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [284]:
# 'p_bp_save%_l10_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific strength of schedule-adjusted (SOS), BREAK POINTS SAVED performance of PLAYER (as a server) over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_bp_save%_l10_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_bp_save%_l10_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [285]:
# 'p_bp_conv%_l60_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), BREAK POINTS CONVERTED performance of PLAYER (as a returner) over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf'])['t_bp_conv%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_bp_conv%_l60_tw_ss_SOS_csp_adj"] = (df_player3["p_bp_conv%_l60_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [286]:
# 'p_bp_conv%_l10_tw_ss_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), strength of schedule-adjusted (SOS), BREAK POINTS CONVERTED performance of PLAYER (as a returner) over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_bp_conv%_l10_tw_ss_SOS_csp_adj"] = (df_player3["p_bp_conv%_l10_tw_ss_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

In [287]:
# 'p_bp_conv%_l60_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific strength of schedule-adjusted (SOS), BREAK POINTS CONVERTED performance of PLAYER (as a returner) over the 60 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

# First obtain mean, time-weighted, surface-specific, IO-specific court speed proxy ratio for TOURNAMENTS PER MATCH PLAYED over the maximum interval (60 matches) PRIOR TO the match being predicted 
df_player3["ct_speed_proxy_60"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-1)*60
df_player3["ct_speed_proxy_59"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-2)*59
df_player3["ct_speed_proxy_58"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-3)*58
df_player3["ct_speed_proxy_57"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-4)*57
df_player3["ct_speed_proxy_56"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-5)*56
df_player3["ct_speed_proxy_55"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-6)*55
df_player3["ct_speed_proxy_54"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-7)*54
df_player3["ct_speed_proxy_53"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-8)*53
df_player3["ct_speed_proxy_52"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-9)*52
df_player3["ct_speed_proxy_51"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-10)*51
df_player3["ct_speed_proxy_50"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-11)*50
df_player3["ct_speed_proxy_49"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-12)*49
df_player3["ct_speed_proxy_48"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-13)*48
df_player3["ct_speed_proxy_47"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-14)*47
df_player3["ct_speed_proxy_46"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-15)*46
df_player3["ct_speed_proxy_45"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-16)*45
df_player3["ct_speed_proxy_44"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-17)*44
df_player3["ct_speed_proxy_43"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-18)*43
df_player3["ct_speed_proxy_42"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-19)*42
df_player3["ct_speed_proxy_41"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-20)*41
df_player3["ct_speed_proxy_40"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-21)*40
df_player3["ct_speed_proxy_39"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-22)*39
df_player3["ct_speed_proxy_38"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-23)*38
df_player3["ct_speed_proxy_37"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-24)*37
df_player3["ct_speed_proxy_36"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-25)*36
df_player3["ct_speed_proxy_35"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-26)*35
df_player3["ct_speed_proxy_34"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-27)*34
df_player3["ct_speed_proxy_33"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-28)*33
df_player3["ct_speed_proxy_32"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-29)*32
df_player3["ct_speed_proxy_31"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-30)*31
df_player3["ct_speed_proxy_30"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-31)*30
df_player3["ct_speed_proxy_29"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-32)*29
df_player3["ct_speed_proxy_28"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-33)*28
df_player3["ct_speed_proxy_27"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-34)*27
df_player3["ct_speed_proxy_26"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-35)*26
df_player3["ct_speed_proxy_25"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-36)*25
df_player3["ct_speed_proxy_24"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-37)*24
df_player3["ct_speed_proxy_23"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-38)*23
df_player3["ct_speed_proxy_22"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-39)*22
df_player3["ct_speed_proxy_21"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-40)*21
df_player3["ct_speed_proxy_20"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-41)*20
df_player3["ct_speed_proxy_19"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-42)*19
df_player3["ct_speed_proxy_18"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-43)*18
df_player3["ct_speed_proxy_17"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-44)*17
df_player3["ct_speed_proxy_16"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-45)*16
df_player3["ct_speed_proxy_15"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-46)*15
df_player3["ct_speed_proxy_14"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-47)*14
df_player3["ct_speed_proxy_13"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-48)*13
df_player3["ct_speed_proxy_12"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-49)*12
df_player3["ct_speed_proxy_11"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-50)*11
df_player3["ct_speed_proxy_10"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-51)*10
df_player3["ct_speed_proxy_9"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-52)*9
df_player3["ct_speed_proxy_8"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-53)*8
df_player3["ct_speed_proxy_7"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-54)*7
df_player3["ct_speed_proxy_6"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-55)*6
df_player3["ct_speed_proxy_5"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-56)*5
df_player3["ct_speed_proxy_4"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-57)*4
df_player3["ct_speed_proxy_3"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-58)*3
df_player3["ct_speed_proxy_2"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-59)*2
df_player3["ct_speed_proxy_1"] = df_player3.groupby(['p_id','t_surf','t_ind'])['t_bp_conv%_ratio'].shift(-60)*1

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 60
df_player3["ct_speed_proxy_l60"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_bp_conv%_l60_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_bp_conv%_l60_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l60"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l60", "ct_speed_proxy_50", "ct_speed_proxy_49", "ct_speed_proxy_48", "ct_speed_proxy_47", "ct_speed_proxy_46", "ct_speed_proxy_45", "ct_speed_proxy_44", "ct_speed_proxy_43", "ct_speed_proxy_42", "ct_speed_proxy_41", "ct_speed_proxy_40", "ct_speed_proxy_39", "ct_speed_proxy_38", "ct_speed_proxy_37", "ct_speed_proxy_36", "ct_speed_proxy_35", "ct_speed_proxy_34", "ct_speed_proxy_33", "ct_speed_proxy_32", "ct_speed_proxy_31", "ct_speed_proxy_30", "ct_speed_proxy_29", "ct_speed_proxy_28", "ct_speed_proxy_27", "ct_speed_proxy_26", "ct_speed_proxy_25", "ct_speed_proxy_24", "ct_speed_proxy_23", "ct_speed_proxy_22", "ct_speed_proxy_21", "ct_speed_proxy_20", "ct_speed_proxy_19", "ct_speed_proxy_18", "ct_speed_proxy_17", "ct_speed_proxy_16", "ct_speed_proxy_15", "ct_speed_proxy_14", "ct_speed_proxy_13", "ct_speed_proxy_12", "ct_speed_proxy_11", "ct_speed_proxy_10", "ct_speed_proxy_9", "ct_speed_proxy_8", "ct_speed_proxy_7", "ct_speed_proxy_6", "ct_speed_proxy_5", "ct_speed_proxy_4", "ct_speed_proxy_3", "ct_speed_proxy_2", "ct_speed_proxy_1"],axis=1)

In [288]:
# 'p_bp_conv%_l10_tw_ss_IO_SOS_csp_adj'
# Provides court speed proxy-adjustment (csp) to mean, time-weighted, surface-specific (SS), IO-specific strength of schedule-adjusted (SOS), BREAK POINTS CONVERTED performance of PLAYER (as a returner) over the 10 matches PRIOR TO the match being predicted 

df_player3 = df_player3.sort_values(by=['p_id','t_surf','t_ind','m_date','m_rd_num'], ascending = False)

#Using sum function allows ignoring NaN instead of interpolation. In the modeling stage, matches between players with few matches previously played on a given surface will be filtered out. So there will be very few matches in the modeling phase where this predictive feature is absent (remember also that there is a ramp-up phase of multiple years where retrospective stats accrue but matches from those years will not be used in the modeling phase.)
df_player3["ct_speed_proxy_ws"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].sum(axis=1)
df_player3["ct_speed_proxy_ws_ct"] = df_player3[["ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"]].count(axis=1)

df_player3["denom"] = triangular_number(60) - (triangular_number(60 - df_player3["ct_speed_proxy_ws_ct"])) #needed to track and correct for when the number of prior matches on a given surface prior to a given match is fewer than 10
df_player3["ct_speed_proxy_l10"] = (df_player3["ct_speed_proxy_ws"]/df_player3["denom"]).round(2) #see note on prior line
#(ws = weighted sum; tw = time-weighted)

# Factors the feature already adjusted for tw and SOS by the time-weighted court speed proxy ratio over the same interval
df_player3["p_bp_conv%_l10_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_bp_conv%_l10_tw_ss_IO_SOS_adj"]*df_player3["ct_speed_proxy_l10"]).round(2)

df_player3 = df_player3.drop(["denom", "ct_speed_proxy_ws", "ct_speed_proxy_ws_ct", "ct_speed_proxy_l10", "ct_speed_proxy_60", "ct_speed_proxy_59", "ct_speed_proxy_58", "ct_speed_proxy_57", "ct_speed_proxy_56", "ct_speed_proxy_55", "ct_speed_proxy_54", "ct_speed_proxy_53", "ct_speed_proxy_52", "ct_speed_proxy_51"],axis=1)

Below a few "efficiency" ratios related to serving with court speed proxy adjustment baked in (on top of the other adjustments) are computed that could potentially be useful as predictive features. We have previously computed two other variants of these ratios, with progressively more adjustment baked in (first time-weighting and then time-weighting + SOS-adjustment baked in).

In [289]:
# 'p_ace_df%_ratio_l60_tw_ss_SOS_csp_adj'
# Provides the ratio of % aces to % double faults for PLAYER over the last 60 surface-specific matches prior to the match being predicted

df_player3["p_ace_df%_ratio_l60_tw_ss_SOS_csp_adj"] = (df_player3["p_ace%_l60_tw_ss_SOS_csp_adj"]/df_player3["p_df%_l60_tw_ss_SOS_csp_adj"]).round(2)

In [290]:
# 'p_ace_df%_ratio_l10_tw_ss_SOS_csp_adj'
# Provides the ratio of % aces to % double faults for PLAYER over the last 10 surface-specific matches prior to the match being predicted

df_player3["p_ace_df%_ratio_l10_tw_ss_SOS_csp_adj"] = (df_player3["p_ace%_l10_tw_ss_SOS_csp_adj"]/df_player3["p_df%_l10_tw_ss_SOS_csp_adj"]).round(2)

In [291]:
# 'p_ace_df%_ratio_l60_tw_ss_IO_SOS_csp_adj'
# Provides the ratio of % aces to % double faults for PLAYER over the last 60 surface-specific, IO-specific matches prior to the match being predicted

df_player3["p_ace_df%_ratio_l60_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_ace%_l60_tw_ss_IO_SOS_csp_adj"]/df_player3["p_df%_l60_tw_ss_IO_SOS_csp_adj"]).round(2)

In [292]:
# 'p_ace_df%_ratio_l10_tw_ss_IO_SOS_csp_adj'
# Provides the ratio of % aces to % double faults for PLAYER over the last 10 surface-specific, IO-specific matches prior to the match being predicted

df_player3["p_ace_df%_ratio_l10_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_ace%_l10_tw_ss_IO_SOS_csp_adj"]/df_player3["p_df%_l10_tw_ss_IO_SOS_csp_adj"]).round(2)

In [293]:
# 'p_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_csp_adj'
# Provides the ratio of % first serves in to % first serve points won for PLAYER over the last 60 surface-specific matches prior to the match being predicted

df_player3["p_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_csp_adj"] = (df_player3["p_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj"]/df_player3["p_1st_sv%_l60_tw_ss_SOS_csp_adj"]).round(2)

In [294]:
# 'p_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_csp_adj'
# Provides the ratio of % first serves in to % first serve points won for PLAYER over the last 10 surface-specific matches prior to the match being predicted

df_player3["p_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_csp_adj"] = (df_player3["p_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj"]/df_player3["p_1st_sv%_l10_tw_ss_SOS_csp_adj"]).round(2)

In [295]:
# 'p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_csp_adj'
# Provides the ratio of % first serves in to % first serve points won for PLAYER over the last 60 surface-specific, IO specific matches prior to the match being predicted

df_player3["p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"]/df_player3["p_1st_sv%_l60_tw_ss_IO_SOS_csp_adj"]).round(2)

In [296]:
# 'p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_csp_adj'
# Provides the ratio of % first serves in to % first serve points won for PLAYER over the last 10 surface-specific, IO specific matches prior to the match being predicted

df_player3["p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"]/df_player3["p_1st_sv%_l10_tw_ss_IO_SOS_csp_adj"]).round(2)

In [297]:
# 'p_ace_1stSv%_ratio_l60_tw_ss_SOS_csp_adj'
# Provides the ratio of % aces in to % first serves in for PLAYER over the last 60 surface-specific matches prior to the match being predicted

df_player3["p_ace_1stSv%_ratio_l60_tw_ss_SOS_csp_adj"] = (df_player3["p_ace%_l60_tw_ss_SOS_csp_adj"]/df_player3["p_1st_sv%_l60_tw_ss_SOS_csp_adj"]).round(2)

In [298]:
# 'p_ace_1stSv%_ratio_l10_tw_ss_SOS_csp_adj'
# Provides the ratio of % aces in to % first serves in for PLAYER over the last 10 surface-specific matches prior to the match being predicted

df_player3["p_ace_1stSv%_ratio_l10_tw_ss_SOS_csp_adj"] = (df_player3["p_ace%_l10_tw_ss_SOS_csp_adj"]/df_player3["p_1st_sv%_l10_tw_ss_SOS_csp_adj"]).round(2)

In [299]:
# 'p_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_csp_adj'
# Provides the ratio of % aces in to % first serves in for PLAYER over the last 60 surface-specific, IO-specific matches prior to the match being predicted

df_player3["p_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_ace%_l60_tw_ss_IO_SOS_csp_adj"]/df_player3["p_1st_sv%_l60_tw_ss_IO_SOS_csp_adj"]).round(2)

In [300]:
# 'p_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_csp_adj'
# Provides the ratio of % aces in to % first serves in for PLAYER over the last 10 surface-specific, IO-specific matches prior to the match being predicted

df_player3["p_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_ace%_l10_tw_ss_IO_SOS_csp_adj"]/df_player3["p_1st_sv%_l10_tw_ss_IO_SOS_csp_adj"]).round(2)

In [301]:
# 'p_df_SvPtsWon%_ratio_l60_tw_ss_SOS_csp_adj'
# Provides the ratio of % double faults to % serve points won for PLAYER over the last 60 surface-specific matches prior to the match being predicted

df_player3["p_df_SvPtsWon%_ratio_l60_tw_ss_SOS_csp_adj"] = (df_player3["p_df%_l60_tw_ss_SOS_csp_adj"]/df_player3["p_sv_pts_won%_l60_tw_ss_SOS_csp_adj"]).round(2)

In [302]:
# 'p_df_SvPtsWon%_ratio_l10_tw_ss_SOS_csp_adj'
# Provides the ratio of % double faults to % serve points won for PLAYER over the last 10 surface-specific matches prior to the match being predicted

df_player3["p_df_SvPtsWon%_ratio_l10_tw_ss_SOS_csp_adj"] = (df_player3["p_df%_l10_tw_ss_SOS_csp_adj"]/df_player3["p_sv_pts_won%_l10_tw_ss_SOS_csp_adj"]).round(2)

In [303]:
# 'p_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_csp_adj'
# Provides the ratio of % double faults to % serve points won for PLAYER over the last 60 surface-specific, IO-specific matches prior to the match being predicted

df_player3["p_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_df%_l60_tw_ss_IO_SOS_csp_adj"]/df_player3["p_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"]).round(2)

In [304]:
# 'p_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_csp_adj'
# Provides the ratio of % double faults to % serve points won for PLAYER over the last 10 surface-specific, IO-specific matches prior to the match being predicted

df_player3["p_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_csp_adj"] = (df_player3["p_df%_l10_tw_ss_IO_SOS_csp_adj"]/df_player3["p_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"]).round(2)

In [305]:
df_player3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57068 entries, 4606 to 34541
Columns: 339 entries, t_id to p_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_csp_adj
dtypes: datetime64[ns](1), float64(303), int64(29), object(6)
memory usage: 148.0+ MB


In [306]:
#Save to review
#df_player3.to_csv('../data/df_player3.csv', index=False)

### 7. Differential Feature Generation: Player vs Player in Match to be Predicted

For all predictive features that are player-specific, we want to express them in a differential format prior to EDA and modeling. ie, if Player A in a match to be predicted in ranked 5 and Player B in that match is ranked 10, Player A would have a 'ranking differential' predictive feature value of 5 and player B would have a 'ranking differential' predictive feature value of -5. Previous iterations of the model have demonstrated that differential features are MUCH more predictive of the target feature (% pts won by a given player in a given match) than the concomitant non-differential feature value (in the case of our example, just the ranking of each player without reference to the ranking of their opponent).  

Along the way, we will add a few new predictive features (eg, log of player ranking), that will also appear as differentials in their ultimate form. 

In [307]:
df_player4 = df_player3
df_player4 = df_player4.sort_values(by=['m_num','m_outcome'], ascending = False)

In [308]:
# ATP Ranking Differential 

# For fill NA purposes for unranked players
max_ranking = df_player4['p_rk'].max() # for fill NA purposes
df_player4["p_rk"] = df_player4["p_rk"].fillna(max_ranking + 1) #if unranked, assign highest (worst) ranking in sample plus one

# Populating the opponent column
df_player4["p_opp_rk"] = df_player4.groupby(['m_num'])['p_rk'].shift(-1)
df_player4["p_opp_rk"] = df_player4["p_opp_rk"].fillna(df_player4.groupby(['m_num'])['p_rk'].shift(1))

# Calculating differentials
df_player4["p_rk_diff"] = -(df_player4["p_rk"] - df_player4["p_opp_rk"]) #negative diff indicates better ranking than opponent
df_player4 = df_player4.drop(["p_opp_rk"], axis=1)

In [309]:
# Generate Log of ATP Ranking (Per Player) and Differential of Log Ranking Between Players

#Compute Log of ATP Ranking per player
df_player4["p_log_rk"] = np.log(df_player4["p_rk"]).round(2)

# Populating the opponent column
df_player4["p_opp_log_rk"] = df_player4.groupby(['m_num'])['p_log_rk'].shift(-1)
df_player4["p_opp_log_rk"] = df_player4["p_opp_log_rk"].fillna(df_player4.groupby(['m_num'])['p_log_rk'].shift(1))

# Calculating differentials
df_player4["p_log_rk_diff"] = -(df_player4["p_log_rk"] - df_player4["p_opp_log_rk"]) #negative diff indicates better log ranking than opponent

# Move non-differential form of log of player ranking column to appropriate place in the dataframe
col = df_player4.pop("p_log_rk")
df_player4.insert(20, "p_log_rk", col)

df_player4 = df_player4.drop(["p_opp_log_rk"],axis=1)

In [310]:
# ATP Ranking Points Differential Between Players

# Assign 1 Point (lowest possible amount) to NAs
min_ranking_pts = df_player4['p_rk_pts'].min() # for fill NA purposes
df_player4["p_rk_pts"] = df_player4["p_rk_pts"].fillna(min_ranking_pts) 

# # Populating the opponent column
df_player4["p_opp_rk_pts"] = df_player4.groupby(['m_num'])['p_rk_pts'].shift(-1)
df_player4["p_opp_rk_pts"] = df_player4["p_opp_rk_pts"].fillna(df_player4.groupby(['m_num'])['p_rk_pts'].shift(1))

# # Calculating differentials
df_player4["p_rk_pts_diff"] = (df_player4["p_rk_pts"] - df_player4["p_opp_rk_pts"])

df_player4 = df_player4.drop(["p_opp_rk_pts"],axis=1)

In [311]:
# Entry Type Differential
# Entry type was encoded in workbook 1 as: 5- Ranking-Based Direct Entry; 4- Non-Ranking Based Direct Entry; 2.5 - Qualifier; 2 - Lucky Loser in Qualifying

# # Populating the opponent column
df_player4["p_opp_ent"] = df_player4.groupby(['m_num'])['p_ent'].shift(-1)
df_player4["p_opp_ent"] = df_player4["p_opp_ent"].fillna(df_player4.groupby(['m_num'])['p_ent'].shift(1))

# # Calculating differentials
df_player4["p_ent_diff"] = (df_player4["p_ent"] - df_player4["p_opp_ent"])

df_player4 = df_player4.drop(["p_opp_ent"],axis=1)

In [312]:
# Handedness Differential
# Handedness type was encoded in workbook 1 as: R- 0; L- 1; Unknown- 0 (statistically way more likely to be righty)

# # Populating the opponent column
df_player4["p_opp_hd"] = df_player4.groupby(['m_num'])['p_hd'].shift(-1)
df_player4["p_opp_hd"] = df_player4["p_opp_hd"].fillna(df_player4.groupby(['m_num'])['p_hd'].shift(1))

# # Calculating differentials
df_player4["p_hd_diff"] = (df_player4["p_hd"] - df_player4["p_opp_hd"])

df_player4 = df_player4.drop(["p_opp_hd"],axis=1)

In [313]:
# Height Differential
# Height in cm

# # Populating the opponent column
df_player4["p_opp_ht"] = df_player4.groupby(['m_num'])['p_ht'].shift(-1)
df_player4["p_opp_ht"] = df_player4["p_opp_ht"].fillna(df_player4.groupby(['m_num'])['p_ht'].shift(1))

# # Calculating differentials
df_player4["p_ht_diff"] = (df_player4["p_ht"] - df_player4["p_opp_ht"])

df_player4 = df_player4.drop(["p_opp_ht"],axis=1)

In [314]:
# Age Differential
# Age in yrs

# # Populating the opponent column
df_player4["p_opp_age"] = df_player4.groupby(['m_num'])['p_age'].shift(-1)
df_player4["p_opp_age"] = df_player4["p_opp_age"].fillna(df_player4.groupby(['m_num'])['p_age'].shift(1))

# # Calculating differentials
df_player4["p_age_diff"] = (df_player4["p_age"] - df_player4["p_opp_age"])

df_player4 = df_player4.drop(["p_opp_age"],axis=1)

In [315]:
# Home Court Advantage (HCA) Differential 

# First, create column that indicates whether a player is from the same country that the current tourny is in (1=YES, 2=NO)
df_player4['p_HCA'] = np.where((df_player4['t_co'] == df_player4['p_co']), 1, 0)

# Populating the opponent column
df_player4["p_opp_HCA"] = df_player4.groupby(['m_num'])['p_HCA'].shift(-1)
df_player4["p_opp_HCA"] = df_player4["p_opp_HCA"].fillna(df_player4.groupby(['m_num'])['p_HCA'].shift(1))

# Calculating differential column (1= Player has HCA and Opp does NOT; -1 =vice versa; 0=both player do or don't have HCA)
df_player4["p_HCA_diff"] = (df_player4["p_HCA"] - df_player4["p_opp_HCA"])

df_player4 = df_player4.drop(["p_opp_HCA"],axis=1)

In [316]:
# Total decay-weighted time spent on court in past 7 days Differential 
# Time in min

# Populating the opponent column
df_player4["p_opp_tot_time_l7d_tw"] = df_player4.groupby(['m_num'])['p_tot_time_l7d_tw'].shift(-1)
df_player4["p_opp_tot_time_l7d_tw"] = df_player4["p_opp_tot_time_l7d_tw"].fillna(df_player4.groupby(['m_num'])['p_tot_time_l7d_tw'].shift(1))

# Calculating differential column 
df_player4["p_tot_time_l7d_tw_diff"] = (df_player4["p_tot_time_l7d_tw"] - df_player4["p_opp_tot_time_l7d_tw"])

df_player4 = df_player4.drop(["p_opp_tot_time_l7d_tw"],axis=1)

In [317]:
# Total decay-weighted pts spent on court in past 7 days Differential 

# Populating the opponent column
df_player4["p_opp_tot_pts_l7d_tw"] = df_player4.groupby(['m_num'])['p_tot_pts_l7d_tw'].shift(-1)
df_player4["p_opp_tot_pts_l7d_tw"] = df_player4["p_opp_tot_pts_l7d_tw"].fillna(df_player4.groupby(['m_num'])['p_tot_pts_l7d_tw'].shift(1))

# Calculating differential column 
df_player4["p_tot_pts_l7d_tw_diff"] = (df_player4["p_tot_pts_l7d_tw"] - df_player4["p_opp_tot_pts_l7d_tw"])

df_player4 = df_player4.drop(["p_opp_tot_pts_l7d_tw"],axis=1)

In [318]:
# Body Battery (Time-Based) Differential 
# Metric integrating decay-weighted time spent on court over past 7 days with player stamina (derived from number of previous matches played across surfaces)

# Populating the opponent column
df_player4["p_opp_body_battery_t_tw"] = df_player4.groupby(['m_num'])['p_body_battery_t_tw'].shift(-1)
df_player4["p_opp_body_battery_t_tw"] = df_player4["p_opp_body_battery_t_tw"].fillna(df_player4.groupby(['m_num'])['p_body_battery_t_tw'].shift(1))

# Calculating differential column 
df_player4["p_body_battery_t_tw_diff"] = (df_player4["p_body_battery_t_tw"] - df_player4["p_opp_body_battery_t_tw"])

df_player4 = df_player4.drop(["p_opp_body_battery_t_tw"],axis=1)

In [319]:
# Body Battery (Points-Based) Differential 
# Metric integrating decay-weighted points spent on court over past 7 days with player stamina (derived from number of previous matches played across surfaces)

# Populating the opponent column
df_player4["p_opp_body_battery_pts_tw"] = df_player4.groupby(['m_num'])['p_body_battery_pts_tw'].shift(-1)
df_player4["p_opp_body_battery_pts_tw"] = df_player4["p_opp_body_battery_pts_tw"].fillna(df_player4.groupby(['m_num'])['p_body_battery_pts_tw'].shift(1))

# Calculating differential column 
df_player4["p_body_battery_pts_tw_diff"] = (df_player4["p_body_battery_pts_tw"] - df_player4["p_opp_body_battery_pts_tw"])

df_player4 = df_player4.drop(["p_opp_body_battery_pts_tw"],axis=1)

In [320]:
# Player Matches (Surface-Specific) Differential 
# Previous matches played in entire match sample on a SPECIFIC surface

# Populating the opponent column
df_player4["p_opp_matches_ss"] = df_player4.groupby(['m_num'])['p_matches_ss'].shift(-1)
df_player4["p_opp_matches_ss"] = df_player4["p_opp_matches_ss"].fillna(df_player4.groupby(['m_num'])['p_matches_ss'].shift(1))

# Calculating differential column 
df_player4["p_matches_ss_diff"] = (df_player4["p_matches_ss"] - df_player4["p_opp_matches_ss"])

df_player4 = df_player4.drop(["p_opp_matches_ss"],axis=1)

In [321]:
# Player Matches (NON-Surface-Specific) Differential 
# Previous matches played in entire match sample ACROSS surfaces

# Populating the opponent column
df_player4["p_opp_matches_nss"] = df_player4.groupby(['m_num'])['p_matches_nss'].shift(-1)
df_player4["p_opp_matches_nss"] = df_player4["p_opp_matches_nss"].fillna(df_player4.groupby(['m_num'])['p_matches_nss'].shift(1))

# Calculating differential column 
df_player4["p_matches_nss_diff"] = (df_player4["p_matches_nss"] - df_player4["p_opp_matches_nss"])

df_player4 = df_player4.drop(["p_opp_matches_nss"],axis=1)

In [322]:
# Player Surface Change Differential
# For each player, 1 means the player has changed (chg) surfaces AT LEAST ONCE within their past 3 matches (before the one at hand); 0 means short term surface 'continuity'. Does not count an indoor/outdoor switch (on the same surface) as a surface switch.
# So 1 for the differential means player changed surfaces AT LAST ONCE within their past 3 matches AND the opponent DID NOT (-1 means vice versa)

# Populating the opponent column
df_player4["p_opp_surf_chg"] = df_player4.groupby(['m_num'])['p_surf_chg'].shift(-1)
df_player4["p_opp_surf_chg"] = df_player4["p_opp_surf_chg"].fillna(df_player4.groupby(['m_num'])['p_surf_chg'].shift(1))

# Calculating differential column 
df_player4["p_surf_chg_diff"] = (df_player4["p_surf_chg"] - df_player4["p_opp_surf_chg"])

df_player4 = df_player4.drop(["p_opp_surf_chg"],axis=1)

In [323]:
# Player Time Zone Change Differential
# The non-differential verion provides how many time zones (tz) a player traveled from the site of their last (main tour level) match, provided that last match was within the previous 4 days.
# This differential version reports the difference in number of time zone traveled within this timeframe for the two players. Negative indicates fewer time zone travelled relative to opponent.

# Populating the opponent column
df_player4["p_opp_tz_chg"] = df_player4.groupby(['m_num'])['p_tz_chg'].shift(-1)
df_player4["p_opp_tz_chg"] = df_player4["p_opp_tz_chg"].fillna(df_player4.groupby(['m_num'])['p_tz_chg'].shift(1))

# Calculating differential column 
df_player4["p_tz_chg_diff"] = (df_player4["p_tz_chg"] - df_player4["p_opp_tz_chg"])

df_player4 = df_player4.drop(["p_opp_tz_chg"],axis=1)

In [324]:
# Player Head-to-Head Matches Won Differential (Surface-Specific version)

# Populating the opponent column
df_player4["p_opp_H2H_w_ss"] = df_player4.groupby(['m_num'])['p_H2H_w_ss'].shift(-1)
df_player4["p_opp_H2H_w_ss"] = df_player4["p_opp_H2H_w_ss"].fillna(df_player4.groupby(['m_num'])['p_H2H_w_ss'].shift(1))

# Calculating differential column 
df_player4["p_H2H_w_ss_diff"] = (df_player4["p_H2H_w_ss"] - df_player4["p_opp_H2H_w_ss"])

df_player4 = df_player4.drop(["p_opp_H2H_w_ss"],axis=1)

In [325]:
# Player Head-to-Head Matches Won Differential (NON-surface-Specific version)

# Populating the opponent column
df_player4["p_opp_H2H_w_nss"] = df_player4.groupby(['m_num'])['p_H2H_w_nss'].shift(-1)
df_player4["p_opp_H2H_w_nss"] = df_player4["p_opp_H2H_w_nss"].fillna(df_player4.groupby(['m_num'])['p_H2H_w_nss'].shift(1))

# Calculating differential column 
df_player4["p_H2H_w_nss_diff"] = (df_player4["p_H2H_w_nss"] - df_player4["p_opp_H2H_w_nss"])

df_player4 = df_player4.drop(["p_opp_H2H_w_nss"],axis=1)

In [326]:
# Player Head-to-Head Points Won Differential (Surface-Specific version)

# Populating the opponent column
df_player4["p_opp_H2H_tot_pts_won%_ss"] = df_player4.groupby(['m_num'])['p_H2H_tot_pts_won%_ss'].shift(-1)
df_player4["p_opp_H2H_tot_pts_won%_ss"] = df_player4["p_opp_H2H_tot_pts_won%_ss"].fillna(df_player4.groupby(['m_num'])['p_H2H_tot_pts_won%_ss'].shift(1))

# Calculating differential column 
df_player4["p_H2H_tot_pts_won%_ss_diff"] = (df_player4["p_H2H_tot_pts_won%_ss"] - df_player4["p_opp_H2H_tot_pts_won%_ss"])

df_player4 = df_player4.drop(["p_opp_H2H_tot_pts_won%_ss"],axis=1)

In [327]:
# Player Head-to-Head Points Won Differential (NON-Surface-Specific version)

# Populating the opponent column
df_player4["p_opp_H2H_tot_pts_won%_nss"] = df_player4.groupby(['m_num'])['p_H2H_tot_pts_won%_nss'].shift(-1)
df_player4["p_opp_H2H_tot_pts_won%_nss"] = df_player4["p_opp_H2H_tot_pts_won%_nss"].fillna(df_player4.groupby(['m_num'])['p_H2H_tot_pts_won%_nss'].shift(1))

# Calculating differential column 
df_player4["p_H2H_tot_pts_won%_nss_diff"] = (df_player4["p_H2H_tot_pts_won%_nss"] - df_player4["p_opp_H2H_tot_pts_won%_nss"])

df_player4 = df_player4.drop(["p_opp_H2H_tot_pts_won%_nss"],axis=1)

In [328]:
# Player Total Pts Won% Last 60 Matches (Surface-Specific, Decay Time-Weighted) Differential
# 'p_tot_pts_won%_l60_tw_ss'

# Populating the opponent column
df_player4["p_opp_tot_pts_won%_l60_tw_ss"] = df_player4.groupby(['m_num'])['p_tot_pts_won%_l60_tw_ss'].shift(-1)
df_player4["p_opp_tot_pts_won%_l60_tw_ss"] = df_player4["p_opp_tot_pts_won%_l60_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_tot_pts_won%_l60_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_tot_pts_won%_l60_tw_ss_diff"] = (df_player4["p_tot_pts_won%_l60_tw_ss"] - df_player4["p_opp_tot_pts_won%_l60_tw_ss"])

df_player4 = df_player4.drop(["p_opp_tot_pts_won%_l60_tw_ss"],axis=1)

In [329]:
# Player Total Pts Won% Last 10 Matches (Surface-Specific, Decay Time-Weighted) Differential
# 'p_tot_pts_won%_l10_tw_ss'

# Populating the opponent column
df_player4["p_opp_tot_pts_won%_l10_tw_ss"] = df_player4.groupby(['m_num'])['p_tot_pts_won%_l10_tw_ss'].shift(-1)
df_player4["p_opp_tot_pts_won%_l10_tw_ss"] = df_player4["p_opp_tot_pts_won%_l10_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_tot_pts_won%_l10_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_tot_pts_won%_l10_tw_ss_diff"] = (df_player4["p_tot_pts_won%_l10_tw_ss"] - df_player4["p_opp_tot_pts_won%_l10_tw_ss"])

df_player4 = df_player4.drop(["p_opp_tot_pts_won%_l10_tw_ss"],axis=1)

In [330]:
# Player Total Pts Won% Last 60 Matches (Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted) Differential
# 'p_tot_pts_won%_l60_tw_ss_SOS_adj'

# Populating the opponent column
df_player4["p_opp_tot_pts_won%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_tot_pts_won%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_tot_pts_won%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_tot_pts_won%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_tot_pts_won%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_tot_pts_won%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_tot_pts_won%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_tot_pts_won%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_tot_pts_won%_l60_tw_ss_SOS_adj"],axis=1)

In [331]:
# Player Total Pts Won% Last 10 Matches (Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted) Differential
# 'p_tot_pts_won%_l10_tw_ss_SOS_adj'

# Populating the opponent column
df_player4["p_opp_tot_pts_won%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_tot_pts_won%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_tot_pts_won%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_tot_pts_won%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_tot_pts_won%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_tot_pts_won%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_tot_pts_won%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_tot_pts_won%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_tot_pts_won%_l10_tw_ss_SOS_adj"],axis=1)

In [332]:
# Player Total Pts Won% Last 60 Matches (Surface-Specific, Decay Time-Weighted, Indoor-Outdoor Specific)
# 'p_tot_pts_won%_l60_tw_ss_IO'

# Populating the opponent column
df_player4["p_opp_tot_pts_won%_l60_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_tot_pts_won%_l60_tw_ss_IO'].shift(-1)
df_player4["p_opp_tot_pts_won%_l60_tw_ss_IO"] = df_player4["p_opp_tot_pts_won%_l60_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_tot_pts_won%_l60_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_tot_pts_won%_l60_tw_ss_IO_diff"] = (df_player4["p_tot_pts_won%_l60_tw_ss_IO"] - df_player4["p_opp_tot_pts_won%_l60_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_tot_pts_won%_l60_tw_ss_IO"],axis=1)

In [333]:
# Player Total Pts Won% Last 10 Matches (Surface-Specific, Decay Time-Weighted, Indoor-Outdoor Specific) Differential
# 'p_tot_pts_won%_l10_tw_ss_IO'

# Populating the opponent column
df_player4["p_opp_tot_pts_won%_l10_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_tot_pts_won%_l10_tw_ss_IO'].shift(-1)
df_player4["p_opp_tot_pts_won%_l10_tw_ss_IO"] = df_player4["p_opp_tot_pts_won%_l10_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_tot_pts_won%_l10_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_tot_pts_won%_l10_tw_ss_IO_diff"] = (df_player4["p_tot_pts_won%_l10_tw_ss_IO"] - df_player4["p_opp_tot_pts_won%_l10_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_tot_pts_won%_l10_tw_ss_IO"],axis=1)

In [334]:
# Player Total Pts Won% Last 60 Matches (Surface-Specific, Decay Time-Weighted, Indoor-Outdoor Specific, Strength-of-Schedule Adjusted) Differential
# 'p_tot_pts_won%_l60_tw_ss_IO_SOS_adj'

# Populating the opponent column
df_player4["p_opp_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_tot_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_tot_pts_won%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_tot_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_tot_pts_won%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_tot_pts_won%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_tot_pts_won%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_tot_pts_won%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [335]:
# Player Total Pts Won% Last 10 Matches (Surface-Specific, Decay Time-Weighted, Indoor-Outdoor Specific, Strength-of-Schedule Adjusted) Differential
# 'p_tot_pts_won%_l10_tw_ss_IO_SOS_adj'

# Populating the opponent column
df_player4["p_opp_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_tot_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_tot_pts_won%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_tot_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_tot_pts_won%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_tot_pts_won%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_tot_pts_won%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_tot_pts_won%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [336]:
# Player Total Pts Won% Last 60 Matches (Surface-Specific, Decay Time-Weighted, Indoor-Outdoor Specific) Composite Version Differential
# 'p_tot_pts_won%_l60_tw_ss_comp'

# Populating the opponent column
df_player4["p_opp_tot_pts_won%_l60_tw_ss_comp"] = df_player4.groupby(['m_num'])['p_tot_pts_won%_l60_tw_ss_comp'].shift(-1)
df_player4["p_opp_tot_pts_won%_l60_tw_ss_comp"] = df_player4["p_opp_tot_pts_won%_l60_tw_ss_comp"].fillna(df_player4.groupby(['m_num'])['p_tot_pts_won%_l60_tw_ss_comp'].shift(1))

# Calculating differential column 
df_player4["p_tot_pts_won%_l60_tw_ss_comp_diff"] = (df_player4["p_tot_pts_won%_l60_tw_ss_comp"] - df_player4["p_opp_tot_pts_won%_l60_tw_ss_comp"])

df_player4 = df_player4.drop(["p_opp_tot_pts_won%_l60_tw_ss_comp"],axis=1)

In [337]:
# Player Total Pts Won% Last 10 Matches (Surface-Specific, Decay Time-Weighted, Indoor-Outdoor Specific) Composite Version Differential
# 'p_tot_pts_won%_l10_tw_ss_comp'

# Populating the opponent column
df_player4["p_opp_tot_pts_won%_l10_tw_ss_comp"] = df_player4.groupby(['m_num'])['p_tot_pts_won%_l10_tw_ss_comp'].shift(-1)
df_player4["p_opp_tot_pts_won%_l10_tw_ss_comp"] = df_player4["p_opp_tot_pts_won%_l10_tw_ss_comp"].fillna(df_player4.groupby(['m_num'])['p_tot_pts_won%_l10_tw_ss_comp'].shift(1))

# Calculating differential column 
df_player4["p_tot_pts_won%_l10_tw_ss_comp_diff"] = (df_player4["p_tot_pts_won%_l10_tw_ss_comp"] - df_player4["p_opp_tot_pts_won%_l10_tw_ss_comp"])

df_player4 = df_player4.drop(["p_opp_tot_pts_won%_l10_tw_ss_comp"],axis=1)

In [338]:
# Player Total Pts Won% Last 60 Matches (Surface-Specific, Decay Time-Weighted, Indoor-Outdoor Specific, Strength-of-Schedule Adjusted) Composite Version Differential
# 'p_tot_pts_won%_l60_tw_ss_SOS_comp_adj'

# Populating the opponent column
df_player4["p_opp_tot_pts_won%_l60_tw_ss_SOS_comp_adj"] = df_player4.groupby(['m_num'])['p_tot_pts_won%_l60_tw_ss_SOS_comp_adj'].shift(-1)
df_player4["p_opp_tot_pts_won%_l60_tw_ss_SOS_comp_adj"] = df_player4["p_opp_tot_pts_won%_l60_tw_ss_SOS_comp_adj"].fillna(df_player4.groupby(['m_num'])['p_tot_pts_won%_l60_tw_ss_SOS_comp_adj'].shift(1))

# Calculating differential column 
df_player4["p_tot_pts_won%_l60_tw_ss_SOS_comp_adj_diff"] = (df_player4["p_tot_pts_won%_l60_tw_ss_SOS_comp_adj"] - df_player4["p_opp_tot_pts_won%_l60_tw_ss_SOS_comp_adj"])

df_player4 = df_player4.drop(["p_opp_tot_pts_won%_l60_tw_ss_SOS_comp_adj"],axis=1)

In [339]:
# 'p_tot_pts_won%_l10_tw_ss_SOS_comp_adj'

# Populating the opponent column
df_player4["p_opp_tot_pts_won%_l10_tw_ss_SOS_comp_adj"] = df_player4.groupby(['m_num'])['p_tot_pts_won%_l10_tw_ss_SOS_comp_adj'].shift(-1)
df_player4["p_opp_tot_pts_won%_l10_tw_ss_SOS_comp_adj"] = df_player4["p_opp_tot_pts_won%_l10_tw_ss_SOS_comp_adj"].fillna(df_player4.groupby(['m_num'])['p_tot_pts_won%_l10_tw_ss_SOS_comp_adj'].shift(1))

# Calculating differential column 
df_player4["p_tot_pts_won%_l10_tw_ss_SOS_comp_adj_diff"] = (df_player4["p_tot_pts_won%_l10_tw_ss_SOS_comp_adj"] - df_player4["p_opp_tot_pts_won%_l10_tw_ss_SOS_comp_adj"])

df_player4 = df_player4.drop(["p_opp_tot_pts_won%_l10_tw_ss_SOS_comp_adj"],axis=1)

now player differentials per match are calculated for "Offense vs Offense" and "Defense vs Defense"

In [340]:
# "OFFENSE VS OFFENSE": PLAYER % SERVE points won VS OPPONENT % SERVE points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted ('p_sv_pts_won%_l60_tw_ss')

# Populating the opponent column
df_player4["p_opp_sv_pts_won%_l60_tw_ss"] = df_player4.groupby(['m_num'])['p_sv_pts_won%_l60_tw_ss'].shift(-1)
df_player4["p_opp_sv_pts_won%_l60_tw_ss"] = df_player4["p_opp_sv_pts_won%_l60_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_sv_pts_won%_l60_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_sv_pts_won%_l60_tw_ss_diff"] = (df_player4["p_sv_pts_won%_l60_tw_ss"] - df_player4["p_opp_sv_pts_won%_l60_tw_ss"])

df_player4 = df_player4.drop(["p_opp_sv_pts_won%_l60_tw_ss"],axis=1)

In [341]:
# "OFFENSE VS OFFENSE": PLAYER % SERVE points won VS OPPONENT % SERVE points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted ('p_sv_pts_won%_l10_tw_ss')

# Populating the opponent column
df_player4["p_opp_sv_pts_won%_l10_tw_ss"] = df_player4.groupby(['m_num'])['p_sv_pts_won%_l10_tw_ss'].shift(-1)
df_player4["p_opp_sv_pts_won%_l10_tw_ss"] = df_player4["p_opp_sv_pts_won%_l10_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_sv_pts_won%_l10_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_sv_pts_won%_l10_tw_ss_diff"] = (df_player4["p_sv_pts_won%_l10_tw_ss"] - df_player4["p_opp_sv_pts_won%_l10_tw_ss"])

df_player4 = df_player4.drop(["p_opp_sv_pts_won%_l10_tw_ss"],axis=1)

In [342]:
# "OFFENSE VS OFFENSE": PLAYER % SERVE points won VS OPPONENT % SERVE points won in last 60 matches 
# Surface-Specific, IO-specific, Decay Time-Weighted ('p_sv_pts_won%_l60_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_sv_pts_won%_l60_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_sv_pts_won%_l60_tw_ss_IO'].shift(-1)
df_player4["p_opp_sv_pts_won%_l60_tw_ss_IO"] = df_player4["p_opp_sv_pts_won%_l60_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_sv_pts_won%_l60_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_sv_pts_won%_l60_tw_ss_IO_diff"] = (df_player4["p_sv_pts_won%_l60_tw_ss_IO"] - df_player4["p_opp_sv_pts_won%_l60_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_sv_pts_won%_l60_tw_ss_IO"],axis=1)

In [343]:
# "OFFENSE VS OFFENSE": PLAYER % SERVE points won VS OPPONENT % SERVE points won in last 10 matches 
# Surface-Specific, IO-specific, Decay Time-Weighted ('p_sv_pts_won%_l10_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_sv_pts_won%_l10_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_sv_pts_won%_l10_tw_ss_IO'].shift(-1)
df_player4["p_opp_sv_pts_won%_l10_tw_ss_IO"] = df_player4["p_opp_sv_pts_won%_l10_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_sv_pts_won%_l10_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_sv_pts_won%_l10_tw_ss_IO_diff"] = (df_player4["p_sv_pts_won%_l10_tw_ss_IO"] - df_player4["p_opp_sv_pts_won%_l10_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_sv_pts_won%_l10_tw_ss_IO"],axis=1)

In [344]:
# "OFFENSE VS OFFENSE": PLAYER % SERVE points won VS OPPONENT % SERVE points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_sv_pts_won%_l60_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_sv_pts_won%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_sv_pts_won%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_sv_pts_won%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_sv_pts_won%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_sv_pts_won%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_sv_pts_won%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_sv_pts_won%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_sv_pts_won%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_sv_pts_won%_l60_tw_ss_SOS_adj"],axis=1)

In [345]:
# "OFFENSE VS OFFENSE": PLAYER % SERVE points won VS OPPONENT % SERVE points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_sv_pts_won%_l10_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_sv_pts_won%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_sv_pts_won%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_sv_pts_won%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_sv_pts_won%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_sv_pts_won%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_sv_pts_won%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_sv_pts_won%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_sv_pts_won%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_sv_pts_won%_l10_tw_ss_SOS_adj"],axis=1)

In [346]:
# "OFFENSE VS OFFENSE": PLAYER % SERVE points won VS OPPONENT % SERVE points won in last 60 matches 
# Surface-Specific, IO-specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_sv_pts_won%_l60_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_sv_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_sv_pts_won%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_sv_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_sv_pts_won%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_sv_pts_won%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_sv_pts_won%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [347]:
# "OFFENSE VS OFFENSE": PLAYER % SERVE points won VS OPPONENT % SERVE points won in last 10 matches 
# Surface-Specific, IO-specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_sv_pts_won%_l10_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_sv_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_sv_pts_won%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_sv_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_sv_pts_won%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_sv_pts_won%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_sv_pts_won%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [348]:
# "OFFENSE VS OFFENSE": PLAYER % SERVE points won VS OPPONENT % SERVE points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_sv_pts_won%_l60_tw_ss_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_sv_pts_won%_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_sv_pts_won%_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_sv_pts_won%_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_sv_pts_won%_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_sv_pts_won%_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_sv_pts_won%_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_sv_pts_won%_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_sv_pts_won%_l60_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_sv_pts_won%_l60_tw_ss_SOS_csp_adj"],axis=1)

In [349]:
# "OFFENSE VS OFFENSE": PLAYER % SERVE points won VS OPPONENT % SERVE points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_sv_pts_won%_l10_tw_ss_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_sv_pts_won%_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_sv_pts_won%_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_sv_pts_won%_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_sv_pts_won%_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_sv_pts_won%_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_sv_pts_won%_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_sv_pts_won%_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_sv_pts_won%_l10_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_sv_pts_won%_l10_tw_ss_SOS_csp_adj"],axis=1)

In [350]:
# "OFFENSE VS OFFENSE": PLAYER % SERVE points won VS OPPONENT % SERVE points won in last 60 matches 
# Surface-Specific, IO-specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [351]:
# "OFFENSE VS OFFENSE": PLAYER % SERVE points won VS OPPONENT % SERVE points won in last 10 matches 
# Surface-Specific, IO-specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

In [352]:
# "OFFENSE VS OFFENSE": PLAYER % FIRST SERVE points won VS OPPONENT % FIRST SERVE points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted ('p_1st_sv_pts_won%_l60_tw_ss')

# Populating the opponent column
df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss"] = df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l60_tw_ss'].shift(-1)
df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss"] = df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l60_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv_pts_won%_l60_tw_ss_diff"] = (df_player4["p_1st_sv_pts_won%_l60_tw_ss"] - df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss"])

df_player4 = df_player4.drop(["p_opp_1st_sv_pts_won%_l60_tw_ss"],axis=1)

In [353]:
# "OFFENSE VS OFFENSE": PLAYER % FIRST SERVE points won VS OPPONENT % FIRST SERVE points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted ('p_1st_sv_pts_won%_l10_tw_ss')

# Populating the opponent column
df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss"] = df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l10_tw_ss'].shift(-1)
df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss"] = df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l10_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv_pts_won%_l10_tw_ss_diff"] = (df_player4["p_1st_sv_pts_won%_l10_tw_ss"] - df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss"])

df_player4 = df_player4.drop(["p_opp_1st_sv_pts_won%_l10_tw_ss"],axis=1)

In [354]:
# "OFFENSE VS OFFENSE": PLAYER % FIRST SERVE points won VS OPPONENT % FIRST SERVE points won in last 60 matches 
# Surface-Specific, IO-specific Decay Time-Weighted ('p_1st_sv_pts_won%_l60_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l60_tw_ss_IO'].shift(-1)
df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_IO"] = df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l60_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv_pts_won%_l60_tw_ss_IO_diff"] = (df_player4["p_1st_sv_pts_won%_l60_tw_ss_IO"] - df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_1st_sv_pts_won%_l60_tw_ss_IO"],axis=1)

In [355]:
# "OFFENSE VS OFFENSE": PLAYER % FIRST SERVE points won VS OPPONENT % FIRST SERVE points won in last 10 matches 
# Surface-Specific, IO-specific Decay Time-Weighted ('p_1st_sv_pts_won%_l10_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l10_tw_ss_IO'].shift(-1)
df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_IO"] = df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l10_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv_pts_won%_l10_tw_ss_IO_diff"] = (df_player4["p_1st_sv_pts_won%_l10_tw_ss_IO"] - df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_1st_sv_pts_won%_l10_tw_ss_IO"],axis=1)

In [356]:
# "OFFENSE VS OFFENSE": PLAYER % FIRST SERVE points won VS OPPONENT % FIRST SERVE points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_1st_sv_pts_won%_l60_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv_pts_won%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_1st_sv_pts_won%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv_pts_won%_l60_tw_ss_SOS_adj"],axis=1)

In [357]:
# "OFFENSE VS OFFENSE": PLAYER % FIRST SERVE points won VS OPPONENT % FIRST SERVE points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_1st_sv_pts_won%_l10_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv_pts_won%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_1st_sv_pts_won%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv_pts_won%_l10_tw_ss_SOS_adj"],axis=1)

In [358]:
# "OFFENSE VS OFFENSE": PLAYER % FIRST SERVE points won VS OPPONENT % FIRST SERVE points won in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [359]:
# "OFFENSE VS OFFENSE": PLAYER % FIRST SERVE points won VS OPPONENT % FIRST SERVE points won in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [360]:
# "OFFENSE VS OFFENSE": PLAYER % FIRST SERVE points won VS OPPONENT % FIRST SERVE points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj"],axis=1)

In [361]:
# "OFFENSE VS OFFENSE": PLAYER % FIRST SERVE points won VS OPPONENT % FIRST SERVE points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj"],axis=1)

In [362]:
# "OFFENSE VS OFFENSE": PLAYER % FIRST SERVE points won VS OPPONENT % FIRST SERVE points won in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [363]:
# "OFFENSE VS OFFENSE": PLAYER % FIRST SERVE points won VS OPPONENT % FIRST SERVE points won in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

In [364]:
# "OFFENSE VS OFFENSE": PLAYER % SECOND SERVE points won VS OPPONENT % SECOND SERVE points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted ('p_2nd_sv_pts_won%_l60_tw_ss')

# Populating the opponent column
df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss"] = df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l60_tw_ss'].shift(-1)
df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss"] = df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l60_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_2nd_sv_pts_won%_l60_tw_ss_diff"] = (df_player4["p_2nd_sv_pts_won%_l60_tw_ss"] - df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss"])

df_player4 = df_player4.drop(["p_opp_2nd_sv_pts_won%_l60_tw_ss"],axis=1)

In [365]:
# "OFFENSE VS OFFENSE": PLAYER % SECOND SERVE points won VS OPPONENT % SECOND SERVE points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted ('p_2nd_sv_pts_won%_l10_tw_ss')

# Populating the opponent column
df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss"] = df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l10_tw_ss'].shift(-1)
df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss"] = df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l10_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_2nd_sv_pts_won%_l10_tw_ss_diff"] = (df_player4["p_2nd_sv_pts_won%_l10_tw_ss"] - df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss"])

df_player4 = df_player4.drop(["p_opp_2nd_sv_pts_won%_l10_tw_ss"],axis=1)

In [366]:
# "OFFENSE VS OFFENSE": PLAYER % SECOND SERVE points won VS OPPONENT % SECOND SERVE points won in last 60 matches 
# Surface-Specific, IO-specific, Decay Time-Weighted ('p_2nd_sv_pts_won%_l60_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(-1)
df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO"] = df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l60_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_2nd_sv_pts_won%_l60_tw_ss_IO_diff"] = (df_player4["p_2nd_sv_pts_won%_l60_tw_ss_IO"] - df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO"],axis=1)

In [367]:
# "OFFENSE VS OFFENSE": PLAYER % SECOND SERVE points won VS OPPONENT % SECOND SERVE points won in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted ('p_2nd_sv_pts_won%_l10_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l10_tw_ss_IO'].shift(-1)
df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO"] = df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l10_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_2nd_sv_pts_won%_l10_tw_ss_IO_diff"] = (df_player4["p_2nd_sv_pts_won%_l10_tw_ss_IO"] - df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO"],axis=1)

In [368]:
# "OFFENSE VS OFFENSE": PLAYER % SECOND SERVE points won VS OPPONENT % SECOND SERVE points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_2nd_sv_pts_won%_l60_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_sv_pts_won%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_2nd_sv_pts_won%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_adj"],axis=1)

In [369]:
# "OFFENSE VS OFFENSE": PLAYER % SECOND SERVE points won VS OPPONENT % SECOND SERVE points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_2nd_sv_pts_won%_l10_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_sv_pts_won%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_2nd_sv_pts_won%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_adj"],axis=1)

In [370]:
# "OFFENSE VS OFFENSE": PLAYER % SECOND SERVE points won VS OPPONENT % SECOND SERVE points won in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [371]:
# "OFFENSE VS OFFENSE": PLAYER % SECOND SERVE points won VS OPPONENT % SECOND SERVE points won in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [372]:
# "OFFENSE VS OFFENSE": PLAYER % SECOND SERVE points won VS OPPONENT % SECOND SERVE points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj"],axis=1)

In [373]:
# "OFFENSE VS OFFENSE": PLAYER % SECOND SERVE points won VS OPPONENT % SECOND SERVE points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj"],axis=1)

In [374]:
# "OFFENSE VS OFFENSE": PLAYER % SECOND SERVE points won VS OPPONENT % SECOND SERVE points won in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [375]:
# "OFFENSE VS OFFENSE": PLAYER % SECOND SERVE points won VS OPPONENT % SECOND SERVE points won in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

In [376]:
# "OFFENSE VS OFFENSE": PLAYER % FIRST SERVES IN VS OPPONENT % FIRST SERVES IN in last 60 matches 
# Surface-Specific, Decay Time-Weighted ('p_1st_sv%_l60_tw_ss')

# Populating the opponent column
df_player4["p_opp_1st_sv%_l60_tw_ss"] = df_player4.groupby(['m_num'])['p_1st_sv%_l60_tw_ss'].shift(-1)
df_player4["p_opp_1st_sv%_l60_tw_ss"] = df_player4["p_opp_1st_sv%_l60_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_1st_sv%_l60_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv%_l60_tw_ss_diff"] = (df_player4["p_1st_sv%_l60_tw_ss"] - df_player4["p_opp_1st_sv%_l60_tw_ss"])

df_player4 = df_player4.drop(["p_opp_1st_sv%_l60_tw_ss"],axis=1)

In [377]:
# "OFFENSE VS OFFENSE": PLAYER % FIRST SERVES IN VS OPPONENT % FIRST SERVES IN in last 10 matches 
# Surface-Specific, Decay Time-Weighted ('p_1st_sv%_l10_tw_ss')

# Populating the opponent column
df_player4["p_opp_1st_sv%_l10_tw_ss"] = df_player4.groupby(['m_num'])['p_1st_sv%_l10_tw_ss'].shift(-1)
df_player4["p_opp_1st_sv%_l10_tw_ss"] = df_player4["p_opp_1st_sv%_l10_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_1st_sv%_l10_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv%_l10_tw_ss_diff"] = (df_player4["p_1st_sv%_l10_tw_ss"] - df_player4["p_opp_1st_sv%_l10_tw_ss"])

df_player4 = df_player4.drop(["p_opp_1st_sv%_l10_tw_ss"],axis=1)

In [378]:
# "OFFENSE VS OFFENSE": PLAYER % FIRST SERVES IN VS OPPONENT % FIRST SERVES IN in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted ('p_1st_sv%_l60_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_1st_sv%_l60_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_1st_sv%_l60_tw_ss_IO'].shift(-1)
df_player4["p_opp_1st_sv%_l60_tw_ss_IO"] = df_player4["p_opp_1st_sv%_l60_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_1st_sv%_l60_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv%_l60_tw_ss_IO_diff"] = (df_player4["p_1st_sv%_l60_tw_ss_IO"] - df_player4["p_opp_1st_sv%_l60_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_1st_sv%_l60_tw_ss_IO"],axis=1)

In [379]:
# "OFFENSE VS OFFENSE": PLAYER % FIRST SERVES IN VS OPPONENT % FIRST SERVES IN in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted ('p_1st_sv%_l10_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_1st_sv%_l10_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_1st_sv%_l10_tw_ss_IO'].shift(-1)
df_player4["p_opp_1st_sv%_l10_tw_ss_IO"] = df_player4["p_opp_1st_sv%_l10_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_1st_sv%_l10_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv%_l10_tw_ss_IO_diff"] = (df_player4["p_1st_sv%_l10_tw_ss_IO"] - df_player4["p_opp_1st_sv%_l10_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_1st_sv%_l10_tw_ss_IO"],axis=1)

In [380]:
# "OFFENSE VS OFFENSE": PLAYER % FIRST SERVES IN VS OPPONENT % FIRST SERVES IN in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_1st_sv%_l60_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_1st_sv%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_1st_sv%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_1st_sv%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_1st_sv%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_1st_sv%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_1st_sv%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv%_l60_tw_ss_SOS_adj"],axis=1)

In [381]:
# "OFFENSE VS OFFENSE": PLAYER % FIRST SERVES IN VS OPPONENT % FIRST SERVES IN in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_1st_sv%_l10_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_1st_sv%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_1st_sv%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_1st_sv%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_1st_sv%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_1st_sv%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_1st_sv%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv%_l10_tw_ss_SOS_adj"],axis=1)

In [382]:
# "OFFENSE VS OFFENSE": PLAYER % FIRST SERVES IN VS OPPONENT % FIRST SERVES IN in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_1st_sv%_l60_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_1st_sv%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_1st_sv%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_1st_sv%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_1st_sv%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_1st_sv%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_1st_sv%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [383]:
# "OFFENSE VS OFFENSE": PLAYER % FIRST SERVES IN VS OPPONENT % FIRST SERVES IN in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_1st_sv%_l10_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_1st_sv%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_1st_sv%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_1st_sv%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_1st_sv%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_1st_sv%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_1st_sv%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [384]:
# "OFFENSE VS OFFENSE": PLAYER % FIRST SERVES IN VS OPPONENT % FIRST SERVES IN in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_1st_sv%_l60_tw_ss_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_1st_sv%_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1st_sv%_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1st_sv%_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_1st_sv%_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv%_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv%_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_1st_sv%_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_1st_sv%_l60_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv%_l60_tw_ss_SOS_csp_adj"],axis=1)

In [385]:
# "OFFENSE VS OFFENSE": PLAYER % FIRST SERVES IN VS OPPONENT % FIRST SERVES IN in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_1st_sv%_l10_tw_ss_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_1st_sv%_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1st_sv%_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1st_sv%_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_1st_sv%_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv%_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv%_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_1st_sv%_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_1st_sv%_l10_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv%_l10_tw_ss_SOS_csp_adj"],axis=1)

In [386]:
# "OFFENSE VS OFFENSE": PLAYER % FIRST SERVES IN VS OPPONENT % FIRST SERVES IN in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_1st_sv%_l60_tw_ss_IO_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_1st_sv%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1st_sv%_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1st_sv%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_1st_sv%_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv%_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv%_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_1st_sv%_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_1st_sv%_l60_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv%_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [387]:
# "OFFENSE VS OFFENSE": PLAYER % FIRST SERVES IN VS OPPONENT % FIRST SERVES IN in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_1st_sv%_l10_tw_ss_IO_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_1st_sv%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1st_sv%_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1st_sv%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_1st_sv%_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv%_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv%_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_1st_sv%_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_1st_sv%_l10_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv%_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

In [388]:
# "DEFENSE VS DEFENSE": PLAYER % FIRST SERVES IN YIELDED VS OPPONENT % FIRST SERVES IN YIELDED in last 60 matches 
# Surface-Specific, Decay Time-Weighted ('p_1st_sv%_yielded_l60_tw_ss')

# Populating the opponent column
df_player4["p_opp_1st_sv%_yielded_l60_tw_ss"] = df_player4.groupby(['m_num'])['p_1st_sv%_yielded_l60_tw_ss'].shift(-1)
df_player4["p_opp_1st_sv%_yielded_l60_tw_ss"] = df_player4["p_opp_1st_sv%_yielded_l60_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_1st_sv%_yielded_l60_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv%_yielded_l60_tw_ss_diff"] = (df_player4["p_1st_sv%_yielded_l60_tw_ss"] - df_player4["p_opp_1st_sv%_yielded_l60_tw_ss"])

df_player4 = df_player4.drop(["p_opp_1st_sv%_yielded_l60_tw_ss"],axis=1)

In [389]:
# "DEFENSE VS DEFENSE": PLAYER % FIRST SERVES IN YIELDED VS OPPONENT % FIRST SERVES IN YIELDED in last 10 matches 
# Surface-Specific, Decay Time-Weighted ('p_1st_sv%_yielded_l10_tw_ss')

# Populating the opponent column
df_player4["p_opp_1st_sv%_yielded_l10_tw_ss"] = df_player4.groupby(['m_num'])['p_1st_sv%_yielded_l10_tw_ss'].shift(-1)
df_player4["p_opp_1st_sv%_yielded_l10_tw_ss"] = df_player4["p_opp_1st_sv%_yielded_l10_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_1st_sv%_yielded_l10_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv%_yielded_l10_tw_ss_diff"] = (df_player4["p_1st_sv%_yielded_l10_tw_ss"] - df_player4["p_opp_1st_sv%_yielded_l10_tw_ss"])

df_player4 = df_player4.drop(["p_opp_1st_sv%_yielded_l10_tw_ss"],axis=1)

In [390]:
# "DEFENSE VS DEFENSE": PLAYER % FIRST SERVES IN YIELDED VS OPPONENT % FIRST SERVES IN YIELDED in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted ('p_1st_sv%_yielded_l60_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_1st_sv%_yielded_l60_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_1st_sv%_yielded_l60_tw_ss_IO'].shift(-1)
df_player4["p_opp_1st_sv%_yielded_l60_tw_ss_IO"] = df_player4["p_opp_1st_sv%_yielded_l60_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_1st_sv%_yielded_l60_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv%_yielded_l60_tw_ss_IO_diff"] = (df_player4["p_1st_sv%_yielded_l60_tw_ss_IO"] - df_player4["p_opp_1st_sv%_yielded_l60_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_1st_sv%_yielded_l60_tw_ss_IO"],axis=1)

In [391]:
# "DEFENSE VS DEFENSE": PLAYER % FIRST SERVES IN YIELDED VS OPPONENT % FIRST SERVES IN YIELDED in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted ('p_1st_sv%_yielded_l10_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_1st_sv%_yielded_l10_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_1st_sv%_yielded_l10_tw_ss_IO'].shift(-1)
df_player4["p_opp_1st_sv%_yielded_l10_tw_ss_IO"] = df_player4["p_opp_1st_sv%_yielded_l10_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_1st_sv%_yielded_l10_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv%_yielded_l10_tw_ss_IO_diff"] = (df_player4["p_1st_sv%_yielded_l10_tw_ss_IO"] - df_player4["p_opp_1st_sv%_yielded_l10_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_1st_sv%_yielded_l10_tw_ss_IO"],axis=1)

In [392]:
# "DEFENSE VS DEFENSE": PLAYER % FIRST SERVES IN YIELDED VS OPPONENT % FIRST SERVES IN YIELDED in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_1st_sv%_yielded_l60_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_1st_sv%_yielded_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_1st_sv%_yielded_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_1st_sv%_yielded_l60_tw_ss_SOS_adj"] = df_player4["p_opp_1st_sv%_yielded_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv%_yielded_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv%_yielded_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_1st_sv%_yielded_l60_tw_ss_SOS_adj"] - df_player4["p_opp_1st_sv%_yielded_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv%_yielded_l60_tw_ss_SOS_adj"],axis=1)

In [393]:
# "DEFENSE VS DEFENSE": PLAYER % FIRST SERVES IN YIELDED VS OPPONENT % FIRST SERVES IN YIELDED in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_1st_sv%_yielded_l10_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_1st_sv%_yielded_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_1st_sv%_yielded_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_1st_sv%_yielded_l10_tw_ss_SOS_adj"] = df_player4["p_opp_1st_sv%_yielded_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv%_yielded_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv%_yielded_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_1st_sv%_yielded_l10_tw_ss_SOS_adj"] - df_player4["p_opp_1st_sv%_yielded_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv%_yielded_l10_tw_ss_SOS_adj"],axis=1)

In [394]:
# "DEFENSE VS DEFENSE": PLAYER % FIRST SERVES IN YIELDED VS OPPONENT % FIRST SERVES IN YIELDED in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv%_yielded_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv%_yielded_l60_tw_ss_IO_SOS_adj"],axis=1)

In [395]:
# "DEFENSE VS DEFENSE": PLAYER % FIRST SERVES IN YIELDED VS OPPONENT % FIRST SERVES IN YIELDED in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv%_yielded_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv%_yielded_l10_tw_ss_IO_SOS_adj"],axis=1)

In [396]:
# "DEFENSE VS DEFENSE": PLAYER % FIRST SERVES IN YIELDED VS OPPONENT % FIRST SERVES IN YIELDED in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_1st_sv%_yielded_l60_tw_ss_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_1st_sv%_yielded_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1st_sv%_yielded_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1st_sv%_yielded_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_1st_sv%_yielded_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv%_yielded_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv%_yielded_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_1st_sv%_yielded_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_1st_sv%_yielded_l60_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv%_yielded_l60_tw_ss_SOS_csp_adj"],axis=1)

In [397]:
# "DEFENSE VS DEFENSE": PLAYER % FIRST SERVES IN YIELDED VS OPPONENT % FIRST SERVES IN YIELDED in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_1st_sv%_yielded_l10_tw_ss_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_1st_sv%_yielded_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1st_sv%_yielded_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1st_sv%_yielded_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_1st_sv%_yielded_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv%_yielded_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv%_yielded_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_1st_sv%_yielded_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_1st_sv%_yielded_l10_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv%_yielded_l10_tw_ss_SOS_csp_adj"],axis=1)

In [398]:
# "DEFENSE VS DEFENSE": PLAYER % FIRST SERVES IN YIELDED VS OPPONENT % FIRST SERVES IN YIELDED in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_1st_sv%_yielded_l60_tw_ss_IO_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_1st_sv%_yielded_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1st_sv%_yielded_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1st_sv%_yielded_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_1st_sv%_yielded_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv%_yielded_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv%_yielded_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_1st_sv%_yielded_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_1st_sv%_yielded_l60_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv%_yielded_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [399]:
# "DEFENSE VS DEFENSE": PLAYER % FIRST SERVES IN YIELDED VS OPPONENT % FIRST SERVES IN YIELDED in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_1st_sv%_yielded_l10_tw_ss_IO_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_1st_sv%_yielded_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1st_sv%_yielded_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1st_sv%_yielded_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_1st_sv%_yielded_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv%_yielded_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv%_yielded_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_1st_sv%_yielded_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_1st_sv%_yielded_l10_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv%_yielded_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

In [400]:
# "DEFENSE VS DEFENSE": PLAYER % RETURN points won VS OPPONENT % RETURN points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted ('p_ret_pts_won%_l60_tw_ss')

# Populating the opponent column
df_player4["p_opp_ret_pts_won%_l60_tw_ss"] = df_player4.groupby(['m_num'])['p_ret_pts_won%_l60_tw_ss'].shift(-1)
df_player4["p_opp_ret_pts_won%_l60_tw_ss"] = df_player4["p_opp_ret_pts_won%_l60_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_ret_pts_won%_l60_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_ret_pts_won%_l60_tw_ss_diff"] = (df_player4["p_ret_pts_won%_l60_tw_ss"] - df_player4["p_opp_ret_pts_won%_l60_tw_ss"])

df_player4 = df_player4.drop(["p_opp_ret_pts_won%_l60_tw_ss"],axis=1)

In [401]:
# "DEFENSE VS DEFENSE": PLAYER % RETURN points won VS OPPONENT % RETURN points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted ('p_ret_pts_won%_l10_tw_ss')

# Populating the opponent column
df_player4["p_opp_ret_pts_won%_l10_tw_ss"] = df_player4.groupby(['m_num'])['p_ret_pts_won%_l10_tw_ss'].shift(-1)
df_player4["p_opp_ret_pts_won%_l10_tw_ss"] = df_player4["p_opp_ret_pts_won%_l10_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_ret_pts_won%_l10_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_ret_pts_won%_l10_tw_ss_diff"] = (df_player4["p_ret_pts_won%_l10_tw_ss"] - df_player4["p_opp_ret_pts_won%_l10_tw_ss"])

df_player4 = df_player4.drop(["p_opp_ret_pts_won%_l10_tw_ss"],axis=1)

In [402]:
# "DEFENSE VS DEFENSE": PLAYER % RETURN points won VS OPPONENT % RETURN points won in last 60 matches 
# Surface-Specific, IO-specific, Decay Time-Weighted ('p_ret_pts_won%_l60_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_ret_pts_won%_l60_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_ret_pts_won%_l60_tw_ss_IO'].shift(-1)
df_player4["p_opp_ret_pts_won%_l60_tw_ss_IO"] = df_player4["p_opp_ret_pts_won%_l60_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_ret_pts_won%_l60_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_ret_pts_won%_l60_tw_ss_IO_diff"] = (df_player4["p_ret_pts_won%_l60_tw_ss_IO"] - df_player4["p_opp_ret_pts_won%_l60_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_ret_pts_won%_l60_tw_ss_IO"],axis=1)

In [403]:
# "DEFENSE VS DEFENSE": PLAYER % RETURN points won VS OPPONENT % RETURN points won in last 10 matches 
# Surface-Specific, IO-specific, Decay Time-Weighted ('p_ret_pts_won%_l10_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_ret_pts_won%_l10_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_ret_pts_won%_l10_tw_ss_IO'].shift(-1)
df_player4["p_opp_ret_pts_won%_l10_tw_ss_IO"] = df_player4["p_opp_ret_pts_won%_l10_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_ret_pts_won%_l10_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_ret_pts_won%_l10_tw_ss_IO_diff"] = (df_player4["p_ret_pts_won%_l10_tw_ss_IO"] - df_player4["p_opp_ret_pts_won%_l10_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_ret_pts_won%_l10_tw_ss_IO"],axis=1)

In [404]:
# "DEFENSE VS DEFENSE": PLAYER % RETURN points won VS OPPONENT % RETURN points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_ret_pts_won%_l60_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_ret_pts_won%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_ret_pts_won%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_ret_pts_won%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_ret_pts_won%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_ret_pts_won%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_ret_pts_won%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_ret_pts_won%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_ret_pts_won%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_ret_pts_won%_l60_tw_ss_SOS_adj"],axis=1)

In [405]:
# "DEFENSE VS DEFENSE": PLAYER % RETURN points won VS OPPONENT % RETURN points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_ret_pts_won%_l10_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_ret_pts_won%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_ret_pts_won%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_ret_pts_won%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_ret_pts_won%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_ret_pts_won%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_ret_pts_won%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_ret_pts_won%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_ret_pts_won%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_ret_pts_won%_l10_tw_ss_SOS_adj"],axis=1)

In [406]:
# "DEFENSE VS DEFENSE": PLAYER % RETURN points won VS OPPONENT % RETURN points won in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_ret_pts_won%_l60_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_ret_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_ret_pts_won%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_ret_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_ret_pts_won%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_ret_pts_won%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_ret_pts_won%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [407]:
# "DEFENSE VS DEFENSE": PLAYER % RETURN points won VS OPPONENT % RETURN points won in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_ret_pts_won%_l10_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_ret_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_ret_pts_won%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_ret_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_ret_pts_won%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_ret_pts_won%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_ret_pts_won%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [408]:
# "DEFENSE VS DEFENSE": PLAYER % RETURN points won VS OPPONENT % RETURN points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_ret_pts_won%_l60_tw_ss_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_ret_pts_won%_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_ret_pts_won%_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_ret_pts_won%_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_ret_pts_won%_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_ret_pts_won%_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_ret_pts_won%_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_ret_pts_won%_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_ret_pts_won%_l60_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_ret_pts_won%_l60_tw_ss_SOS_csp_adj"],axis=1)

In [409]:
# "DEFENSE VS DEFENSE": PLAYER % RETURN points won VS OPPONENT % RETURN points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_ret_pts_won%_l10_tw_ss_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_ret_pts_won%_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_ret_pts_won%_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_ret_pts_won%_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_ret_pts_won%_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_ret_pts_won%_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_ret_pts_won%_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_ret_pts_won%_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_ret_pts_won%_l10_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_ret_pts_won%_l10_tw_ss_SOS_csp_adj"],axis=1)

In [410]:
# "DEFENSE VS DEFENSE": PLAYER % RETURN points won VS OPPONENT % RETURN points won in last 60 matches 
# Surface-Specific, IO-specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [411]:
# "DEFENSE VS DEFENSE": PLAYER % RETURN points won VS OPPONENT % RETURN points won in last 10 matches 
# Surface-Specific, IO-specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

In [412]:
# "DEFENSE VS DEFENSE": PLAYER % FIRST RETURN points won VS OPPONENT % FIRST RETURN points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted ('p_1st_ret_pts_won%_l60_tw_ss')

# Populating the opponent column
df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss"] = df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l60_tw_ss'].shift(-1)
df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss"] = df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l60_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_1st_ret_pts_won%_l60_tw_ss_diff"] = (df_player4["p_1st_ret_pts_won%_l60_tw_ss"] - df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss"])

df_player4 = df_player4.drop(["p_opp_1st_ret_pts_won%_l60_tw_ss"],axis=1)

In [413]:
# "DEFENSE VS DEFENSE": PLAYER % FIRST RETURN points won VS OPPONENT % RETURN points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted ('p_1st_ret_pts_won%_l10_tw_ss')

# Populating the opponent column
df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss"] = df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l10_tw_ss'].shift(-1)
df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss"] = df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l10_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_1st_ret_pts_won%_l10_tw_ss_diff"] = (df_player4["p_1st_ret_pts_won%_l10_tw_ss"] - df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss"])

df_player4 = df_player4.drop(["p_opp_1st_ret_pts_won%_l10_tw_ss"],axis=1)

In [414]:
# "DEFENSE VS DEFENSE": PLAYER % FIRST RETURN points won VS OPPONENT % FIRST RETURN points won in last 60 matches 
# Surface-Specific, IO-specific, Decay Time-Weighted ('p_1st_ret_pts_won%_l60_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l60_tw_ss_IO'].shift(-1)
df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_IO"] = df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l60_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_1st_ret_pts_won%_l60_tw_ss_IO_diff"] = (df_player4["p_1st_ret_pts_won%_l60_tw_ss_IO"] - df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_1st_ret_pts_won%_l60_tw_ss_IO"],axis=1)

In [415]:
# "DEFENSE VS DEFENSE": PLAYER % FIRST RETURN points won VS OPPONENT % FIRST RETURN points won in last 10 matches 
# Surface-Specific, IO-specific, Decay Time-Weighted ('p_1st_ret_pts_won%_l10_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l10_tw_ss_IO'].shift(-1)
df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_IO"] = df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l10_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_1st_ret_pts_won%_l10_tw_ss_IO_diff"] = (df_player4["p_1st_ret_pts_won%_l10_tw_ss_IO"] - df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_1st_ret_pts_won%_l10_tw_ss_IO"],axis=1)

In [416]:
# "DEFENSE VS DEFENSE": PLAYER % FIRST RETURN points won VS OPPONENT % FIRST RETURN points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_1st_ret_pts_won%_l60_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_ret_pts_won%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_1st_ret_pts_won%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_1st_ret_pts_won%_l60_tw_ss_SOS_adj"],axis=1)

In [417]:
# "DEFENSE VS DEFENSE": PLAYER % FIRST RETURN points won VS OPPONENT % FIRST RETURN points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_1st_ret_pts_won%_l10_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_ret_pts_won%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_1st_ret_pts_won%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_1st_ret_pts_won%_l10_tw_ss_SOS_adj"],axis=1)

In [418]:
# "DEFENSE VS DEFENSE": PLAYER % FIRST RETURN points won VS OPPONENT % FIRST RETURN points won in last 60 matches 
# Surface-Specific, IO-specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [419]:
# "DEFENSE VS DEFENSE": PLAYER % FIRST RETURN points won VS OPPONENT % FIRST RETURN points won in last 10 matches 
# Surface-Specific, IO-specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [420]:
# "DEFENSE VS DEFENSE": PLAYER % FIRST RETURN points won VS OPPONENT % FIRST RETURN points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj"],axis=1)

In [421]:
# "DEFENSE VS DEFENSE": PLAYER % FIRST RETURN points won VS OPPONENT % FIRST RETURN points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj"],axis=1)

In [422]:
# "DEFENSE VS DEFENSE": PLAYER % FIRST RETURN points won VS OPPONENT % FIRST RETURN points won in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [423]:
# "DEFENSE VS DEFENSE": PLAYER % FIRST RETURN points won VS OPPONENT % FIRST RETURN points won in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

In [424]:
# "DEFENSE VS DEFENSE": PLAYER % SECOND RETURN points won VS OPPONENT % SECOND RETURN points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted ('p_2nd_ret_pts_won%_l60_tw_ss')

# Populating the opponent column
df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss"] = df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l60_tw_ss'].shift(-1)
df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss"] = df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l60_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_2nd_ret_pts_won%_l60_tw_ss_diff"] = (df_player4["p_2nd_ret_pts_won%_l60_tw_ss"] - df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss"])

df_player4 = df_player4.drop(["p_opp_2nd_ret_pts_won%_l60_tw_ss"],axis=1)

In [425]:
# "DEFENSE VS DEFENSE": PLAYER % SECOND RETURN points won VS OPPONENT % SECOND RETURN points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted ('p_2nd_ret_pts_won%_l10_tw_ss')

# Populating the opponent column
df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss"] = df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l10_tw_ss'].shift(-1)
df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss"] = df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l10_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_2nd_ret_pts_won%_l10_tw_ss_diff"] = (df_player4["p_2nd_ret_pts_won%_l10_tw_ss"] - df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss"])

df_player4 = df_player4.drop(["p_opp_2nd_ret_pts_won%_l10_tw_ss"],axis=1)

In [426]:
# "DEFENSE VS DEFENSE": PLAYER % SECOND RETURN points won VS OPPONENT % SECOND RETURN points won in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted ('p_2nd_ret_pts_won%_l60_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(-1)
df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO"] = df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l60_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_2nd_ret_pts_won%_l60_tw_ss_IO_diff"] = (df_player4["p_2nd_ret_pts_won%_l60_tw_ss_IO"] - df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO"],axis=1)

In [427]:
# "DEFENSE VS DEFENSE": PLAYER % SECOND RETURN points won VS OPPONENT % SECOND RETURN points won in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted ('p_2nd_ret_pts_won%_l10_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l10_tw_ss_IO'].shift(-1)
df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO"] = df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l10_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_2nd_ret_pts_won%_l10_tw_ss_IO_diff"] = (df_player4["p_2nd_ret_pts_won%_l10_tw_ss_IO"] - df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO"],axis=1)

In [428]:
# "DEFENSE VS DEFENSE": PLAYER % SECOND RETURN points won VS OPPONENT % SECOND RETURN points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_2nd_ret_pts_won%_l60_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_ret_pts_won%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_2nd_ret_pts_won%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_adj"],axis=1)

In [429]:
# "DEFENSE VS DEFENSE": PLAYER % SECOND RETURN points won VS OPPONENT % SECOND RETURN points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_2nd_ret_pts_won%_l10_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_ret_pts_won%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_2nd_ret_pts_won%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_adj"],axis=1)

In [430]:
# "DEFENSE VS DEFENSE": PLAYER % SECOND RETURN points won VS OPPONENT % SECOND RETURN points won in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [431]:
# "DEFENSE VS DEFENSE": PLAYER % SECOND RETURN points won VS OPPONENT % SECOND RETURN points won in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [432]:
# "DEFENSE VS DEFENSE": PLAYER % SECOND RETURN points won VS OPPONENT % SECOND RETURN points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj"],axis=1)

In [433]:
# "DEFENSE VS DEFENSE": PLAYER % SECOND RETURN points won VS OPPONENT % SECOND RETURN points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj"],axis=1)

In [434]:
# "DEFENSE VS DEFENSE": PLAYER % SECOND RETURN points won VS OPPONENT % SECOND RETURN points won in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [435]:
# "DEFENSE VS DEFENSE": PLAYER % SECOND RETURN points won VS OPPONENT % SECOND RETURN points won in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted ('p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

In [436]:
# "OFFENSE VS OFFENSE": PLAYER ACE % VS OPPONENT ACE % in last 60 matches 
# Surface-Specific, Decay Time-Weighted ('p_ace%_l60_tw_ss')

# Populating the opponent column
df_player4["p_opp_ace%_l60_tw_ss"] = df_player4.groupby(['m_num'])['p_ace%_l60_tw_ss'].shift(-1)
df_player4["p_opp_ace%_l60_tw_ss"] = df_player4["p_opp_ace%_l60_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_ace%_l60_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_ace%_l60_tw_ss_diff"] = (df_player4["p_ace%_l60_tw_ss"] - df_player4["p_opp_ace%_l60_tw_ss"])

df_player4 = df_player4.drop(["p_opp_ace%_l60_tw_ss"],axis=1)

In [437]:
# "OFFENSE VS OFFENSE": PLAYER ACE % VS OPPONENT ACE % in last 10 matches 
# Surface-Specific, Decay Time-Weighted ('p_ace%_l10_tw_ss')

# Populating the opponent column
df_player4["p_opp_ace%_l10_tw_ss"] = df_player4.groupby(['m_num'])['p_ace%_l10_tw_ss'].shift(-1)
df_player4["p_opp_ace%_l10_tw_ss"] = df_player4["p_opp_ace%_l10_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_ace%_l10_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_ace%_l10_tw_ss_diff"] = (df_player4["p_ace%_l10_tw_ss"] - df_player4["p_opp_ace%_l10_tw_ss"])

df_player4 = df_player4.drop(["p_opp_ace%_l10_tw_ss"],axis=1)

In [438]:
# "OFFENSE VS OFFENSE": PLAYER ACE % VS OPPONENT ACE % in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted ('p_ace%_l60_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_ace%_l60_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_ace%_l60_tw_ss_IO'].shift(-1)
df_player4["p_opp_ace%_l60_tw_ss_IO"] = df_player4["p_opp_ace%_l60_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_ace%_l60_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_ace%_l60_tw_ss_IO_diff"] = (df_player4["p_ace%_l60_tw_ss_IO"] - df_player4["p_opp_ace%_l60_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_ace%_l60_tw_ss_IO"],axis=1)

In [439]:
# "OFFENSE VS OFFENSE": PLAYER ACE % VS OPPONENT ACE % in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted ('p_ace%_l10_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_ace%_l10_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_ace%_l10_tw_ss_IO'].shift(-1)
df_player4["p_opp_ace%_l10_tw_ss_IO"] = df_player4["p_opp_ace%_l10_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_ace%_l10_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_ace%_l10_tw_ss_IO_diff"] = (df_player4["p_ace%_l10_tw_ss_IO"] - df_player4["p_opp_ace%_l10_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_ace%_l10_tw_ss_IO"],axis=1)

In [440]:
# "OFFENSE VS OFFENSE": PLAYER ACE % VS OPPONENT ACE % in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_ace%_l60_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_ace%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_ace%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_ace%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_ace%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_ace%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_ace%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_ace%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_ace%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_ace%_l60_tw_ss_SOS_adj"],axis=1)

In [441]:
# "OFFENSE VS OFFENSE": PLAYER ACE % VS OPPONENT ACE % in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_ace%_l10_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_ace%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_ace%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_ace%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_ace%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_ace%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_ace%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_ace%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_ace%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_ace%_l10_tw_ss_SOS_adj"],axis=1)

In [442]:
# "OFFENSE VS OFFENSE": PLAYER ACE % VS OPPONENT ACE % in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_ace%_l60_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_ace%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_ace%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_ace%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_ace%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_ace%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_ace%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_ace%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_ace%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_ace%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [443]:
# "OFFENSE VS OFFENSE": PLAYER ACE % VS OPPONENT ACE % in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_ace%_l10_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_ace%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_ace%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_ace%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_ace%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_ace%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_ace%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_ace%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_ace%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_ace%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [444]:
# "OFFENSE VS OFFENSE": PLAYER ACE % VS OPPONENT ACE % in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_ace%_l60_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_ace%_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_ace%_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_ace%_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_ace%_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_ace%_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_ace%_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_ace%_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_ace%_l60_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_ace%_l60_tw_ss_SOS_csp_adj"],axis=1)

In [445]:
# "OFFENSE VS OFFENSE": PLAYER ACE % VS OPPONENT ACE % in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_ace%_l10_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_ace%_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_ace%_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_ace%_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_ace%_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_ace%_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_ace%_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_ace%_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_ace%_l10_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_ace%_l10_tw_ss_SOS_csp_adj"],axis=1)

In [446]:
# "OFFENSE VS OFFENSE": PLAYER ACE % VS OPPONENT ACE % in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_ace%_l60_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_ace%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_ace%_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_ace%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_ace%_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_ace%_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_ace%_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_ace%_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_ace%_l60_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_ace%_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [447]:
# "OFFENSE VS OFFENSE": PLAYER ACE % VS OPPONENT ACE % in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_ace%_l10_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_ace%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_ace%_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_ace%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_ace%_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_ace%_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_ace%_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_ace%_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_ace%_l10_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_ace%_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

In [448]:
# "DEFENSE VS DEFENSE": PLAYER ACED % VS OPPONENT ACED % in last 60 matches 
# Surface-Specific, Decay Time-Weighted ('p_aced%_l60_tw_ss')

# Populating the opponent column
df_player4["p_opp_aced%_l60_tw_ss"] = df_player4.groupby(['m_num'])['p_aced%_l60_tw_ss'].shift(-1)
df_player4["p_opp_aced%_l60_tw_ss"] = df_player4["p_opp_aced%_l60_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_aced%_l60_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_aced%_l60_tw_ss_diff"] = (df_player4["p_aced%_l60_tw_ss"] - df_player4["p_opp_aced%_l60_tw_ss"])

df_player4 = df_player4.drop(["p_opp_aced%_l60_tw_ss"],axis=1)

In [449]:
# "DEFENSE VS DEFENSE": PLAYER ACED % VS OPPONENT ACED % in last 10 matches 
# Surface-Specific, Decay Time-Weighted ('p_aced%_l10_tw_ss')

# Populating the opponent column
df_player4["p_opp_aced%_l10_tw_ss"] = df_player4.groupby(['m_num'])['p_aced%_l10_tw_ss'].shift(-1)
df_player4["p_opp_aced%_l10_tw_ss"] = df_player4["p_opp_aced%_l10_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_aced%_l10_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_aced%_l10_tw_ss_diff"] = (df_player4["p_aced%_l10_tw_ss"] - df_player4["p_opp_aced%_l10_tw_ss"])

df_player4 = df_player4.drop(["p_opp_aced%_l10_tw_ss"],axis=1)

In [450]:
# "DEFENSE VS DEFENSE": PLAYER ACED % VS OPPONENT ACED % in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted ('p_aced%_l60_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_aced%_l60_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_aced%_l60_tw_ss_IO'].shift(-1)
df_player4["p_opp_aced%_l60_tw_ss_IO"] = df_player4["p_opp_aced%_l60_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_aced%_l60_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_aced%_l60_tw_ss_IO_diff"] = (df_player4["p_aced%_l60_tw_ss_IO"] - df_player4["p_opp_aced%_l60_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_aced%_l60_tw_ss_IO"],axis=1)

In [451]:
# "DEFENSE VS DEFENSE": PLAYER ACED % VS OPPONENT ACED % in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted ('p_aced%_l10_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_aced%_l10_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_aced%_l10_tw_ss_IO'].shift(-1)
df_player4["p_opp_aced%_l10_tw_ss_IO"] = df_player4["p_opp_aced%_l10_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_aced%_l10_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_aced%_l10_tw_ss_IO_diff"] = (df_player4["p_aced%_l10_tw_ss_IO"] - df_player4["p_opp_aced%_l10_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_aced%_l10_tw_ss_IO"],axis=1)

In [452]:
# "DEFENSE VS DEFENSE": PLAYER ACED % VS OPPONENT ACED % in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_aced%_l60_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_aced%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_aced%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_aced%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_aced%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_aced%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_aced%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_aced%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_aced%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_aced%_l60_tw_ss_SOS_adj"],axis=1)

In [453]:
# "DEFENSE VS DEFENSE": PLAYER ACED % VS OPPONENT ACED % in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_aced%_l10_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_aced%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_aced%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_aced%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_aced%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_aced%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_aced%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_aced%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_aced%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_aced%_l10_tw_ss_SOS_adj"],axis=1)

In [454]:
# "DEFENSE VS DEFENSE": PLAYER ACED % VS OPPONENT ACED % in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_aced%_l60_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_aced%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_aced%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_aced%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_aced%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_aced%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_aced%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_aced%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_aced%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_aced%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [455]:
# "DEFENSE VS DEFENSE": PLAYER ACED % VS OPPONENT ACED % in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_aced%_l10_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_aced%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_aced%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_aced%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_aced%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_aced%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_aced%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_aced%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_aced%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_aced%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [456]:
# "DEFENSE VS DEFENSE": PLAYER ACED % VS OPPONENT ACED % in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_aced%_l60_tw_ss_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_aced%_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_aced%_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_aced%_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_aced%_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_aced%_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_aced%_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_aced%_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_aced%_l60_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_aced%_l60_tw_ss_SOS_csp_adj"],axis=1)

In [457]:
# "DEFENSE VS DEFENSE": PLAYER ACED % VS OPPONENT ACED % in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_aced%_l10_tw_ss_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_aced%_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_aced%_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_aced%_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_aced%_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_aced%_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_aced%_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_aced%_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_aced%_l10_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_aced%_l10_tw_ss_SOS_csp_adj"],axis=1)

In [458]:
# "DEFENSE VS DEFENSE": PLAYER ACED % VS OPPONENT ACED % in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_aced%_l60_tw_ss_IO_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_aced%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_aced%_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_aced%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_aced%_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_aced%_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_aced%_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_aced%_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_aced%_l60_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_aced%_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [459]:
# "DEFENSE VS DEFENSE": PLAYER ACED % VS OPPONENT ACED % in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_aced%_l10_tw_ss_IO_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_aced%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_aced%_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_aced%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_aced%_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_aced%_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_aced%_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_aced%_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_aced%_l10_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_aced%_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

In [460]:
# "OFFENSE VS OFFENSE": PLAYER DOUBLE FAULT % VS OPPONENT DOUBLE FAULT % in last 60 matches 
# Surface-Specific, Decay Time-Weighted ('p_df%_l60_tw_ss')

# Populating the opponent column
df_player4["p_opp_df%_l60_tw_ss"] = df_player4.groupby(['m_num'])['p_df%_l60_tw_ss'].shift(-1)
df_player4["p_opp_df%_l60_tw_ss"] = df_player4["p_opp_df%_l60_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_df%_l60_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_df%_l60_tw_ss_diff"] = (df_player4["p_df%_l60_tw_ss"] - df_player4["p_opp_df%_l60_tw_ss"])

df_player4 = df_player4.drop(["p_opp_df%_l60_tw_ss"],axis=1)

In [461]:
# "OFFENSE VS OFFENSE": PLAYER DOUBLE FAULT % VS OPPONENT DOUBLE FAULT % in last 10 matches 
# Surface-Specific, Decay Time-Weighted ('p_df%_l10_tw_ss')

# Populating the opponent column
df_player4["p_opp_df%_l10_tw_ss"] = df_player4.groupby(['m_num'])['p_df%_l10_tw_ss'].shift(-1)
df_player4["p_opp_df%_l10_tw_ss"] = df_player4["p_opp_df%_l10_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_df%_l10_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_df%_l10_tw_ss_diff"] = (df_player4["p_df%_l10_tw_ss"] - df_player4["p_opp_df%_l10_tw_ss"])

df_player4 = df_player4.drop(["p_opp_df%_l10_tw_ss"],axis=1)

In [462]:
# "OFFENSE VS OFFENSE": PLAYER DOUBLE FAULT % VS OPPONENT DOUBLE FAULT % in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted ('p_df%_l60_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_df%_l60_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_df%_l60_tw_ss_IO'].shift(-1)
df_player4["p_opp_df%_l60_tw_ss_IO"] = df_player4["p_opp_df%_l60_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_df%_l60_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_df%_l60_tw_ss_IO_diff"] = (df_player4["p_df%_l60_tw_ss_IO"] - df_player4["p_opp_df%_l60_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_df%_l60_tw_ss_IO"],axis=1)

In [463]:
# "OFFENSE VS OFFENSE": PLAYER DOUBLE FAULT % VS OPPONENT DOUBLE FAULT % in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted ('p_df%_l10_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_df%_l10_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_df%_l10_tw_ss_IO'].shift(-1)
df_player4["p_opp_df%_l10_tw_ss_IO"] = df_player4["p_opp_df%_l10_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_df%_l10_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_df%_l10_tw_ss_IO_diff"] = (df_player4["p_df%_l10_tw_ss_IO"] - df_player4["p_opp_df%_l10_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_df%_l10_tw_ss_IO"],axis=1)

In [464]:
# "OFFENSE VS OFFENSE": PLAYER DOUBLE FAULT % VS OPPONENT DOUBLE FAULT % in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_df%_l60_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_df%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_df%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_df%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_df%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_df%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_df%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_df%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_df%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_df%_l60_tw_ss_SOS_adj"],axis=1)

In [465]:
# "OFFENSE VS OFFENSE": PLAYER DOUBLE FAULT % VS OPPONENT DOUBLE FAULT % in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_df%_l10_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_df%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_df%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_df%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_df%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_df%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_df%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_df%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_df%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_df%_l10_tw_ss_SOS_adj"],axis=1)

In [466]:
# "OFFENSE VS OFFENSE": PLAYER DOUBLE FAULT % VS OPPONENT DOUBLE FAULT % in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_df%_l60_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_df%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_df%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_df%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_df%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_df%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_df%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_df%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_df%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_df%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [467]:
# "OFFENSE VS OFFENSE": PLAYER DOUBLE FAULT % VS OPPONENT DOUBLE FAULT % in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_df%_l10_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_df%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_df%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_df%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_df%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_df%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_df%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_df%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_df%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_df%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [468]:
# "OFFENSE VS OFFENSE": PLAYER DOUBLE FAULT % VS OPPONENT DOUBLE FAULT % in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_df%_l60_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_df%_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_df%_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_df%_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_df%_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_df%_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_df%_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_df%_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_df%_l60_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_df%_l60_tw_ss_SOS_csp_adj"],axis=1)

In [469]:
# "OFFENSE VS OFFENSE": PLAYER DOUBLE FAULT % VS OPPONENT DOUBLE FAULT % in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_df%_l10_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_df%_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_df%_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_df%_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_df%_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_df%_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_df%_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_df%_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_df%_l10_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_df%_l10_tw_ss_SOS_csp_adj"],axis=1)

In [470]:
# "OFFENSE VS OFFENSE": PLAYER DOUBLE FAULT % VS OPPONENT DOUBLE FAULT % in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_df%_l60_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_df%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_df%_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_df%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_df%_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_df%_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_df%_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_df%_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_df%_l60_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_df%_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [471]:
# "OFFENSE VS OFFENSE": PLAYER DOUBLE FAULT % VS OPPONENT DOUBLE FAULT % in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_df%_l10_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_df%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_df%_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_df%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_df%_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_df%_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_df%_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_df%_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_df%_l10_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_df%_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

In [472]:
# "DEFENSE VS DEFENSE": PLAYER DOUBLE FAULT INDUCE % VS OPPONENT DOUBLE FAULT INDUCE % in last 60 matches 
# Surface-Specific, Decay Time-Weighted ('p_df_induce%_l60_tw_ss')

# Populating the opponent column
df_player4["p_opp_df_induce%_l60_tw_ss"] = df_player4.groupby(['m_num'])['p_df_induce%_l60_tw_ss'].shift(-1)
df_player4["p_opp_df_induce%_l60_tw_ss"] = df_player4["p_opp_df_induce%_l60_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_df_induce%_l60_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_df_induce%_l60_tw_ss_diff"] = (df_player4["p_df_induce%_l60_tw_ss"] - df_player4["p_opp_df_induce%_l60_tw_ss"])

df_player4 = df_player4.drop(["p_opp_df_induce%_l60_tw_ss"],axis=1)

In [473]:
# "DEFENSE VS DEFENSE": PLAYER DOUBLE FAULT INDUCE % VS OPPONENT DOUBLE FAULT INDUCE % in last 10 matches 
# Surface-Specific, Decay Time-Weighted ('p_df_induce%_l10_tw_ss')

# Populating the opponent column
df_player4["p_opp_df_induce%_l10_tw_ss"] = df_player4.groupby(['m_num'])['p_df_induce%_l10_tw_ss'].shift(-1)
df_player4["p_opp_df_induce%_l10_tw_ss"] = df_player4["p_opp_df_induce%_l10_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_df_induce%_l10_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_df_induce%_l10_tw_ss_diff"] = (df_player4["p_df_induce%_l10_tw_ss"] - df_player4["p_opp_df_induce%_l10_tw_ss"])

df_player4 = df_player4.drop(["p_opp_df_induce%_l10_tw_ss"],axis=1)

In [474]:
# "DEFENSE VS DEFENSE": PLAYER DOUBLE FAULT INDUCE % VS OPPONENT DOUBLE FAULT INDUCE % in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted ('p_df_induce%_l60_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_df_induce%_l60_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_df_induce%_l60_tw_ss_IO'].shift(-1)
df_player4["p_opp_df_induce%_l60_tw_ss_IO"] = df_player4["p_opp_df_induce%_l60_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_df_induce%_l60_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_df_induce%_l60_tw_ss_IO_diff"] = (df_player4["p_df_induce%_l60_tw_ss_IO"] - df_player4["p_opp_df_induce%_l60_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_df_induce%_l60_tw_ss_IO"],axis=1)

In [475]:
# "DEFENSE VS DEFENSE": PLAYER DOUBLE FAULT INDUCE % VS OPPONENT DOUBLE FAULT INDUCE % in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted ('p_df_induce%_l10_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_df_induce%_l10_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_df_induce%_l10_tw_ss_IO'].shift(-1)
df_player4["p_opp_df_induce%_l10_tw_ss_IO"] = df_player4["p_opp_df_induce%_l10_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_df_induce%_l10_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_df_induce%_l10_tw_ss_IO_diff"] = (df_player4["p_df_induce%_l10_tw_ss_IO"] - df_player4["p_opp_df_induce%_l10_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_df_induce%_l10_tw_ss_IO"],axis=1)

In [476]:
# "DEFENSE VS DEFENSE": PLAYER DOUBLE FAULT INDUCE % VS OPPONENT DOUBLE FAULT INDUCE% in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_df_induce%_l60_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_df_induce%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_df_induce%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_df_induce%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_df_induce%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_df_induce%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_df_induce%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_df_induce%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_df_induce%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_df_induce%_l60_tw_ss_SOS_adj"],axis=1)

In [477]:
# "DEFENSE VS DEFENSE": PLAYER DOUBLE FAULT INDUCE % VS OPPONENT DOUBLE FAULT INDUCE % in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_df_induce%_l10_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_df_induce%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_df_induce%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_df_induce%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_df_induce%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_df_induce%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_df_induce%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_df_induce%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_df_induce%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_df_induce%_l10_tw_ss_SOS_adj"],axis=1)

In [478]:
# "DEFENSE VS DEFENSE": PLAYER DOUBLE FAULT INDUCE % VS OPPONENT DOUBLE FAULT INDUCE% in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_df_induce%_l60_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_df_induce%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_df_induce%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_df_induce%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_df_induce%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_df_induce%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_df_induce%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_df_induce%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_df_induce%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_df_induce%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [479]:
# "DEFENSE VS DEFENSE": PLAYER DOUBLE FAULT INDUCE % VS OPPONENT DOUBLE FAULT INDUCE% in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_df_induce%_l10_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_df_induce%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_df_induce%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_df_induce%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_df_induce%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_df_induce%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_df_induce%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_df_induce%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_df_induce%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_df_induce%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [480]:
# "DEFENSE VS DEFENSE": PLAYER DOUBLE FAULT INDUCE % VS OPPONENT DOUBLE FAULT INDUCE % in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_df_induce%_l60_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_df_induce%_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_df_induce%_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_df_induce%_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_df_induce%_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_df_induce%_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_df_induce%_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_df_induce%_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_df_induce%_l60_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_df_induce%_l60_tw_ss_SOS_csp_adj"],axis=1)

In [481]:
# "DEFENSE VS DEFENSE": PLAYER DOUBLE FAULT INDUCE % VS OPPONENT DOUBLE FAULT INDUCE % in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_df_induce%_l10_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_df_induce%_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_df_induce%_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_df_induce%_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_df_induce%_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_df_induce%_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_df_induce%_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_df_induce%_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_df_induce%_l10_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_df_induce%_l10_tw_ss_SOS_csp_adj"],axis=1)

In [482]:
# "DEFENSE VS DEFENSE": PLAYER DOUBLE FAULT INDUCE % VS OPPONENT DOUBLE FAULT INDUCE % in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_df_induce%_l60_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_df_induce%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_df_induce%_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_df_induce%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_df_induce%_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_df_induce%_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_df_induce%_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_df_induce%_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_df_induce%_l60_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_df_induce%_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [483]:
# "DEFENSE VS DEFENSE": PLAYER DOUBLE FAULT INDUCE % VS OPPONENT DOUBLE FAULT INDUCE % in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_df_induce%_l10_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_df_induce%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_df_induce%_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_df_induce%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_df_induce%_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_df_induce%_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_df_induce%_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_df_induce%_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_df_induce%_l10_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_df_induce%_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

In [484]:
# "OFFENSE VS OFFENSE": PLAYER BREAK POINT SAVE % VS OPPONENT BREAK POINT SAVE % in last 60 matches 
# Surface-Specific, Decay Time-Weighted ('p_bp_save%_l60_tw_ss')

# Populating the opponent column
df_player4["p_opp_bp_save%_l60_tw_ss"] = df_player4.groupby(['m_num'])['p_bp_save%_l60_tw_ss'].shift(-1)
df_player4["p_opp_bp_save%_l60_tw_ss"] = df_player4["p_opp_bp_save%_l60_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_bp_save%_l60_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_bp_save%_l60_tw_ss_diff"] = (df_player4["p_bp_save%_l60_tw_ss"] - df_player4["p_opp_bp_save%_l60_tw_ss"])

df_player4 = df_player4.drop(["p_opp_bp_save%_l60_tw_ss"],axis=1)

In [485]:
# "OFFENSE VS OFFENSE": PLAYER BREAK POINT SAVE % VS OPPONENT BREAK POINT SAVE % in last 10 matches 
# Surface-Specific, Decay Time-Weighted ('p_bp_save%_l10_tw_ss')

# Populating the opponent column
df_player4["p_opp_bp_save%_l10_tw_ss"] = df_player4.groupby(['m_num'])['p_bp_save%_l10_tw_ss'].shift(-1)
df_player4["p_opp_bp_save%_l10_tw_ss"] = df_player4["p_opp_bp_save%_l10_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_bp_save%_l10_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_bp_save%_l10_tw_ss_diff"] = (df_player4["p_bp_save%_l10_tw_ss"] - df_player4["p_opp_bp_save%_l10_tw_ss"])

df_player4 = df_player4.drop(["p_opp_bp_save%_l10_tw_ss"],axis=1)

In [486]:
# "OFFENSE VS OFFENSE": PLAYER BREAK POINT SAVE % VS OPPONENT BREAK POINT SAVE % in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted ('p_bp_save%_l60_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_bp_save%_l60_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_bp_save%_l60_tw_ss_IO'].shift(-1)
df_player4["p_opp_bp_save%_l60_tw_ss_IO"] = df_player4["p_opp_bp_save%_l60_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_bp_save%_l60_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_bp_save%_l60_tw_ss_IO_diff"] = (df_player4["p_bp_save%_l60_tw_ss_IO"] - df_player4["p_opp_bp_save%_l60_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_bp_save%_l60_tw_ss_IO"],axis=1)

In [487]:
# "OFFENSE VS OFFENSE": PLAYER BREAK POINT SAVE % VS OPPONENT BREAK POINT SAVE % in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted ('p_bp_save%_l10_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_bp_save%_l10_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_bp_save%_l10_tw_ss_IO'].shift(-1)
df_player4["p_opp_bp_save%_l10_tw_ss_IO"] = df_player4["p_opp_bp_save%_l10_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_bp_save%_l10_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_bp_save%_l10_tw_ss_IO_diff"] = (df_player4["p_bp_save%_l10_tw_ss_IO"] - df_player4["p_opp_bp_save%_l10_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_bp_save%_l10_tw_ss_IO"],axis=1)

In [488]:
# "OFFENSE VS OFFENSE": PLAYER BREAK POINT SAVE % VS OPPONENT BREAK POINT SAVE % in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_bp_save%_l60_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_bp_save%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_bp_save%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_bp_save%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_bp_save%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_save%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_bp_save%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_bp_save%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_bp_save%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_bp_save%_l60_tw_ss_SOS_adj"],axis=1)

In [489]:
# "OFFENSE VS OFFENSE": PLAYER BREAK POINT SAVE % VS OPPONENT BREAK POINT SAVE % in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_bp_save%_l10_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_bp_save%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_bp_save%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_bp_save%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_bp_save%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_save%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_bp_save%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_bp_save%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_bp_save%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_bp_save%_l10_tw_ss_SOS_adj"],axis=1)

In [490]:
# "OFFENSE VS OFFENSE": PLAYER BREAK POINT SAVE % VS OPPONENT BREAK POINT SAVE % in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_bp_save%_l60_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_bp_save%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_bp_save%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_bp_save%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_bp_save%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_save%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_bp_save%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_bp_save%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_bp_save%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_bp_save%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [491]:
# "OFFENSE VS OFFENSE": PLAYER BREAK POINT SAVE % VS OPPONENT BREAK POINT SAVE % in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_bp_save%_l10_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_bp_save%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_bp_save%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_bp_save%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_bp_save%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_save%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_bp_save%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_bp_save%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_bp_save%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_bp_save%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [492]:
# "OFFENSE VS OFFENSE": PLAYER BREAK POINT SAVE % VS OPPONENT BREAK POINT SAVE % in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_bp_save%_l60_tw_ss_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_bp_save%_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_bp_save%_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_bp_save%_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_bp_save%_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_save%_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_bp_save%_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_bp_save%_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_bp_save%_l60_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_bp_save%_l60_tw_ss_SOS_csp_adj"],axis=1)

In [493]:
# "OFFENSE VS OFFENSE": PLAYER BREAK POINT SAVE % VS OPPONENT BREAK POINT SAVE % in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_bp_save%_l10_tw_ss_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_bp_save%_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_bp_save%_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_bp_save%_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_bp_save%_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_save%_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_bp_save%_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_bp_save%_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_bp_save%_l10_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_bp_save%_l10_tw_ss_SOS_csp_adj"],axis=1)

In [494]:
# "OFFENSE VS OFFENSE": PLAYER BREAK POINT SAVE % VS OPPONENT BREAK POINT SAVE % in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_bp_save%_l60_tw_ss_IO_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_bp_save%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_bp_save%_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_bp_save%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_bp_save%_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_save%_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_bp_save%_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_bp_save%_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_bp_save%_l60_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_bp_save%_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [495]:
# "OFFENSE VS OFFENSE": PLAYER BREAK POINT SAVE % VS OPPONENT BREAK POINT SAVE % in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_bp_save%_l10_tw_ss_IO_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_bp_save%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_bp_save%_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_bp_save%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_bp_save%_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_save%_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_bp_save%_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_bp_save%_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_bp_save%_l10_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_bp_save%_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

In [496]:
# "DEFENSE VS DEFENSE": PLAYER BREAK POINT CONVERT % VS OPPONENT BREAK POINT CONVERT % in last 60 matches 
# Surface-Specific, Decay Time-Weighted ('p_bp_conv%_l60_tw_ss')

# Populating the opponent column
df_player4["p_opp_bp_conv%_l60_tw_ss"] = df_player4.groupby(['m_num'])['p_bp_conv%_l60_tw_ss'].shift(-1)
df_player4["p_opp_bp_conv%_l60_tw_ss"] = df_player4["p_opp_bp_conv%_l60_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_bp_conv%_l60_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_bp_conv%_l60_tw_ss_diff"] = (df_player4["p_bp_conv%_l60_tw_ss"] - df_player4["p_opp_bp_conv%_l60_tw_ss"])

df_player4 = df_player4.drop(["p_opp_bp_conv%_l60_tw_ss"],axis=1)

In [497]:
# "DEFENSE VS DEFENSE": PLAYER BREAK POINT CONVERT % VS OPPONENT BREAK POINT CONVERT % in last 10 matches 
# Surface-Specific, Decay Time-Weighted ('p_bp_conv%_l10_tw_ss')

# Populating the opponent column
df_player4["p_opp_bp_conv%_l10_tw_ss"] = df_player4.groupby(['m_num'])['p_bp_conv%_l10_tw_ss'].shift(-1)
df_player4["p_opp_bp_conv%_l10_tw_ss"] = df_player4["p_opp_bp_conv%_l10_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_bp_conv%_l10_tw_ss'].shift(1))

# Calculating differential column 
df_player4["p_bp_conv%_l10_tw_ss_diff"] = (df_player4["p_bp_conv%_l10_tw_ss"] - df_player4["p_opp_bp_conv%_l10_tw_ss"])

df_player4 = df_player4.drop(["p_opp_bp_conv%_l10_tw_ss"],axis=1)

In [498]:
# "DEFENSE VS DEFENSE": PLAYER BREAK POINT CONVERT % VS OPPONENT BREAK POINT CONVERT % in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted ('p_bp_conv%_l60_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_bp_conv%_l60_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_bp_conv%_l60_tw_ss_IO'].shift(-1)
df_player4["p_opp_bp_conv%_l60_tw_ss_IO"] = df_player4["p_opp_bp_conv%_l60_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_bp_conv%_l60_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_bp_conv%_l60_tw_ss_IO_diff"] = (df_player4["p_bp_conv%_l60_tw_ss_IO"] - df_player4["p_opp_bp_conv%_l60_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_bp_conv%_l60_tw_ss_IO"],axis=1)

In [499]:
# "DEFENSE VS DEFENSE": PLAYER BREAK POINT CONVERT % VS OPPONENT BREAK POINT CONVERT % in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted ('p_bp_conv%_l10_tw_ss_IO')

# Populating the opponent column
df_player4["p_opp_bp_conv%_l10_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_bp_conv%_l10_tw_ss_IO'].shift(-1)
df_player4["p_opp_bp_conv%_l10_tw_ss_IO"] = df_player4["p_opp_bp_conv%_l10_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_bp_conv%_l10_tw_ss_IO'].shift(1))

# Calculating differential column 
df_player4["p_bp_conv%_l10_tw_ss_IO_diff"] = (df_player4["p_bp_conv%_l10_tw_ss_IO"] - df_player4["p_opp_bp_conv%_l10_tw_ss_IO"])

df_player4 = df_player4.drop(["p_opp_bp_conv%_l10_tw_ss_IO"],axis=1)

In [500]:
# "DEFENSE VS DEFENSE": PLAYER BREAK POINT CONVERT % VS OPPONENT BREAK POINT CONVERT % in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_bp_conv%_l60_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_bp_conv%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_bp_conv%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_bp_conv%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_bp_conv%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_conv%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_bp_conv%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_bp_conv%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_bp_conv%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_bp_conv%_l60_tw_ss_SOS_adj"],axis=1)

In [501]:
# "DEFENSE VS DEFENSE": PLAYER BREAK POINT CONVERT % VS OPPONENT BREAK POINT CONVERT % in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_bp_conv%_l10_tw_ss_SOS_adj')

# Populating the opponent column
df_player4["p_opp_bp_conv%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_bp_conv%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_bp_conv%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_bp_conv%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_conv%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_bp_conv%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_bp_conv%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_bp_conv%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_bp_conv%_l10_tw_ss_SOS_adj"],axis=1)

In [502]:
# "DEFENSE VS DEFENSE": PLAYER BREAK POINT CONVERT % VS OPPONENT BREAK POINT CONVERT % in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_bp_conv%_l60_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_bp_conv%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_bp_conv%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_bp_conv%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_bp_conv%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_conv%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_bp_conv%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_bp_conv%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_bp_conv%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_bp_conv%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [503]:
# "DEFENSE VS DEFENSE": PLAYER BREAK POINT CONVERT % VS OPPONENT BREAK POINT CONVERT % in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted ('p_bp_conv%_l10_tw_ss_IO_SOS_adj')

# Populating the opponent column
df_player4["p_opp_bp_conv%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_bp_conv%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_bp_conv%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_bp_conv%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_conv%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_bp_conv%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_bp_conv%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_bp_conv%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_bp_conv%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [504]:
# "DEFENSE VS DEFENSE": PLAYER BREAK POINT CONVERT % VS OPPONENT BREAK POINT CONVERT % in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_bp_conv%_l60_tw_ss_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_bp_conv%_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_bp_conv%_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_bp_conv%_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_bp_conv%_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_conv%_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_bp_conv%_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_bp_conv%_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_bp_conv%_l60_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_bp_conv%_l60_tw_ss_SOS_csp_adj"],axis=1)

In [505]:
# "DEFENSE VS DEFENSE": PLAYER BREAK POINT CONVERT % VS OPPONENT BREAK POINT CONVERT % in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_bp_conv%_l10_tw_ss_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_bp_conv%_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_bp_conv%_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_bp_conv%_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_bp_conv%_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_conv%_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_bp_conv%_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_bp_conv%_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_bp_conv%_l10_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_bp_conv%_l10_tw_ss_SOS_csp_adj"],axis=1)

In [506]:
# "DEFENSE VS DEFENSE": PLAYER BREAK POINT CONVERT % VS OPPONENT BREAK POINT CONVERT % in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_bp_conv%_l60_tw_ss_IO_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_bp_conv%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_bp_conv%_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_bp_conv%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_bp_conv%_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_conv%_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_bp_conv%_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_bp_conv%_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_bp_conv%_l60_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_bp_conv%_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [507]:
# "DEFENSE VS DEFENSE": PLAYER BREAK POINT CONVERT % VS OPPONENT BREAK POINT CONVERT % in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule, Court Speed Proxy Adjusted ('p_bp_conv%_l10_tw_ss_IO_SOS_csp_adj')

# Populating the opponent column
df_player4["p_opp_bp_conv%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_bp_conv%_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_bp_conv%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_bp_conv%_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_conv%_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_bp_conv%_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_bp_conv%_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_bp_conv%_l10_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_bp_conv%_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

now player differentials per match are calculated for "Offense vs Defense" and "Defense vs Offense"

In [508]:
# "OFFENSE VS DEFENSE": PLAYER % SERVE points won VS OPPONENT % RETURN points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted 

# Populating the opponent column
df_player4["p_opp_ret_pts_won%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_ret_pts_won%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_ret_pts_won%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_ret_pts_won%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_ret_pts_won%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_sv_opp_ret_pts_won%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_sv_pts_won%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_ret_pts_won%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_ret_pts_won%_l60_tw_ss_SOS_adj"],axis=1)

In [509]:
# "OFFENSE VS DEFENSE": PLAYER % SERVE points won VS OPPONENT % RETURN points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted 

# Populating the opponent column
df_player4["p_opp_ret_pts_won%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_ret_pts_won%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_ret_pts_won%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_ret_pts_won%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_ret_pts_won%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_sv_opp_ret_pts_won%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_sv_pts_won%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_ret_pts_won%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_ret_pts_won%_l10_tw_ss_SOS_adj"],axis=1)

In [510]:
# "OFFENSE VS DEFENSE": PLAYER % SERVE points won VS OPPONENT % RETURN points won in last 60 matches 
# Surface-Specific, IO-Specific Decay Time-Weighted, Strength-of-Schedule Adjusted 

# Populating the opponent column
df_player4["p_opp_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_ret_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_ret_pts_won%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_ret_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_sv_opp_ret_pts_won%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_ret_pts_won%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_ret_pts_won%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [511]:
# "OFFENSE VS DEFENSE": PLAYER % SERVE points won VS OPPONENT % RETURN points won in last 10 matches 
# Surface-Specific, IO-Specific Decay Time-Weighted, Strength-of-Schedule Adjusted 

# Populating the opponent column
df_player4["p_opp_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_ret_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_ret_pts_won%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_ret_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_sv_opp_ret_pts_won%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_ret_pts_won%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_ret_pts_won%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [512]:
# "OFFENSE VS DEFENSE": PLAYER % SERVE points won VS OPPONENT % RETURN points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_ret_pts_won%_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_ret_pts_won%_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_ret_pts_won%_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_ret_pts_won%_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_ret_pts_won%_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_sv_opp_ret_pts_won%_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_sv_pts_won%_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_ret_pts_won%_l60_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_ret_pts_won%_l60_tw_ss_SOS_csp_adj"],axis=1)

In [513]:
# "OFFENSE VS DEFENSE": PLAYER % SERVE points won VS OPPONENT % RETURN points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_ret_pts_won%_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_ret_pts_won%_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_ret_pts_won%_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_ret_pts_won%_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_ret_pts_won%_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_sv_opp_ret_pts_won%_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_sv_pts_won%_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_ret_pts_won%_l10_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_ret_pts_won%_l10_tw_ss_SOS_csp_adj"],axis=1)

In [514]:
# "OFFENSE VS DEFENSE": PLAYER % SERVE points won VS OPPONENT % RETURN points won in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_sv_opp_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [515]:
# "OFFENSE VS DEFENSE": PLAYER % SERVE points won VS OPPONENT % RETURN points won in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_sv_opp_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

In [516]:
# "OFFENSE VS DEFENSE": PLAYER % FIRST SERVE points won VS OPPONENT % FIRST RETURN points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted 

# Populating the opponent column
df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv_opp_1st_ret_pts_won%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_1st_sv_pts_won%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_1st_ret_pts_won%_l60_tw_ss_SOS_adj"],axis=1)

In [517]:
# "OFFENSE VS DEFENSE": PLAYER % FIRST SERVE points won VS OPPONENT % FIRST RETURN points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted 

# Populating the opponent column
df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv_opp_1st_ret_pts_won%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_1st_sv_pts_won%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_1st_ret_pts_won%_l10_tw_ss_SOS_adj"],axis=1)

In [518]:
# "OFFENSE VS DEFENSE": PLAYER % FIRST SERVE points won VS OPPONENT % FIRST RETURN points won in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted 

# Populating the opponent column
df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [519]:
# "OFFENSE VS DEFENSE": PLAYER % FIRST SERVE points won VS OPPONENT % FIRST RETURN points won in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted 

# Populating the opponent column
df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [520]:
# "OFFENSE VS DEFENSE": PLAYER % FIRST SERVE points won VS OPPONENT % FIRST RETURN points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv_opp_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj"],axis=1)

In [521]:
# "OFFENSE VS DEFENSE": PLAYER % FIRST SERVE points won VS OPPONENT % FIRST RETURN points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv_opp_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj"],axis=1)

In [522]:
# "OFFENSE VS DEFENSE": PLAYER % FIRST SERVE points won VS OPPONENT % FIRST RETURN points won in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [523]:
# "OFFENSE VS DEFENSE": PLAYER % FIRST SERVE points won VS OPPONENT % FIRST RETURN points won in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_sv_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

In [524]:
# "OFFENSE VS DEFENSE": PLAYER % SECOND SERVE points won VS OPPONENT % SECOND RETURN points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_sv_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_2nd_sv_pts_won%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_adj"],axis=1)

In [525]:
# "OFFENSE VS DEFENSE": PLAYER % SECOND SERVE points won VS OPPONENT % SECOND RETURN points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_sv_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_2nd_sv_pts_won%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_adj"],axis=1)

In [526]:
# "OFFENSE VS DEFENSE": PLAYER % SECOND SERVE points won VS OPPONENT % SECOND RETURN points won in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_sv_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [527]:
# "OFFENSE VS DEFENSE": PLAYER % SECOND SERVE points won VS OPPONENT % SECOND RETURN points won in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_sv_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [528]:
# "OFFENSE VS DEFENSE": PLAYER % SECOND SERVE points won VS OPPONENT % SECOND RETURN points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_sv_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj"],axis=1)

In [529]:
# "OFFENSE VS DEFENSE": PLAYER % SECOND SERVE points won VS OPPONENT % SECOND RETURN points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_sv_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj"],axis=1)

In [530]:
# "OFFENSE VS DEFENSE": PLAYER % SECOND SERVE points won VS OPPONENT % SECOND RETURN points won in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_sv_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [531]:
# "OFFENSE VS DEFENSE": PLAYER % SECOND SERVE points won VS OPPONENT % SECOND RETURN points won in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_sv_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

In [532]:
# "DEFENSE VS OFFENSE": PLAYER % RETURN points won VS OPPONENT % SERVE points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_sv_pts_won%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_sv_pts_won%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_sv_pts_won%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_sv_pts_won%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_sv_pts_won%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_ret_opp_sv_pts_won%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_ret_pts_won%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_sv_pts_won%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_sv_pts_won%_l60_tw_ss_SOS_adj"],axis=1)

In [533]:
# "DEFENSE VS OFFENSE": PLAYER % RETURN points won VS OPPONENT % SERVE points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_sv_pts_won%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_sv_pts_won%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_sv_pts_won%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_sv_pts_won%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_sv_pts_won%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_ret_opp_sv_pts_won%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_ret_pts_won%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_sv_pts_won%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_sv_pts_won%_l10_tw_ss_SOS_adj"],axis=1)

In [534]:
# "DEFENSE VS OFFENSE": PLAYER % RETURN points won VS OPPONENT % SERVE points won in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_sv_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_sv_pts_won%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_sv_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_ret_opp_sv_pts_won%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_sv_pts_won%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_sv_pts_won%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [535]:
# "DEFENSE VS OFFENSE": PLAYER % RETURN points won VS OPPONENT % SERVE points won in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_sv_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_sv_pts_won%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_sv_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_ret_opp_sv_pts_won%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_sv_pts_won%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_sv_pts_won%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [536]:
# "DEFENSE VS OFFENSE": PLAYER % RETURN points won VS OPPONENT % SERVE points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_sv_pts_won%_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_sv_pts_won%_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_sv_pts_won%_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_sv_pts_won%_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_sv_pts_won%_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_ret_opp_sv_pts_won%_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_ret_pts_won%_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_sv_pts_won%_l60_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_sv_pts_won%_l60_tw_ss_SOS_csp_adj"],axis=1)

In [537]:
# "DEFENSE VS OFFENSE": PLAYER % RETURN points won VS OPPONENT % SERVE points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_sv_pts_won%_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_sv_pts_won%_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_sv_pts_won%_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_sv_pts_won%_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_sv_pts_won%_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_ret_opp_sv_pts_won%_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_ret_pts_won%_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_sv_pts_won%_l10_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_sv_pts_won%_l10_tw_ss_SOS_csp_adj"],axis=1)

In [538]:
# "DEFENSE VS OFFENSE": PLAYER % RETURN points won VS OPPONENT % SERVE points won in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_ret_opp_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [539]:
# "DEFENSE VS OFFENSE": PLAYER % RETURN points won VS OPPONENT % SERVE points won in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_ret_opp_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

In [540]:
# "DEFENSE VS OFFENSE": PLAYER % FIRST RETURN points won VS OPPONENT % FIRST SERVE points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_ret_opp_1st_sv_pts_won%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_1st_ret_pts_won%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv_pts_won%_l60_tw_ss_SOS_adj"],axis=1)

In [541]:
# "DEFENSE VS OFFENSE": PLAYER % FIRST RETURN points won VS OPPONENT % FIRST SERVE points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_ret_opp_1st_sv_pts_won%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_1st_ret_pts_won%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv_pts_won%_l10_tw_ss_SOS_adj"],axis=1)

In [542]:
# "DEFENSE VS OFFENSE": PLAYER % FIRST RETURN points won VS OPPONENT % FIRST SERVE points won in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_ret_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [543]:
# "DEFENSE VS OFFENSE": PLAYER % FIRST RETURN points won VS OPPONENT % FIRST SERVE points won in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_ret_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [544]:
# "DEFENSE VS OFFENSE": PLAYER % FIRST RETURN points won VS OPPONENT % FIRST SERVE points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_ret_opp_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj"],axis=1)

In [545]:
# "DEFENSE VS OFFENSE": PLAYER % FIRST RETURN points won VS OPPONENT % FIRST SERVE points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_ret_opp_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj"],axis=1)

In [546]:
# "DEFENSE VS OFFENSE": PLAYER % FIRST RETURN points won VS OPPONENT % FIRST SERVE points won in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_ret_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [547]:
# "DEFENSE VS OFFENSE": PLAYER % FIRST RETURN points won VS OPPONENT % FIRST SERVE points won in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_1st_ret_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

In [548]:
# "DEFENSE VS OFFENSE": PLAYER % SECOND RETURN points won VS OPPONENT % SECOND SERVE points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_ret_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_2nd_ret_pts_won%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_adj"],axis=1)

In [549]:
# "DEFENSE VS OFFENSE": PLAYER % SECOND RETURN points won VS OPPONENT % SECOND SERVE points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_ret_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_2nd_ret_pts_won%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_adj"],axis=1)

In [550]:
# "DEFENSE VS OFFENSE": PLAYER % SECOND RETURN points won VS OPPONENT % SECOND SERVE points won in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_ret_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [551]:
# "DEFENSE VS OFFENSE": PLAYER % SECOND RETURN points won VS OPPONENT % SECOND SERVE points won in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_ret_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [552]:
# "DEFENSE VS OFFENSE": PLAYER % SECOND RETURN points won VS OPPONENT % SECOND SERVE points won in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_ret_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj"],axis=1)

In [553]:
# "DEFENSE VS OFFENSE": PLAYER % SECOND RETURN points won VS OPPONENT % SECOND SERVE points won in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_ret_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj"],axis=1)

In [554]:
# "DEFENSE VS OFFENSE": PLAYER % SECOND RETURN points won VS OPPONENT % SECOND SERVE points won in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_ret_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [555]:
# "DEFENSE VS OFFENSE": PLAYER % SECOND RETURN points won VS OPPONENT % SECOND SERVE points won in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_2nd_ret_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

In [556]:
# "OFFENSE VS DEFENSE": PLAYER % ACE VS OPPONENT % ACED in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_aced%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_aced%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_aced%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_aced%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_aced%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_ace_opp_aced%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_ace%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_aced%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_aced%_l60_tw_ss_SOS_adj"],axis=1)

In [557]:
# "OFFENSE VS DEFENSE": PLAYER % ACE VS OPPONENT % ACED in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_aced%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_aced%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_aced%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_aced%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_aced%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_ace_opp_aced%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_ace%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_aced%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_aced%_l10_tw_ss_SOS_adj"],axis=1)

In [558]:
# "OFFENSE VS DEFENSE": PLAYER % ACE VS OPPONENT % ACED in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_aced%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_aced%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_aced%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_aced%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_aced%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_ace_opp_aced%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_ace%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_aced%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_aced%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [559]:
# "OFFENSE VS DEFENSE": PLAYER % ACE VS OPPONENT % ACED in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_aced%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_aced%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_aced%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_aced%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_aced%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_ace_opp_aced%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_ace%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_aced%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_aced%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [560]:
# "OFFENSE VS DEFENSE": PLAYER % ACE VS OPPONENT % ACED in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_aced%_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_aced%_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_aced%_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_aced%_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_aced%_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_ace_opp_aced%_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_ace%_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_aced%_l60_tw_ss_SOS_csp_adj"])

df_diff_test = df_player4[["p_nm", "opp_nm", "m_num", "p_ace%_l60_tw_ss_SOS_csp_adj", "p_aced%_l60_tw_ss_SOS_csp_adj", "p_opp_aced%_l60_tw_ss_SOS_csp_adj", "p_ace_opp_aced%_l60_tw_ss_SOS_csp_adj_diff"]]

df_player4 = df_player4.drop(["p_opp_aced%_l60_tw_ss_SOS_csp_adj"],axis=1)

In [561]:
# "OFFENSE VS DEFENSE": PLAYER % ACE VS OPPONENT % ACED in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_aced%_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_aced%_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_aced%_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_aced%_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_aced%_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_ace_opp_aced%_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_ace%_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_aced%_l10_tw_ss_SOS_csp_adj"])

df_diff_test = df_player4[["p_nm", "opp_nm", "m_num", "p_ace%_l10_tw_ss_SOS_csp_adj", "p_aced%_l10_tw_ss_SOS_csp_adj", "p_opp_aced%_l10_tw_ss_SOS_csp_adj", "p_ace_opp_aced%_l10_tw_ss_SOS_csp_adj_diff"]]

df_player4 = df_player4.drop(["p_opp_aced%_l10_tw_ss_SOS_csp_adj"],axis=1)

In [562]:
# "OFFENSE VS DEFENSE": PLAYER % ACE VS OPPONENT % ACED in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_aced%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_aced%_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_aced%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_aced%_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_aced%_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_ace_opp_aced%_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_ace%_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_aced%_l60_tw_ss_IO_SOS_csp_adj"])

df_diff_test = df_player4[["p_nm", "opp_nm", "m_num", "p_ace%_l60_tw_ss_IO_SOS_csp_adj", "p_aced%_l60_tw_ss_IO_SOS_csp_adj", "p_opp_aced%_l60_tw_ss_IO_SOS_csp_adj", "p_ace_opp_aced%_l60_tw_ss_IO_SOS_csp_adj_diff"]]

df_player4 = df_player4.drop(["p_opp_aced%_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [563]:
# "OFFENSE VS DEFENSE": PLAYER % ACE VS OPPONENT % ACED in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_aced%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_aced%_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_aced%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_aced%_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_aced%_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_ace_opp_aced%_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_ace%_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_aced%_l10_tw_ss_IO_SOS_csp_adj"])

df_diff_test = df_player4[["p_nm", "opp_nm", "m_num", "p_ace%_l10_tw_ss_IO_SOS_csp_adj", "p_aced%_l10_tw_ss_IO_SOS_csp_adj", "p_opp_aced%_l10_tw_ss_IO_SOS_csp_adj", "p_ace_opp_aced%_l10_tw_ss_IO_SOS_csp_adj_diff"]]

df_player4 = df_player4.drop(["p_opp_aced%_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

In [564]:
# "DEFENSE VS OFFENSE": PLAYER % ACED VS OPPONENT % ACE in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted 

# Populating the opponent column
df_player4["p_opp_ace%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_ace%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_ace%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_ace%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_ace%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_aced_opp_ace%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_aced%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_ace%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_ace%_l60_tw_ss_SOS_adj"],axis=1)

In [565]:
# "DEFENSE VS OFFENSE": PLAYER % ACED VS OPPONENT % ACE in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted 

# Populating the opponent column
df_player4["p_opp_ace%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_ace%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_ace%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_ace%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_ace%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_aced_opp_ace%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_aced%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_ace%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_ace%_l10_tw_ss_SOS_adj"],axis=1)

In [566]:
# "DEFENSE VS OFFENSE": PLAYER % ACED VS OPPONENT % ACE in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted 

# Populating the opponent column
df_player4["p_opp_ace%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_ace%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_ace%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_ace%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_ace%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_aced_opp_ace%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_aced%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_ace%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_ace%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [567]:
# "DEFENSE VS OFFENSE": PLAYER % ACED VS OPPONENT % ACE in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted 

# Populating the opponent column
df_player4["p_opp_ace%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_ace%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_ace%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_ace%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_ace%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_aced_opp_ace%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_aced%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_ace%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_ace%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [568]:
# "DEFENSE VS OFFENSE": PLAYER % ACED VS OPPONENT % ACE in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_ace%_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_ace%_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_ace%_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_ace%_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_ace%_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_aced_opp_ace%_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_aced%_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_ace%_l60_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_ace%_l60_tw_ss_SOS_csp_adj"],axis=1)

In [569]:
# "DEFENSE VS OFFENSE": PLAYER % ACED VS OPPONENT % ACE in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_ace%_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_ace%_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_ace%_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_ace%_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_ace%_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_aced_opp_ace%_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_aced%_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_ace%_l10_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_ace%_l10_tw_ss_SOS_csp_adj"],axis=1)

In [570]:
# "DEFENSE VS OFFENSE": PLAYER % ACED VS OPPONENT % ACE in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_ace%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_ace%_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_ace%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_ace%_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_ace%_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_aced_opp_ace%_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_aced%_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_ace%_l60_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_ace%_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [571]:
# "DEFENSE VS OFFENSE": PLAYER % ACED VS OPPONENT % ACE in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_ace%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_ace%_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_ace%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_ace%_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_ace%_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_aced_opp_ace%_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_aced%_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_ace%_l10_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_ace%_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

In [572]:
# "OFFENSE VS DEFENSE": PLAYER % DOUBLE FAULTS VS OPPONENT % DOUBLE FAULTS INDUCED in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_df_induce%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_df_induce%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_df_induce%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_df_induce%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_df_induce%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_df_opp_df_induce%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_df%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_df_induce%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_df_induce%_l60_tw_ss_SOS_adj"],axis=1)

In [573]:
# "OFFENSE VS DEFENSE": PLAYER % DOUBLE FAULTS VS OPPONENT % DOUBLE FAULTS INDUCED in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_df_induce%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_df_induce%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_df_induce%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_df_induce%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_df_induce%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_df_opp_df_induce%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_df%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_df_induce%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_df_induce%_l10_tw_ss_SOS_adj"],axis=1)

In [574]:
# "OFFENSE VS DEFENSE": PLAYER % DOUBLE FAULTS VS OPPONENT % DOUBLE FAULTS INDUCED in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_df_induce%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_df_induce%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_df_induce%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_df_induce%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_df_induce%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_df_opp_df_induce%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_df%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_df_induce%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_df_induce%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [575]:
# "OFFENSE VS DEFENSE": PLAYER % DOUBLE FAULTS VS OPPONENT % DOUBLE FAULTS INDUCED in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_df_induce%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_df_induce%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_df_induce%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_df_induce%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_df_induce%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_df_opp_df_induce%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_df%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_df_induce%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_df_induce%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [576]:
# "OFFENSE VS DEFENSE": PLAYER % DOUBLE FAULTS VS OPPONENT % DOUBLE FAULTS INDUCED in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_df_induce%_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_df_induce%_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_df_induce%_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_df_induce%_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_df_induce%_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_df_opp_df_induce%_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_df%_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_df_induce%_l60_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_df_induce%_l60_tw_ss_SOS_csp_adj"],axis=1)

In [577]:
# "OFFENSE VS DEFENSE": PLAYER % DOUBLE FAULTS VS OPPONENT % DOUBLE FAULTS INDUCED in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_df_induce%_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_df_induce%_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_df_induce%_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_df_induce%_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_df_induce%_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_df_opp_df_induce%_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_df%_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_df_induce%_l10_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_df_induce%_l10_tw_ss_SOS_csp_adj"],axis=1)


In [578]:
# "OFFENSE VS DEFENSE": PLAYER % DOUBLE FAULTS VS OPPONENT % DOUBLE FAULTS INDUCED in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_df_induce%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_df_induce%_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_df_induce%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_df_induce%_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_df_induce%_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_df_opp_df_induce%_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_df%_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_df_induce%_l60_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_df_induce%_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [579]:
# "OFFENSE VS DEFENSE": PLAYER % DOUBLE FAULTS VS OPPONENT % DOUBLE FAULTS INDUCED in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_df_induce%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_df_induce%_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_df_induce%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_df_induce%_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_df_induce%_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_df_opp_df_induce%_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_df%_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_df_induce%_l10_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_df_induce%_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

In [580]:
# "DEFENSE VS OFFENSE": PLAYER % DOUBLE FAULTS INDUCED VS OPPONENT % DOUBLE FAULTS in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted 

# Populating the opponent column
df_player4["p_opp_df%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_df%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_df%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_df%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_df%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_dfinduce_opp_df%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_df_induce%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_df%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_df%_l60_tw_ss_SOS_adj"],axis=1)

In [581]:
# "DEFENSE VS OFFENSE": PLAYER % DOUBLE FAULTS INDUCED VS OPPONENT % DOUBLE FAULTS in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted 

# Populating the opponent column
df_player4["p_opp_df%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_df%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_df%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_df%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_df%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_dfinduce_opp_df%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_df_induce%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_df%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_df%_l10_tw_ss_SOS_adj"],axis=1)

In [582]:
# "DEFENSE VS OFFENSE": PLAYER % DOUBLE FAULTS INDUCED VS OPPONENT % DOUBLE FAULTS in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted 

# Populating the opponent column
df_player4["p_opp_df%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_df%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_df%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_df%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_df%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_dfinduce_opp_df%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_df_induce%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_df%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_df%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [583]:
# "DEFENSE VS OFFENSE": PLAYER % DOUBLE FAULTS INDUCED VS OPPONENT % DOUBLE FAULTS in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted 

# Populating the opponent column
df_player4["p_opp_df%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_df%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_df%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_df%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_df%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_dfinduce_opp_df%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_df_induce%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_df%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_df%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [584]:
# "DEFENSE VS OFFENSE": PLAYER % DOUBLE FAULTS INDUCED VS OPPONENT % DOUBLE FAULTS in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_df%_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_df%_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_df%_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_df%_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_df%_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_dfinduce_opp_df%_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_df_induce%_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_df%_l60_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_df%_l60_tw_ss_SOS_csp_adj"],axis=1)

In [585]:
# "DEFENSE VS OFFENSE": PLAYER % DOUBLE FAULTS INDUCED VS OPPONENT % DOUBLE FAULTS in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_df%_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_df%_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_df%_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_df%_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_df%_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_dfinduce_opp_df%_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_df_induce%_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_df%_l10_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_df%_l10_tw_ss_SOS_csp_adj"],axis=1)

In [586]:
# "DEFENSE VS OFFENSE": PLAYER % DOUBLE FAULTS INDUCED VS OPPONENT % DOUBLE FAULTS in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_df%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_df%_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_df%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_df%_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_df%_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_dfinduce_opp_df%_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_df_induce%_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_df%_l60_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_df%_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [587]:
# "DEFENSE VS OFFENSE": PLAYER % DOUBLE FAULTS INDUCED VS OPPONENT % DOUBLE FAULTS in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_df%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_df%_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_df%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_df%_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_df%_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_dfinduce_opp_df%_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_df_induce%_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_df%_l10_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_df%_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

In [588]:
# "OFFENSE VS DEFENSE": PLAYER % BREAK POINTS SAVED VS OPPONENT % BREAK POINTS CONVERTED in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted 

# Populating the opponent column
df_player4["p_opp_bp_conv%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_bp_conv%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_bp_conv%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_bp_conv%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_conv%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_bpsave_opp_bpconv%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_bp_save%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_bp_conv%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_bp_conv%_l60_tw_ss_SOS_adj"],axis=1)

In [589]:
# "OFFENSE VS DEFENSE": PLAYER % BREAK POINTS SAVED VS OPPONENT % BREAK POINTS CONVERTED in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted 

# Populating the opponent column
df_player4["p_opp_bp_conv%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_bp_conv%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_bp_conv%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_bp_conv%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_conv%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_bpsave_opp_bpconv%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_bp_save%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_bp_conv%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_bp_conv%_l10_tw_ss_SOS_adj"],axis=1)

In [590]:
# "OFFENSE VS DEFENSE": PLAYER % BREAK POINTS SAVED VS OPPONENT % BREAK POINTS CONVERTED in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted 

# Populating the opponent column
df_player4["p_opp_bp_conv%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_bp_conv%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_bp_conv%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_bp_conv%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_conv%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_bpsave_opp_bpconv%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_bp_save%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_bp_conv%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_bp_conv%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [591]:
# "OFFENSE VS DEFENSE": PLAYER % BREAK POINTS SAVED VS OPPONENT % BREAK POINTS CONVERTED in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted 

# Populating the opponent column
df_player4["p_opp_bp_conv%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_bp_conv%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_bp_conv%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_bp_conv%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_conv%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_bpsave_opp_bpconv%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_bp_save%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_bp_conv%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_bp_conv%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [592]:
# "OFFENSE VS DEFENSE": PLAYER % BREAK POINTS SAVED VS OPPONENT % BREAK POINTS CONVERTED in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_bp_conv%_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_bp_conv%_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_bp_conv%_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_bp_conv%_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_conv%_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_bpsave_opp_bpconv%_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_bp_save%_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_bp_conv%_l60_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_bp_conv%_l60_tw_ss_SOS_csp_adj"],axis=1)

In [593]:
# "OFFENSE VS DEFENSE": PLAYER % BREAK POINTS SAVED VS OPPONENT % BREAK POINTS CONVERTED in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_bp_conv%_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_bp_conv%_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_bp_conv%_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_bp_conv%_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_conv%_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_bpsave_opp_bpconv%_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_bp_save%_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_bp_conv%_l10_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_bp_conv%_l10_tw_ss_SOS_csp_adj"],axis=1)

In [594]:
# "OFFENSE VS DEFENSE": PLAYER % BREAK POINTS SAVED VS OPPONENT % BREAK POINTS CONVERTED in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_bp_conv%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_bp_conv%_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_bp_conv%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_bp_conv%_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_conv%_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_bpsave_opp_bpconv%_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_bp_save%_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_bp_conv%_l60_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_bp_conv%_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [595]:
# "OFFENSE VS DEFENSE": PLAYER % BREAK POINTS SAVED VS OPPONENT % BREAK POINTS CONVERTED in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_bp_conv%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_bp_conv%_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_bp_conv%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_bp_conv%_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_conv%_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_bpsave_opp_bpconv%_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_bp_save%_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_bp_conv%_l10_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_bp_conv%_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

In [596]:
# "DEFENSE VS OFFENSE": PLAYER % BREAK POINTS CONVERTED VS OPPONENT % BREAK POINTS SAVED in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted 

# Populating the opponent column
df_player4["p_opp_bp_save%_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_bp_save%_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_bp_save%_l60_tw_ss_SOS_adj"] = df_player4["p_opp_bp_save%_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_save%_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_bpconv_opp_bpsave%_l60_tw_ss_SOS_adj_diff"] = (df_player4["p_bp_conv%_l60_tw_ss_SOS_adj"] - df_player4["p_opp_bp_save%_l60_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_bp_save%_l60_tw_ss_SOS_adj"],axis=1)

In [597]:
# "DEFENSE VS OFFENSE": PLAYER % BREAK POINTS CONVERTED VS OPPONENT % BREAK POINTS SAVED in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted 

# Populating the opponent column
df_player4["p_opp_bp_save%_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_bp_save%_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_bp_save%_l10_tw_ss_SOS_adj"] = df_player4["p_opp_bp_save%_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_save%_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_bpconv_opp_bpsave%_l10_tw_ss_SOS_adj_diff"] = (df_player4["p_bp_conv%_l10_tw_ss_SOS_adj"] - df_player4["p_opp_bp_save%_l10_tw_ss_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_bp_save%_l10_tw_ss_SOS_adj"],axis=1)

In [598]:
# "DEFENSE VS OFFENSE": PLAYER % BREAK POINTS CONVERTED VS OPPONENT % BREAK POINTS SAVED in last 60 matches 
# Surface-Specific, IO-Specific Decay Time-Weighted, Strength-of-Schedule Adjusted 

# Populating the opponent column
df_player4["p_opp_bp_save%_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_bp_save%_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_bp_save%_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_bp_save%_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_save%_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_bpconv_opp_bpsave%_l60_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_bp_conv%_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_bp_save%_l60_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_bp_save%_l60_tw_ss_IO_SOS_adj"],axis=1)

In [599]:
# "DEFENSE VS OFFENSE": PLAYER % BREAK POINTS CONVERTED VS OPPONENT % BREAK POINTS SAVED in last 10 matches 
# Surface-Specific, IO-Specific Decay Time-Weighted, Strength-of-Schedule Adjusted 

# Populating the opponent column
df_player4["p_opp_bp_save%_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_bp_save%_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_bp_save%_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_bp_save%_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_save%_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differential column 
df_player4["p_bpconv_opp_bpsave%_l10_tw_ss_IO_SOS_adj_diff"] = (df_player4["p_bp_conv%_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_bp_save%_l10_tw_ss_IO_SOS_adj"])

df_player4 = df_player4.drop(["p_opp_bp_save%_l10_tw_ss_IO_SOS_adj"],axis=1)

In [600]:
# "DEFENSE VS OFFENSE": PLAYER % BREAK POINTS CONVERTED VS OPPONENT % BREAK POINTS SAVED in last 60 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_bp_save%_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_bp_save%_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_bp_save%_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_bp_save%_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_save%_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_bpconv_opp_bpsave%_l60_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_bp_conv%_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_bp_save%_l60_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_bp_save%_l60_tw_ss_SOS_csp_adj"],axis=1)

In [601]:
# "DEFENSE VS OFFENSE": PLAYER % BREAK POINTS CONVERTED VS OPPONENT % BREAK POINTS SAVED in last 10 matches 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_bp_save%_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_bp_save%_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_bp_save%_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_bp_save%_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_save%_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_bpconv_opp_bpsave%_l10_tw_ss_SOS_csp_adj_diff"] = (df_player4["p_bp_conv%_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_bp_save%_l10_tw_ss_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_bp_save%_l10_tw_ss_SOS_csp_adj"],axis=1)

In [602]:
# "DEFENSE VS OFFENSE": PLAYER % BREAK POINTS CONVERTED VS OPPONENT % BREAK POINTS SAVED in last 60 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_bp_save%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_bp_save%_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_bp_save%_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_bp_save%_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_save%_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_bpconv_opp_bpsave%_l60_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_bp_conv%_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_bp_save%_l60_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_bp_save%_l60_tw_ss_IO_SOS_csp_adj"],axis=1)

In [603]:
# "DEFENSE VS OFFENSE": PLAYER % BREAK POINTS CONVERTED VS OPPONENT % BREAK POINTS SAVED in last 10 matches 
# Surface-Specific, IO-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted, Court Speed Proxy Adjusted 

# Populating the opponent column
df_player4["p_opp_bp_save%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_bp_save%_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_bp_save%_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_bp_save%_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_bp_save%_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differential column 
df_player4["p_bpconv_opp_bpsave%_l10_tw_ss_IO_SOS_csp_adj_diff"] = (df_player4["p_bp_conv%_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_bp_save%_l10_tw_ss_IO_SOS_csp_adj"])

df_player4 = df_player4.drop(["p_opp_bp_save%_l10_tw_ss_IO_SOS_csp_adj"],axis=1)

player differentials for implied win probability odds derived from historical wagering lines

In [604]:
#  Averaged Implied Odds in last 60 matches Differential 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_AVG_C_IP_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_AVG_C_IP_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_AVG_C_IP_l60_tw_ss_SOS_adj"] = df_player4["p_opp_AVG_C_IP_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_AVG_C_IP_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differentials
df_player4["p_AVG_C_IP_l60_tw_ss_SOS_adj_diff"] = df_player4["p_AVG_C_IP_l60_tw_ss_SOS_adj"] - df_player4["p_opp_AVG_C_IP_l60_tw_ss_SOS_adj"] 

df_player4 = df_player4.drop(["p_opp_AVG_C_IP_l60_tw_ss_SOS_adj"], axis=1)

In [605]:
#  Averaged Implied Odds in last 10 matches Differential 
# Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_AVG_C_IP_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_AVG_C_IP_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_AVG_C_IP_l10_tw_ss_SOS_adj"] = df_player4["p_opp_AVG_C_IP_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_AVG_C_IP_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differentials
df_player4["p_AVG_C_IP_l10_tw_ss_SOS_adj_diff"] = df_player4["p_AVG_C_IP_l10_tw_ss_SOS_adj"] - df_player4["p_opp_AVG_C_IP_l10_tw_ss_SOS_adj"] 

df_player4 = df_player4.drop(["p_opp_AVG_C_IP_l10_tw_ss_SOS_adj"], axis=1)

In [606]:
#  Averaged Implied Odds in last 60 matches Differential 
# Surface-Specific, Indoor-Outdoor-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_AVG_C_IP_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_AVG_C_IP_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_AVG_C_IP_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differentials
df_player4["p_AVG_C_IP_l60_tw_ss_IO_SOS_adj_diff"] = df_player4["p_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_AVG_C_IP_l60_tw_ss_IO_SOS_adj"] 

df_player4 = df_player4.drop(["p_opp_AVG_C_IP_l60_tw_ss_IO_SOS_adj"], axis=1)

In [607]:
#  Averaged Implied Odds in last 10 matches Differential 
# Surface-Specific, Indoor-Outdoor-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_AVG_C_IP_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_AVG_C_IP_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_AVG_C_IP_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differentials
df_player4["p_AVG_C_IP_l10_tw_ss_IO_SOS_adj_diff"] = df_player4["p_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_AVG_C_IP_l10_tw_ss_IO_SOS_adj"] 

df_player4 = df_player4.drop(["p_opp_AVG_C_IP_l10_tw_ss_IO_SOS_adj"], axis=1)

In [608]:
#  Averaged Implied Odds in last 60 matches Differential 
# NON-Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_AVG_C_IP_l60_tw_nss_SOS_adj"] = df_player4.groupby(['m_num'])['p_AVG_C_IP_l60_tw_nss_SOS_adj'].shift(-1)
df_player4["p_opp_AVG_C_IP_l60_tw_nss_SOS_adj"] = df_player4["p_opp_AVG_C_IP_l60_tw_nss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_AVG_C_IP_l60_tw_nss_SOS_adj'].shift(1))

# Calculating differentials
df_player4["p_AVG_C_IP_l60_tw_nss_SOS_adj_diff"] = df_player4["p_AVG_C_IP_l60_tw_nss_SOS_adj"] - df_player4["p_opp_AVG_C_IP_l60_tw_nss_SOS_adj"] 

df_player4 = df_player4.drop(["p_opp_AVG_C_IP_l60_tw_nss_SOS_adj"], axis=1)

In [609]:
#  Averaged Implied Odds in last 10 matches Differential 
# NON-Surface-Specific, Decay Time-Weighted, Strength-of-Schedule Adjusted

# Populating the opponent column
df_player4["p_opp_AVG_C_IP_l10_tw_nss_SOS_adj"] = df_player4.groupby(['m_num'])['p_AVG_C_IP_l10_tw_nss_SOS_adj'].shift(-1)
df_player4["p_opp_AVG_C_IP_l10_tw_nss_SOS_adj"] = df_player4["p_opp_AVG_C_IP_l10_tw_nss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_AVG_C_IP_l10_tw_nss_SOS_adj'].shift(1))

# Calculating differentials
df_player4["p_AVG_C_IP_l10_tw_nss_SOS_adj_diff"] = df_player4["p_AVG_C_IP_l10_tw_nss_SOS_adj"] - df_player4["p_opp_AVG_C_IP_l10_tw_nss_SOS_adj"] 

df_player4 = df_player4.drop(["p_opp_AVG_C_IP_l10_tw_nss_SOS_adj"], axis=1)

now we compute player vs player in a given match differentials for the serve efficiency ratios we created earlier

In [610]:
# 'p_ace_df%_ratio_l60_tw_ss_diff'
# Provides the player differential for ratio of % aces to % double faults for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_ace_df%_ratio_l60_tw_ss"] = df_player4.groupby(['m_num'])['p_ace_df%_ratio_l60_tw_ss'].shift(-1)
df_player4["p_opp_ace_df%_ratio_l60_tw_ss"] = df_player4["p_opp_ace_df%_ratio_l60_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_ace_df%_ratio_l60_tw_ss'].shift(1))

# Calculating differentials
df_player4["p_ace_df%_ratio_l60_tw_ss_diff"] = df_player4["p_ace_df%_ratio_l60_tw_ss"] - df_player4["p_opp_ace_df%_ratio_l60_tw_ss"] 

df_player4 = df_player4.drop(["p_opp_ace_df%_ratio_l60_tw_ss"], axis=1)

In [611]:
# 'p_ace_df%_ratio_l10_tw_ss_diff'
# Provides the player differential for ratio of % aces to % double faults for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_ace_df%_ratio_l10_tw_ss"] = df_player4.groupby(['m_num'])['p_ace_df%_ratio_l10_tw_ss'].shift(-1)
df_player4["p_opp_ace_df%_ratio_l10_tw_ss"] = df_player4["p_opp_ace_df%_ratio_l10_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_ace_df%_ratio_l10_tw_ss'].shift(1))

# Calculating differentials
df_player4["p_ace_df%_ratio_l10_tw_ss_diff"] = df_player4["p_ace_df%_ratio_l10_tw_ss"] - df_player4["p_opp_ace_df%_ratio_l10_tw_ss"] 

df_player4 = df_player4.drop(["p_opp_ace_df%_ratio_l10_tw_ss"], axis=1)

In [612]:
# 'p_ace_df%_ratio_l60_tw_ss_IO_diff'
# Provides the player differential for ratio of % aces to % double faults for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_ace_df%_ratio_l60_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_ace_df%_ratio_l60_tw_ss_IO'].shift(-1)
df_player4["p_opp_ace_df%_ratio_l60_tw_ss_IO"] = df_player4["p_opp_ace_df%_ratio_l60_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_ace_df%_ratio_l60_tw_ss_IO'].shift(1))

# Calculating differentials
df_player4["p_ace_df%_ratio_l60_tw_ss_IO_diff"] = df_player4["p_ace_df%_ratio_l60_tw_ss_IO"] - df_player4["p_opp_ace_df%_ratio_l60_tw_ss_IO"] 

df_player4 = df_player4.drop(["p_opp_ace_df%_ratio_l60_tw_ss_IO"], axis=1)

In [613]:
# 'p_ace_df%_ratio_l10_tw_ss_IO_diff'
# Provides the player differential for ratio of % aces to % double faults for PLAYER over the last 10 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_ace_df%_ratio_l10_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_ace_df%_ratio_l10_tw_ss_IO'].shift(-1)
df_player4["p_opp_ace_df%_ratio_l10_tw_ss_IO"] = df_player4["p_opp_ace_df%_ratio_l10_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_ace_df%_ratio_l10_tw_ss_IO'].shift(1))

# Calculating differentials
df_player4["p_ace_df%_ratio_l10_tw_ss_IO_diff"] = df_player4["p_ace_df%_ratio_l10_tw_ss_IO"] - df_player4["p_opp_ace_df%_ratio_l10_tw_ss_IO"] 

df_player4 = df_player4.drop(["p_opp_ace_df%_ratio_l10_tw_ss_IO"], axis=1)

In [614]:
# 'p_ace_df%_ratio_l60_tw_ss_SOS_adj_diff'
# Provides the player differential for ratio of % aces to % double faults for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_ace_df%_ratio_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_ace_df%_ratio_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_ace_df%_ratio_l60_tw_ss_SOS_adj"] = df_player4["p_opp_ace_df%_ratio_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_ace_df%_ratio_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differentials
df_player4["p_ace_df%_ratio_l60_tw_ss_SOS_adj_diff"] = df_player4["p_ace_df%_ratio_l60_tw_ss_SOS_adj"] - df_player4["p_opp_ace_df%_ratio_l60_tw_ss_SOS_adj"] 

df_player4 = df_player4.drop(["p_opp_ace_df%_ratio_l60_tw_ss_SOS_adj"], axis=1)

In [615]:
# 'p_ace_df%_ratio_l10_tw_ss_SOS_adj_diff'
# Provides the player differential for ratio of % aces to % double faults for PLAYER over the last 10 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_ace_df%_ratio_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_ace_df%_ratio_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_ace_df%_ratio_l10_tw_ss_SOS_adj"] = df_player4["p_opp_ace_df%_ratio_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_ace_df%_ratio_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differentials
df_player4["p_ace_df%_ratio_l10_tw_ss_SOS_adj_diff"] = df_player4["p_ace_df%_ratio_l10_tw_ss_SOS_adj"] - df_player4["p_opp_ace_df%_ratio_l10_tw_ss_SOS_adj"] 

df_player4 = df_player4.drop(["p_opp_ace_df%_ratio_l10_tw_ss_SOS_adj"], axis=1)

In [616]:
# 'p_ace_df%_ratio_l60_tw_ss_IO_SOS_adj_diff'
# Provides the player differential for ratio of % aces to % double faults for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_ace_df%_ratio_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_ace_df%_ratio_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_ace_df%_ratio_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_ace_df%_ratio_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_ace_df%_ratio_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differentials
df_player4["p_ace_df%_ratio_l60_tw_ss_IO_SOS_adj_diff"] = df_player4["p_ace_df%_ratio_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_ace_df%_ratio_l60_tw_ss_IO_SOS_adj"] 

df_player4 = df_player4.drop(["p_opp_ace_df%_ratio_l60_tw_ss_IO_SOS_adj"], axis=1)

In [617]:
# 'p_ace_df%_ratio_l10_tw_ss_IO_SOS_adj_diff'
# Provides the player differential for ratio of % aces to % double faults for PLAYER over the last 10 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_ace_df%_ratio_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_ace_df%_ratio_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_ace_df%_ratio_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_ace_df%_ratio_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_ace_df%_ratio_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differentials
df_player4["p_ace_df%_ratio_l10_tw_ss_IO_SOS_adj_diff"] = df_player4["p_ace_df%_ratio_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_ace_df%_ratio_l10_tw_ss_IO_SOS_adj"] 

df_player4 = df_player4.drop(["p_opp_ace_df%_ratio_l10_tw_ss_IO_SOS_adj"], axis=1)

In [618]:
# 'p_ace_df%_ratio_l60_tw_ss_SOS_adj_csp_diff'
# Provides the player differential for ratio of % aces to % double faults for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_ace_df%_ratio_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_ace_df%_ratio_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_ace_df%_ratio_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_ace_df%_ratio_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_ace_df%_ratio_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differentials
df_player4["p_ace_df%_ratio_l60_tw_ss_SOS_adj_csp_diff"] = df_player4["p_ace_df%_ratio_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_ace_df%_ratio_l60_tw_ss_SOS_csp_adj"] 

df_player4 = df_player4.drop(["p_opp_ace_df%_ratio_l60_tw_ss_SOS_csp_adj"], axis=1)

In [619]:
# 'p_ace_df%_ratio_l10_tw_ss_SOS_adj_csp_diff'
# Provides the player differential for ratio of % aces to % double faults for PLAYER over the last 10 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_ace_df%_ratio_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_ace_df%_ratio_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_ace_df%_ratio_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_ace_df%_ratio_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_ace_df%_ratio_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differentials
df_player4["p_ace_df%_ratio_l10_tw_ss_SOS_adj_csp_diff"] = df_player4["p_ace_df%_ratio_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_ace_df%_ratio_l10_tw_ss_SOS_csp_adj"] 

df_player4 = df_player4.drop(["p_opp_ace_df%_ratio_l10_tw_ss_SOS_csp_adj"], axis=1)

In [620]:
# 'p_ace_df%_ratio_l60_tw_ss_IO_SOS_adj_csp_diff'
# Provides the player differential for ratio of % aces to % double faults for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_ace_df%_ratio_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_ace_df%_ratio_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_ace_df%_ratio_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_ace_df%_ratio_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_ace_df%_ratio_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differentials
df_player4["p_ace_df%_ratio_l60_tw_ss_IO_SOS_adj_csp_diff"] = df_player4["p_ace_df%_ratio_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_ace_df%_ratio_l60_tw_ss_IO_SOS_csp_adj"] 

df_player4 = df_player4.drop(["p_opp_ace_df%_ratio_l60_tw_ss_IO_SOS_csp_adj"], axis=1)

In [621]:
# 'p_ace_df%_ratio_l10_tw_ss_IO_SOS_adj_csp_diff'
# Provides the player differential for ratio of % aces to % double faults for PLAYER over the last 10 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_ace_df%_ratio_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_ace_df%_ratio_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_ace_df%_ratio_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_ace_df%_ratio_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_ace_df%_ratio_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differentials
df_player4["p_ace_df%_ratio_l10_tw_ss_IO_SOS_adj_csp_diff"] = df_player4["p_ace_df%_ratio_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_ace_df%_ratio_l10_tw_ss_IO_SOS_csp_adj"] 

df_player4 = df_player4.drop(["p_opp_ace_df%_ratio_l10_tw_ss_IO_SOS_csp_adj"], axis=1)

In [622]:
# 'p_1stSvWon_1stSv%_ratio_l60_tw_ss_diff'
# Provides the player differential for ratio of % first serves in to % first serve points won for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss"] = df_player4.groupby(['m_num'])['p_1stSvWon_1stSv%_ratio_l60_tw_ss'].shift(-1)
df_player4["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss"] = df_player4["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_1stSvWon_1stSv%_ratio_l60_tw_ss'].shift(1))

# Calculating differentials
df_player4["p_1stSvWon_1stSv%_ratio_l60_tw_ss_diff"] = df_player4["p_1stSvWon_1stSv%_ratio_l60_tw_ss"] - df_player4["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss"] 

df_player4 = df_player4.drop(["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss"], axis=1)

In [623]:
# 'p_1stSvWon_1stSv%_ratio_l10_tw_ss_diff'
# Provides the player differential for ratio of % first serves in to % first serve points won for PLAYER over the last 10 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss"] = df_player4.groupby(['m_num'])['p_1stSvWon_1stSv%_ratio_l10_tw_ss'].shift(-1)
df_player4["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss"] = df_player4["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_1stSvWon_1stSv%_ratio_l10_tw_ss'].shift(1))

# Calculating differentials
df_player4["p_1stSvWon_1stSv%_ratio_l10_tw_ss_diff"] = df_player4["p_1stSvWon_1stSv%_ratio_l10_tw_ss"] - df_player4["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss"] 

df_player4 = df_player4.drop(["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss"], axis=1)

In [624]:
# 'p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_diff'
# Provides the player differential for ratio of % first serves in to % first serve points won for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO'].shift(-1)
df_player4["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss_IO"] = df_player4["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO'].shift(1))

# Calculating differentials
df_player4["p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_diff"] = df_player4["p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO"] - df_player4["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss_IO"] 

df_player4 = df_player4.drop(["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss_IO"], axis=1)

In [625]:
# 'p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_diff'
# Provides the player differential for ratio of % first serves in to % first serve points won for PLAYER over the last 10 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO'].shift(-1)
df_player4["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss_IO"] = df_player4["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO'].shift(1))

# Calculating differentials
df_player4["p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_diff"] = df_player4["p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO"] - df_player4["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss_IO"] 

df_player4 = df_player4.drop(["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss_IO"], axis=1)

In [626]:
# 'p_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_adj_diff'
# Provides the player differential for ratio of % first serves in to % first serve points won for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_adj"] = df_player4["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differentials
df_player4["p_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_adj_diff"] = df_player4["p_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_adj"] - df_player4["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_adj"] 

df_player4 = df_player4.drop(["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_adj"], axis=1)

In [627]:
# 'p_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_adj_diff'
# Provides the player differential for ratio of % first serves in to % first serve points won for PLAYER over the last 10 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_adj"] = df_player4["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differentials
df_player4["p_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_adj_diff"] = df_player4["p_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_adj"] - df_player4["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_adj"] 

df_player4 = df_player4.drop(["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_adj"], axis=1)

In [628]:
# 'p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_adj_diff'
# Provides the player differential for ratio of % first serves in to % first serve points won for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differentials
df_player4["p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_adj_diff"] = df_player4["p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_adj"] 

df_player4 = df_player4.drop(["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_adj"], axis=1)

In [629]:
# 'p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_adj_diff'
# Provides the player differential for ratio of % first serves in to % first serve points won for PLAYER over the last 10 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differentials
df_player4["p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_adj_diff"] = df_player4["p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_adj"] 

df_player4 = df_player4.drop(["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_adj"], axis=1)

In [630]:
# 'p_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_csp_adj_diff'
# Provides the player differential for ratio of % first serves in to % first serve points won for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differentials
df_player4["p_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_adj_csp_diff"] = df_player4["p_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_csp_adj"] 

df_player4 = df_player4.drop(["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_csp_adj"], axis=1)

In [631]:
# 'p_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_csp_adj_diff'
# Provides the player differential for ratio of % first serves in to % first serve points won for PLAYER over the last 10 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differentials
df_player4["p_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_adj_csp_diff"] = df_player4["p_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_csp_adj"] 

df_player4 = df_player4.drop(["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_csp_adj"], axis=1)

In [632]:
# 'p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_csp_adj_diff'
# Provides the player differential for ratio of % first serves in to % first serve points won for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differentials
df_player4["p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_adj_csp_diff"] = df_player4["p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_csp_adj"] 

df_player4 = df_player4.drop(["p_opp_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_csp_adj"], axis=1)

In [633]:
# 'p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_csp_adj_diff'
# Provides the player differential for ratio of % first serves in to % first serve points won for PLAYER over the last 10 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differentials
df_player4["p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_adj_csp_diff"] = df_player4["p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_csp_adj"] 

df_player4 = df_player4.drop(["p_opp_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_csp_adj"], axis=1)

In [634]:
# 'p_ace_1stSv%_ratio_l60_tw_ss_diff'
# Provides the player differential for ratio of % aces in to % first serves in for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_ace_1stSv%_ratio_l60_tw_ss"] = df_player4.groupby(['m_num'])['p_ace_1stSv%_ratio_l60_tw_ss'].shift(-1)
df_player4["p_opp_ace_1stSv%_ratio_l60_tw_ss"] = df_player4["p_opp_ace_1stSv%_ratio_l60_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_ace_1stSv%_ratio_l60_tw_ss'].shift(1))

# Calculating differentials
df_player4["p_ace_1stSv%_ratio_l60_tw_ss_diff"] = df_player4["p_ace_1stSv%_ratio_l60_tw_ss"] - df_player4["p_opp_ace_1stSv%_ratio_l60_tw_ss"] 

df_player4 = df_player4.drop(["p_opp_ace_1stSv%_ratio_l60_tw_ss"], axis=1)

In [635]:
# 'p_ace_1stSv%_ratio_l10_tw_ss_diff'
# Provides the player differential for ratio of % aces in to % first serves in for PLAYER over the last 10 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_ace_1stSv%_ratio_l10_tw_ss"] = df_player4.groupby(['m_num'])['p_ace_1stSv%_ratio_l10_tw_ss'].shift(-1)
df_player4["p_opp_ace_1stSv%_ratio_l10_tw_ss"] = df_player4["p_opp_ace_1stSv%_ratio_l10_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_ace_1stSv%_ratio_l10_tw_ss'].shift(1))

# Calculating differentials
df_player4["p_ace_1stSv%_ratio_l10_tw_ss_diff"] = df_player4["p_ace_1stSv%_ratio_l10_tw_ss"] - df_player4["p_opp_ace_1stSv%_ratio_l10_tw_ss"] 

df_player4 = df_player4.drop(["p_opp_ace_1stSv%_ratio_l10_tw_ss"], axis=1)

In [636]:
# 'p_ace_1stSv%_ratio_l60_tw_ss_IO_diff'
# Provides the player differential for ratio of % aces in to % first serves in for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_ace_1stSv%_ratio_l60_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_ace_1stSv%_ratio_l60_tw_ss_IO'].shift(-1)
df_player4["p_opp_ace_1stSv%_ratio_l60_tw_ss_IO"] = df_player4["p_opp_ace_1stSv%_ratio_l60_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_ace_1stSv%_ratio_l60_tw_ss_IO'].shift(1))

# Calculating differentials
df_player4["p_ace_1stSv%_ratio_l60_tw_ss_IO_diff"] = df_player4["p_ace_1stSv%_ratio_l60_tw_ss_IO"] - df_player4["p_opp_ace_1stSv%_ratio_l60_tw_ss_IO"] 

df_player4 = df_player4.drop(["p_opp_ace_1stSv%_ratio_l60_tw_ss_IO"], axis=1)

In [637]:
# 'p_ace_1stSv%_ratio_l10_tw_ss_IO_diff'
# Provides the player differential for ratio of % aces in to % first serves in for PLAYER over the last 10 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_ace_1stSv%_ratio_l10_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_ace_1stSv%_ratio_l10_tw_ss_IO'].shift(-1)
df_player4["p_opp_ace_1stSv%_ratio_l10_tw_ss_IO"] = df_player4["p_opp_ace_1stSv%_ratio_l10_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_ace_1stSv%_ratio_l10_tw_ss_IO'].shift(1))

# Calculating differentials
df_player4["p_ace_1stSv%_ratio_l10_tw_ss_IO_diff"] = df_player4["p_ace_1stSv%_ratio_l10_tw_ss_IO"] - df_player4["p_opp_ace_1stSv%_ratio_l10_tw_ss_IO"] 

df_player4 = df_player4.drop(["p_opp_ace_1stSv%_ratio_l10_tw_ss_IO"], axis=1)

In [638]:
# 'p_ace_1stSv%_ratio_l60_tw_ss_SOS_adj_diff'
# Provides the player differential for ratio of % aces in to % first serves in for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_ace_1stSv%_ratio_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_ace_1stSv%_ratio_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_ace_1stSv%_ratio_l60_tw_ss_SOS_adj"] = df_player4["p_opp_ace_1stSv%_ratio_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_ace_1stSv%_ratio_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differentials
df_player4["p_ace_1stSv%_ratio_l60_tw_ss_SOS_adj_diff"] = df_player4["p_ace_1stSv%_ratio_l60_tw_ss_SOS_adj"] - df_player4["p_opp_ace_1stSv%_ratio_l60_tw_ss_SOS_adj"] 

df_player4 = df_player4.drop(["p_opp_ace_1stSv%_ratio_l60_tw_ss_SOS_adj"], axis=1)

In [639]:
# 'p_ace_1stSv%_ratio_l10_tw_ss_SOS_adj_diff'
# Provides the player differential for ratio of % aces in to % first serves in for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_ace_1stSv%_ratio_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_ace_1stSv%_ratio_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_ace_1stSv%_ratio_l10_tw_ss_SOS_adj"] = df_player4["p_opp_ace_1stSv%_ratio_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_ace_1stSv%_ratio_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differentials
df_player4["p_ace_1stSv%_ratio_l10_tw_ss_SOS_adj_diff"] = df_player4["p_ace_1stSv%_ratio_l10_tw_ss_SOS_adj"] - df_player4["p_opp_ace_1stSv%_ratio_l10_tw_ss_SOS_adj"] 

df_player4 = df_player4.drop(["p_opp_ace_1stSv%_ratio_l10_tw_ss_SOS_adj"], axis=1)

In [640]:
# 'p_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_adj_diff'
# Provides the player differential for ratio of % aces in to % first serves in for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differentials
df_player4["p_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_adj_diff"] = df_player4["p_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_adj"] 

df_player4 = df_player4.drop(["p_opp_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_adj"], axis=1)

In [641]:
# 'p_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_adj_diff'
# Provides the player differential for ratio of % aces in to % first serves in for PLAYER over the last 10 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differentials
df_player4["p_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_adj_diff"] = df_player4["p_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_adj"] 

df_player4 = df_player4.drop(["p_opp_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_adj"], axis=1)

In [642]:
# 'p_ace_1stSv%_ratio_l60_tw_ss_SOS_csp_adj_diff'
# Provides the player differential for ratio of % aces in to % first serves in for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_ace_1stSv%_ratio_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_ace_1stSv%_ratio_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_ace_1stSv%_ratio_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_ace_1stSv%_ratio_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_ace_1stSv%_ratio_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differentials
df_player4["p_ace_1stSv%_ratio_l60_tw_ss_SOS_csp_adj_diff"] = df_player4["p_ace_1stSv%_ratio_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_ace_1stSv%_ratio_l60_tw_ss_SOS_csp_adj"] 

df_player4 = df_player4.drop(["p_opp_ace_1stSv%_ratio_l60_tw_ss_SOS_csp_adj"], axis=1)

In [643]:
# 'p_ace_1stSv%_ratio_l10_tw_ss_SOS_csp_adj_diff'
# Provides the player differential for ratio of % aces in to % first serves in for PLAYER over the last 10 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_ace_1stSv%_ratio_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_ace_1stSv%_ratio_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_ace_1stSv%_ratio_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_ace_1stSv%_ratio_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_ace_1stSv%_ratio_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differentials
df_player4["p_ace_1stSv%_ratio_l10_tw_ss_SOS_csp_adj_diff"] = df_player4["p_ace_1stSv%_ratio_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_ace_1stSv%_ratio_l10_tw_ss_SOS_csp_adj"] 

df_player4 = df_player4.drop(["p_opp_ace_1stSv%_ratio_l10_tw_ss_SOS_csp_adj"], axis=1)

In [644]:
# 'p_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_csp_adj_diff'
# Provides the player differential for ratio of % aces in to % first serves in for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differentials
df_player4["p_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_csp_adj_diff"] = df_player4["p_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_csp_adj"] 

df_player4 = df_player4.drop(["p_opp_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_csp_adj"], axis=1)

In [645]:
# 'p_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_csp_adj_diff'
# Provides the player differential for ratio of % aces in to % first serves in for PLAYER over the last 10 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differentials
df_player4["p_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_csp_adj_diff"] = df_player4["p_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_csp_adj"] 

df_player4 = df_player4.drop(["p_opp_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_csp_adj"], axis=1)

In [646]:
# 'p_df_SvPtsWon%_ratio_l60_tw_ss_diff'
# Provides the player differential for ratio of % double faults to % serve points won for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_df_SvPtsWon%_ratio_l60_tw_ss"] = df_player4.groupby(['m_num'])['p_df_SvPtsWon%_ratio_l60_tw_ss'].shift(-1)
df_player4["p_opp_df_SvPtsWon%_ratio_l60_tw_ss"] = df_player4["p_opp_df_SvPtsWon%_ratio_l60_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_df_SvPtsWon%_ratio_l60_tw_ss'].shift(1))

# Calculating differentials
df_player4["p_df_SvPtsWon%_ratio_l60_tw_ss_diff"] = df_player4["p_df_SvPtsWon%_ratio_l60_tw_ss"] - df_player4["p_opp_df_SvPtsWon%_ratio_l60_tw_ss"] 

df_player4 = df_player4.drop(["p_opp_df_SvPtsWon%_ratio_l60_tw_ss"], axis=1)

In [647]:
# 'p_df_SvPtsWon%_ratio_l10_tw_ss_diff'
# Provides the player differential for ratio of % double faults to % serve points won for PLAYER over the last 10 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_df_SvPtsWon%_ratio_l10_tw_ss"] = df_player4.groupby(['m_num'])['p_df_SvPtsWon%_ratio_l10_tw_ss'].shift(-1)
df_player4["p_opp_df_SvPtsWon%_ratio_l10_tw_ss"] = df_player4["p_opp_df_SvPtsWon%_ratio_l10_tw_ss"].fillna(df_player4.groupby(['m_num'])['p_df_SvPtsWon%_ratio_l10_tw_ss'].shift(1))

# Calculating differentials
df_player4["p_df_SvPtsWon%_ratio_l10_tw_ss_diff"] = df_player4["p_df_SvPtsWon%_ratio_l10_tw_ss"] - df_player4["p_opp_df_SvPtsWon%_ratio_l10_tw_ss"] 

df_player4 = df_player4.drop(["p_opp_df_SvPtsWon%_ratio_l10_tw_ss"], axis=1)

In [648]:
# 'p_df_SvPtsWon%_ratio_l60_tw_ss_IO_diff'
# Provides the player differential for ratio of % double faults to % serve points won for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_df_SvPtsWon%_ratio_l60_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_df_SvPtsWon%_ratio_l60_tw_ss_IO'].shift(-1)
df_player4["p_opp_df_SvPtsWon%_ratio_l60_tw_ss_IO"] = df_player4["p_opp_df_SvPtsWon%_ratio_l60_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_df_SvPtsWon%_ratio_l60_tw_ss_IO'].shift(1))

# Calculating differentials
df_player4["p_df_SvPtsWon%_ratio_l60_tw_ss_IO_diff"] = df_player4["p_df_SvPtsWon%_ratio_l60_tw_ss_IO"] - df_player4["p_opp_df_SvPtsWon%_ratio_l60_tw_ss_IO"] 

df_player4 = df_player4.drop(["p_opp_df_SvPtsWon%_ratio_l60_tw_ss_IO"], axis=1)

In [649]:
# 'p_df_SvPtsWon%_ratio_l10_tw_ss_IO_diff'
# Provides the player differential for ratio of % double faults to % serve points won for PLAYER over the last 10 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_df_SvPtsWon%_ratio_l10_tw_ss_IO"] = df_player4.groupby(['m_num'])['p_df_SvPtsWon%_ratio_l10_tw_ss_IO'].shift(-1)
df_player4["p_opp_df_SvPtsWon%_ratio_l10_tw_ss_IO"] = df_player4["p_opp_df_SvPtsWon%_ratio_l10_tw_ss_IO"].fillna(df_player4.groupby(['m_num'])['p_df_SvPtsWon%_ratio_l10_tw_ss_IO'].shift(1))

# Calculating differentials
df_player4["p_df_SvPtsWon%_ratio_l10_tw_ss_IO_diff"] = df_player4["p_df_SvPtsWon%_ratio_l10_tw_ss_IO"] - df_player4["p_opp_df_SvPtsWon%_ratio_l10_tw_ss_IO"] 

df_player4 = df_player4.drop(["p_opp_df_SvPtsWon%_ratio_l10_tw_ss_IO"], axis=1)

In [650]:
# 'p_df_SvPtsWon%_ratio_l60_tw_ss_diff_SOS_adj'
# Provides the player differential for ratio of % double faults to % serve points won for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_df_SvPtsWon%_ratio_l60_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_df_SvPtsWon%_ratio_l60_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_df_SvPtsWon%_ratio_l60_tw_ss_SOS_adj"] = df_player4["p_opp_df_SvPtsWon%_ratio_l60_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_df_SvPtsWon%_ratio_l60_tw_ss_SOS_adj'].shift(1))

# Calculating differentials
df_player4["p_df_SvPtsWon%_ratio_l60_tw_ss_SOS_adj_diff"] = df_player4["p_df_SvPtsWon%_ratio_l60_tw_ss_SOS_adj"] - df_player4["p_opp_df_SvPtsWon%_ratio_l60_tw_ss_SOS_adj"] 

df_player4 = df_player4.drop(["p_opp_df_SvPtsWon%_ratio_l60_tw_ss_SOS_adj"], axis=1)

In [651]:
# 'p_df_SvPtsWon%_ratio_l10_tw_ss_diff_SOS_adj'
# Provides the player differential for ratio of % double faults to % serve points won for PLAYER over the last 10 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_df_SvPtsWon%_ratio_l10_tw_ss_SOS_adj"] = df_player4.groupby(['m_num'])['p_df_SvPtsWon%_ratio_l10_tw_ss_SOS_adj'].shift(-1)
df_player4["p_opp_df_SvPtsWon%_ratio_l10_tw_ss_SOS_adj"] = df_player4["p_opp_df_SvPtsWon%_ratio_l10_tw_ss_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_df_SvPtsWon%_ratio_l10_tw_ss_SOS_adj'].shift(1))

# Calculating differentials
df_player4["p_df_SvPtsWon%_ratio_l10_tw_ss_SOS_adj_diff"] = df_player4["p_df_SvPtsWon%_ratio_l10_tw_ss_SOS_adj"] - df_player4["p_opp_df_SvPtsWon%_ratio_l10_tw_ss_SOS_adj"] 

df_player4 = df_player4.drop(["p_opp_df_SvPtsWon%_ratio_l10_tw_ss_SOS_adj"], axis=1)

In [652]:
# 'p_df_SvPtsWon%_ratio_l60_tw_ss_diff_SOS_adj'
# Provides the player differential for ratio of % double faults to % serve points won for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_adj"] = df_player4["p_opp_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differentials
df_player4["p_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_adj_diff"] = df_player4["p_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_adj"] - df_player4["p_opp_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_adj"] 

df_player4 = df_player4.drop(["p_opp_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_adj"], axis=1)

In [653]:
# 'p_df_SvPtsWon%_ratio_l10_tw_ss_diff_SOS_adj'
# Provides the player differential for ratio of % double faults to % serve points won for PLAYER over the last 10 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_adj"] = df_player4.groupby(['m_num'])['p_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_adj'].shift(-1)
df_player4["p_opp_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_adj"] = df_player4["p_opp_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_adj"].fillna(df_player4.groupby(['m_num'])['p_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_adj'].shift(1))

# Calculating differentials
df_player4["p_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_adj_diff"] = df_player4["p_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_adj"] - df_player4["p_opp_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_adj"] 

df_player4 = df_player4.drop(["p_opp_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_adj"], axis=1)

In [654]:
# 'p_df_SvPtsWon%_ratio_l60_tw_ss_diff_SOS_csp_adj'
# Provides the player differential for ratio of % double faults to % serve points won for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_df_SvPtsWon%_ratio_l60_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_df_SvPtsWon%_ratio_l60_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_df_SvPtsWon%_ratio_l60_tw_ss_SOS_csp_adj"] = df_player4["p_opp_df_SvPtsWon%_ratio_l60_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_df_SvPtsWon%_ratio_l60_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differentials
df_player4["p_df_SvPtsWon%_ratio_l60_tw_ss_SOS_adj_csp_diff"] = df_player4["p_df_SvPtsWon%_ratio_l60_tw_ss_SOS_csp_adj"] - df_player4["p_opp_df_SvPtsWon%_ratio_l60_tw_ss_SOS_csp_adj"] 

df_player4 = df_player4.drop(["p_opp_df_SvPtsWon%_ratio_l60_tw_ss_SOS_csp_adj"], axis=1)

In [655]:
# 'p_df_SvPtsWon%_ratio_l10_tw_ss_diff_SOS_csp_adj'
# Provides the player differential for ratio of % double faults to % serve points won for PLAYER over the last 10 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_df_SvPtsWon%_ratio_l10_tw_ss_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_df_SvPtsWon%_ratio_l10_tw_ss_SOS_csp_adj'].shift(-1)
df_player4["p_opp_df_SvPtsWon%_ratio_l10_tw_ss_SOS_csp_adj"] = df_player4["p_opp_df_SvPtsWon%_ratio_l10_tw_ss_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_df_SvPtsWon%_ratio_l10_tw_ss_SOS_csp_adj'].shift(1))

# Calculating differentials
df_player4["p_df_SvPtsWon%_ratio_l10_tw_ss_SOS_adj_csp_diff"] = df_player4["p_df_SvPtsWon%_ratio_l10_tw_ss_SOS_csp_adj"] - df_player4["p_opp_df_SvPtsWon%_ratio_l10_tw_ss_SOS_csp_adj"] 

df_player4 = df_player4.drop(["p_opp_df_SvPtsWon%_ratio_l10_tw_ss_SOS_csp_adj"], axis=1)

In [656]:
# 'p_df_SvPtsWon%_ratio_l60_tw_ss_diff_SOS_csp_adj'
# Provides the player differential for ratio of % double faults to % serve points won for PLAYER over the last 60 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differentials
df_player4["p_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_adj_csp_diff"] = df_player4["p_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_csp_adj"] 

df_player4 = df_player4.drop(["p_opp_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_csp_adj"], axis=1)

In [657]:
# 'p_df_SvPtsWon%_ratio_l10_tw_ss_diff_SOS_csp_adj'
# Provides the player differential for ratio of % double faults to % serve points won for PLAYER over the last 10 surface-specific matches prior to the match being predicted

# Populating the opponent column
df_player4["p_opp_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_csp_adj"] = df_player4.groupby(['m_num'])['p_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_csp_adj'].shift(-1)
df_player4["p_opp_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_csp_adj"] = df_player4["p_opp_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_csp_adj"].fillna(df_player4.groupby(['m_num'])['p_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_csp_adj'].shift(1))

# Calculating differentials
df_player4["p_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_adj_csp_diff"] = df_player4["p_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_csp_adj"] - df_player4["p_opp_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_csp_adj"] 

df_player4 = df_player4.drop(["p_opp_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_csp_adj"], axis=1)

### 8. Save Data for EDA Stage

In [658]:
# Save All Features
#df_player_all_for_EDA = df_player4
#df_player_all_for_EDA.to_csv('../data/df_player_all_for_EDA.csv', index=False)

In [659]:
# Save Differential Predictive Features & Target Feature Only (Plus tournament and match-level 'metadata' Predictive Features and metadata useful for filtering and benchmarking)
#df_player_differential_EDA = df_player4[["p_tot_pts_won%", "m_outcome", "p_matches_ss", "p_matches_nss", "p_AVG_C_IP", "p_PS_C_IP", "p_PS_O_IP", "t_surf", "t_ind", "t_alt", "t_draw_sz", "t_lvl", "t_1st_sv_in%_ratio", "t_1st_sv_in%_yielded_ratio", "t_sv_pts_won%_ratio", "t_ret_pts_won%_ratio", "t_1st_sv_pts_won%_ratio", "t_1st_ret_pts_won%_ratio", "t_2nd_sv_pts_won%_ratio", "t_2nd_ret_pts_won%_ratio", "t_ace%_ratio", "t_aced%_ratio", "t_df%_ratio", "t_df_induce%_ratio", "t_bp_save%_ratio", "t_bp_conv%_ratio", "m_bestof", "m_num", "m_date", "m_yr", "m_rd_num", "p_rk_diff", "p_log_rk_diff", "p_rk_pts_diff", "p_ent_diff", "p_hd_diff", "p_ht_diff", "p_age_diff", "p_HCA_diff", "p_tot_time_l7d_tw_diff", "p_tot_pts_l7d_tw_diff", "p_body_battery_t_tw_diff", "p_body_battery_pts_tw_diff", "p_matches_ss_diff", "p_matches_nss_diff", "p_surf_chg_diff", "p_tz_chg_diff", "p_H2H_w_ss_diff", "p_H2H_w_nss_diff", "p_H2H_tot_pts_won%_ss_diff", "p_H2H_tot_pts_won%_nss_diff", "p_tot_pts_won%_l60_tw_ss_diff", "p_tot_pts_won%_l10_tw_ss_diff", "p_tot_pts_won%_l60_tw_ss_SOS_adj_diff", "p_tot_pts_won%_l10_tw_ss_SOS_adj_diff", "p_tot_pts_won%_l60_tw_ss_IO_diff", "p_tot_pts_won%_l10_tw_ss_IO_diff", "p_tot_pts_won%_l60_tw_ss_IO_SOS_adj_diff", "p_tot_pts_won%_l10_tw_ss_IO_SOS_adj_diff", "p_sv_pts_won%_l60_tw_ss_diff", "p_sv_pts_won%_l10_tw_ss_diff", "p_sv_pts_won%_l60_tw_ss_SOS_adj_diff", "p_sv_pts_won%_l10_tw_ss_SOS_adj_diff", "p_sv_pts_won%_l60_tw_ss_SOS_csp_adj_diff", "p_sv_pts_won%_l10_tw_ss_SOS_csp_adj_diff", "p_1st_sv_pts_won%_l60_tw_ss_diff", "p_1st_sv_pts_won%_l10_tw_ss_diff", "p_1st_sv_pts_won%_l60_tw_ss_SOS_adj_diff", "p_1st_sv_pts_won%_l10_tw_ss_SOS_adj_diff", "p_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj_diff", "p_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj_diff", "p_2nd_sv_pts_won%_l60_tw_ss_diff", "p_2nd_sv_pts_won%_l10_tw_ss_diff", "p_2nd_sv_pts_won%_l60_tw_ss_SOS_adj_diff", "p_2nd_sv_pts_won%_l10_tw_ss_SOS_adj_diff", "p_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj_diff", "p_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj_diff", "p_1st_sv%_l60_tw_ss_diff", "p_1st_sv%_l10_tw_ss_diff", "p_1st_sv%_l60_tw_ss_SOS_adj_diff", "p_1st_sv%_l10_tw_ss_SOS_adj_diff", "p_1st_sv%_l60_tw_ss_SOS_csp_adj_diff", "p_1st_sv%_l10_tw_ss_SOS_csp_adj_diff", "p_1st_sv%_yielded_l60_tw_ss_diff", "p_1st_sv%_yielded_l10_tw_ss_diff", "p_1st_sv%_yielded_l60_tw_ss_SOS_adj_diff", "p_1st_sv%_yielded_l10_tw_ss_SOS_adj_diff", "p_1st_sv%_yielded_l60_tw_ss_SOS_csp_adj_diff", "p_1st_sv%_yielded_l10_tw_ss_SOS_csp_adj_diff", "p_ret_pts_won%_l60_tw_ss_diff", "p_ret_pts_won%_l10_tw_ss_diff", "p_ret_pts_won%_l60_tw_ss_SOS_adj_diff", "p_ret_pts_won%_l10_tw_ss_SOS_adj_diff", "p_ret_pts_won%_l60_tw_ss_SOS_csp_adj_diff", "p_ret_pts_won%_l10_tw_ss_SOS_csp_adj_diff", "p_1st_ret_pts_won%_l60_tw_ss_diff", "p_1st_ret_pts_won%_l10_tw_ss_diff", "p_1st_ret_pts_won%_l60_tw_ss_SOS_adj_diff", "p_1st_ret_pts_won%_l10_tw_ss_SOS_adj_diff", "p_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj_diff", "p_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj_diff", "p_2nd_ret_pts_won%_l60_tw_ss_diff", "p_2nd_ret_pts_won%_l10_tw_ss_diff", "p_2nd_ret_pts_won%_l60_tw_ss_SOS_adj_diff", "p_2nd_ret_pts_won%_l10_tw_ss_SOS_adj_diff", "p_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj_diff", "p_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj_diff", "p_ace%_l60_tw_ss_diff", "p_ace%_l10_tw_ss_diff", "p_ace%_l60_tw_ss_SOS_adj_diff", "p_ace%_l10_tw_ss_SOS_adj_diff", "p_ace%_l60_tw_ss_SOS_csp_adj_diff", "p_ace%_l10_tw_ss_SOS_csp_adj_diff", "p_aced%_l60_tw_ss_diff", "p_aced%_l10_tw_ss_diff", "p_aced%_l60_tw_ss_SOS_adj_diff", "p_aced%_l10_tw_ss_SOS_adj_diff", "p_aced%_l60_tw_ss_SOS_csp_adj_diff", "p_aced%_l10_tw_ss_SOS_csp_adj_diff", "p_df%_l60_tw_ss_diff", "p_df%_l10_tw_ss_diff", "p_df%_l60_tw_ss_SOS_adj_diff", "p_df%_l10_tw_ss_SOS_adj_diff", "p_df%_l60_tw_ss_SOS_csp_adj_diff", "p_df%_l10_tw_ss_SOS_csp_adj_diff", "p_df_induce%_l60_tw_ss_diff", "p_df_induce%_l10_tw_ss_diff", "p_df_induce%_l60_tw_ss_SOS_adj_diff", "p_df_induce%_l10_tw_ss_SOS_adj_diff", "p_df_induce%_l60_tw_ss_SOS_csp_adj_diff", "p_df_induce%_l10_tw_ss_SOS_csp_adj_diff", "p_bp_save%_l60_tw_ss_diff", "p_bp_save%_l10_tw_ss_diff", "p_bp_save%_l60_tw_ss_SOS_adj_diff", "p_bp_save%_l10_tw_ss_SOS_adj_diff", "p_bp_save%_l60_tw_ss_SOS_csp_adj_diff", "p_bp_save%_l10_tw_ss_SOS_csp_adj_diff", "p_bp_conv%_l60_tw_ss_diff", "p_bp_conv%_l10_tw_ss_diff", "p_bp_conv%_l60_tw_ss_SOS_adj_diff", "p_bp_conv%_l10_tw_ss_SOS_adj_diff", "p_bp_conv%_l60_tw_ss_SOS_csp_adj_diff", "p_bp_conv%_l10_tw_ss_SOS_csp_adj_diff", "p_sv_opp_ret_pts_won%_l60_tw_ss_SOS_adj_diff", "p_sv_opp_ret_pts_won%_l10_tw_ss_SOS_adj_diff", "p_sv_opp_ret_pts_won%_l60_tw_ss_SOS_csp_adj_diff", "p_sv_opp_ret_pts_won%_l10_tw_ss_SOS_csp_adj_diff", "p_1st_sv_opp_1st_ret_pts_won%_l60_tw_ss_SOS_adj_diff", "p_1st_sv_opp_1st_ret_pts_won%_l10_tw_ss_SOS_adj_diff", "p_1st_sv_opp_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj_diff", "p_1st_sv_opp_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj_diff", "p_2nd_sv_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_adj_diff", "p_2nd_sv_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_adj_diff", "p_2nd_sv_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj_diff", "p_2nd_sv_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj_diff", "p_ret_opp_sv_pts_won%_l60_tw_ss_SOS_adj_diff", "p_ret_opp_sv_pts_won%_l10_tw_ss_SOS_adj_diff", "p_ret_opp_sv_pts_won%_l60_tw_ss_SOS_csp_adj_diff", "p_ret_opp_sv_pts_won%_l10_tw_ss_SOS_csp_adj_diff", "p_1st_ret_opp_1st_sv_pts_won%_l60_tw_ss_SOS_adj_diff", "p_1st_ret_opp_1st_sv_pts_won%_l10_tw_ss_SOS_adj_diff", "p_1st_ret_opp_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj_diff", "p_1st_ret_opp_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj_diff", "p_2nd_ret_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_adj_diff", "p_2nd_ret_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_adj_diff", "p_2nd_ret_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj_diff", "p_2nd_ret_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj_diff", "p_ace_opp_aced%_l60_tw_ss_SOS_adj_diff", "p_ace_opp_aced%_l10_tw_ss_SOS_csp_adj_diff", "p_ace_opp_aced%_l60_tw_ss_SOS_csp_adj_diff", "p_ace_opp_aced%_l10_tw_ss_SOS_csp_adj_diff", "p_aced_opp_ace%_l60_tw_ss_SOS_adj_diff", "p_aced_opp_ace%_l10_tw_ss_SOS_adj_diff", "p_aced_opp_ace%_l60_tw_ss_SOS_csp_adj_diff", "p_aced_opp_ace%_l10_tw_ss_SOS_csp_adj_diff", "p_df_opp_df_induce%_l60_tw_ss_SOS_adj_diff", "p_df_opp_df_induce%_l10_tw_ss_SOS_adj_diff", "p_df_opp_df_induce%_l60_tw_ss_SOS_csp_adj_diff", "p_df_opp_df_induce%_l10_tw_ss_SOS_csp_adj_diff", "p_dfinduce_opp_df%_l60_tw_ss_SOS_adj_diff", "p_dfinduce_opp_df%_l10_tw_ss_SOS_adj_diff", "p_dfinduce_opp_df%_l60_tw_ss_SOS_csp_adj_diff", "p_dfinduce_opp_df%_l10_tw_ss_SOS_csp_adj_diff", "p_bpsave_opp_bpconv%_l60_tw_ss_SOS_adj_diff", "p_bpsave_opp_bpconv%_l10_tw_ss_SOS_adj_diff", "p_bpsave_opp_bpconv%_l60_tw_ss_SOS_csp_adj_diff", "p_bpsave_opp_bpconv%_l10_tw_ss_SOS_csp_adj_diff", "p_bpconv_opp_bpsave%_l60_tw_ss_SOS_adj_diff", "p_bpconv_opp_bpsave%_l10_tw_ss_SOS_adj_diff", "p_bpconv_opp_bpsave%_l60_tw_ss_SOS_csp_adj_diff", "p_bpconv_opp_bpsave%_l10_tw_ss_SOS_csp_adj_diff", "p_AVG_C_IP_l60_tw_ss_SOS_adj_diff", "p_AVG_C_IP_l10_tw_ss_SOS_adj_diff", "p_AVG_C_IP_l60_tw_ss_IO_SOS_adj_diff", "p_AVG_C_IP_l10_tw_ss_IO_SOS_adj_diff", "p_AVG_C_IP_l60_tw_nss_SOS_adj_diff", "p_AVG_C_IP_l10_tw_nss_SOS_adj_diff", "p_ace_df%_ratio_l60_tw_ss_diff", "p_ace_df%_ratio_l10_tw_ss_diff", "p_ace_df%_ratio_l60_tw_ss_SOS_adj_diff", "p_ace_df%_ratio_l10_tw_ss_SOS_adj_diff", "p_ace_df%_ratio_l60_tw_ss_SOS_adj_csp_diff", "p_ace_df%_ratio_l10_tw_ss_SOS_adj_csp_diff", "p_1stSvWon_1stSv%_ratio_l60_tw_ss_diff", "p_1stSvWon_1stSv%_ratio_l10_tw_ss_diff", "p_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_adj_diff", "p_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_adj_diff", "p_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_adj_csp_diff", "p_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_adj_csp_diff", "p_ace_1stSv%_ratio_l60_tw_ss_diff", "p_ace_1stSv%_ratio_l10_tw_ss_diff", "p_ace_1stSv%_ratio_l60_tw_ss_SOS_adj_diff", "p_ace_1stSv%_ratio_l10_tw_ss_SOS_adj_diff", "p_ace_1stSv%_ratio_l60_tw_ss_SOS_csp_adj_diff", "p_ace_1stSv%_ratio_l10_tw_ss_SOS_csp_adj_diff", "p_df_SvPtsWon%_ratio_l60_tw_ss_diff", "p_df_SvPtsWon%_ratio_l10_tw_ss_diff", "p_df_SvPtsWon%_ratio_l60_tw_ss_SOS_adj_diff", "p_df_SvPtsWon%_ratio_l10_tw_ss_SOS_adj_diff", "p_df_SvPtsWon%_ratio_l60_tw_ss_SOS_adj_csp_diff", "p_df_SvPtsWon%_ratio_l10_tw_ss_SOS_adj_csp_diff"]]
#df_player_differential_EDA.to_csv('../data/df_player_differential_EDA.csv', index=False)

In [660]:
# Creates a dataframe containing ranking-related features (and target feature) for dummy and benchmrk model testing.
# also contains features necessary for thresholding minimum number of matches played by a player prior to match being predicted on, and 
# for restricting modeling to 2012 and onward (2009-2012 used to accrue retrospective stats/features)
#df_player_benchmark = df_player6[["p_pts_won%", "p_rank", "p_log_rank", "p_rank_pts", "p_opp_rank_diff", "p_opp_log_rank_diff","p_opp_rank_pts_diff", "p_matches_surf", "t_indoor", "m_num", "tour_wk"]]

### 9. Dataframes for Predictive Modeling 

What we want in the main dataframe(s) for the modeling stage is 1) the target feature ('p_tot_pts_won%'), metadata features, and all (putatively) PREDICTIVE features. What we DO NOT want are features pertinent to the match being predicted that would constitute data leakage. For example, we do not want the closing wagering lines/derived implied win percentages for each match to be predicted or the raw match stats from each match to be predicted. Though, of course, we DO want the features we created out of statistics from past matches (relative to each match being predicted). 

We will also drop "raw" player-level preditive features and keep only the "differential" forms, which is to say the difference between the two players in a given match to be predicted. A few exceptions include player entry type, where it's not clear how meaningful a differential version is (though there is also a differential version included, with numerical assignations based on trial and error). 

Some metadata features we do want to keep for now for potential additional upfront filtering in the modeling stage, but might jettison before we really get down to business (e.g., player and tournament names and identifiers, match dates and unique match number identifier). 

In [661]:
#df_for_modeling = df_player4[['p_tot_pts_won%', 't_ident', 't_surf', 't_ind', 't_alt', 't_draw_sz', 't_lvl', 'm_bestof', 'm_num', 'm_date', 'm_yr', 'm_rd_num', 'p_id', 'p_ent', 'p_age', 't_1st_sv_in%_ratio', 't_1st_sv_in%_yielded_ratio', 't_sv_pts_won%_ratio', 't_ret_pts_won%_ratio', 't_1st_sv_pts_won%_ratio', 't_1st_ret_pts_won%_ratio', 't_2nd_sv_pts_won%_ratio', 't_2nd_ret_pts_won%_ratio', 't_ace%_ratio', 't_aced%_ratio', 't_df%_ratio', 't_df_induce%_ratio', 't_bp_save%_ratio', 't_bp_conv%_ratio', 'p_rk_diff', 'p_log_rk_diff', 'p_rk_pts_diff', 'p_ent_diff', 'p_hd_diff', 'p_ht_diff', 'p_age_diff', 'p_HCA', 'p_HCA_diff', 'p_tot_time_l7d_tw_diff', 'p_tot_pts_l7d_tw_diff', 'p_body_battery_t_tw_diff', 'p_body_battery_pts_tw_diff', 'p_matches_ss_diff', 'p_matches_nss_diff', 'p_surf_chg_diff', 'p_tz_chg_diff', 'p_H2H_w_ss_diff', 'p_H2H_w_nss_diff', 'p_H2H_tot_pts_won%_ss_diff', 'p_H2H_tot_pts_won%_nss_diff', 'p_tot_pts_won%_l60_tw_ss_diff', 'p_tot_pts_won%_l10_tw_ss_diff', 'p_tot_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_tot_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_tot_pts_won%_l60_tw_ss_IO_diff', 'p_tot_pts_won%_l10_tw_ss_IO_diff', 'p_tot_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_tot_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_sv_pts_won%_l60_tw_ss_diff', 'p_sv_pts_won%_l10_tw_ss_diff', 'p_sv_pts_won%_l60_tw_ss_IO_diff', 'p_sv_pts_won%_l10_tw_ss_IO_diff', 'p_sv_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_sv_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_sv_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_sv_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_sv_pts_won%_l60_tw_ss_SOS_csp_adj_diff', 'p_sv_pts_won%_l10_tw_ss_SOS_csp_adj_diff', 'p_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_1st_sv_pts_won%_l60_tw_ss_diff', 'p_1st_sv_pts_won%_l10_tw_ss_diff', 'p_1st_sv_pts_won%_l60_tw_ss_IO_diff', 'p_1st_sv_pts_won%_l10_tw_ss_IO_diff', 'p_1st_sv_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_1st_sv_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj_diff', 'p_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj_diff', 'p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_2nd_sv_pts_won%_l60_tw_ss_diff', 'p_2nd_sv_pts_won%_l10_tw_ss_diff', 'p_2nd_sv_pts_won%_l60_tw_ss_IO_diff', 'p_2nd_sv_pts_won%_l10_tw_ss_IO_diff', 'p_2nd_sv_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_2nd_sv_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj_diff', 'p_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj_diff', 'p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_1st_sv%_l60_tw_ss_diff', 'p_1st_sv%_l10_tw_ss_diff', 'p_1st_sv%_l60_tw_ss_IO_diff', 'p_1st_sv%_l10_tw_ss_IO_diff', 'p_1st_sv%_l60_tw_ss_SOS_adj_diff', 'p_1st_sv%_l10_tw_ss_SOS_adj_diff', 'p_1st_sv%_l60_tw_ss_IO_SOS_adj_diff', 'p_1st_sv%_l10_tw_ss_IO_SOS_adj_diff', 'p_1st_sv%_l60_tw_ss_SOS_csp_adj_diff', 'p_1st_sv%_l10_tw_ss_SOS_csp_adj_diff', 'p_1st_sv%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_1st_sv%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_1st_sv%_yielded_l60_tw_ss_diff', 'p_1st_sv%_yielded_l10_tw_ss_diff', 'p_1st_sv%_yielded_l60_tw_ss_IO_diff', 'p_1st_sv%_yielded_l10_tw_ss_IO_diff', 'p_1st_sv%_yielded_l60_tw_ss_SOS_adj_diff', 'p_1st_sv%_yielded_l10_tw_ss_SOS_adj_diff', 'p_1st_sv%_yielded_l60_tw_ss_SOS_csp_adj_diff', 'p_1st_sv%_yielded_l10_tw_ss_SOS_csp_adj_diff', 'p_1st_sv%_yielded_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_1st_sv%_yielded_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_ret_pts_won%_l60_tw_ss_diff', 'p_ret_pts_won%_l10_tw_ss_diff', 'p_ret_pts_won%_l60_tw_ss_IO_diff', 'p_ret_pts_won%_l10_tw_ss_IO_diff', 'p_ret_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_ret_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_ret_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_ret_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_ret_pts_won%_l60_tw_ss_SOS_csp_adj_diff', 'p_ret_pts_won%_l10_tw_ss_SOS_csp_adj_diff', 'p_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_1st_ret_pts_won%_l60_tw_ss_diff', 'p_1st_ret_pts_won%_l10_tw_ss_diff', 'p_1st_ret_pts_won%_l60_tw_ss_IO_diff', 'p_1st_ret_pts_won%_l10_tw_ss_IO_diff', 'p_1st_ret_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_1st_ret_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj_diff', 'p_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj_diff', 'p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_2nd_ret_pts_won%_l60_tw_ss_diff', 'p_2nd_ret_pts_won%_l10_tw_ss_diff', 'p_2nd_ret_pts_won%_l60_tw_ss_IO_diff', 'p_2nd_ret_pts_won%_l10_tw_ss_IO_diff', 'p_2nd_ret_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_2nd_ret_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj_diff', 'p_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj_diff', 'p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_ace%_l60_tw_ss_diff', 'p_ace%_l10_tw_ss_diff', 'p_ace%_l60_tw_ss_IO_diff', 'p_ace%_l10_tw_ss_IO_diff', 'p_ace%_l60_tw_ss_SOS_adj_diff', 'p_ace%_l10_tw_ss_SOS_adj_diff', 'p_ace%_l60_tw_ss_IO_SOS_adj_diff', 'p_ace%_l10_tw_ss_IO_SOS_adj_diff', 'p_ace%_l60_tw_ss_SOS_csp_adj_diff', 'p_ace%_l10_tw_ss_SOS_csp_adj_diff', 'p_ace%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_ace%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_aced%_l60_tw_ss_diff', 'p_aced%_l10_tw_ss_diff', 'p_aced%_l60_tw_ss_IO_diff', 'p_aced%_l10_tw_ss_IO_diff', 'p_aced%_l60_tw_ss_SOS_adj_diff', 'p_aced%_l10_tw_ss_SOS_adj_diff', 'p_aced%_l60_tw_ss_IO_SOS_adj_diff', 'p_aced%_l10_tw_ss_IO_SOS_adj_diff', 'p_aced%_l60_tw_ss_SOS_csp_adj_diff', 'p_aced%_l10_tw_ss_SOS_csp_adj_diff', 'p_aced%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_aced%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_df%_l60_tw_ss_diff', 'p_df%_l10_tw_ss_diff', 'p_df%_l60_tw_ss_IO_diff', 'p_df%_l10_tw_ss_IO_diff', 'p_df%_l60_tw_ss_SOS_adj_diff', 'p_df%_l10_tw_ss_SOS_adj_diff', 'p_df%_l60_tw_ss_IO_SOS_adj_diff', 'p_df%_l10_tw_ss_IO_SOS_adj_diff', 'p_df%_l60_tw_ss_SOS_csp_adj_diff', 'p_df%_l10_tw_ss_SOS_csp_adj_diff', 'p_df%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_df%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_df_induce%_l60_tw_ss_diff', 'p_df_induce%_l10_tw_ss_diff', 'p_df_induce%_l60_tw_ss_IO_diff', 'p_df_induce%_l10_tw_ss_IO_diff', 'p_df_induce%_l60_tw_ss_SOS_adj_diff', 'p_df_induce%_l10_tw_ss_SOS_adj_diff', 'p_df_induce%_l60_tw_ss_IO_SOS_adj_diff', 'p_df_induce%_l10_tw_ss_IO_SOS_adj_diff', 'p_df_induce%_l60_tw_ss_SOS_csp_adj_diff', 'p_df_induce%_l10_tw_ss_SOS_csp_adj_diff', 'p_df_induce%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_df_induce%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_bp_save%_l60_tw_ss_diff', 'p_bp_save%_l10_tw_ss_diff', 'p_bp_save%_l60_tw_ss_IO_diff', 'p_bp_save%_l10_tw_ss_IO_diff', 'p_bp_save%_l60_tw_ss_SOS_adj_diff', 'p_bp_save%_l10_tw_ss_SOS_adj_diff', 'p_bp_save%_l60_tw_ss_IO_SOS_adj_diff', 'p_bp_save%_l10_tw_ss_IO_SOS_adj_diff', 'p_bp_save%_l60_tw_ss_SOS_csp_adj_diff', 'p_bp_save%_l10_tw_ss_SOS_csp_adj_diff', 'p_bp_save%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_bp_save%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_bp_conv%_l60_tw_ss_diff', 'p_bp_conv%_l10_tw_ss_diff', 'p_bp_conv%_l60_tw_ss_IO_diff', 'p_bp_conv%_l10_tw_ss_IO_diff', 'p_bp_conv%_l60_tw_ss_SOS_adj_diff', 'p_bp_conv%_l10_tw_ss_SOS_adj_diff', 'p_bp_conv%_l60_tw_ss_IO_SOS_adj_diff', 'p_bp_conv%_l10_tw_ss_IO_SOS_adj_diff', 'p_bp_conv%_l60_tw_ss_SOS_csp_adj_diff', 'p_bp_conv%_l10_tw_ss_SOS_csp_adj_diff', 'p_bp_conv%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_bp_conv%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_sv_opp_ret_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_sv_opp_ret_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_sv_opp_ret_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_sv_opp_ret_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_sv_opp_ret_pts_won%_l60_tw_ss_SOS_csp_adj_diff', 'p_sv_opp_ret_pts_won%_l10_tw_ss_SOS_csp_adj_diff', 'p_sv_opp_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_sv_opp_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_1st_sv_opp_1st_ret_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_1st_sv_opp_1st_ret_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_1st_sv_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_1st_sv_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_1st_sv_opp_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj_diff', 'p_1st_sv_opp_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj_diff', 'p_1st_sv_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_1st_sv_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_2nd_sv_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_2nd_sv_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_2nd_sv_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_2nd_sv_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_2nd_sv_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj_diff', 'p_2nd_sv_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj_diff', 'p_2nd_sv_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_2nd_sv_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_ret_opp_sv_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_ret_opp_sv_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_ret_opp_sv_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_ret_opp_sv_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_ret_opp_sv_pts_won%_l60_tw_ss_SOS_csp_adj_diff', 'p_ret_opp_sv_pts_won%_l10_tw_ss_SOS_csp_adj_diff', 'p_ret_opp_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_ret_opp_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_1st_ret_opp_1st_sv_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_1st_ret_opp_1st_sv_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_1st_ret_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_1st_ret_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_1st_ret_opp_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj_diff', 'p_1st_ret_opp_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj_diff', 'p_1st_ret_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_1st_ret_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_2nd_ret_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_2nd_ret_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_2nd_ret_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_2nd_ret_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_2nd_ret_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj_diff', 'p_2nd_ret_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj_diff', 'p_2nd_ret_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_2nd_ret_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_ace_opp_aced%_l60_tw_ss_SOS_adj_diff', 'p_ace_opp_aced%_l10_tw_ss_SOS_adj_diff', 'p_ace_opp_aced%_l60_tw_ss_IO_SOS_adj_diff', 'p_ace_opp_aced%_l10_tw_ss_IO_SOS_adj_diff', 'p_ace_opp_aced%_l60_tw_ss_SOS_csp_adj_diff', 'p_ace_opp_aced%_l10_tw_ss_SOS_csp_adj_diff', 'p_ace_opp_aced%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_ace_opp_aced%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_aced_opp_ace%_l60_tw_ss_SOS_adj_diff', 'p_aced_opp_ace%_l10_tw_ss_SOS_adj_diff', 'p_aced_opp_ace%_l60_tw_ss_IO_SOS_adj_diff', 'p_aced_opp_ace%_l10_tw_ss_IO_SOS_adj_diff', 'p_aced_opp_ace%_l60_tw_ss_SOS_csp_adj_diff', 'p_aced_opp_ace%_l10_tw_ss_SOS_csp_adj_diff', 'p_aced_opp_ace%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_aced_opp_ace%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_df_opp_df_induce%_l60_tw_ss_SOS_adj_diff', 'p_df_opp_df_induce%_l10_tw_ss_SOS_adj_diff', 'p_df_opp_df_induce%_l60_tw_ss_IO_SOS_adj_diff', 'p_df_opp_df_induce%_l10_tw_ss_IO_SOS_adj_diff', 'p_df_opp_df_induce%_l60_tw_ss_SOS_csp_adj_diff', 'p_df_opp_df_induce%_l10_tw_ss_SOS_csp_adj_diff', 'p_df_opp_df_induce%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_df_opp_df_induce%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_dfinduce_opp_df%_l60_tw_ss_SOS_adj_diff', 'p_dfinduce_opp_df%_l10_tw_ss_SOS_adj_diff', 'p_dfinduce_opp_df%_l60_tw_ss_IO_SOS_adj_diff', 'p_dfinduce_opp_df%_l10_tw_ss_IO_SOS_adj_diff', 'p_dfinduce_opp_df%_l60_tw_ss_SOS_csp_adj_diff', 'p_dfinduce_opp_df%_l10_tw_ss_SOS_csp_adj_diff', 'p_dfinduce_opp_df%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_dfinduce_opp_df%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_bpsave_opp_bpconv%_l60_tw_ss_SOS_adj_diff', 'p_bpsave_opp_bpconv%_l10_tw_ss_SOS_adj_diff', 'p_bpsave_opp_bpconv%_l60_tw_ss_IO_SOS_adj_diff', 'p_bpsave_opp_bpconv%_l10_tw_ss_IO_SOS_adj_diff', 'p_bpsave_opp_bpconv%_l60_tw_ss_SOS_csp_adj_diff', 'p_bpsave_opp_bpconv%_l10_tw_ss_SOS_csp_adj_diff', 'p_bpsave_opp_bpconv%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_bpsave_opp_bpconv%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_bpconv_opp_bpsave%_l60_tw_ss_SOS_adj_diff', 'p_bpconv_opp_bpsave%_l10_tw_ss_SOS_adj_diff', 'p_bpconv_opp_bpsave%_l60_tw_ss_IO_SOS_adj_diff', 'p_bpconv_opp_bpsave%_l10_tw_ss_IO_SOS_adj_diff', 'p_bpconv_opp_bpsave%_l60_tw_ss_SOS_csp_adj_diff', 'p_bpconv_opp_bpsave%_l10_tw_ss_SOS_csp_adj_diff', 'p_bpconv_opp_bpsave%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_bpconv_opp_bpsave%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_AVG_C_IP_l60_tw_ss_SOS_adj_diff', 'p_AVG_C_IP_l10_tw_ss_SOS_adj_diff', 'p_AVG_C_IP_l60_tw_ss_IO_SOS_adj_diff', 'p_AVG_C_IP_l10_tw_ss_IO_SOS_adj_diff', 'p_AVG_C_IP_l60_tw_nss_SOS_adj_diff', 'p_AVG_C_IP_l10_tw_nss_SOS_adj_diff', 'p_ace_df%_ratio_l60_tw_ss_diff', 'p_ace_df%_ratio_l10_tw_ss_diff', 'p_ace_df%_ratio_l60_tw_ss_IO_diff', 'p_ace_df%_ratio_l10_tw_ss_IO_diff', 'p_ace_df%_ratio_l60_tw_ss_SOS_adj_diff', 'p_ace_df%_ratio_l10_tw_ss_SOS_adj_diff', 'p_ace_df%_ratio_l60_tw_ss_IO_SOS_adj_diff', 'p_ace_df%_ratio_l10_tw_ss_IO_SOS_adj_diff', 'p_ace_df%_ratio_l60_tw_ss_SOS_adj_csp_diff', 'p_ace_df%_ratio_l10_tw_ss_SOS_adj_csp_diff', 'p_ace_df%_ratio_l60_tw_ss_IO_SOS_adj_csp_diff', 'p_ace_df%_ratio_l10_tw_ss_IO_SOS_adj_csp_diff', 'p_1stSvWon_1stSv%_ratio_l60_tw_ss_diff', 'p_1stSvWon_1stSv%_ratio_l10_tw_ss_diff', 'p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_diff', 'p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_diff', 'p_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_adj_diff', 'p_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_adj_diff', 'p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_adj_diff', 'p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_adj_diff', 'p_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_adj_csp_diff', 'p_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_adj_csp_diff', 'p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_adj_csp_diff', 'p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_adj_csp_diff', 'p_ace_1stSv%_ratio_l60_tw_ss_diff', 'p_ace_1stSv%_ratio_l10_tw_ss_diff', 'p_ace_1stSv%_ratio_l60_tw_ss_IO_diff', 'p_ace_1stSv%_ratio_l10_tw_ss_IO_diff', 'p_ace_1stSv%_ratio_l60_tw_ss_SOS_adj_diff', 'p_ace_1stSv%_ratio_l10_tw_ss_SOS_adj_diff', 'p_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_adj_diff', 'p_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_adj_diff', 'p_ace_1stSv%_ratio_l60_tw_ss_SOS_csp_adj_diff', 'p_ace_1stSv%_ratio_l10_tw_ss_SOS_csp_adj_diff', 'p_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_csp_adj_d', 'p_df_SvPtsWon%_ratio_l60_tw_ss_diff', 'p_df_SvPtsWon%_ratio_l10_tw_ss_diff', 'p_df_SvPtsWon%_ratio_l60_tw_ss_IO_diff', 'p_df_SvPtsWon%_ratio_l10_tw_ss_IO_diff', 'p_df_SvPtsWon%_ratio_l60_tw_ss_SOS_adj_diff', 'p_df_SvPtsWon%_ratio_l10_tw_ss_SOS_adj_diff', 'p_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_adj_diff', 'p_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_adj_diff', 'p_df_SvPtsWon%_ratio_l60_tw_ss_SOS_adj_csp_diff', 'p_df_SvPtsWon%_ratio_l10_tw_ss_SOS_adj_csp_diff', 'p_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_adj_csp_diff', 'p_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_adj_csp_diff']]

In [662]:
df_for_modeling = df_player4[['p_tot_pts_won%', 't_ident', 't_surf', 't_ind', 't_alt', 't_draw_sz', 't_lvl', 'm_bestof', 'm_num', 'm_date', 'm_yr', 'm_rd_num', 'p_id', 'p_ent', 'p_age', 'p_matches_ss', 'p_matches_nss', 't_1st_sv_in%_ratio', 't_1st_sv_in%_yielded_ratio', 't_sv_pts_won%_ratio', 't_ret_pts_won%_ratio', 't_1st_sv_pts_won%_ratio', 't_1st_ret_pts_won%_ratio', 't_2nd_sv_pts_won%_ratio', 't_2nd_ret_pts_won%_ratio', 't_ace%_ratio', 't_aced%_ratio', 't_df%_ratio', 't_df_induce%_ratio', 't_bp_save%_ratio', 't_bp_conv%_ratio', 'p_rk_diff', 'p_log_rk_diff', 'p_rk_pts_diff', 'p_ent_diff', 'p_hd_diff', 'p_ht_diff', 'p_age_diff', 'p_HCA', 'p_HCA_diff', 'p_tot_time_l7d_tw_diff', 'p_tot_pts_l7d_tw_diff', 'p_body_battery_t_tw_diff', 'p_body_battery_pts_tw_diff', 'p_matches_ss_diff', 'p_matches_nss_diff', 'p_surf_chg_diff', 'p_tz_chg_diff', 'p_H2H_w_ss_diff', 'p_H2H_w_nss_diff', 'p_H2H_tot_pts_won%_ss_diff', 'p_H2H_tot_pts_won%_nss_diff', 'p_tot_pts_won%_l60_tw_ss_diff', 'p_tot_pts_won%_l10_tw_ss_diff', 'p_tot_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_tot_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_tot_pts_won%_l60_tw_ss_IO_diff', 'p_tot_pts_won%_l10_tw_ss_IO_diff', 'p_tot_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_tot_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_tot_pts_won%_l60_tw_ss_comp_diff', 'p_tot_pts_won%_l10_tw_ss_comp_diff', 'p_tot_pts_won%_l60_tw_ss_SOS_comp_adj_diff', 'p_tot_pts_won%_l10_tw_ss_SOS_comp_adj_diff', 'p_sv_pts_won%_l60_tw_ss_diff', 'p_sv_pts_won%_l10_tw_ss_diff', 'p_sv_pts_won%_l60_tw_ss_IO_diff', 'p_sv_pts_won%_l10_tw_ss_IO_diff', 'p_sv_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_sv_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_sv_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_sv_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_sv_pts_won%_l60_tw_ss_SOS_csp_adj_diff', 'p_sv_pts_won%_l10_tw_ss_SOS_csp_adj_diff', 'p_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_1st_sv_pts_won%_l60_tw_ss_diff', 'p_1st_sv_pts_won%_l10_tw_ss_diff', 'p_1st_sv_pts_won%_l60_tw_ss_IO_diff', 'p_1st_sv_pts_won%_l10_tw_ss_IO_diff', 'p_1st_sv_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_1st_sv_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj_diff', 'p_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj_diff', 'p_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_2nd_sv_pts_won%_l60_tw_ss_diff', 'p_2nd_sv_pts_won%_l10_tw_ss_diff', 'p_2nd_sv_pts_won%_l60_tw_ss_IO_diff', 'p_2nd_sv_pts_won%_l10_tw_ss_IO_diff', 'p_2nd_sv_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_2nd_sv_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj_diff', 'p_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj_diff', 'p_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_1st_sv%_l60_tw_ss_diff', 'p_1st_sv%_l10_tw_ss_diff', 'p_1st_sv%_l60_tw_ss_IO_diff', 'p_1st_sv%_l10_tw_ss_IO_diff', 'p_1st_sv%_l60_tw_ss_SOS_adj_diff', 'p_1st_sv%_l10_tw_ss_SOS_adj_diff', 'p_1st_sv%_l60_tw_ss_IO_SOS_adj_diff', 'p_1st_sv%_l10_tw_ss_IO_SOS_adj_diff', 'p_1st_sv%_l60_tw_ss_SOS_csp_adj_diff', 'p_1st_sv%_l10_tw_ss_SOS_csp_adj_diff', 'p_1st_sv%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_1st_sv%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_1st_sv%_yielded_l60_tw_ss_diff', 'p_1st_sv%_yielded_l10_tw_ss_diff', 'p_1st_sv%_yielded_l60_tw_ss_IO_diff', 'p_1st_sv%_yielded_l10_tw_ss_IO_diff', 'p_1st_sv%_yielded_l60_tw_ss_SOS_adj_diff', 'p_1st_sv%_yielded_l10_tw_ss_SOS_adj_diff', 'p_1st_sv%_yielded_l60_tw_ss_SOS_csp_adj_diff', 'p_1st_sv%_yielded_l10_tw_ss_SOS_csp_adj_diff', 'p_1st_sv%_yielded_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_1st_sv%_yielded_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_ret_pts_won%_l60_tw_ss_diff', 'p_ret_pts_won%_l10_tw_ss_diff', 'p_ret_pts_won%_l60_tw_ss_IO_diff', 'p_ret_pts_won%_l10_tw_ss_IO_diff', 'p_ret_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_ret_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_ret_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_ret_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_ret_pts_won%_l60_tw_ss_SOS_csp_adj_diff', 'p_ret_pts_won%_l10_tw_ss_SOS_csp_adj_diff', 'p_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_1st_ret_pts_won%_l60_tw_ss_diff', 'p_1st_ret_pts_won%_l10_tw_ss_diff', 'p_1st_ret_pts_won%_l60_tw_ss_IO_diff', 'p_1st_ret_pts_won%_l10_tw_ss_IO_diff', 'p_1st_ret_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_1st_ret_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj_diff', 'p_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj_diff', 'p_1st_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_1st_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_2nd_ret_pts_won%_l60_tw_ss_diff', 'p_2nd_ret_pts_won%_l10_tw_ss_diff', 'p_2nd_ret_pts_won%_l60_tw_ss_IO_diff', 'p_2nd_ret_pts_won%_l10_tw_ss_IO_diff', 'p_2nd_ret_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_2nd_ret_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj_diff', 'p_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj_diff', 'p_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_ace%_l60_tw_ss_diff', 'p_ace%_l10_tw_ss_diff', 'p_ace%_l60_tw_ss_IO_diff', 'p_ace%_l10_tw_ss_IO_diff', 'p_ace%_l60_tw_ss_SOS_adj_diff', 'p_ace%_l10_tw_ss_SOS_adj_diff', 'p_ace%_l60_tw_ss_IO_SOS_adj_diff', 'p_ace%_l10_tw_ss_IO_SOS_adj_diff', 'p_ace%_l60_tw_ss_SOS_csp_adj_diff', 'p_ace%_l10_tw_ss_SOS_csp_adj_diff', 'p_ace%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_ace%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_aced%_l60_tw_ss_diff', 'p_aced%_l10_tw_ss_diff', 'p_aced%_l60_tw_ss_IO_diff', 'p_aced%_l10_tw_ss_IO_diff', 'p_aced%_l60_tw_ss_SOS_adj_diff', 'p_aced%_l10_tw_ss_SOS_adj_diff', 'p_aced%_l60_tw_ss_IO_SOS_adj_diff', 'p_aced%_l10_tw_ss_IO_SOS_adj_diff', 'p_aced%_l60_tw_ss_SOS_csp_adj_diff', 'p_aced%_l10_tw_ss_SOS_csp_adj_diff', 'p_aced%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_aced%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_df%_l60_tw_ss_diff', 'p_df%_l10_tw_ss_diff', 'p_df%_l60_tw_ss_IO_diff', 'p_df%_l10_tw_ss_IO_diff', 'p_df%_l60_tw_ss_SOS_adj_diff', 'p_df%_l10_tw_ss_SOS_adj_diff', 'p_df%_l60_tw_ss_IO_SOS_adj_diff', 'p_df%_l10_tw_ss_IO_SOS_adj_diff', 'p_df%_l60_tw_ss_SOS_csp_adj_diff', 'p_df%_l10_tw_ss_SOS_csp_adj_diff', 'p_df%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_df%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_df_induce%_l60_tw_ss_diff', 'p_df_induce%_l10_tw_ss_diff', 'p_df_induce%_l60_tw_ss_IO_diff', 'p_df_induce%_l10_tw_ss_IO_diff', 'p_df_induce%_l60_tw_ss_SOS_adj_diff', 'p_df_induce%_l10_tw_ss_SOS_adj_diff', 'p_df_induce%_l60_tw_ss_IO_SOS_adj_diff', 'p_df_induce%_l10_tw_ss_IO_SOS_adj_diff', 'p_df_induce%_l60_tw_ss_SOS_csp_adj_diff', 'p_df_induce%_l10_tw_ss_SOS_csp_adj_diff', 'p_df_induce%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_df_induce%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_bp_save%_l60_tw_ss_diff', 'p_bp_save%_l10_tw_ss_diff', 'p_bp_save%_l60_tw_ss_IO_diff', 'p_bp_save%_l10_tw_ss_IO_diff', 'p_bp_save%_l60_tw_ss_SOS_adj_diff', 'p_bp_save%_l10_tw_ss_SOS_adj_diff', 'p_bp_save%_l60_tw_ss_IO_SOS_adj_diff', 'p_bp_save%_l10_tw_ss_IO_SOS_adj_diff', 'p_bp_save%_l60_tw_ss_SOS_csp_adj_diff', 'p_bp_save%_l10_tw_ss_SOS_csp_adj_diff', 'p_bp_save%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_bp_save%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_bp_conv%_l60_tw_ss_diff', 'p_bp_conv%_l10_tw_ss_diff', 'p_bp_conv%_l60_tw_ss_IO_diff', 'p_bp_conv%_l10_tw_ss_IO_diff', 'p_bp_conv%_l60_tw_ss_SOS_adj_diff', 'p_bp_conv%_l10_tw_ss_SOS_adj_diff', 'p_bp_conv%_l60_tw_ss_IO_SOS_adj_diff', 'p_bp_conv%_l10_tw_ss_IO_SOS_adj_diff', 'p_bp_conv%_l60_tw_ss_SOS_csp_adj_diff', 'p_bp_conv%_l10_tw_ss_SOS_csp_adj_diff', 'p_bp_conv%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_bp_conv%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_sv_opp_ret_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_sv_opp_ret_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_sv_opp_ret_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_sv_opp_ret_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_sv_opp_ret_pts_won%_l60_tw_ss_SOS_csp_adj_diff', 'p_sv_opp_ret_pts_won%_l10_tw_ss_SOS_csp_adj_diff', 'p_sv_opp_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_sv_opp_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_1st_sv_opp_1st_ret_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_1st_sv_opp_1st_ret_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_1st_sv_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_1st_sv_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_1st_sv_opp_1st_ret_pts_won%_l60_tw_ss_SOS_csp_adj_diff', 'p_1st_sv_opp_1st_ret_pts_won%_l10_tw_ss_SOS_csp_adj_diff', 'p_1st_sv_opp_1st_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_1st_sv_opp_1st_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_2nd_sv_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_2nd_sv_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_2nd_sv_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_2nd_sv_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_2nd_sv_opp_2nd_ret_pts_won%_l60_tw_ss_SOS_csp_adj_diff', 'p_2nd_sv_opp_2nd_ret_pts_won%_l10_tw_ss_SOS_csp_adj_diff', 'p_2nd_sv_opp_2nd_ret_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_2nd_sv_opp_2nd_ret_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_ret_opp_sv_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_ret_opp_sv_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_ret_opp_sv_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_ret_opp_sv_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_ret_opp_sv_pts_won%_l60_tw_ss_SOS_csp_adj_diff', 'p_ret_opp_sv_pts_won%_l10_tw_ss_SOS_csp_adj_diff', 'p_ret_opp_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_ret_opp_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_1st_ret_opp_1st_sv_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_1st_ret_opp_1st_sv_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_1st_ret_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_1st_ret_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_1st_ret_opp_1st_sv_pts_won%_l60_tw_ss_SOS_csp_adj_diff', 'p_1st_ret_opp_1st_sv_pts_won%_l10_tw_ss_SOS_csp_adj_diff', 'p_1st_ret_opp_1st_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_1st_ret_opp_1st_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff',
                              'p_2nd_ret_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_adj_diff', 'p_2nd_ret_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_adj_diff', 'p_2nd_ret_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_adj_diff', 'p_2nd_ret_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_adj_diff', 'p_2nd_ret_opp_2nd_sv_pts_won%_l60_tw_ss_SOS_csp_adj_diff', 'p_2nd_ret_opp_2nd_sv_pts_won%_l10_tw_ss_SOS_csp_adj_diff', 'p_2nd_ret_opp_2nd_sv_pts_won%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_2nd_ret_opp_2nd_sv_pts_won%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_ace_opp_aced%_l60_tw_ss_SOS_adj_diff', 'p_ace_opp_aced%_l10_tw_ss_SOS_adj_diff', 'p_ace_opp_aced%_l60_tw_ss_IO_SOS_adj_diff', 'p_ace_opp_aced%_l10_tw_ss_IO_SOS_adj_diff', 'p_ace_opp_aced%_l60_tw_ss_SOS_csp_adj_diff', 'p_ace_opp_aced%_l10_tw_ss_SOS_csp_adj_diff', 'p_ace_opp_aced%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_ace_opp_aced%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_aced_opp_ace%_l60_tw_ss_SOS_adj_diff', 'p_aced_opp_ace%_l10_tw_ss_SOS_adj_diff', 'p_aced_opp_ace%_l60_tw_ss_IO_SOS_adj_diff', 'p_aced_opp_ace%_l10_tw_ss_IO_SOS_adj_diff', 'p_aced_opp_ace%_l60_tw_ss_SOS_csp_adj_diff', 'p_aced_opp_ace%_l10_tw_ss_SOS_csp_adj_diff', 'p_aced_opp_ace%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_aced_opp_ace%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_df_opp_df_induce%_l60_tw_ss_SOS_adj_diff', 'p_df_opp_df_induce%_l10_tw_ss_SOS_adj_diff', 'p_df_opp_df_induce%_l60_tw_ss_IO_SOS_adj_diff', 'p_df_opp_df_induce%_l10_tw_ss_IO_SOS_adj_diff', 'p_df_opp_df_induce%_l60_tw_ss_SOS_csp_adj_diff', 'p_df_opp_df_induce%_l10_tw_ss_SOS_csp_adj_diff', 'p_df_opp_df_induce%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_df_opp_df_induce%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_dfinduce_opp_df%_l60_tw_ss_SOS_adj_diff', 'p_dfinduce_opp_df%_l10_tw_ss_SOS_adj_diff', 'p_dfinduce_opp_df%_l60_tw_ss_IO_SOS_adj_diff', 'p_dfinduce_opp_df%_l10_tw_ss_IO_SOS_adj_diff', 'p_dfinduce_opp_df%_l60_tw_ss_SOS_csp_adj_diff', 'p_dfinduce_opp_df%_l10_tw_ss_SOS_csp_adj_diff', 'p_dfinduce_opp_df%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_dfinduce_opp_df%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_bpsave_opp_bpconv%_l60_tw_ss_SOS_adj_diff', 'p_bpsave_opp_bpconv%_l10_tw_ss_SOS_adj_diff', 'p_bpsave_opp_bpconv%_l60_tw_ss_IO_SOS_adj_diff', 'p_bpsave_opp_bpconv%_l10_tw_ss_IO_SOS_adj_diff', 'p_bpsave_opp_bpconv%_l60_tw_ss_SOS_csp_adj_diff', 'p_bpsave_opp_bpconv%_l10_tw_ss_SOS_csp_adj_diff', 'p_bpsave_opp_bpconv%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_bpsave_opp_bpconv%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_bpconv_opp_bpsave%_l60_tw_ss_SOS_adj_diff', 'p_bpconv_opp_bpsave%_l10_tw_ss_SOS_adj_diff', 'p_bpconv_opp_bpsave%_l60_tw_ss_IO_SOS_adj_diff', 'p_bpconv_opp_bpsave%_l10_tw_ss_IO_SOS_adj_diff', 'p_bpconv_opp_bpsave%_l60_tw_ss_SOS_csp_adj_diff', 'p_bpconv_opp_bpsave%_l10_tw_ss_SOS_csp_adj_diff', 'p_bpconv_opp_bpsave%_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_bpconv_opp_bpsave%_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_AVG_C_IP_l60_tw_ss_SOS_adj_diff', 'p_AVG_C_IP_l10_tw_ss_SOS_adj_diff', 'p_AVG_C_IP_l60_tw_ss_IO_SOS_adj_diff', 'p_AVG_C_IP_l10_tw_ss_IO_SOS_adj_diff', 'p_AVG_C_IP_l60_tw_nss_SOS_adj_diff', 'p_AVG_C_IP_l10_tw_nss_SOS_adj_diff', 'p_ace_df%_ratio_l60_tw_ss_diff', 'p_ace_df%_ratio_l10_tw_ss_diff', 'p_ace_df%_ratio_l60_tw_ss_IO_diff', 'p_ace_df%_ratio_l10_tw_ss_IO_diff', 'p_ace_df%_ratio_l60_tw_ss_SOS_adj_diff', 'p_ace_df%_ratio_l10_tw_ss_SOS_adj_diff', 'p_ace_df%_ratio_l60_tw_ss_IO_SOS_adj_diff', 'p_ace_df%_ratio_l10_tw_ss_IO_SOS_adj_diff', 'p_ace_df%_ratio_l60_tw_ss_SOS_adj_csp_diff', 'p_ace_df%_ratio_l10_tw_ss_SOS_adj_csp_diff', 'p_ace_df%_ratio_l60_tw_ss_IO_SOS_adj_csp_diff', 'p_ace_df%_ratio_l10_tw_ss_IO_SOS_adj_csp_diff', 'p_1stSvWon_1stSv%_ratio_l60_tw_ss_diff', 'p_1stSvWon_1stSv%_ratio_l10_tw_ss_diff', 'p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_diff', 'p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_diff', 'p_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_adj_diff', 'p_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_adj_diff', 'p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_adj_diff', 'p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_adj_diff', 'p_1stSvWon_1stSv%_ratio_l60_tw_ss_SOS_adj_csp_diff', 'p_1stSvWon_1stSv%_ratio_l10_tw_ss_SOS_adj_csp_diff', 'p_1stSvWon_1stSv%_ratio_l60_tw_ss_IO_SOS_adj_csp_diff', 'p_1stSvWon_1stSv%_ratio_l10_tw_ss_IO_SOS_adj_csp_diff', 'p_ace_1stSv%_ratio_l60_tw_ss_diff', 'p_ace_1stSv%_ratio_l10_tw_ss_diff', 'p_ace_1stSv%_ratio_l60_tw_ss_IO_diff', 'p_ace_1stSv%_ratio_l10_tw_ss_IO_diff', 'p_ace_1stSv%_ratio_l60_tw_ss_SOS_adj_diff', 'p_ace_1stSv%_ratio_l10_tw_ss_SOS_adj_diff', 'p_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_adj_diff', 'p_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_adj_diff', 'p_ace_1stSv%_ratio_l60_tw_ss_SOS_csp_adj_diff', 'p_ace_1stSv%_ratio_l10_tw_ss_SOS_csp_adj_diff', 'p_ace_1stSv%_ratio_l60_tw_ss_IO_SOS_csp_adj_diff', 'p_ace_1stSv%_ratio_l10_tw_ss_IO_SOS_csp_adj_diff', 'p_df_SvPtsWon%_ratio_l60_tw_ss_diff', 'p_df_SvPtsWon%_ratio_l10_tw_ss_diff', 'p_df_SvPtsWon%_ratio_l60_tw_ss_IO_diff', 'p_df_SvPtsWon%_ratio_l10_tw_ss_IO_diff', 'p_df_SvPtsWon%_ratio_l60_tw_ss_SOS_adj_diff', 'p_df_SvPtsWon%_ratio_l10_tw_ss_SOS_adj_diff', 'p_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_adj_diff', 'p_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_adj_diff', 'p_df_SvPtsWon%_ratio_l60_tw_ss_SOS_adj_csp_diff', 'p_df_SvPtsWon%_ratio_l10_tw_ss_SOS_adj_csp_diff', 'p_df_SvPtsWon%_ratio_l60_tw_ss_IO_SOS_adj_csp_diff', 'p_df_SvPtsWon%_ratio_l10_tw_ss_IO_SOS_adj_csp_diff']]

In [663]:
df_for_modeling.to_csv('../data/df_for_modeling_no_tdw.csv', index=False)