### Imports

In [1]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

### Load Preprocessed Data

In [2]:
#Load preprocessed
df = pd.read_csv('../data/cleaned_data_for_FeatureDev.csv')

In [None]:
df.info()

In [None]:
df.head()

#### dataframe in by-match organization (first iteration)

### Target Feature Creation: % Pts Won By Player in a Given Match

In [3]:
# Creation of target feature for each player in a given match: proportionalizing points played in the match appropriately between the two players.
df["w_pts_won%"] = round(((df["w_1stWon"] + df["w_2ndWon"] + (df["l_svpt"] - (df["l_1stWon"] + df["l_2ndWon"])))/ (df["w_svpt"] + df["l_svpt"]))*100, 2)

# Loser % pts won is simply 100 - w % pts won
df["l_pts_won%"] = 100 - df["w_pts_won%"]

In [4]:
# Target broken down into serving and returning 

# Winner % Serve pts won
df["w_sv_pts_won%"] = round((df["w_1stWon"] + df["w_2ndWon"]) / df["w_svpt"]*100,2)

# Winner % Return pts won
df["w_ret_pts_won%"] =round(((df["l_svpt"] - (df["l_1stWon"] + df["l_2ndWon"]))/df["l_svpt"])*100,2)

# Loser % Serve pts won
df["l_sv_pts_won%"] = round((df["l_1stWon"] + df["l_2ndWon"]) / df["l_svpt"]*100,2)

# Loser % Return pts won
df["l_ret_pts_won%"] =round(((df["w_svpt"] - (df["w_1stWon"] + df["w_2ndWon"]))/df["w_svpt"])*100,2)

In [5]:
# Not a target feature, but useful for generating predictive features: Total Points Played In Match
df["tot_pts"] = df["l_svpt"] + df["w_svpt"]

#### dataframe in by-player organization (first iteration)

In [None]:
df.info()

### Retrospective, Surface-Specific Performance Prediction Features by Player per Match

The goal in this section is to generate, for a given player relative to a given match to be played, backward-looking predictors of performance in the match to be predicted on. A number of early experiments (including feedback from EDA and simple modeling) I conducted with integration windows and various decay weights have driven me to land on the values currently present here (though more optimization surely will follow in post-complex modeling iterations).

In [6]:
df_winners = df.drop(["l_name", "l_rank", "l_rank_pts", "l_ioc", "l_ent", "l_hd", "l_ht", "l_age", "l_1stWon", "l_2ndWon", "l_SvGms", "l_pts_won%", "l_sv_pts_won%", "l_ret_pts_won%"], axis = 1)
df_winners["m_outcome"] = 1
df_losers =  df.drop(["w_name", "w_rank", "w_rank_pts", "w_ioc", "w_ent", "w_hd", "w_ht", "w_age", "w_1stWon", "w_2ndWon", "w_SvGms", "w_pts_won%", "w_sv_pts_won%", "w_ret_pts_won%"], axis = 1)
df_losers["m_outcome"] = 0

# Split out winners and losers from by-match organization and concatenate into a per player organization
df_winners = df_winners.set_axis(["t_id", "t_date", "tour_wk", "t_name", "t_country", "t_surf", "t_lvl", "t_draw_size", "m_num", "t_round", "t_rd_num", "m_best_of", "m_score", "m_time(m)", "p_id", "p_name","p_rank", "p_rank_pts", "p_country", "p_ent", "p_hd", "p_ht", "p_age", "p_svpt", "p_1stWon","p_2ndWon","p_SvGms","p_ace","p_bpSaved","p_bpFaced","opp_id","opp_svpt","opp_ace","opp_bpSaved","opp_bpFaced", "p_pts_won%","p_sv_pts_won%","p_ret_pts_won%", "m_tot_pts", "m_outcome"], axis=1)
df_losers = df_losers.set_axis(["t_id", "t_date", "tour_wk", "t_name", "t_country", "t_surf", "t_lvl", "t_draw_size", "m_num", "t_round", "t_rd_num", "m_best_of", "m_score", "m_time(m)", "opp_id", "opp_svpt","opp_ace","opp_bpSaved","opp_bpFaced","p_id","p_name","p_rank", "p_rank_pts", "p_country", "p_ent", "p_hd", "p_ht", "p_age", "p_svpt", "p_1stWon","p_2ndWon","p_SvGms", "p_ace","p_bpSaved","p_bpFaced","p_pts_won%","p_sv_pts_won%","p_ret_pts_won%", "m_tot_pts", "m_outcome"], axis=1)
df_player1 = pd.concat([df_winners, df_losers], ignore_index=True)
df_player1 = df_player1.sort_values(by=['p_id','tour_wk','t_rd_num'], ascending = False)

generated below are a number of retrospective (relative to the match being predicted on) predictive performance features. Unless otherwise specified, aggregations are surface specific (clay or hard).

In [8]:
#Sorting as such helps visually verify the complicated, backward-looking stat accrual calculations we will make below
df_player1 = df_player1.sort_values(by=['p_id','tour_wk','t_rd_num'], ascending = False)

In [9]:
df_player1.head(20)

Unnamed: 0,t_id,t_date,tour_wk,t_name,t_country,t_surf,t_lvl,t_draw_size,m_num,t_round,...,opp_id,opp_svpt,opp_ace,opp_bpSaved,opp_bpFaced,p_pts_won%,p_sv_pts_won%,p_ret_pts_won%,m_tot_pts,m_outcome
19984,2019-560,20190826,2019_24,US Open,USA,Hard,4,128,2059,R128,...,103893,164.0,6.0,9.0,16.0,47.84,56.28,38.41,347.0,0
20367,2019-M014,20191014,2019_29,Moscow,RUS,Hard,1,32,2446,R32,...,200553,72.0,11.0,3.0,5.0,41.29,49.4,31.94,155.0,0
18809,2019-M004,20190225,2019_07,Acapulco,MEX,Hard,1,32,545,R32,...,111456,40.0,2.0,0.0,0.0,37.23,51.85,17.5,94.0,0
2257,2019-7696,20191105,2019_33,NextGen Finals,ITA,Hard,3,8,2781,F,...,200282,41.0,2.0,5.0,8.0,59.14,69.23,46.34,93.0,1
2259,2019-7696,20191105,2019_33,NextGen Finals,ITA,Hard,3,8,2783,SF,...,200175,58.0,2.0,5.0,8.0,53.66,70.77,34.48,123.0,1
20539,2019-7696,20191105,2019_33,NextGen Finals,ITA,Hard,3,8,2794,RR3,...,200005,77.0,11.0,3.0,4.0,50.0,69.33,31.17,152.0,0
2271,2019-7696,20191105,2019_33,NextGen Finals,ITA,Hard,3,8,2795,RR2,...,144707,44.0,2.0,4.0,8.0,62.5,72.73,52.27,88.0,1
2268,2019-7696,20191105,2019_33,NextGen Finals,ITA,Hard,3,8,2792,RR1,...,126207,69.0,7.0,2.0,5.0,52.14,71.83,31.88,140.0,1
20447,2019-0337,20191021,2019_30,Vienna,AUT,Hard,1,32,2528,R16,...,104792,79.0,17.0,3.0,3.0,47.33,66.2,30.38,150.0,0
2191,2019-0337,20191021,2019_30,Vienna,AUT,Hard,1,32,2541,R32,...,104259,62.0,8.0,6.0,9.0,55.15,64.86,43.55,136.0,1


In [10]:
# % total points won over up to the last 60 surface-specific matches for a given player prior to a match to be predicted on
# In EDA and modeling, we will require a minimum # of matches in the past relative to a match being predicted on FOR BOTH PLAYERS IN THE MATCH
# Therefore, we do not need to go to extremes to backfill NaNs here when a window to compute on doesn't meet the min period requirement.

df_player1 = df_player1.iloc[::-1]

df_player1['p_pts_won%_1to10'] = df_player1.groupby(['p_id','t_surf'])['p_pts_won%'].transform(lambda x: x.rolling(window=10, min_periods = 1).mean().round(2).shift(1))

df_player1['p_pts_won%_11to20'] = df_player1.groupby(['p_id','t_surf'])['p_pts_won%'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(11))
df_player1['p_pts_won%_11to20'] = df_player1['p_pts_won%_11to20'].fillna(df_player1['p_pts_won%_1to10'])

df_player1['p_pts_won%_21to30'] = df_player1.groupby(['p_id','t_surf'])['p_pts_won%'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(21))
df_player1['p_pts_won%_21to30'] = df_player1['p_pts_won%_21to30'].fillna(df_player1['p_pts_won%_11to20'])

df_player1['p_pts_won%_31to40'] = df_player1.groupby(['p_id','t_surf'])['p_pts_won%'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(31))
df_player1['p_pts_won%_31to40'] = df_player1['p_pts_won%_31to40'].fillna(df_player1['p_pts_won%_21to30'])

df_player1['p_pts_won%_41to50'] = df_player1.groupby(['p_id','t_surf'])['p_pts_won%'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(41))
df_player1['p_pts_won%_41to50'] = df_player1['p_pts_won%_41to50'].fillna(df_player1['p_pts_won%_31to40'])

df_player1['p_pts_won%_51to60'] = df_player1.groupby(['p_id','t_surf'])['p_pts_won%'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(51))
df_player1['p_pts_won%_51to60'] = df_player1['p_pts_won%_51to60'].fillna(df_player1['p_pts_won%_41to50'])

df_player1 = df_player1.iloc[::-1]

In [11]:
# Time-decay weighting the total pts won % by player result from above.
df_player1["p_pts_won%_l60_decay"] = (((df_player1['p_pts_won%_1to10'] * 6) + (df_player1['p_pts_won%_11to20'] * 5) + (df_player1['p_pts_won%_21to30'] * 4) 
+ (df_player1['p_pts_won%_31to40'] * 3) + (df_player1['p_pts_won%_41to50'] * 2) + (df_player1['p_pts_won%_51to60'] * 1))/21).round(2)

#Dropping the transient columns used for the decay calculations
df_player1.drop(["p_pts_won%_1to10", "p_pts_won%_11to20","p_pts_won%_21to30","p_pts_won%_31to40","p_pts_won%_41to50","p_pts_won%_51to60"],axis=1, inplace=True)
#df_player1

In [12]:
# % SERVE points won over up to the last 60 surface-specific matches for a given player prior to a match to be predicted on
# In EDA and modeling, we will require a minimum # of matches in the past relative to a match being predicted on FOR BOTH PLAYERS IN THE MATCH
# Therefore, we do not need to go to extremes to backfill NaNs here when a window to compute on doesn't meet the min period requirement.

df_player1 = df_player1.iloc[::-1]

df_player1['p_sv_pts_won%_1to10'] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won%'].transform(lambda x: x.rolling(window=10, min_periods = 1).mean().round(2).shift(1))

df_player1['p_sv_pts_won%_11to20'] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won%'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(11))
df_player1['p_sv_pts_won%_11to20'] = df_player1['p_sv_pts_won%_11to20'].fillna(df_player1['p_sv_pts_won%_1to10'])

df_player1['p_sv_pts_won%_21to30'] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won%'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(21))
df_player1['p_sv_pts_won%_21to30'] = df_player1['p_sv_pts_won%_21to30'].fillna(df_player1['p_sv_pts_won%_11to20'])

df_player1['p_sv_pts_won%_31to40'] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won%'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(31))
df_player1['p_sv_pts_won%_31to40'] = df_player1['p_sv_pts_won%_31to40'].fillna(df_player1['p_sv_pts_won%_21to30'])

df_player1['p_sv_pts_won%_41to50'] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won%'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(41))
df_player1['p_sv_pts_won%_41to50'] = df_player1['p_sv_pts_won%_41to50'].fillna(df_player1['p_sv_pts_won%_31to40'])

df_player1['p_sv_pts_won%_51to60'] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won%'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(51))
df_player1['p_sv_pts_won%_51to60'] = df_player1['p_sv_pts_won%_51to60'].fillna(df_player1['p_sv_pts_won%_41to50'])

df_player1 = df_player1.iloc[::-1]


In [13]:
# Time-decay weighting the SERVE pts won % by player result from above.
df_player1["p_sv_pts_won%_l60_decay"] = (((df_player1['p_sv_pts_won%_1to10'] * 6) + (df_player1['p_sv_pts_won%_11to20'] * 5) + (df_player1['p_sv_pts_won%_21to30'] * 4) 
+ (df_player1['p_sv_pts_won%_31to40'] * 3) + (df_player1['p_sv_pts_won%_41to50'] * 2) + (df_player1['p_sv_pts_won%_51to60'] * 1))/21).round(2)

#Dropping the transient columns used for the decay calculations
df_player1.drop(["p_sv_pts_won%_1to10", "p_sv_pts_won%_11to20","p_sv_pts_won%_21to30","p_sv_pts_won%_31to40","p_sv_pts_won%_41to50","p_sv_pts_won%_51to60"],axis=1, inplace=True)


In [14]:
# % RETURN points won over up to the last 60 surface-specific matches for a given player prior to a match to be predicted on
# In EDA and modeling, we will require a minimum # of matches in the past relative to a match being predicted on FOR BOTH PLAYERS IN THE MATCH
# Therefore, we do not need to go to extremes to backfill NaNs here when a window to compute on doesn't meet the min period requirement.

df_player1 = df_player1.iloc[::-1]

df_player1['p_ret_pts_won%_1to10'] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won%'].transform(lambda x: x.rolling(window=10, min_periods = 1).mean().round(2).shift(1))

df_player1['p_ret_pts_won%_11to20'] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won%'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(11))
df_player1['p_ret_pts_won%_11to20'] = df_player1['p_ret_pts_won%_11to20'].fillna(df_player1['p_ret_pts_won%_1to10'])

df_player1['p_ret_pts_won%_21to30'] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won%'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(21))
df_player1['p_ret_pts_won%_21to30'] = df_player1['p_ret_pts_won%_21to30'].fillna(df_player1['p_ret_pts_won%_11to20'])

df_player1['p_ret_pts_won%_31to40'] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won%'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(31))
df_player1['p_ret_pts_won%_31to40'] = df_player1['p_ret_pts_won%_31to40'].fillna(df_player1['p_ret_pts_won%_21to30'])

df_player1['p_ret_pts_won%_41to50'] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won%'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(41))
df_player1['p_ret_pts_won%_41to50'] = df_player1['p_ret_pts_won%_41to50'].fillna(df_player1['p_ret_pts_won%_31to40'])

df_player1['p_ret_pts_won%_51to60'] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won%'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(51))
df_player1['p_ret_pts_won%_51to60'] = df_player1['p_ret_pts_won%_51to60'].fillna(df_player1['p_ret_pts_won%_41to50'])

df_player1 = df_player1.iloc[::-1]


In [15]:
# Time-decay weighting the RETURN pts won % by player result from above.
df_player1["p_ret_pts_won%_l60_decay"] = (((df_player1['p_ret_pts_won%_1to10'] * 6) + (df_player1['p_ret_pts_won%_11to20'] * 5) + (df_player1['p_ret_pts_won%_21to30'] * 4) 
+ (df_player1['p_ret_pts_won%_31to40'] * 3) + (df_player1['p_ret_pts_won%_41to50'] * 2) + (df_player1['p_ret_pts_won%_51to60'] * 1))/21).round(2)

#Dropping the transient columns used for the decay calculations
df_player1.drop(["p_ret_pts_won%_1to10", "p_ret_pts_won%_11to20","p_ret_pts_won%_21to30","p_ret_pts_won%_31to40","p_ret_pts_won%_41to50","p_ret_pts_won%_51to60"],axis=1, inplace=True)


In [16]:
# player ace% over up to the last 60 surface-specific matches for a given player prior to a match to be predicted on
# In EDA and modeling, we will require a minimum # of matches in the past relative to a match being predicted on FOR BOTH PLAYERS IN THE MATCH
# Therefore, we do not need to go to extremes to backfill NaNs here when a window to compute on doesn't meet the min period requirement.

# decay-weighted player ace % over up to the last 60 matches (surface-specific)
df_player1["p_ace%"] = ((df_player1["p_ace"]/df_player1["p_svpt"])*100).round(2)

df_player1 = df_player1.iloc[::-1]

df_player1['p_ace%_1to10'] = df_player1.groupby(['p_id','t_surf'])['p_ace%'].transform(lambda x: x.rolling(window=10, min_periods = 1).mean().round(2).shift(1))

df_player1['p_ace%_11to20'] = df_player1.groupby(['p_id','t_surf'])['p_ace%'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(11))
df_player1['p_ace%_11to20'] = df_player1['p_ace%_11to20'].fillna(df_player1['p_ace%_1to10'])

df_player1['p_ace%_21to30'] = df_player1.groupby(['p_id','t_surf'])['p_ace%'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(21))
df_player1['p_ace%_21to30'] = df_player1['p_ace%_21to30'].fillna(df_player1['p_ace%_11to20'])

df_player1['p_ace%_31to40'] = df_player1.groupby(['p_id','t_surf'])['p_ace%'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(31))
df_player1['p_ace%_31to40'] = df_player1['p_ace%_31to40'].fillna(df_player1['p_ace%_21to30'])

df_player1['p_ace%_41to50'] = df_player1.groupby(['p_id','t_surf'])['p_ace%'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(41))
df_player1['p_ace%_41to50'] = df_player1['p_ace%_41to50'].fillna(df_player1['p_ace%_31to40'])

df_player1['p_ace%_51to60'] = df_player1.groupby(['p_id','t_surf'])['p_ace%'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(51))
df_player1['p_ace%_51to60'] = df_player1['p_ace%_51to60'].fillna(df_player1['p_ace%_41to50'])

df_player1 = df_player1.iloc[::-1]

In [17]:
# Time-decay weighting the ace% by player result from above.
df_player1["p_ace%_l60_decay"] = (((df_player1['p_ace%_1to10'] * 6) + (df_player1['p_ace%_11to20'] * 5) + (df_player1['p_ace%_21to30'] * 4) 
+ (df_player1['p_ace%_31to40'] * 3) + (df_player1['p_ace%_41to50'] * 2) + (df_player1['p_ace%_51to60'] * 1))/21).round(2)

#Dropping the transient columns used for the decay calculations
df_player1.drop(["p_ace%_1to10", "p_ace%_11to20","p_ace%_21to30","p_ace%_31to40","p_ace%_41to50","p_ace%_51to60"],axis=1, inplace=True)

In [18]:
# player aced% (as a returner) over up to the last 60 surface-specific matches for a given player prior to a match to be predicted on
# In EDA and modeling, we will require a minimum # of matches in the past relative to a match being predicted on FOR BOTH PLAYERS IN THE MATCH
# Therefore, we do not need to go to extremes to backfill NaNs here when a window to compute on doesn't meet the min period requirement.

df_player1["p_aced%"] = ((df_player1["opp_ace"]/df_player1["opp_svpt"])*100).round(2)

df_player1 = df_player1.iloc[::-1]

df_player1['p_aced%_1to10'] = df_player1.groupby(['p_id','t_surf'])['p_aced%'].transform(lambda x: x.rolling(window=10, min_periods = 1).mean().round(2).shift(1))

df_player1['p_aced%_11to20'] = df_player1.groupby(['p_id','t_surf'])['p_aced%'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(11))
df_player1['p_aced%_11to20'] = df_player1['p_aced%_11to20'].fillna(df_player1['p_aced%_1to10'])

df_player1['p_aced%_21to30'] = df_player1.groupby(['p_id','t_surf'])['p_aced%'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(21))
df_player1['p_aced%_21to30'] = df_player1['p_aced%_21to30'].fillna(df_player1['p_aced%_11to20'])

df_player1['p_aced%_31to40'] = df_player1.groupby(['p_id','t_surf'])['p_aced%'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(31))
df_player1['p_aced%_31to40'] = df_player1['p_aced%_31to40'].fillna(df_player1['p_aced%_21to30'])

df_player1['p_aced%_41to50'] = df_player1.groupby(['p_id','t_surf'])['p_aced%'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(41))
df_player1['p_aced%_41to50'] = df_player1['p_aced%_41to50'].fillna(df_player1['p_aced%_31to40'])

df_player1['p_aced%_51to60'] = df_player1.groupby(['p_id','t_surf'])['p_aced%'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(51))
df_player1['p_aced%_51to60'] = df_player1['p_aced%_51to60'].fillna(df_player1['p_aced%_41to50'])

df_player1 = df_player1.iloc[::-1]

In [19]:
# Time-decay weighting the aced% by player result from above.
df_player1["p_aced%_l60_decay"] = (((df_player1['p_aced%_1to10'] * 6) + (df_player1['p_aced%_11to20'] * 5) + (df_player1['p_aced%_21to30'] * 4) 
+ (df_player1['p_aced%_31to40'] * 3) + (df_player1['p_aced%_41to50'] * 2) + (df_player1['p_aced%_51to60'] * 1))/21).round(2)

#Dropping the transient columns used for the decay calculations
df_player1.drop(["p_aced%_1to10", "p_aced%_11to20","p_aced%_21to30","p_aced%_31to40","p_aced%_41to50","p_aced%_51to60"],axis=1, inplace=True)

In [20]:
# player break point save % (as a server) over up to the last 60 matches (surface-specific)
# I played around with a weighted version of this, but it didn't work as well as unweighted just due to the rareness of the events

df_player1["p_bp_save%"] = ((df_player1["p_bpSaved"]/df_player1["p_bpFaced"])*100).round(2)

df_player1 = df_player1.iloc[::-1]
df_player1['p_bp_save%_l60'] = df_player1.groupby(['p_id','t_surf'])['p_bp_save%'].transform(lambda x: x.rolling(window=60, min_periods = 1).mean().round(2).shift(1))
df_player1 = df_player1.iloc[::-1]

In [21]:
# player break point conversion % (as a returner) over up to the last 60 matches (surface-specific)
# I played around with a weighted version of this, but it didn't work as well as unweighted just due to the rareness of the events

df_player1["p_bp_convert%"] = ((1 - (df_player1["opp_bpSaved"]/df_player1["opp_bpFaced"]))*100).round(2)

df_player1 = df_player1.iloc[::-1]
df_player1['p_bp_convert%_l60'] = df_player1.groupby(['p_id','t_surf'])['p_bp_convert%'].transform(lambda x: x.rolling(window=60, min_periods = 1).mean().round(2).shift(1))
df_player1 = df_player1.iloc[::-1]

In [22]:
# Variability (standard deviation) in total pts won% over the previous 60 surface-specific matches (non-decay weighted)
df_player1 = df_player1.iloc[::-1]
df_player1['p_pts_won%_std_l60'] = df_player1.groupby(['p_id','t_surf'])['p_pts_won%'].transform(lambda x: x.rolling(window=60, min_periods = 1).std().round(2).shift(1))
df_player1 = df_player1.iloc[::-1]

# Variability (standard deviation) in SERVE pts won% over the previous 60 surface-specific matches (non-decay weighted)
df_player1 = df_player1.iloc[::-1]
df_player1['p_sv_pts_won%_std_l60'] = df_player1.groupby(['p_id','t_surf'])['p_sv_pts_won%'].transform(lambda x: x.rolling(window=60, min_periods = 1).std().round(2).shift(1))
df_player1 = df_player1.iloc[::-1]

# Variability (standard deviation) in RETURN pts won% over the previous 60 surface-specific matches (non-decay weighted)
df_player1 = df_player1.iloc[::-1]
df_player1['p_ret_pts_won%_std_l60'] = df_player1.groupby(['p_id','t_surf'])['p_ret_pts_won%'].transform(lambda x: x.rolling(window=60, min_periods = 1).std().round(2).shift(1))
df_player1 = df_player1.iloc[::-1]

In [None]:
df_player1.info()

### Fatigue and Stamina Predictive Features

In [23]:
# Computes number of matches previous to the one being predicted on a player has played AND is contained in the sample
# Accrued without respect to hard or clay court. Capped at max of 300. 

df_player1 = df_player1.iloc[::-1]
df_player1['p_matches'] = df_player1.groupby('p_id')['p_id'].transform(lambda x: x.rolling(300, min_periods=1).count().shift(1))
df_player1 = df_player1.iloc[::-1]

# If this is the first match in the sample for the player, the NaN will become 1 (these mtches will be filtered out before modeling anyhow)
df_player1['p_matches'] = df_player1['p_matches'].fillna(1)

In [24]:
# Computes time spent on court for a player across up to his last 5 matches within the same tournament as the match being predicted on
# An NaN is replaced with 60 minutes, with the assumption that the player got an hour of match intensity work in that day instead of a match.

df_player1["p_m_time_last"] = df_player1.groupby(['p_id','tour_wk'])['m_time(m)'].shift(-1)
df_player1['p_m_time_last'] = df_player1['p_m_time_last'].fillna(60)
df_player1["p_m_time_2ago"] = df_player1.groupby(['p_id','tour_wk'])['m_time(m)'].shift(-2)
df_player1['p_m_time_2ago'] = df_player1['p_m_time_2ago'].fillna(60) 
df_player1["p_m_time_3ago"] = df_player1.groupby(['p_id','tour_wk'])['m_time(m)'].shift(-3)
df_player1['p_m_time_3ago'] = df_player1['p_m_time_3ago'].fillna(60) 
df_player1["p_m_time_4ago"] = df_player1.groupby(['p_id','tour_wk'])['m_time(m)'].shift(-4)
df_player1['p_m_time_4ago'] = df_player1['p_m_time_4ago'].fillna(60)
df_player1["p_m_time_5ago"] = df_player1.groupby(['p_id','tour_wk'])['m_time(m)'].shift(-5)
df_player1['p_m_time_5ago'] = df_player1['p_m_time_5ago'].fillna(60) 

# Decay-weighted and non-decay weighted versions of the total time are computed
df_player1['p_tot_time_l5_decay'] = ((df_player1['p_m_time_last']*1) + (df_player1['p_m_time_2ago']*.8) + (df_player1['p_m_time_3ago']*.6) + (df_player1['p_m_time_4ago']*.4) + (df_player1['p_m_time_5ago']*.2))
df_player1['p_tot_time_l5'] = ((df_player1['p_m_time_last']*1) + (df_player1['p_m_time_2ago']*1) + (df_player1['p_m_time_3ago']*1) + (df_player1['p_m_time_4ago']*1) + (df_player1['p_m_time_5ago']*1)) #empirically tested to discern that no decay performs slightly better than decay

# Dropping transient columns
df_player1 = df_player1.drop(['p_m_time_last','p_m_time_2ago','p_m_time_3ago','p_m_time_4ago','p_m_time_5ago'],axis=1)

In [25]:
# Integrates "stamina" and "fatigue" features into a "body battery" feature (non- and decay weighted for match time versions)
# Currently, player matches in denom factored by cube root, based on some prediction quality feedbck from simple (linear) model 

df_player1["p_stamina_adj_fatigue_decay"] = (df_player1["p_tot_time_l5_decay"]/np.cbrt(df_player1["p_matches"])).round(2)
df_player1["p_stamina_adj_fatigue"] = (df_player1["p_tot_time_l5"]/np.cbrt(df_player1["p_matches"])).round(2)

In [26]:
# Computes total points played for a player across up to his last 5 matches within the same tournament as the match being predicted on
# An NaN is replaced with 100 points, with the assumption that the player got an hour of match intensity work in that day instead of a match.

df_player1["p_tot_pts_last"] = df_player1.groupby(['p_id','tour_wk'])['m_tot_pts'].shift(-1)
df_player1['p_tot_pts_last'] = df_player1['p_tot_pts_last'].fillna(100)
df_player1["p_tot_pts_2ago"] = df_player1.groupby(['p_id','tour_wk'])['m_tot_pts'].shift(-2)
df_player1['p_tot_pts_2ago'] = df_player1['p_tot_pts_2ago'].fillna(100) 
df_player1["p_tot_pts_3ago"] = df_player1.groupby(['p_id','tour_wk'])['m_tot_pts'].shift(-3)
df_player1['p_tot_pts_3ago'] = df_player1['p_tot_pts_3ago'].fillna(100) 
df_player1["p_tot_pts_4ago"] = df_player1.groupby(['p_id','tour_wk'])['m_tot_pts'].shift(-4)
df_player1['p_tot_pts_4ago'] = df_player1['p_tot_pts_4ago'].fillna(100)
df_player1["p_tot_pts_5ago"] = df_player1.groupby(['p_id','tour_wk'])['m_tot_pts'].shift(-5)
df_player1['p_tot_pts_5ago'] = df_player1['p_tot_pts_5ago'].fillna(100) 

# Decay-weighted and non-decay weighted versions of the total pts are computed
df_player1['p_tot_pts_l5_decay'] = ((df_player1['p_tot_pts_last']*1) + (df_player1['p_tot_pts_2ago']*.8) + (df_player1['p_tot_pts_3ago']*.6) + (df_player1['p_tot_pts_4ago']*.4) + (df_player1['p_tot_pts_5ago']*.2))
df_player1['p_tot_pts_l5'] = ((df_player1['p_tot_pts_last']*1) + (df_player1['p_tot_pts_2ago']*1) + (df_player1['p_tot_pts_3ago']*1) + (df_player1['p_tot_pts_4ago']*1) + (df_player1['p_tot_pts_5ago']*1)) #empirically tested to discern that no decay performs slightly better than decay

# Dropping transient columns
df_player1 = df_player1.drop(['p_tot_pts_last','p_tot_pts_2ago','p_tot_pts_3ago','p_tot_pts_4ago','p_tot_pts_5ago'],axis=1)

#### below is computed a given player's H2H wins versus an opponent prior to a match being predicted on in a surface-specific manner (across entire sample 2012-2019 though)

In [27]:
df_player1 = df_player1.iloc[::-1]
df_player1['p_H2H_w'] = df_player1.groupby(['p_id','opp_id','t_surf'])['m_outcome'].transform(lambda x: x.rolling(window=2000, min_periods = 1).sum().shift(1))
df_player1 = df_player1.iloc[::-1]
df_player1['p_H2H_w'] = df_player1['p_H2H_w'].fillna(0)
df_player1

Unnamed: 0,t_id,t_date,tour_wk,t_name,t_country,t_surf,t_lvl,t_draw_size,m_num,t_round,...,p_sv_pts_won%_std_l60,p_ret_pts_won%_std_l60,p_matches,p_tot_time_l5_decay,p_tot_time_l5,p_stamina_adj_fatigue_decay,p_stamina_adj_fatigue,p_tot_pts_l5_decay,p_tot_pts_l5,p_H2H_w
19984,2019-560,20190826,2019_24,US Open,USA,Hard,4,128,2059,R128,...,,,1.0,180.0,300.0,180.00,300.00,300.0,500.0,0.0
20367,2019-M014,20191014,2019_29,Moscow,RUS,Hard,1,32,2446,R32,...,,,1.0,180.0,300.0,180.00,300.00,300.0,500.0,0.0
18809,2019-M004,20190225,2019_07,Acapulco,MEX,Hard,1,32,545,R32,...,,,1.0,180.0,300.0,180.00,300.00,300.0,500.0,0.0
2257,2019-7696,20191105,2019_33,NextGen Finals,ITA,Hard,3,8,2781,F,...,6.46,8.02,19.0,237.4,380.0,88.97,142.41,373.4,603.0,0.0
2259,2019-7696,20191105,2019_33,NextGen Finals,ITA,Hard,3,8,2783,SF,...,6.68,8.36,18.0,235.4,365.0,89.82,139.27,366.4,580.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12825,2014-414,20140714,2014_19,Hamburg,GER,Clay,1,48,16430,R32,...,19.04,12.79,4.0,178.0,298.0,112.13,187.73,296.0,496.0,0.0
12809,2014-414,20140714,2014_19,Hamburg,GER,Clay,1,48,16414,R64,...,16.07,5.08,3.0,180.0,300.0,124.81,208.01,300.0,500.0,0.0
31024,2014-321,20140707,2014_18,Stuttgart,GER,Clay,1,28,16360,R32,...,3.75,6.72,2.0,180.0,300.0,142.87,238.11,300.0,500.0,0.0
30686,2014-308,20140428,2014_13,Munich,GER,Clay,1,28,15713,R32,...,,,1.0,180.0,300.0,180.00,300.00,300.0,500.0,0.0


In [28]:
#Save current df prior to another transformation back to by-match organization
#df_player1.to_csv('../data/df_player1.csv', index=False)

converting briefly back to by-match organization so that we can obtain the data we need to compute player "Strength of Schedule" at the time of each match to be predicted on, across a range of features. The concept is the same as you might seen in football or soccer analytics. At the time of every match to be predicted on, we have a number of (mostly time-decay weighted) assessments of how the player performed over the last stretch of time (60 matches) on the same surface as the match to be played. However, we want to normalize these predictive features by the aggregate strength of the schedule of opponents they faced in the stretch over which those features were generated. For example, winning 60% of your serve points against a schedule of opponents who had historically yielded 65% of opponent serve points (ie, their own return points) is not as impressive as winning 60% of your serve points against a schedule of opponents who had historically yielded 55% of opponent serve points (ie, their own return points).

In [29]:
df_winners2 = df_player1[df_player1['m_outcome'] == 1]
df_losers2 = df_player1[df_player1['m_outcome'] == 0]
df_match2 = df_winners2.merge(df_losers2, on='m_num', how = 'left')

#Back to by-player organization, picking up the reciprocal columns per player needed to make SOS calculations

# Dropping other player columns for winners
df_winners2 = df_match2.drop(["p_svpt_x", "p_1stWon_x", "p_2ndWon_x", "p_SvGms_x", "p_ace_x", "p_bpSaved_x", "p_bpFaced_x", "opp_svpt_x", "opp_ace_x", "opp_bpSaved_x", "opp_bpFaced_x", "m_outcome_x", "t_id_y", "t_date_y", "tour_wk_y", "t_name_y", "t_country_y", "t_surf_y", "t_lvl_y", "t_draw_size_y", "t_round_y", "t_rd_num_y", "m_best_of_y", "m_score_y", "m_time(m)_y", "p_id_y", "p_name_y", "p_rank_y", "p_rank_pts_y", "p_country_y", "p_ent_y", "p_hd_y", "p_ht_y", "p_age_y", "p_svpt_y", "p_1stWon_y", "p_2ndWon_y", "p_SvGms_y", "p_ace_y", "p_bpSaved_y", "p_bpFaced_y", "opp_id_y", "opp_svpt_y", "opp_ace_y", "opp_bpSaved_y", "opp_bpFaced_y", "p_pts_won%_y", "p_sv_pts_won%_y", "p_ret_pts_won%_y", "m_tot_pts_y", "m_outcome_y", "p_ace%_y", "p_aced%_y", "p_bp_save%_y", "p_bp_convert%_y", "p_pts_won%_std_l60_y", "p_sv_pts_won%_std_l60_y", "p_ret_pts_won%_std_l60_y", "p_matches_y", "p_tot_time_l5_decay_y", "p_tot_time_l5_y", "p_stamina_adj_fatigue_decay_y", "p_stamina_adj_fatigue_y", "p_tot_pts_l5_decay_y", "p_tot_pts_l5_y", "p_H2H_w_y"], axis = 1)
df_winners2["m_outcome"] = 1

#Renaming columns to remove winner-loser descriptions so we can re-concatenate winners and losers
df_winners2 = df_winners2.set_axis(["t_id", "t_date", "tour_wk", "t_name", "t_country", "t_surf", "t_lvl", "t_draw_size", "m_num", "t_round", "t_rd_num", "m_best_of", "m_score","m_time(m)", "p_id", "p_name","p_rank", "p_rank_pts", "p_country", "p_ent", "p_hd", "p_ht", "p_age", "opp_id", "p_pts_won%", "p_sv_pts_won%", "p_ret_pts_won%", "m_tot_pts", "p_pts_won%_l60_decay", "p_sv_pts_won%_l60_decay", "p_ret_pts_won%_l60_decay", "p_ace%", "p_ace%_l60_decay", "p_aced%", "p_aced%_l60_decay", "p_bp_save%", "p_bp_save%_l60", "p_bp_convert%", "p_bp_convert%_l60", "p_pts_won%_std_l60",'p_sv_pts_won%_std_l60','p_ret_pts_won%_std_l60', "p_matches", "p_tot_time_l5_decay", "p_tot_time_l5", "p_stamina_adj_fatigue_decay", "p_stamina_adj_fatigue", "p_tot_pts_l5_decay", "p_tot_pts_l5", "p_H2H_w", "p_opp_pts_won%_l60_decay", "p_opp_sv_pts_won%_l60_decay", "p_opp_ret_pts_won%_l60_decay", "p_opp_ace%_l60_decay", "p_opp_aced%_l60_decay", "p_opp_bp_save%_l60", "p_opp_bp_convert%_l60", "m_outcome"], axis=1)

#Dropping other player columns for losers
df_losers2 = df_match2.drop(["p_id_x", "p_name_x", "p_rank_x", "p_rank_pts_x", "p_country_x", "p_ent_x", "p_hd_x", "p_ht_x", "p_age_x", "p_svpt_x", "p_1stWon_x", "p_2ndWon_x", "p_SvGms_x", "p_ace_x", "p_bpSaved_x", "p_bpFaced_x", "opp_id_x", "opp_svpt_x", "opp_ace_x", "opp_bpSaved_x", "opp_bpFaced_x", "p_pts_won%_x", "p_sv_pts_won%_x", "p_ret_pts_won%_x", "m_outcome_x", "p_ace%_x", "p_aced%_x", "p_bp_save%_x", "p_bp_convert%_x", "p_pts_won%_std_l60_x", "p_sv_pts_won%_std_l60_x", "p_ret_pts_won%_std_l60_x", "p_matches_x", "p_tot_time_l5_decay_x", "p_tot_time_l5_x", "p_stamina_adj_fatigue_decay_x", "p_stamina_adj_fatigue_x", "p_tot_pts_l5_decay_x", "p_tot_pts_l5_x", "p_H2H_w_x", "t_id_y", "t_date_y", "tour_wk_y", "t_name_y", "t_country_y", "t_surf_y", "t_lvl_y", "t_draw_size_y", "t_round_y", "t_rd_num_y", "m_best_of_y", "m_score_y", "m_time(m)_y", "p_svpt_y", "p_1stWon_y", "p_2ndWon_y", "p_SvGms_y", "p_ace_y", "p_bpSaved_y", "p_bpFaced_y", "opp_svpt_y", "opp_ace_y", "opp_bpSaved_y", "opp_bpFaced_y", "m_tot_pts_y", "m_outcome_y"], axis = 1)
df_losers2["m_outcome"] = 0

#Renaming columns to remove winner-loser descriptions so we can re-concatenate winners and losers
df_losers2 = df_losers2.set_axis(["t_id", "t_date", "tour_wk", "t_name", "t_country", "t_surf", "t_lvl", "t_draw_size", "m_num", "t_round", "t_rd_num", "m_best_of", "m_score","m_time(m)", "m_tot_pts", "p_opp_pts_won%_l60_decay", "p_opp_sv_pts_won%_l60_decay", "p_opp_ret_pts_won%_l60_decay", "p_opp_ace%_l60_decay", "p_opp_aced%_l60_decay", "p_opp_bp_save%_l60", "p_opp_bp_convert%_l60", "p_id", "p_name","p_rank", "p_rank_pts", "p_country", "p_ent", "p_hd", "p_ht", "p_age", "opp_id", "p_pts_won%", "p_sv_pts_won%", "p_ret_pts_won%", "p_pts_won%_l60_decay", "p_sv_pts_won%_l60_decay", "p_ret_pts_won%_l60_decay", "p_ace%", "p_ace%_l60_decay", "p_aced%", "p_aced%_l60_decay", "p_bp_save%", "p_bp_save%_l60", "p_bp_convert%", "p_bp_convert%_l60", "p_pts_won%_std_l60",'p_sv_pts_won%_std_l60','p_ret_pts_won%_std_l60', "p_matches", "p_tot_time_l5_decay", "p_tot_time_l5", "p_stamina_adj_fatigue_decay", "p_stamina_adj_fatigue", "p_tot_pts_l5_decay", "p_tot_pts_l5", "p_H2H_w", "m_outcome"], axis=1)

#Re-merge data, but now with no separate columns for winners and losers 
df_player2 = pd.concat([df_winners2, df_losers2], ignore_index=True)
df_player2 = df_player2[["t_id", "t_date", "tour_wk", "t_name", "t_country", "t_surf", "t_lvl", "t_draw_size", "t_round", "t_rd_num", "m_num", "m_best_of", "m_outcome", "m_score","m_time(m)", "m_tot_pts", "p_id", "p_name", "opp_id", "p_H2H_w", "p_rank", "p_rank_pts", "p_country", "p_ent", "p_hd", "p_ht", "p_age", "p_matches", "p_pts_won%", "p_pts_won%_l60_decay", "p_sv_pts_won%", "p_sv_pts_won%_l60_decay", "p_ret_pts_won%", "p_ret_pts_won%_l60_decay", "p_ace%", "p_ace%_l60_decay", "p_aced%", "p_aced%_l60_decay", "p_bp_save%", "p_bp_save%_l60", "p_bp_convert%", "p_bp_convert%_l60", "p_pts_won%_std_l60",'p_sv_pts_won%_std_l60','p_ret_pts_won%_std_l60',  "p_tot_time_l5", "p_tot_time_l5_decay", "p_stamina_adj_fatigue_decay", "p_stamina_adj_fatigue", "p_tot_pts_l5", "p_tot_pts_l5_decay", "p_opp_pts_won%_l60_decay", "p_opp_sv_pts_won%_l60_decay", "p_opp_ret_pts_won%_l60_decay", "p_opp_ace%_l60_decay", "p_opp_aced%_l60_decay", "p_opp_bp_save%_l60", "p_opp_bp_convert%_l60"]]
df_player2 = df_player2.sort_values(by=['p_id','tour_wk','t_rd_num'], ascending = False)

In [32]:
df_player2.head(20)

Unnamed: 0,t_id,t_date,tour_wk,t_name,t_country,t_surf,t_lvl,t_draw_size,t_round,t_rd_num,...,p_stamina_adj_fatigue,p_tot_pts_l5,p_tot_pts_l5_decay,p_opp_pts_won%_l60_decay,p_opp_sv_pts_won%_l60_decay,p_opp_ret_pts_won%_l60_decay,p_opp_ace%_l60_decay,p_opp_aced%_l60_decay,p_opp_bp_save%_l60,p_opp_bp_convert%_l60
34850,2019-560,20190826,2019_24,US Open,USA,Hard,4,128,R128,1.0,...,300.0,500.0,300.0,47.61,61.44,33.11,5.86,10.52,59.47,38.93
18294,2019-M014,20191014,2019_29,Moscow,RUS,Hard,1,32,R32,1.0,...,300.0,500.0,300.0,,,,,,,
19382,2019-M004,20190225,2019_07,Acapulco,MEX,Hard,1,32,R32,1.0,...,300.0,500.0,300.0,46.68,58.45,34.84,3.87,9.01,58.49,35.1
0,2019-7696,20191105,2019_33,NextGen Finals,ITA,Hard,3,8,F,5.0,...,142.41,603.0,373.4,53.06,68.31,38.59,6.07,9.06,57.58,45.8
1,2019-7696,20191105,2019_33,NextGen Finals,ITA,Hard,3,8,SF,4.0,...,139.27,580.0,366.4,51.35,67.1,36.52,5.63,12.56,53.44,41.7
18384,2019-7696,20191105,2019_33,NextGen Finals,ITA,Hard,3,8,RR3,3.0,...,125.23,528.0,320.0,49.42,65.74,31.96,9.58,7.74,57.94,45.38
2,2019-7696,20191105,2019_33,NextGen Finals,ITA,Hard,3,8,RR2,2.0,...,129.37,540.0,340.0,49.96,63.37,36.69,2.96,9.5,50.61,43.73
3,2019-7696,20191105,2019_33,NextGen Finals,ITA,Hard,3,8,RR1,1.0,...,121.64,500.0,300.0,49.46,64.66,34.04,8.05,11.43,61.53,38.22
28706,2019-0337,20191021,2019_30,Vienna,AUT,Hard,1,32,R16,2.0,...,136.92,536.0,336.0,52.31,65.07,39.02,9.34,5.12,62.93,41.97
4,2019-0337,20191021,2019_30,Vienna,AUT,Hard,1,32,R32,1.0,...,127.59,500.0,300.0,50.24,65.97,34.59,6.0,9.95,62.1,41.96


In [None]:
df_player2.info()

In [33]:
#Save to review
df_player2.to_csv('../data/df_player2.csv', index=False)

### "Strength of Schedule" Calculation and Adjustment for Predictive Features

for each past-performance based (last 60 matches, relative to a given match; surface-specific) predictive feature per-player for the match being predicted on, "Strength of Schedule" is calculated below. For a given player, time decay-weighted (for most features) performance over the last 60 matches (which we are using to predict performance in the match at hand) is adjusted by how much above or below sample average performance their roster of opponents during that stretch had THEMSELVES performed over THEIR last 60 matches heading into their match with the player of interest. This strength of schedule adjustment is common practice in team sports, but this requires much more computation because it's relative to each match in a very large sample per player (in NFL, for example, you only need to re-compute 16 times per season).

In [34]:
#Calculates % total points won 'Strength of Schedule' for the past 60 opponents of a given player in a given match
#Uses each opponent's decay-weighted last 60 match performance prior to facing the player of interest (surface-specific)
#With this, we can obtain the "expected" performance over the last 60 matches for the player of interest, then can SOS adjust
#that player's performance over their last 60 on the surface to reflect how much above or below an average schedule they faced (see calculations below)

df_player2 = df_player2.iloc[::-1]

df_player2['p_pts_won%_SOS_1to10'] = df_player2.groupby(['p_id','t_surf'])['p_opp_pts_won%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 1).mean().round(2).shift(1))

df_player2['p_pts_won%_SOS_11to20'] = df_player2.groupby(['p_id','t_surf'])['p_opp_pts_won%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(11))
df_player2['p_pts_won%_SOS_11to20'] = df_player2['p_pts_won%_SOS_11to20'].fillna(df_player2['p_pts_won%_SOS_1to10'])

df_player2['p_pts_won%_SOS_21to30'] = df_player2.groupby(['p_id','t_surf'])['p_opp_pts_won%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(21))
df_player2['p_pts_won%_SOS_21to30'] = df_player2['p_pts_won%_SOS_21to30'].fillna(df_player2['p_pts_won%_SOS_11to20'])

df_player2['p_pts_won%_SOS_31to40'] = df_player2.groupby(['p_id','t_surf'])['p_opp_pts_won%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(31))
df_player2['p_pts_won%_SOS_31to40'] = df_player2['p_pts_won%_SOS_31to40'].fillna(df_player2['p_pts_won%_SOS_21to30'])

df_player2['p_pts_won%_SOS_41to50'] = df_player2.groupby(['p_id','t_surf'])['p_opp_pts_won%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(41))
df_player2['p_pts_won%_SOS_41to50'] = df_player2['p_pts_won%_SOS_41to50'].fillna(df_player2['p_pts_won%_SOS_31to40'])

df_player2['p_pts_won%_SOS_51to60'] = df_player2.groupby(['p_id','t_surf'])['p_opp_pts_won%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(51))
df_player2['p_pts_won%_SOS_51to60'] = df_player2['p_pts_won%_SOS_51to60'].fillna(df_player2['p_pts_won%_SOS_41to50'])

df_player2 = df_player2.iloc[::-1]


In [35]:
# Decay weights the SOS calculation at each match to be predicted on, and frames as expected points% given up by player opponents over the last 60 surface-specific matches (we will contrast directly to the player's ACTUAL performance over the last 60 prior to that match to be predicted on. 

df_player2["p_expected_opp_yield_pts%"] = (100 - (((df_player2['p_pts_won%_SOS_1to10'] * 6) + (df_player2['p_pts_won%_SOS_11to20'] * 5) + (df_player2['p_pts_won%_SOS_21to30'] * 4) 
+ (df_player2['p_pts_won%_SOS_31to40'] * 3) + (df_player2['p_pts_won%_SOS_41to50'] * 2) + (df_player2['p_pts_won%_SOS_51to60'] * 1))/21)).round(2)

# Drops transient columns
df_player2.drop(["p_pts_won%_SOS_1to10", "p_pts_won%_SOS_11to20","p_pts_won%_SOS_21to30","p_pts_won%_SOS_31to40","p_pts_won%_SOS_41to50","p_pts_won%_SOS_51to60"],axis=1, inplace=True)

In [36]:
# Calculates mean opponent performance per surface. We will use these to factor player l60 performance based on opponent 
# l60 performance (surface-specific) prior to the match of interest relative to the surface-specific sample mean

mean_clay_SOS1 = df_player2.loc[df_player2['t_surf'] == "Clay", 'p_opp_pts_won%_l60_decay'].mean()
mean_clay_SOS1 = 100 - mean_clay_SOS1 #we want in terms of pct pts the field ALLOWS on average
mean_hard_SOS1 = df_player2.loc[df_player2['t_surf'] == "Hard", 'p_opp_pts_won%_l60_decay'].mean()
mean_hard_SOS1 = 100 - mean_hard_SOS1 #we want in terms of pct pts the field ALLOWS on average
mean_clay_SOS1, mean_hard_SOS1

(49.74067648486775, 49.83854232313211)

In [37]:
# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# If opponents 
df_player2.loc[(df_player2["t_surf"] == "Clay"), "p_SOS_adj_pts_won%_l60_decay"] = ((df_player2["p_pts_won%_l60_decay"])*(mean_clay_SOS1/df_player2["p_expected_opp_yield_pts%"])).round(2)
                
df_player2.loc[(df_player2["t_surf"] == "Hard"), "p_SOS_adj_pts_won%_l60_decay"] = ((df_player2["p_pts_won%_l60_decay"])*(mean_hard_SOS1/df_player2["p_expected_opp_yield_pts%"])).round(2)

In [None]:
#Save to review
#df_player2.to_csv('../data/df_player2b.csv', index=False)

In [38]:
#Calculates % SERVE points won 'Strength of Schedule' for the past 60 opponents of a given player in a given match
#Uses each opponent's decay-weighted last 60 match performance prior to facing the player of interest (surface-specific)
#With this, we can obtain the "expected" performance over the last 60 matches for the player of interest, then can SOS adjust
#that player's performance over their last 60 on the surface to reflect how much above or below an average schedule they faced (see calculations below)

df_player2 = df_player2.iloc[::-1]

df_player2['p_sv_pts_won%_SOS_1to10'] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 1).mean().round(2).shift(1))

df_player2['p_sv_pts_won%_SOS_11to20'] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(11))
df_player2['p_sv_pts_won%_SOS_11to20'] = df_player2['p_sv_pts_won%_SOS_11to20'].fillna(df_player2['p_sv_pts_won%_SOS_1to10'])

df_player2['p_sv_pts_won%_SOS_21to30'] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(21))
df_player2['p_sv_pts_won%_SOS_21to30'] = df_player2['p_sv_pts_won%_SOS_21to30'].fillna(df_player2['p_sv_pts_won%_SOS_11to20'])

df_player2['p_sv_pts_won%_SOS_31to40'] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(31))
df_player2['p_sv_pts_won%_SOS_31to40'] = df_player2['p_sv_pts_won%_SOS_31to40'].fillna(df_player2['p_sv_pts_won%_SOS_21to30'])

df_player2['p_sv_pts_won%_SOS_41to50'] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(41))
df_player2['p_sv_pts_won%_SOS_41to50'] = df_player2['p_sv_pts_won%_SOS_41to50'].fillna(df_player2['p_sv_pts_won%_SOS_31to40'])

df_player2['p_sv_pts_won%_SOS_51to60'] = df_player2.groupby(['p_id','t_surf'])['p_opp_sv_pts_won%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(51))
df_player2['p_sv_pts_won%_SOS_51to60'] = df_player2['p_sv_pts_won%_SOS_51to60'].fillna(df_player2['p_sv_pts_won%_SOS_41to50'])

df_player2 = df_player2.iloc[::-1]


In [39]:
# Decay weights the SOS calculation at each match to be predicted on, and frames as expected RETURN PTS YIELD up by player's opponents over the last 60 surface-specific matches (we will contrast directly to the player's ACTUAL performance over the last 60 prior to that match to be predicted on). 

df_player2["p_expected_opp_yield_ret_pts%"] = (100 - (((df_player2['p_sv_pts_won%_SOS_1to10'] * 6) + (df_player2['p_sv_pts_won%_SOS_11to20'] * 5) + (df_player2['p_sv_pts_won%_SOS_21to30'] * 4) 
+ (df_player2['p_sv_pts_won%_SOS_31to40'] * 3) + (df_player2['p_sv_pts_won%_SOS_41to50'] * 2) + (df_player2['p_sv_pts_won%_SOS_51to60'] * 1))/21)).round(2)

# Drops transient columns
df_player2.drop(["p_sv_pts_won%_SOS_1to10", "p_sv_pts_won%_SOS_11to20","p_sv_pts_won%_SOS_21to30","p_sv_pts_won%_SOS_31to40","p_sv_pts_won%_SOS_41to50","p_sv_pts_won%_SOS_51to60"],axis=1, inplace=True)

In [40]:
# Calculates mean opponent performance per surface. We will use these to factor player l60 performance based on opponent 
# l60 performance (surface-specific) prior to the match of interest relative to the surface-specific sample mean

mean_clay_SOS2 = df_player2.loc[df_player2['t_surf'] == "Clay", 'p_opp_sv_pts_won%_l60_decay'].mean()
mean_clay_SOS2 = 100 - mean_clay_SOS2 #we want in terms of pct RETURN pts the field ALLOWS on average
mean_hard_SOS2 = df_player2.loc[df_player2['t_surf'] == "Hard", 'p_opp_sv_pts_won%_l60_decay'].mean()
mean_hard_SOS2 = 100 - mean_hard_SOS2 #we want in terms of pct RETURN pts the field ALLOWS on average
mean_clay_SOS2, mean_hard_SOS2

(37.810126234018405, 35.88599084905211)

In [41]:
# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# If opponents 
df_player2.loc[(df_player2["t_surf"] == "Clay"), "p_SOS_adj_ret_pts_won%_l60_decay"] = ((df_player2["p_ret_pts_won%_l60_decay"])*(mean_clay_SOS2/df_player2["p_expected_opp_yield_ret_pts%"])).round(2)
                
df_player2.loc[(df_player2["t_surf"] == "Hard"), "p_SOS_adj_ret_pts_won%_l60_decay"] = ((df_player2["p_ret_pts_won%_l60_decay"])*(mean_hard_SOS2/df_player2["p_expected_opp_yield_ret_pts%"])).round(2)

In [42]:
#Calculates % RETURN points won 'Strength of Schedule' for the past 60 opponents of a given player in a given match
#Uses each opponent's decay-weighted last 60 match performance prior to facing the player of interest (surface-specific)
#With this, we can obtain the "expected" performance over the last 60 matches for the player of interest, then can SOS adjust
#that player's performance over their last 60 on the surface to reflect how much above or below an average schedule they faced (see calculations below)

df_player2 = df_player2.iloc[::-1]

df_player2['p_ret_pts_won%_SOS_1to10'] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 1).mean().round(2).shift(1))

df_player2['p_ret_pts_won%_SOS_11to20'] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(11))
df_player2['p_ret_pts_won%_SOS_11to20'] = df_player2['p_ret_pts_won%_SOS_11to20'].fillna(df_player2['p_ret_pts_won%_SOS_1to10'])

df_player2['p_ret_pts_won%_SOS_21to30'] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(21))
df_player2['p_ret_pts_won%_SOS_21to30'] = df_player2['p_ret_pts_won%_SOS_21to30'].fillna(df_player2['p_ret_pts_won%_SOS_11to20'])

df_player2['p_ret_pts_won%_SOS_31to40'] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(31))
df_player2['p_ret_pts_won%_SOS_31to40'] = df_player2['p_ret_pts_won%_SOS_31to40'].fillna(df_player2['p_ret_pts_won%_SOS_21to30'])

df_player2['p_ret_pts_won%_SOS_41to50'] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(41))
df_player2['p_ret_pts_won%_SOS_41to50'] = df_player2['p_ret_pts_won%_SOS_41to50'].fillna(df_player2['p_ret_pts_won%_SOS_31to40'])

df_player2['p_ret_pts_won%_SOS_51to60'] = df_player2.groupby(['p_id','t_surf'])['p_opp_ret_pts_won%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(51))
df_player2['p_ret_pts_won%_SOS_51to60'] = df_player2['p_ret_pts_won%_SOS_51to60'].fillna(df_player2['p_ret_pts_won%_SOS_41to50'])

df_player2 = df_player2.iloc[::-1]


In [43]:
# Decay weights the SOS calculation at each match to be predicted on, and frames as expected SERVE PTS YIELD up by player's opponents over the last 60 surface-specific matches (we will contrast directly to the player's ACTUAL performance over the last 60 prior to that match to be predicted on). 

df_player2["p_expected_opp_yield_sv_pts%"] = (100 - (((df_player2['p_ret_pts_won%_SOS_1to10'] * 6) + (df_player2['p_ret_pts_won%_SOS_11to20'] * 5) + (df_player2['p_ret_pts_won%_SOS_21to30'] * 4) 
+ (df_player2['p_ret_pts_won%_SOS_31to40'] * 3) + (df_player2['p_ret_pts_won%_SOS_41to50'] * 2) + (df_player2['p_ret_pts_won%_SOS_51to60'] * 1))/21)).round(2)

# Drops transient columns
df_player2.drop(["p_ret_pts_won%_SOS_1to10", "p_ret_pts_won%_SOS_11to20","p_ret_pts_won%_SOS_21to30","p_ret_pts_won%_SOS_31to40","p_ret_pts_won%_SOS_41to50","p_ret_pts_won%_SOS_51to60"],axis=1, inplace=True)

In [44]:
# Calculates mean opponent performance per surface. We will use these to factor player l60 performance based on opponent 
# l60 performance (surface-specific) prior to the match of interest relative to the surface-specific sample mean

mean_clay_SOS3 = df_player2.loc[df_player2['t_surf'] == "Clay", 'p_opp_ret_pts_won%_l60_decay'].mean()
mean_clay_SOS3 = 100 - mean_clay_SOS3 #we want in terms of pct SERVE pts the field ALLOWS on average
mean_hard_SOS3 = df_player2.loc[df_player2['t_surf'] == "Hard", 'p_opp_ret_pts_won%_l60_decay'].mean()
mean_hard_SOS3 = 100 - mean_hard_SOS3 #we want in terms of pct SERVE pts the field ALLOWS on average
mean_clay_SOS3, mean_hard_SOS3

(61.6650712089336, 63.762513920662926)

In [45]:
# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# If opponents 
df_player2.loc[(df_player2["t_surf"] == "Clay"), "p_SOS_adj_sv_pts_won%_l60_decay"] = ((df_player2["p_sv_pts_won%_l60_decay"])*(mean_clay_SOS3/df_player2["p_expected_opp_yield_sv_pts%"])).round(2)
                
df_player2.loc[(df_player2["t_surf"] == "Hard"), "p_SOS_adj_sv_pts_won%_l60_decay"] = ((df_player2["p_sv_pts_won%_l60_decay"])*(mean_hard_SOS3/df_player2["p_expected_opp_yield_sv_pts%"])).round(2)

In [None]:
#Save to review
#df_player2.to_csv('../data/df_player2c.csv', index=False)

In [46]:
#Calculates ace % 'Strength of Schedule' for the past 60 opponents of a given player in a given match
#Uses each opponent's decay-weighted last 60 match performance prior to facing the player of interest (surface-specific)
#With this, we can obtain the "expected" performance over the last 60 matches for the player of interest, then can SOS adjust
#that player's performance over their last 60 on the surface to reflect how much above or below an average schedule they faced (see calculations below)

df_player2 = df_player2.iloc[::-1]

df_player2['p_ace%_SOS_1to10'] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 1).mean().round(2).shift(1))

df_player2['p_ace%_SOS_11to20'] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(11))
df_player2['p_ace%_SOS_11to20'] = df_player2['p_ace%_SOS_11to20'].fillna(df_player2['p_ace%_SOS_1to10'])

df_player2['p_ace%_SOS_21to30'] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(21))
df_player2['p_ace%_SOS_21to30'] = df_player2['p_ace%_SOS_21to30'].fillna(df_player2['p_ace%_SOS_11to20'])

df_player2['p_ace%_SOS_31to40'] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(31))
df_player2['p_ace%_SOS_31to40'] = df_player2['p_ace%_SOS_31to40'].fillna(df_player2['p_ace%_SOS_21to30'])

df_player2['p_ace%_SOS_41to50'] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(41))
df_player2['p_ace%_SOS_41to50'] = df_player2['p_ace%_SOS_41to50'].fillna(df_player2['p_ace%_SOS_31to40'])

df_player2['p_ace%_SOS_51to60'] = df_player2.groupby(['p_id','t_surf'])['p_opp_ace%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(51))
df_player2['p_ace%_SOS_51to60'] = df_player2['p_ace%_SOS_51to60'].fillna(df_player2['p_ace%_SOS_41to50'])

df_player2 = df_player2.iloc[::-1]


In [47]:
# Decay weights the SOS calculation at each match to be predicted on, and frames as expected ACED% YIELD by player's opponents over the last 60 surface-specific matches (we will contrast directly to the player's ACTUAL performance over the last 60 prior to that match to be predicted on). 

df_player2["p_expected_opp_yield_aced%"] = (((df_player2['p_ace%_SOS_1to10'] * 6) + (df_player2['p_ace%_SOS_11to20'] * 5) + (df_player2['p_ace%_SOS_21to30'] * 4) 
+ (df_player2['p_ace%_SOS_31to40'] * 3) + (df_player2['p_ace%_SOS_41to50'] * 2) + (df_player2['p_ace%_SOS_51to60'] * 1))/21).round(2)

# Drops transient columns
df_player2.drop(["p_ace%_SOS_1to10", "p_ace%_SOS_11to20","p_ace%_SOS_21to30","p_ace%_SOS_31to40","p_ace%_SOS_41to50","p_ace%_SOS_51to60"],axis=1, inplace=True)

In [48]:
# Calculates mean opponent performance per surface. We will use these to factor player l60 performance based on opponent 
# l60 performance (surface-specific) prior to the match of interest relative to the surface-specific sample mean

mean_clay_SOS4 = df_player2.loc[df_player2['t_surf'] == "Clay", 'p_opp_ace%_l60_decay'].mean()
mean_clay_SOS4 = mean_clay_SOS4 #we want in terms of pct ACED the field ALLOWS on average
mean_hard_SOS4 = df_player2.loc[df_player2['t_surf'] == "Hard", 'p_opp_ace%_l60_decay'].mean()
mean_hard_SOS4 = mean_hard_SOS4 #we want in terms of pct ACED the field ALLOWS on average
mean_clay_SOS4, mean_hard_SOS4

(5.607325619032206, 8.752371045020887)

In [49]:
# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# If opponents 
df_player2.loc[(df_player2["t_surf"] == "Clay"), "p_SOS_adj_aced%_l60_decay"] = ((df_player2["p_aced%_l60_decay"])*(mean_clay_SOS4/df_player2["p_expected_opp_yield_aced%"])).round(2)
                
df_player2.loc[(df_player2["t_surf"] == "Hard"), "p_SOS_adj_aced%_l60_decay"] = ((df_player2["p_aced%_l60_decay"])*(mean_hard_SOS4/df_player2["p_expected_opp_yield_aced%"])).round(2)

In [None]:
#Save to review
#df_player2.to_csv('../data/df_player2d.csv', index=False)

In [50]:
#Calculates aced % 'Strength of Schedule' for the past 60 opponents of a given player in a given match
#Uses each opponent's decay-weighted last 60 match performance prior to facing the player of interest (surface-specific)
#With this, we can obtain the "expected" performance over the last 60 matches for the player of interest, then can SOS adjust
#that player's performance over their last 60 on the surface to reflect how much above or below an average schedule they faced (see calculations below)

df_player2 = df_player2.iloc[::-1]

df_player2['p_aced%_SOS_1to10'] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 1).mean().round(2).shift(1))

df_player2['p_aced%_SOS_11to20'] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(11))
df_player2['p_aced%_SOS_11to20'] = df_player2['p_aced%_SOS_11to20'].fillna(df_player2['p_aced%_SOS_1to10'])

df_player2['p_aced%_SOS_21to30'] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(21))
df_player2['p_aced%_SOS_21to30'] = df_player2['p_aced%_SOS_21to30'].fillna(df_player2['p_aced%_SOS_11to20'])

df_player2['p_aced%_SOS_31to40'] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(31))
df_player2['p_aced%_SOS_31to40'] = df_player2['p_aced%_SOS_31to40'].fillna(df_player2['p_aced%_SOS_21to30'])

df_player2['p_aced%_SOS_41to50'] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(41))
df_player2['p_aced%_SOS_41to50'] = df_player2['p_aced%_SOS_41to50'].fillna(df_player2['p_aced%_SOS_31to40'])

df_player2['p_aced%_SOS_51to60'] = df_player2.groupby(['p_id','t_surf'])['p_opp_aced%_l60_decay'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(51))
df_player2['p_aced%_SOS_51to60'] = df_player2['p_aced%_SOS_51to60'].fillna(df_player2['p_aced%_SOS_41to50'])

df_player2 = df_player2.iloc[::-1]


In [51]:
# Decay weights the SOS calculation at each match to be predicted on, and frames as expected ACE% YIELD by player's opponents over the last 60 surface-specific matches (we will contrast directly to the player's ACTUAL performance over the last 60 prior to that match to be predicted on). 

df_player2["p_expected_opp_yield_ace%"] = (((df_player2['p_aced%_SOS_1to10'] * 6) + (df_player2['p_aced%_SOS_11to20'] * 5) + (df_player2['p_aced%_SOS_21to30'] * 4) 
+ (df_player2['p_aced%_SOS_31to40'] * 3) + (df_player2['p_aced%_SOS_41to50'] * 2) + (df_player2['p_aced%_SOS_51to60'] * 1))/21).round(2)

# Drops transient columns
df_player2.drop(["p_aced%_SOS_1to10", "p_aced%_SOS_11to20","p_aced%_SOS_21to30","p_aced%_SOS_31to40","p_aced%_SOS_41to50","p_aced%_SOS_51to60"],axis=1, inplace=True)

In [52]:
# Calculates mean opponent performance per surface. We will use these to factor player l60 performance based on opponent 
# l60 performance (surface-specific) prior to the match of interest relative to the surface-specific sample mean

mean_clay_SOS5 = df_player2.loc[df_player2['t_surf'] == "Clay", 'p_opp_aced%_l60_decay'].mean()
mean_clay_SOS5 = mean_clay_SOS5 #we want in terms of pct ACES the field ALLOWS on average
mean_hard_SOS5 = df_player2.loc[df_player2['t_surf'] == "Hard", 'p_opp_aced%_l60_decay'].mean()
mean_hard_SOS5 = mean_hard_SOS5 #we want in terms of pct ACES the field ALLOWS on average
mean_clay_SOS5, mean_hard_SOS5

(5.38411393429361, 8.504145551862615)

In [53]:
# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# If opponents 
df_player2.loc[(df_player2["t_surf"] == "Clay"), "p_SOS_adj_ace%_l60_decay"] = ((df_player2["p_ace%_l60_decay"])*(mean_clay_SOS5/df_player2["p_expected_opp_yield_ace%"])).round(2)
                
df_player2.loc[(df_player2["t_surf"] == "Hard"), "p_SOS_adj_ace%_l60_decay"] = ((df_player2["p_ace%_l60_decay"])*(mean_hard_SOS5/df_player2["p_expected_opp_yield_ace%"])).round(2)

In [None]:
#Save to review
#df_player2.to_csv('../data/df_player2e.csv', index=False)

In [54]:
#Calculates break point saved% 'Strength of Schedule' for the past 60 opponents of a given player in a given match
#Uses each opponent's (non decay-weighted) last 60 match performance prior to facing the player of interest (surface-specific)
#With this, we can obtain the "expected" performance over the last 60 matches for the player of interest, then can SOS adjust
#that player's performance over their last 60 on the surface to reflect how much above or below an average schedule they faced (see calculations below)

df_player2 = df_player2.iloc[::-1]

df_player2['p_bp_save%_SOS_1to10'] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60'].transform(lambda x: x.rolling(window=10, min_periods = 1).mean().round(2).shift(1))

df_player2['p_bp_save%_SOS_11to20'] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(11))
df_player2['p_bp_save%_SOS_11to20'] = df_player2['p_bp_save%_SOS_11to20'].fillna(df_player2['p_bp_save%_SOS_1to10'])

df_player2['p_bp_save%_SOS_21to30'] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(21))
df_player2['p_bp_save%_SOS_21to30'] = df_player2['p_bp_save%_SOS_21to30'].fillna(df_player2['p_bp_save%_SOS_11to20'])

df_player2['p_bp_save%_SOS_31to40'] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(31))
df_player2['p_bp_save%_SOS_31to40'] = df_player2['p_bp_save%_SOS_31to40'].fillna(df_player2['p_bp_save%_SOS_21to30'])

df_player2['p_bp_save%_SOS_41to50'] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(41))
df_player2['p_bp_save%_SOS_41to50'] = df_player2['p_bp_save%_SOS_41to50'].fillna(df_player2['p_bp_save%_SOS_31to40'])

df_player2['p_bp_save%_SOS_51to60'] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_save%_l60'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(51))
df_player2['p_bp_save%_SOS_51to60'] = df_player2['p_bp_save%_SOS_51to60'].fillna(df_player2['p_bp_save%_SOS_41to50'])

df_player2 = df_player2.iloc[::-1]

In [55]:
# Decay weights the SOS calculation at each match to be predicted on, and frames as expected BP CONVERT% YIELD by player's opponents over the last 60 surface-specific matches (we will contrast directly to the player's ACTUAL performance over the last 60 prior to that match to be predicted on). 

df_player2["p_expected_opp_yield_bp_convert%"] = (100 - (((df_player2['p_bp_save%_SOS_1to10'] * 6) + (df_player2['p_bp_save%_SOS_11to20'] * 5) + (df_player2['p_bp_save%_SOS_21to30'] * 4) 
+ (df_player2['p_bp_save%_SOS_31to40'] * 3) + (df_player2['p_bp_save%_SOS_41to50'] * 2) + (df_player2['p_bp_save%_SOS_51to60'] * 1))/21)).round(2)

# Drops transient columns
df_player2.drop(["p_bp_save%_SOS_1to10", "p_bp_save%_SOS_11to20","p_bp_save%_SOS_21to30","p_bp_save%_SOS_31to40","p_bp_save%_SOS_41to50","p_bp_save%_SOS_51to60"],axis=1, inplace=True)

In [56]:
# Calculates mean opponent performance per surface. We will use these to factor player l60 performance based on opponent 
# l60 performance (surface-specific) prior to the match of interest relative to the surface-specific sample mean

mean_clay_SOS6 = df_player2.loc[df_player2['t_surf'] == "Clay", 'p_opp_bp_save%_l60'].mean()
mean_clay_SOS6 = 100 - mean_clay_SOS6 #we want in terms of pct BREAK CONVERSIONS the field ALLOWS on average
mean_hard_SOS6 = df_player2.loc[df_player2['t_surf'] == "Hard", 'p_opp_bp_save%_l60'].mean()
mean_hard_SOS6 = 100 - mean_hard_SOS6 #we want in terms of pct BREAK CONVERSIONS the field ALLOWS on average
mean_clay_SOS6, mean_hard_SOS6

(42.236740362811396, 40.70573353487378)

In [57]:
# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# If opponents 
df_player2.loc[(df_player2["t_surf"] == "Clay"), "p_SOS_adj_bp_convert%_l60"] = ((df_player2["p_bp_convert%_l60"])*(mean_clay_SOS6/df_player2["p_expected_opp_yield_bp_convert%"])).round(2)
                
df_player2.loc[(df_player2["t_surf"] == "Hard"), "p_SOS_adj_bp_convert%_l60"] = ((df_player2["p_bp_convert%_l60"])*(mean_hard_SOS6/df_player2["p_expected_opp_yield_bp_convert%"])).round(2)

In [None]:
#Save to review
#df_player2.to_csv('../data/df_player2f.csv', index=False)

In [58]:
#Calculates break point converted% 'Strength of Schedule' for the past 60 opponents of a given player in a given match
#Uses each opponent's (non decay-weighted) last 60 match performance prior to facing the player of interest (surface-specific)
#With this, we can obtain the "expected" performance over the last 60 matches for the player of interest, then can SOS adjust
#that player's performance over their last 60 on the surface to reflect how much above or below an average schedule they faced (see calculations below)

df_player2 = df_player2.iloc[::-1]

df_player2['p_bp_convert%_SOS_1to10'] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_convert%_l60'].transform(lambda x: x.rolling(window=10, min_periods = 1).mean().round(2).shift(1))

df_player2['p_bp_convert%_SOS_11to20'] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_convert%_l60'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(11))
df_player2['p_bp_convert%_SOS_11to20'] = df_player2['p_bp_convert%_SOS_11to20'].fillna(df_player2['p_bp_convert%_SOS_1to10'])

df_player2['p_bp_convert%_SOS_21to30'] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_convert%_l60'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(21))
df_player2['p_bp_convert%_SOS_21to30'] = df_player2['p_bp_convert%_SOS_21to30'].fillna(df_player2['p_bp_convert%_SOS_11to20'])

df_player2['p_bp_convert%_SOS_31to40'] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_convert%_l60'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(31))
df_player2['p_bp_convert%_SOS_31to40'] = df_player2['p_bp_convert%_SOS_31to40'].fillna(df_player2['p_bp_convert%_SOS_21to30'])

df_player2['p_bp_convert%_SOS_41to50'] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_convert%_l60'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(41))
df_player2['p_bp_convert%_SOS_41to50'] = df_player2['p_bp_convert%_SOS_41to50'].fillna(df_player2['p_bp_convert%_SOS_31to40'])

df_player2['p_bp_convert%_SOS_51to60'] = df_player2.groupby(['p_id','t_surf'])['p_opp_bp_convert%_l60'].transform(lambda x: x.rolling(window=10, min_periods = 3).mean().round(2).shift(51))
df_player2['p_bp_convert%_SOS_51to60'] = df_player2['p_bp_convert%_SOS_51to60'].fillna(df_player2['p_bp_convert%_SOS_41to50'])

df_player2 = df_player2.iloc[::-1]

In [59]:
# Decay weights the SOS calculation at each match to be predicted on, and frames as expected BP SAVE% YIELD by player's opponents over the last 60 surface-specific matches (we will contrast directly to the player's ACTUAL performance over the last 60 prior to that match to be predicted on). 

df_player2["p_expected_opp_yield_bp_save%"] = (100 - (((df_player2['p_bp_convert%_SOS_1to10'] * 6) + (df_player2['p_bp_convert%_SOS_11to20'] * 5) + (df_player2['p_bp_convert%_SOS_21to30'] * 4) 
+ (df_player2['p_bp_convert%_SOS_31to40'] * 3) + (df_player2['p_bp_convert%_SOS_41to50'] * 2) + (df_player2['p_bp_convert%_SOS_51to60'] * 1))/21)).round(2)

# Drops transient columns
df_player2.drop(["p_bp_convert%_SOS_1to10", "p_bp_convert%_SOS_11to20","p_bp_convert%_SOS_21to30","p_bp_convert%_SOS_31to40","p_bp_convert%_SOS_41to50","p_bp_convert%_SOS_51to60"],axis=1, inplace=True)

In [60]:
# Calculates mean opponent performance per surface. We will use these to factor player l60 performance based on opponent 
# l60 performance (surface-specific) prior to the match of interest relative to the surface-specific sample mean

mean_clay_SOS7 = df_player2.loc[df_player2['t_surf'] == "Clay", 'p_opp_bp_convert%_l60'].mean()
mean_clay_SOS7 = 100 - mean_clay_SOS7 #we want in terms of pct BREAK PTS SAVED the field ALLOWS on average
mean_hard_SOS7 = df_player2.loc[df_player2['t_surf'] == "Hard", 'p_opp_bp_convert%_l60'].mean()
mean_hard_SOS7 = 100 - mean_hard_SOS7 #we want in terms of pct BREAK PTS SAVED the field ALLOWS on average
mean_clay_SOS7, mean_hard_SOS7

(56.84343651243825, 58.91272000691238)

In [61]:
# Puts together the above- factors the player's actual performance over the last 60 by schedule of opponents' aggregrate performance over THEIR l60 prior to when they faced the player.
# If opponents 
df_player2.loc[(df_player2["t_surf"] == "Clay"), "p_SOS_adj_bp_save%_l60"] = ((df_player2["p_bp_save%_l60"])*(mean_clay_SOS7/df_player2["p_expected_opp_yield_bp_save%"])).round(2)
                
df_player2.loc[(df_player2["t_surf"] == "Hard"), "p_SOS_adj_bp_save%_l60"] = ((df_player2["p_bp_save%_l60"])*(mean_hard_SOS7/df_player2["p_expected_opp_yield_bp_save%"])).round(2)

In [None]:
#Save to review
#df_player2.to_csv('../data/df_player2g.csv', index=False)

In [None]:
df_player2.info()

In [62]:
#Tidy up latest by-player iteration before converting back to by-match for calculation of within-match to be predicted on player differentials
df_player3 = df_player2[["t_id", "t_date", "tour_wk", "t_name", "t_country", "t_surf", "t_lvl", "t_draw_size", "t_rd_num", "m_num", "m_best_of", "m_outcome", "m_time(m)", "m_tot_pts", "p_id", "p_name", "p_H2H_w", "p_rank", "p_rank_pts", "p_country", "p_ent", "p_hd", "p_ht", "p_age", "p_matches", "p_pts_won%", "p_pts_won%_l60_decay", "p_SOS_adj_pts_won%_l60_decay", "p_sv_pts_won%", "p_sv_pts_won%_l60_decay", "p_SOS_adj_sv_pts_won%_l60_decay", "p_ret_pts_won%", "p_ret_pts_won%_l60_decay", "p_SOS_adj_ret_pts_won%_l60_decay", "p_ace%", "p_ace%_l60_decay", "p_SOS_adj_ace%_l60_decay", "p_aced%", "p_aced%_l60_decay", "p_SOS_adj_aced%_l60_decay", "p_bp_save%", "p_bp_save%_l60", "p_SOS_adj_bp_save%_l60", "p_bp_convert%", "p_bp_convert%_l60", "p_SOS_adj_bp_convert%_l60", "p_pts_won%_std_l60", "p_sv_pts_won%_std_l60", "p_ret_pts_won%_std_l60", "p_tot_time_l5", "p_tot_time_l5_decay", "p_stamina_adj_fatigue_decay", "p_stamina_adj_fatigue", "p_tot_pts_l5", "p_tot_pts_l5_decay"]]

In [None]:
df_player3.info()

In [63]:
#Save prior to conversion back to by-match format for computation of player differential features per match.
#df_player3.to_csv('../data/df_player3.csv', index=False)

### Player vs Player Differentials in Predictive Features By Match

Now will convert dataframe back to by-match format to compute player differentials for predictive features aligned to each match to predict on. A few additional by-player features will be computed in the process as well. After these features are computed, the dataframe will be converted back to a by-player format for output to EDA.

In [64]:
df_winners3 = df_player3[df_player3['m_outcome'] == 1]
df_losers3 = df_player3[df_player3['m_outcome'] == 0]
df_match3 = df_winners3.merge(df_losers3, on='m_num', how = 'left')
df_match3 = df_match3.drop(["t_id_y", "t_date_y", "tour_wk_y", "t_name_y", "t_country_y", "t_surf_y", "t_lvl_y", "t_draw_size_y", "t_rd_num_y", "m_best_of_y", "m_time(m)_y", "m_tot_pts_y"], axis=1)
df_match3.rename(columns = {'t_id_x':'t_id', 't_date_x':'t_date', 'tour_wk_x':'tour_wk', 't_name_x':'t_name','t_country_x':'t_country','t_surf_x':'t_surf','t_lvl_x':'t_lvl','t_draw_size_x':'t_draw_size', 't_rd_num_x':'t_rd_num', 'm_best_of_x':'m_best_of', 'm_time(m)_x':'m_time(m)','m_tot_pts_x':'m_tot_pts'}, inplace=True)

#### Rankings-Related Player Differential Predictive Features By Match

In [67]:
# ATP ranking differential between winner (_x) vs loser (_y) (and loser vs winner) (to be consistent with points diff, positive number = better ranking than opp)
max_winners = df_match3['p_rank_x'].max()
max_losers = df_match3['p_rank_y'].max()
max_sample = max(max_winners, max_losers)
#max_sample

df_match3['p_rank_x'] = df_match3['p_rank_x'].fillna(max_sample + 1) # if player has no ranking, assign sample max + 1
df_match3['p_rank_y'] = df_match3['p_rank_y'].fillna(max_sample + 1) # if player has no ranking, assign sample max + 1
df_match3["p_rank_diff_x"] = -(df_match3["p_rank_x"] - df_match3["p_rank_y"])
df_match3["p_rank_diff_y"] = -df_match3["p_rank_diff_x"]

In [68]:
# Generate log of ranking for both players and then calculate the difference (assumption that one ranking place separates players more as you get closer to the top of the rankings)
df_match3["p_log_rank_x"] = np.log(df_match3["p_rank_x"]).round(2)
df_match3["p_log_rank_y"] = np.log(df_match3["p_rank_y"]).round(2)
df_match3["p_log_rank_diff_x"] = -(df_match3["p_log_rank_x"] - df_match3["p_log_rank_y"])
df_match3["p_log_rank_diff_y"] = -(df_match3["p_log_rank_diff_x"])

In [69]:
# ATP ranking points differential between winner (_x) and loser (_y) (and loser vs winner)
df_match3['p_rank_pts_x'] = df_match3['p_rank_pts_x'].fillna(0) # if player has no pts, assign 0
df_match3['p_rank_pts_y'] = df_match3['p_rank_pts_y'].fillna(0) # if player has no pts, assign 0
df_match3["p_rank_pts_diff_x"] = df_match3["p_rank_pts_x"] - df_match3["p_rank_pts_y"]
df_match3["p_rank_pts_diff_y"] = -df_match3["p_rank_pts_diff_x"]

#### Basic Player Characteristics Differential Predictive Features By Match

In [70]:
# Height differential between winner (_x) vs loser (_y) (in cm) (and loser vs winner)
df_match3["p_ht_diff_x"] = (df_match3["p_ht_x"] - df_match3["p_ht_y"])
df_match3["p_ht_diff_y"] = -df_match3["p_ht_diff_x"]

In [71]:
# Age differential between winner (_x) vs loser (_y) (yrs) (and loser vs winner)
df_match3["p_age_diff_x"] = (df_match3["p_age_x"] - df_match3["p_age_y"])
df_match3["p_age_diff_y"] = -df_match3["p_age_diff_x"]

In [72]:
# Marker column for if winner was Left-Handed and loser was Right-Handed (and vice versa) (1=T, 0=F)
df_match3['p_L_opp_R_x'] = np.where((df_match3['p_hd_x'] == 'L') & (df_match3['p_hd_y'] == 'R'), 1, 0)
df_match3['p_L_opp_R_y'] = np.where((df_match3['p_hd_x'] == 'R') & (df_match3['p_hd_y'] == 'L'), 1, 0)

# a small number of low-match # players in the sample are unknown (U) for handed, even after investigation on ATP site.

In [73]:
# Convert player handedness itself to numeric encoding
df_match3.loc[(df_match3["p_hd_x"] == "L"), "p_hd_x"] = 2 #Lefties converts to 2
df_match3.loc[(df_match3["p_hd_y"] == "L"), "p_hd_y"] = 2 #Lefties converts to 2
df_match3.loc[(df_match3["p_hd_x"] == "R"), "p_hd_x"] = 1 #Righties converts to 1
df_match3.loc[(df_match3["p_hd_y"] == "R"), "p_hd_y"] = 1 #Righties converts to 1
df_match3.loc[(df_match3["p_hd_x"] == "U"), "p_hd_x"] = 1 #Unknowns convert to 0
df_match3.loc[(df_match3["p_hd_y"] == "U"), "p_hd_y"] = 1 #Unknowns convert to 0

df_match3["p_hd_x"] = pd.to_numeric(df_match3["p_hd_x"])
df_match3["p_hd_y"] = pd.to_numeric(df_match3["p_hd_y"])

In [74]:
# Marker column for if winner was from the country where the tourney was held, and opponent was not (and vice versa) (1=T, 0=F)
df_match3['p_HCA_opp_N_x'] = np.where((df_match3['t_country'] == df_match3['p_country_x']) & (df_match3['t_country'] != df_match3['p_country_y']), 1, 0)
df_match3['p_HCA_opp_N_y'] = np.where((df_match3['t_country'] != df_match3['p_country_x']) & (df_match3['t_country'] == df_match3['p_country_y']), 1, 0)

#### Retrospective Player Performance Differential Predictive Features By Match

In [75]:
# % total points won in previous 6o matches (surface-specific; decay-weighted) differential between winner (_x) and loser (_y)
#This is the NON-Strength of Schedule Adjusted Version

df_match3["p_pts_won%_l60_decay_diff_x"] = df_match3["p_pts_won%_l60_decay_x"] - df_match3["p_pts_won%_l60_decay_y"]
df_match3["p_pts_won%_l60_decay_diff_y"] = -(df_match3["p_pts_won%_l60_decay_diff_x"])

In [76]:
# % total points won in previous 6o matches (surface-specific; decay-weighted) differential between winner (_x) and loser (_y)
#This is the Strength of Schedule Adjusted Version

df_match3["p_SOS_adj_pts_won%_l60_decay_diff_x"] = df_match3["p_SOS_adj_pts_won%_l60_decay_x"] - df_match3["p_SOS_adj_pts_won%_l60_decay_y"]
df_match3["p_SOS_adj_pts_won%_l60_decay_diff_y"] = -(df_match3["p_SOS_adj_pts_won%_l60_decay_diff_x"])

In [77]:
# "OFFENSE VS OFFENSE": % SERVE points won in previous 6o matches (surface-specific; decay-weighted) differential between winner (_x) and loser (_y)
#This is the NON-Strength of Schedule Adjusted Version

df_match3["p_sv_pts_won%_l60_decay_diff_x"] = df_match3["p_sv_pts_won%_l60_decay_x"] - df_match3["p_sv_pts_won%_l60_decay_y"]
df_match3["p_sv_pts_won%_l60_decay_diff_y"] = -(df_match3["p_sv_pts_won%_l60_decay_diff_x"])

In [78]:
# "OFFENSE VS OFFENSE": % SERVE points won in previous 6o matches (surface-specific; decay-weighted) differential between winner (_x) and loser (_y)
#This is the Strength of Schedule Adjusted Version

df_match3["p_SOS_adj_sv_pts_won%_l60_decay_diff_x"] = df_match3["p_SOS_adj_sv_pts_won%_l60_decay_x"] - df_match3["p_SOS_adj_sv_pts_won%_l60_decay_y"]
df_match3["p_SOS_adj_sv_pts_won%_l60_decay_diff_y"] = -(df_match3["p_SOS_adj_sv_pts_won%_l60_decay_diff_x"])

In [79]:
# "DEFENSE VS DEFENSE": % RETURN points won in previous 6o matches (surface-specific; decay-weighted) differential between winner (_x) and loser (_y)
#This is the NON-Strength of Schedule Adjusted Version

df_match3["p_ret_pts_won%_l60_decay_diff_x"] = df_match3["p_ret_pts_won%_l60_decay_x"] - df_match3["p_ret_pts_won%_l60_decay_y"]
df_match3["p_ret_pts_won%_l60_decay_diff_y"] = -(df_match3["p_ret_pts_won%_l60_decay_diff_x"])

In [80]:
# "DEFENSE VS DEFENSE": % RETURN points won in previous 6o matches (surface-specific; decay-weighted) differential between winner (_x) and loser (_y)
#This is the Strength of Schedule Adjusted Version

df_match3["p_SOS_adj_ret_pts_won%_l60_decay_diff_x"] = df_match3["p_SOS_adj_ret_pts_won%_l60_decay_x"] - df_match3["p_SOS_adj_ret_pts_won%_l60_decay_y"]
df_match3["p_SOS_adj_ret_pts_won%_l60_decay_diff_y"] = -(df_match3["p_SOS_adj_ret_pts_won%_l60_decay_diff_x"])

In [81]:
# "OFFENSE VS DEFENSE": % SERVE points won VS OPPONENT % RETURN points won in previous 6o matches (surface-specific; decay-weighted) differential between winner (_x) and loser (_y)
#This is the NON-Strength of Schedule Adjusted Version

df_match3["p_sv_opp_ret_pts_won%_l60_decay_diff_x"] = df_match3["p_sv_pts_won%_l60_decay_x"] - df_match3["p_ret_pts_won%_l60_decay_y"]
df_match3["p_sv_opp_ret_pts_won%_l60_decay_diff_y"] = df_match3["p_sv_pts_won%_l60_decay_y"] - df_match3["p_ret_pts_won%_l60_decay_x"]

In [82]:
# "OFFENSE VS DEFENSE": % SERVE points won VS OPPONENT % RETURN points won in previous 6o matches (surface-specific; decay-weighted) differential between winner (_x) and loser (_y)
#This is the Strength of Schedule Adjusted Version

df_match3["p_SOS_adj_sv_opp_ret_pts_won%_l60_decay_diff_x"] = df_match3["p_SOS_adj_sv_pts_won%_l60_decay_x"] - df_match3["p_SOS_adj_ret_pts_won%_l60_decay_y"]
df_match3["p_SOS_adj_sv_opp_ret_pts_won%_l60_decay_diff_y"] = df_match3["p_SOS_adj_sv_pts_won%_l60_decay_y"] - df_match3["p_SOS_adj_ret_pts_won%_l60_decay_x"]

In [83]:
# "DEFENSE VS OFFENSE": % RETURN points won VS OPPONENT % SERVE points won in previous 6o matches (surface-specific; decay-weighted) differential between winner (_x) and loser (_y)
#This is the NON-Strength of Schedule Adjusted Version

df_match3["p_ret_opp_sv_pts_won%_l60_decay_diff_x"] = df_match3["p_ret_pts_won%_l60_decay_x"] - df_match3["p_sv_pts_won%_l60_decay_y"]
df_match3["p_ret_opp_sv_pts_won%_l60_decay_diff_y"] = df_match3["p_ret_pts_won%_l60_decay_y"] - df_match3["p_sv_pts_won%_l60_decay_x"]

In [84]:
# "DEFENSE VS OFFENSE": % RETURN points won VS OPPONENT % SERVE points won in previous 6o matches (surface-specific; decay-weighted) differential between winner (_x) and loser (_y)
#This is the Strength of Schedule Adjusted Version

df_match3["p_SOS_adj_ret_opp_sv_pts_won%_l60_decay_diff_x"] = df_match3["p_SOS_adj_ret_pts_won%_l60_decay_x"] - df_match3["p_SOS_adj_sv_pts_won%_l60_decay_y"]
df_match3["p_SOS_adj_ret_opp_sv_pts_won%_l60_decay_diff_y"] = df_match3["p_SOS_adj_ret_pts_won%_l60_decay_y"] - df_match3["p_SOS_adj_sv_pts_won%_l60_decay_x"]

In [85]:
# "OFFENSE VS OFFENSE": player ace% VS OPPONENT ace% in previous 6o matches (surface-specific; decay-weighted) differential between winner (_x) and loser (_y)
#This is the NON-Strength of Schedule Adjusted Version

df_match3["p_ace%_l60_decay_diff_x"] = df_match3["p_ace%_l60_decay_x"] - df_match3["p_ace%_l60_decay_y"]
df_match3["p_ace%_l60_decay_diff_y"] = -(df_match3["p_ace%_l60_decay_diff_x"]) 

In [86]:
# "OFFENSE VS OFFENSE": player ace% VS OPPONENT ace% in previous 6o matches (surface-specific; decay-weighted) differential between winner (_x) and loser (_y)
#This is the Strength of Schedule Adjusted Version

df_match3["p_SOS_adj_ace%_l60_decay_diff_x"] = df_match3["p_SOS_adj_ace%_l60_decay_x"] - df_match3["p_SOS_adj_ace%_l60_decay_y"]
df_match3["p_SOS_adj_ace%_l60_decay_diff_y"] = -(df_match3["p_SOS_adj_ace%_l60_decay_diff_x"]) 

In [87]:
# "DEFENSE VS DEFENSE": player aced% VS OPPONENT aced% in previous 6o matches (surface-specific; decay-weighted) differential between winner (_x) and loser (_y)
#This is the NON-Strength of Schedule Adjusted Version

df_match3["p_aced%_l60_decay_diff_x"] = df_match3["p_aced%_l60_decay_x"] - df_match3["p_aced%_l60_decay_y"]
df_match3["p_aced%_l60_decay_diff_y"] = -(df_match3["p_aced%_l60_decay_diff_x"]) 

In [88]:
# "DEFENSE VS DEFENSE": player aced% VS OPPONENT aced% in previous 6o matches (surface-specific; decay-weighted) differential between winner (_x) and loser (_y)
#This is the Strength of Schedule Adjusted Version

df_match3["p_SOS_adj_aced%_l60_decay_diff_x"] = df_match3["p_SOS_adj_aced%_l60_decay_x"] - df_match3["p_SOS_adj_aced%_l60_decay_y"]
df_match3["p_SOS_adj_aced%_l60_decay_diff_y"] = -(df_match3["p_SOS_adj_aced%_l60_decay_diff_x"])

In [89]:
# "OFFENSE VS DEFENSE": player ace% VS OPPONENT aced% in previous 6o matches (surface-specific; decay-weighted) differential between winner (_x) and loser (_y)
#This is the NON-Strength of Schedule Adjusted Version

df_match3["p_ace%_opp_aced%_l60_decay_diff_x"] = df_match3["p_ace%_l60_decay_x"] - df_match3["p_aced%_l60_decay_y"]
df_match3["p_ace%_opp_aced%_l60_decay_diff_y"] = df_match3["p_ace%_l60_decay_y"] - df_match3["p_aced%_l60_decay_x"]

In [90]:
# "OFFENSE VS DEFENSE": player ace% VS OPPONENT aced% in previous 6o matches (surface-specific;decay-weighted) differential between winner (_x) and loser (_y)
#This is the Strength of Schedule Adjusted Version

df_match3["p_SOS_adj_ace%_opp_aced%_l60_decay_diff_x"] = df_match3["p_SOS_adj_ace%_l60_decay_x"] - df_match3["p_SOS_adj_aced%_l60_decay_y"]
df_match3["p_SOS_adj_ace%_opp_aced%_l60_decay_diff_y"] = df_match3["p_SOS_adj_ace%_l60_decay_y"] - df_match3["p_SOS_adj_aced%_l60_decay_x"]

In [91]:
# "DEFENSE VS OFFENSE": player aced% VS OPPONENT ace% in previous 6o matches (surface-specific; decay-weighted) differential between winner (_x) and loser (_y)
#This is the Non-Strength of Schedule Adjusted Version

df_match3["p_aced%_opp_ace%_l60_decay_diff_x"] = df_match3["p_aced%_l60_decay_x"] - df_match3["p_ace%_l60_decay_y"]
df_match3["p_aced%_opp_ace%_l60_decay_diff_y"] = df_match3["p_aced%_l60_decay_y"] - df_match3["p_ace%_l60_decay_x"]

In [92]:
# "DEFENSE VS OFFENSE": player aced% VS OPPONENT ace% in previous 6o matches (surface-specific; decay-weighted) differential between winner (_x) and loser (_y)
#This is the Strength of Schedule Adjusted Version

df_match3["p_SOS_adj_aced%_opp_ace%_l60_decay_diff_x"] = df_match3["p_SOS_adj_aced%_l60_decay_x"] - df_match3["p_SOS_adj_ace%_l60_decay_y"]
df_match3["p_SOS_adj_aced%_opp_ace%_l60_decay_diff_y"] = df_match3["p_SOS_adj_aced%_l60_decay_y"] - df_match3["p_SOS_adj_ace%_l60_decay_x"]

In [93]:
# "DEFENSE VS DEFENSE": player bp saved% VS OPPONENT bp saved % in previous 6o matches (surface-specific; non-decay weighted) differential between winner (_x) and loser (_y)
#This is the NON-Strength of Schedule Adjusted Version

df_match3["p_bp_save%_l60_diff_x"] = df_match3["p_bp_save%_l60_x"] - df_match3["p_bp_save%_l60_y"]
df_match3["p_bp_save%_l60_diff_y"] = -(df_match3["p_bp_save%_l60_diff_x"]) 

In [94]:
# "DEFENSE VS DEFENSE": player bp saved% VS OPPONENT bp saved % in previous 6o matches (surface-specific; non-decay weighted) differential between winner (_x) and loser (_y)
#This is the Strength of Schedule Adjusted Version

df_match3["p_SOS_adj_bp_save%_l60_diff_x"] = df_match3["p_SOS_adj_bp_save%_l60_x"] - df_match3["p_SOS_adj_bp_save%_l60_y"]
df_match3["p_SOS_adj_bp_save%_l60_diff_y"] = -(df_match3["p_SOS_adj_bp_save%_l60_diff_x"]) 

In [95]:
# "OFFENSE VS OFFENSE": player bp convert% VS OPPONENT bp convert% in previous 6o matches (surface-specific; non-decay weighted) differential between winner (_x) and loser (_y)
#This is the NON-Strength of Schedule Adjusted Version

df_match3["p_bp_convert%_l60_diff_x"] = df_match3["p_bp_convert%_l60_x"] - df_match3["p_bp_convert%_l60_y"]
df_match3["p_bp_convert%_l60_diff_y"] = -(df_match3["p_bp_convert%_l60_diff_x"]) 

In [96]:
# "OFFENSE VS OFFENSE": player bp convert% VS OPPONENT bp convert% in previous 6o matches (surface-specific; non-decay weighted) differential between winner (_x) and loser (_y)
#This is the Strength of Schedule Adjusted Version

df_match3["p_SOS_adj_bp_convert%_l60_diff_x"] = df_match3["p_SOS_adj_bp_convert%_l60_x"] - df_match3["p_SOS_adj_bp_convert%_l60_y"]
df_match3["p_SOS_adj_bp_convert%_l60_diff_y"] = -(df_match3["p_SOS_adj_bp_convert%_l60_diff_x"]) 

In [97]:
# "OFFENSE VS DEFENSE": player bp convert% VS OPPONENT bp save% in previous 6o matches (surface-specific; non-decay weighted) differential between winner (_x) and loser (_y)
#This is the NON-Strength of Schedule Adjusted Version

df_match3["p_bp_convert%_opp_bp_save%_l60_diff_x"] = df_match3["p_bp_convert%_l60_x"] - df_match3["p_bp_save%_l60_y"]
df_match3["p_bp_convert%_opp_bp_save%_l60_diff_y"] = df_match3["p_bp_convert%_l60_y"] - df_match3["p_bp_save%_l60_x"]

In [98]:
# "OFFENSE VS DEFENSE": player bp convert% VS OPPONENT bp save% in previous 6o matches (surface-specific; non-decay weighted) differential between winner (_x) and loser (_y)
#This is the Strength of Schedule Adjusted Version

df_match3["p_SOS_adj_bp_convert%_opp_bp_save%_l60_diff_x"] = df_match3["p_bp_convert%_l60_x"] - df_match3["p_bp_save%_l60_y"]
df_match3["p_SOS_adj_bp_convert%_opp_bp_save%_l60_diff_y"] = df_match3["p_bp_convert%_l60_y"] - df_match3["p_bp_save%_l60_x"]

In [99]:
# "DEFENSE VS OFFENSE": player bp save% VS OPPONENT bp convert% in previous 6o matches (surface-specific; non-decay weighted) differential between winner (_x) and loser (_y)
#This is the NON-Strength of Schedule Adjusted Version

df_match3["p_bp_save%_opp_bp_convert%_l60_diff_x"] = df_match3["p_bp_save%_l60_x"] - df_match3["p_bp_convert%_l60_y"]
df_match3["p_bp_save%_opp_bp_convert%_l60_diff_y"] = df_match3["p_bp_save%_l60_y"] - df_match3["p_bp_convert%_l60_x"]

In [100]:
# "DEFENSE VS OFFENSE": player bp save% VS OPPONENT bp convert% in previous 6o matches (surface-specific; non-decay weighted) differential between winner (_x) and loser (_y)
#This is the Strength of Schedule Adjusted Version

df_match3["p_SOS_adj_bp_save%_opp_bp_convert%_l60_diff_x"] = df_match3["p_SOS_adj_bp_save%_l60_x"] - df_match3["p_SOS_adj_bp_convert%_l60_y"]
df_match3["p_SOS_adj_bp_save%_opp_bp_convert%_l60_diff_y"] = df_match3["p_SOS_adj_bp_save%_l60_y"] - df_match3["p_SOS_adj_bp_convert%_l60_x"]

#### Retrospective Stamina and Fatigue Player Performance Differential Predictive Features By Match

In [101]:
# Decay-adjusted number of minutes played over up to the last 5 (within tournament) matches between winner (_x) and loser (_y)

df_match3["p_tot_time_l5_decay_diff_x"] = df_match3["p_tot_time_l5_decay_x"] - df_match3["p_tot_time_l5_decay_y"]
df_match3["p_tot_time_l5_decay_diff_y"] = -(df_match3["p_tot_time_l5_decay_diff_x"]) 

In [102]:
# NON-decay-adjusted number of minutes played over up to the last 5 (within tournament) matches between winner (_x) and loser (_y)

df_match3["p_tot_time_l5_diff_x"] = df_match3["p_tot_time_l5_x"] - df_match3["p_tot_time_l5_y"]
df_match3["p_tot_time_l5_diff_y"] = -(df_match3["p_tot_time_l5_diff_x"])

In [103]:
# Decay-adjusted total number of points played over up to the last 5 (within tournament) matches between winner (_x) and loser (_y)

df_match3["p_tot_pts_l5_decay_diff_x"] = df_match3["p_tot_pts_l5_decay_x"] - df_match3["p_tot_pts_l5_decay_y"]
df_match3["p_tot_pts_l5_decay_diff_y"] = -(df_match3["p_tot_pts_l5_decay_diff_x"]) 

In [104]:
# NON-decay-adjusted total number of points played over up to the last 5 (within tournament) matches between winner (_x) and loser (_y)

df_match3["p_tot_pts_l5_diff_x"] = df_match3["p_tot_pts_l5_x"] - df_match3["p_tot_pts_l5_y"]
df_match3["p_tot_pts_l5_diff_y"] = -(df_match3["p_tot_pts_l5_diff_x"]) 

In [105]:
# Difference in total matches played in the entire sample (non-surface specific) between winner (_x) and loser (_y)

df_match3["p_matches_diff_x"] = df_match3["p_matches_x"] - df_match3["p_matches_y"]
df_match3["p_matches_diff_y"] = -(df_match3["p_matches_diff_x"]) 

In [106]:
# Difference in stamina-adjusted fatigue (decay weighted total time played last 5 component) between winner (_x) and loser (_y)

df_match3["p_stamina_adj_fatigue_decay_diff_x"] = df_match3["p_stamina_adj_fatigue_decay_x"] - df_match3["p_stamina_adj_fatigue_decay_y"]
df_match3["p_stamina_adj_fatigue_decay_diff_y"] = -(df_match3["p_stamina_adj_fatigue_decay_diff_x"]) 

In [107]:
# Difference in stamina-adjusted fatigue (NON-decay weighted total time played last 5 component) between winner (_x) and loser (_y)

df_match3["p_stamina_adj_fatigue_diff_x"] = df_match3["p_stamina_adj_fatigue_x"] - df_match3["p_stamina_adj_fatigue_y"]
df_match3["p_stamina_adj_fatigue_diff_y"] = -(df_match3["p_stamina_adj_fatigue_diff_x"]) 

In [108]:
# Head-to-Head Matchup Past Differential (surface-specific, but no time constraints) between winner (_x) and loser (_y)

df_match3["p_H2H_diff_x"] = df_match3["p_H2H_w_x"] - df_match3["p_H2H_w_y"]
df_match3["p_H2H_diff_y"] = -(df_match3["p_H2H_diff_x"])

now back to by-player organization one final time.  A few additional features will be computed in relation to court speed prediction and then data will be prepped for the next stage (exploratory data analysis)

In [121]:
df_match3.to_csv('../data/df_match3.csv', index=False)

In [128]:
#Dropping loser (_y) columns for remerge by player
df_winners4 = df_match3.drop(["m_outcome_x", "m_outcome_y", "p_id_y", "p_name_y", "p_H2H_w_y", "p_rank_y", "p_rank_pts_y", "p_country_y", "p_ent_y", "p_hd_y", "p_ht_y", "p_age_y", "p_matches_y", "p_pts_won%_y", "p_pts_won%_l60_decay_y", "p_SOS_adj_pts_won%_l60_decay_y", "p_sv_pts_won%_y", "p_sv_pts_won%_l60_decay_y", "p_SOS_adj_sv_pts_won%_l60_decay_y", "p_ret_pts_won%_y", "p_ret_pts_won%_l60_decay_y", "p_SOS_adj_ret_pts_won%_l60_decay_y", "p_ace%_y", "p_ace%_l60_decay_y", "p_SOS_adj_ace%_l60_decay_y", "p_aced%_y", "p_aced%_l60_decay_y", "p_SOS_adj_aced%_l60_decay_y", "p_bp_save%_y", "p_bp_save%_l60_y", "p_SOS_adj_bp_save%_l60_y", "p_bp_convert%_y", "p_bp_convert%_l60_y", "p_SOS_adj_bp_convert%_l60_y", "p_pts_won%_std_l60_y", "p_sv_pts_won%_std_l60_y", "p_ret_pts_won%_std_l60_y", "p_tot_time_l5_y", "p_tot_time_l5_decay_y", "p_stamina_adj_fatigue_decay_y", "p_stamina_adj_fatigue_y", "p_tot_pts_l5_y", "p_tot_pts_l5_decay_y", "p_rank_diff_y", "p_log_rank_y", "p_log_rank_diff_y", "p_rank_pts_diff_y", "p_ht_diff_y", "p_age_diff_y", "p_L_opp_R_y", "p_HCA_opp_N_y", "p_pts_won%_l60_decay_diff_y", "p_SOS_adj_pts_won%_l60_decay_diff_y", "p_sv_pts_won%_l60_decay_diff_y", "p_SOS_adj_sv_pts_won%_l60_decay_diff_y", "p_ret_pts_won%_l60_decay_diff_y", "p_SOS_adj_ret_pts_won%_l60_decay_diff_y", "p_sv_opp_ret_pts_won%_l60_decay_diff_y", "p_SOS_adj_sv_opp_ret_pts_won%_l60_decay_diff_y", "p_ret_opp_sv_pts_won%_l60_decay_diff_y", "p_SOS_adj_ret_opp_sv_pts_won%_l60_decay_diff_y", "p_ace%_l60_decay_diff_y", "p_SOS_adj_ace%_l60_decay_diff_y", "p_aced%_l60_decay_diff_y", "p_SOS_adj_aced%_l60_decay_diff_y", "p_ace%_opp_aced%_l60_decay_diff_y", "p_SOS_adj_ace%_opp_aced%_l60_decay_diff_y", "p_aced%_opp_ace%_l60_decay_diff_y", "p_SOS_adj_aced%_opp_ace%_l60_decay_diff_y", "p_bp_save%_l60_diff_y", "p_SOS_adj_bp_save%_l60_diff_y", "p_bp_convert%_l60_diff_y", "p_SOS_adj_bp_convert%_l60_diff_y", "p_bp_convert%_opp_bp_save%_l60_diff_y", "p_SOS_adj_bp_convert%_opp_bp_save%_l60_diff_y", "p_bp_save%_opp_bp_convert%_l60_diff_y", "p_SOS_adj_bp_save%_opp_bp_convert%_l60_diff_y", "p_tot_time_l5_decay_diff_y", "p_tot_time_l5_diff_y", "p_tot_pts_l5_decay_diff_y", "p_tot_pts_l5_diff_y", "p_matches_diff_y", "p_stamina_adj_fatigue_decay_diff_y", "p_stamina_adj_fatigue_diff_y", "p_H2H_diff_y"], axis=1)
df_winners4["m_outcome"] = 1

#Renaming columns to remove winner-loser descriptions so we can re-concatenate winners and losers
df_winners4 = df_winners4.set_axis(["t_id", "t_date", "tour_wk", "t_name", "t_country", "t_surf", "t_lvl", "t_draw_size", "t_rd_num", "m_num", "m_best_of", "m_time(m)", "m_tot_pts", "p_id", "p_name", "p_H2H_w", "p_rank", "p_rank_pts", "p_country", "p_ent", "p_hd", "p_ht", "p_age", "p_matches", "p_pts_won%", "p_pts_won%_l60_decay", "p_SOS_adj_pts_won%_l60_decay", "p_sv_pts_won%", "p_sv_pts_won%_l60_decay", "p_SOS_adj_sv_pts_won%_l60_decay", "p_ret_pts_won%", "p_ret_pts_won%_l60_decay", "p_SOS_adj_ret_pts_won%_l60_decay", "p_ace%", "p_ace%_l60_decay", "p_SOS_adj_ace%_l60_decay", "p_aced%", "p_aced%_l60_decay", "p_SOS_adj_aced%_l60_decay", "p_bp_save%", "p_bp_save%_l60", "p_SOS_adj_bp_save%_l60", "p_bp_convert%", "p_bp_convert%_l60", "p_SOS_adj_bp_convert%_l60", "p_pts_won%_std_l60",'p_sv_pts_won%_std_l60','p_ret_pts_won%_std_l60', "p_tot_time_l5", "p_tot_time_l5_decay", "p_stamina_adj_fatigue_decay", "p_stamina_adj_fatigue", "p_tot_pts_l5", "p_tot_pts_l5_decay", "p_opp_rank_diff", "p_log_rank", "p_opp_log_rank_diff", "p_opp_rank_pts_diff","p_opp_ht_diff","p_opp_age_diff","p_L_opp_R","p_HCA_opp_N", "p_pts_won%_l60_decay_diff", "p_SOS_adj_pts_won%_l60_decay_diff", "p_sv_pts_won%_l60_decay_diff", "p_SOS_adj_sv_pts_won%_l60_decay_diff", "p_ret_pts_won%_l60_decay_diff", "p_SOS_adj_ret_pts_won%_l60_decay_diff", "p_sv_opp_ret_pts_won%_l60_decay_diff", "p_SOS_adj_sv_opp_ret_pts_won%_l60_decay_diff", "p_ret_opp_sv_pts_won%_l60_decay_diff", "p_SOS_adj_ret_opp_sv_pts_won%_l60_decay_diff", "p_ace%_l60_decay_diff", "p_SOS_adj_ace%_l60_decay_diff", "p_aced%_l60_decay_diff", "p_SOS_adj_aced%_l60_decay_diff", "p_ace%_opp_aced%_l60_decay_diff", "p_SOS_adj_ace%_opp_aced%_l60_decay_diff", "p_aced%_opp_ace%_l60_decay_diff", "p_SOS_adj_aced%_opp_ace%_l60_decay_diff", "p_bp_save%_l60_diff", "p_SOS_adj_bp_save%_l60_diff", "p_bp_convert%_l60_diff", "p_SOS_adj_bp_convert%_l60_diff", "p_bp_convert%_opp_bp_save%_l60_diff", "p_SOS_adj_bp_convert%_opp_bp_save%_l60_diff", "p_bp_save%_opp_bp_convert%_l60_diff", "p_SOS_adj_bp_save%_opp_bp_convert%_l60_diff", "p_tot_time_l5_decay_diff", "p_tot_time_l5_diff", "p_tot_pts_l5_decay_diff", "p_tot_pts_l5_diff", "p_matches_diff", "p_stam_adj_fatigue_decay_diff", "p_stam_adj_fatigue_diff", "p_H2H_diff", "m_outcome"], axis=1)

#Dropping winner (_x) columns for remerge by player
df_losers4 = df_match3.drop(["m_outcome_x", "m_outcome_y", "p_id_x", "p_name_x", "p_H2H_w_x", "p_rank_x", "p_rank_pts_x", "p_country_x", "p_ent_x", "p_hd_x", "p_ht_x", "p_age_x", "p_matches_x", "p_pts_won%_x", "p_pts_won%_l60_decay_x", "p_SOS_adj_pts_won%_l60_decay_x", "p_sv_pts_won%_x", "p_sv_pts_won%_l60_decay_x", "p_SOS_adj_sv_pts_won%_l60_decay_x", "p_ret_pts_won%_x", "p_ret_pts_won%_l60_decay_x", "p_SOS_adj_ret_pts_won%_l60_decay_x", "p_ace%_x", "p_ace%_l60_decay_x", "p_SOS_adj_ace%_l60_decay_x", "p_aced%_x", "p_aced%_l60_decay_x", "p_SOS_adj_aced%_l60_decay_x", "p_bp_save%_x", "p_bp_save%_l60_x", "p_SOS_adj_bp_save%_l60_x", "p_bp_convert%_x", "p_bp_convert%_l60_x", "p_SOS_adj_bp_convert%_l60_x", "p_pts_won%_std_l60_x", "p_sv_pts_won%_std_l60_x", "p_ret_pts_won%_std_l60_x", "p_tot_time_l5_x", "p_tot_time_l5_decay_x", "p_stamina_adj_fatigue_decay_x", "p_stamina_adj_fatigue_x", "p_tot_pts_l5_x", "p_tot_pts_l5_decay_x", "p_rank_diff_x", "p_log_rank_x", "p_log_rank_diff_x", "p_rank_pts_diff_x", "p_ht_diff_x", "p_age_diff_x", "p_L_opp_R_x", "p_HCA_opp_N_x", "p_pts_won%_l60_decay_diff_x", "p_SOS_adj_pts_won%_l60_decay_diff_x", "p_sv_pts_won%_l60_decay_diff_x", "p_SOS_adj_sv_pts_won%_l60_decay_diff_x", "p_ret_pts_won%_l60_decay_diff_x", "p_SOS_adj_ret_pts_won%_l60_decay_diff_x", "p_sv_opp_ret_pts_won%_l60_decay_diff_x", "p_SOS_adj_sv_opp_ret_pts_won%_l60_decay_diff_x", "p_ret_opp_sv_pts_won%_l60_decay_diff_x", "p_SOS_adj_ret_opp_sv_pts_won%_l60_decay_diff_x", "p_ace%_l60_decay_diff_x", "p_SOS_adj_ace%_l60_decay_diff_x", "p_aced%_l60_decay_diff_x", "p_SOS_adj_aced%_l60_decay_diff_x", "p_ace%_opp_aced%_l60_decay_diff_x", "p_SOS_adj_ace%_opp_aced%_l60_decay_diff_x", "p_aced%_opp_ace%_l60_decay_diff_x", "p_SOS_adj_aced%_opp_ace%_l60_decay_diff_x", "p_bp_save%_l60_diff_x", "p_SOS_adj_bp_save%_l60_diff_x", "p_bp_convert%_l60_diff_x", "p_SOS_adj_bp_convert%_l60_diff_x", "p_bp_convert%_opp_bp_save%_l60_diff_x", "p_SOS_adj_bp_convert%_opp_bp_save%_l60_diff_x", "p_bp_save%_opp_bp_convert%_l60_diff_x", "p_SOS_adj_bp_save%_opp_bp_convert%_l60_diff_x", "p_tot_time_l5_decay_diff_x", "p_tot_time_l5_diff_x", "p_tot_pts_l5_decay_diff_x", "p_tot_pts_l5_diff_x", "p_matches_diff_x", "p_stamina_adj_fatigue_decay_diff_x", "p_stamina_adj_fatigue_diff_x", "p_H2H_diff_x"], axis=1)
df_losers4["m_outcome"] = 0

#Renaming columns to remove winner-loser descriptions so we can re-concatenate winners and losers
df_losers4 = df_losers4.set_axis(["t_id", "t_date", "tour_wk", "t_name", "t_country", "t_surf", "t_lvl", "t_draw_size", "t_rd_num", "m_num", "m_best_of", "m_time(m)", "m_tot_pts", "p_id", "p_name", "p_H2H_w", "p_rank", "p_rank_pts", "p_country", "p_ent", "p_hd", "p_ht", "p_age", "p_matches", "p_pts_won%", "p_pts_won%_l60_decay", "p_SOS_adj_pts_won%_l60_decay", "p_sv_pts_won%", "p_sv_pts_won%_l60_decay", "p_SOS_adj_sv_pts_won%_l60_decay", "p_ret_pts_won%", "p_ret_pts_won%_l60_decay", "p_SOS_adj_ret_pts_won%_l60_decay", "p_ace%", "p_ace%_l60_decay", "p_SOS_adj_ace%_l60_decay", "p_aced%", "p_aced%_l60_decay", "p_SOS_adj_aced%_l60_decay", "p_bp_save%", "p_bp_save%_l60", "p_SOS_adj_bp_save%_l60", "p_bp_convert%", "p_bp_convert%_l60", "p_SOS_adj_bp_convert%_l60", "p_pts_won%_std_l60",'p_sv_pts_won%_std_l60','p_ret_pts_won%_std_l60', "p_tot_time_l5", "p_tot_time_l5_decay", "p_stamina_adj_fatigue_decay", "p_stamina_adj_fatigue", "p_tot_pts_l5", "p_tot_pts_l5_decay", "p_opp_rank_diff", "p_log_rank", "p_opp_log_rank_diff", "p_opp_rank_pts_diff","p_opp_ht_diff","p_opp_age_diff","p_L_opp_R","p_HCA_opp_N", "p_pts_won%_l60_decay_diff", "p_SOS_adj_pts_won%_l60_decay_diff", "p_sv_pts_won%_l60_decay_diff", "p_SOS_adj_sv_pts_won%_l60_decay_diff", "p_ret_pts_won%_l60_decay_diff", "p_SOS_adj_ret_pts_won%_l60_decay_diff", "p_sv_opp_ret_pts_won%_l60_decay_diff", "p_SOS_adj_sv_opp_ret_pts_won%_l60_decay_diff", "p_ret_opp_sv_pts_won%_l60_decay_diff", "p_SOS_adj_ret_opp_sv_pts_won%_l60_decay_diff", "p_ace%_l60_decay_diff", "p_SOS_adj_ace%_l60_decay_diff", "p_aced%_l60_decay_diff", "p_SOS_adj_aced%_l60_decay_diff", "p_ace%_opp_aced%_l60_decay_diff", "p_SOS_adj_ace%_opp_aced%_l60_decay_diff", "p_aced%_opp_ace%_l60_decay_diff", "p_SOS_adj_aced%_opp_ace%_l60_decay_diff", "p_bp_save%_l60_diff", "p_SOS_adj_bp_save%_l60_diff", "p_bp_convert%_l60_diff", "p_SOS_adj_bp_convert%_l60_diff", "p_bp_convert%_opp_bp_save%_l60_diff", "p_SOS_adj_bp_convert%_opp_bp_save%_l60_diff", "p_bp_save%_opp_bp_convert%_l60_diff", "p_SOS_adj_bp_save%_opp_bp_convert%_l60_diff", "p_tot_time_l5_decay_diff", "p_tot_time_l5_diff", "p_tot_pts_l5_decay_diff", "p_tot_pts_l5_diff", "p_matches_diff", "p_stam_adj_fatigue_decay_diff", "p_stam_adj_fatigue_diff", "p_H2H_diff", "m_outcome"], axis=1)

#Re-merge data, but now with no separate columns for winners and losers 
df_player4 = pd.concat([df_winners4, df_losers4], ignore_index=True)
#df_player4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36538 entries, 0 to 36537
Data columns (total 97 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   t_id                                          36538 non-null  object 
 1   t_date                                        36538 non-null  int64  
 2   tour_wk                                       36538 non-null  object 
 3   t_name                                        36538 non-null  object 
 4   t_country                                     36538 non-null  object 
 5   t_surf                                        36538 non-null  object 
 6   t_lvl                                         36538 non-null  int64  
 7   t_draw_size                                   36538 non-null  int64  
 8   t_rd_num                                      36488 non-null  float64
 9   m_num                                         36538 non-null 

In [129]:
df_player4.head(30)

Unnamed: 0,t_id,t_date,tour_wk,t_name,t_country,t_surf,t_lvl,t_draw_size,t_rd_num,m_num,...,p_SOS_adj_bp_save%_opp_bp_convert%_l60_diff,p_tot_time_l5_decay_diff,p_tot_time_l5_diff,p_tot_pts_l5_decay_diff,p_tot_pts_l5_diff,p_matches_diff,p_stam_adj_fatigue_decay_diff,p_stam_adj_fatigue_diff,p_H2H_diff,m_outcome
0,2019-7696,20191105,2019_33,NextGen Finals,ITA,Hard,3,8,5.0,2781,...,20.07,16.8,11.0,44.8,49.0,-80.0,41.28,62.64,0.0,1
1,2019-7696,20191105,2019_33,NextGen Finals,ITA,Hard,3,8,4.0,2783,...,23.62,29.2,28.0,41.0,44.0,-18.0,27.37,37.21,0.0,1
2,2019-7696,20191105,2019_33,NextGen Finals,ITA,Hard,3,8,2.0,2795,...,17.59,-15.0,-15.0,13.0,13.0,-5.0,1.65,5.77,0.0,1
3,2019-7696,20191105,2019_33,NextGen Finals,ITA,Hard,3,8,1.0,2792,...,21.91,0.0,0.0,0.0,0.0,-106.0,36.6,60.99,1.0,1
4,2019-0337,20191021,2019_30,Vienna,AUT,Hard,1,32,1.0,2541,...,10.11,0.0,0.0,0.0,0.0,-287.0,49.66,82.78,0.0,1
5,2019-7485,20191014,2019_29,Antwerp,BEL,Hard,1,32,3.0,2407,...,7.0,-42.0,-48.0,-54.8,-63.0,-106.0,37.31,65.95,0.0,1
6,2019-7485,20191014,2019_29,Antwerp,BEL,Hard,1,32,2.0,2411,...,-14.35,9.0,9.0,11.0,11.0,-290.0,60.84,98.62,0.0,1
7,2019-7485,20191014,2019_29,Antwerp,BEL,Hard,1,32,1.0,2419,...,12.86,0.0,0.0,0.0,0.0,4.0,-18.73,-31.22,0.0,1
8,2019-0439,20190715,2019_18,Umag,CRO,Clay,1,32,1.0,1705,...,-14.1,0.0,0.0,0.0,0.0,-10.0,32.27,53.8,0.0,1
9,2019-M009,20190513,2019_15,Rome Masters,ITA,Clay,2,64,1.0,1139,...,-6.54,0.0,0.0,0.0,0.0,-242.0,114.06,190.1,0.0,1


In [130]:
df_player5 = df_player4[["t_id", "t_date", "tour_wk", "t_name", "t_country", "t_surf", "t_lvl", "t_draw_size", "t_rd_num", "m_num", "m_best_of", "m_time(m)", "m_tot_pts", "p_id", "p_name", "p_rank", "p_log_rank", "p_rank_pts", "p_country", "p_ent", "p_hd", "p_ht", "p_age", "p_matches", "p_H2H_w", "p_pts_won%", "p_pts_won%_l60_decay", "p_SOS_adj_pts_won%_l60_decay", "p_sv_pts_won%", "p_sv_pts_won%_l60_decay", "p_SOS_adj_sv_pts_won%_l60_decay", "p_ret_pts_won%", "p_ret_pts_won%_l60_decay", "p_SOS_adj_ret_pts_won%_l60_decay", "p_ace%", "p_ace%_l60_decay", "p_SOS_adj_ace%_l60_decay", "p_aced%", "p_aced%_l60_decay", "p_SOS_adj_aced%_l60_decay", "p_bp_save%", "p_bp_save%_l60", "p_SOS_adj_bp_save%_l60", "p_bp_convert%", "p_bp_convert%_l60", "p_SOS_adj_bp_convert%_l60", "p_pts_won%_std_l60",'p_sv_pts_won%_std_l60','p_ret_pts_won%_std_l60', "p_tot_time_l5", "p_tot_time_l5_decay", "p_stamina_adj_fatigue", "p_stamina_adj_fatigue_decay", "p_tot_pts_l5", "p_tot_pts_l5_decay", "p_opp_rank_diff", "p_opp_log_rank_diff", "p_opp_rank_pts_diff", "p_opp_ht_diff", "p_opp_age_diff", "p_L_opp_R", "p_HCA_opp_N", "p_pts_won%_l60_decay_diff", "p_SOS_adj_pts_won%_l60_decay_diff", "p_sv_pts_won%_l60_decay_diff", "p_SOS_adj_sv_pts_won%_l60_decay_diff", "p_ret_pts_won%_l60_decay_diff", "p_SOS_adj_ret_pts_won%_l60_decay_diff", "p_sv_opp_ret_pts_won%_l60_decay_diff", "p_SOS_adj_sv_opp_ret_pts_won%_l60_decay_diff", "p_ret_opp_sv_pts_won%_l60_decay_diff", "p_SOS_adj_ret_opp_sv_pts_won%_l60_decay_diff", "p_ace%_l60_decay_diff", "p_SOS_adj_ace%_l60_decay_diff", "p_aced%_l60_decay_diff", "p_SOS_adj_aced%_l60_decay_diff", "p_ace%_opp_aced%_l60_decay_diff", "p_SOS_adj_ace%_opp_aced%_l60_decay_diff", "p_aced%_opp_ace%_l60_decay_diff", "p_SOS_adj_aced%_opp_ace%_l60_decay_diff", "p_bp_save%_l60_diff", "p_SOS_adj_bp_save%_l60_diff", "p_bp_convert%_l60_diff", "p_SOS_adj_bp_convert%_l60_diff", "p_bp_convert%_opp_bp_save%_l60_diff", "p_SOS_adj_bp_convert%_opp_bp_save%_l60_diff", "p_bp_save%_opp_bp_convert%_l60_diff", "p_SOS_adj_bp_save%_opp_bp_convert%_l60_diff", "p_tot_time_l5_diff", "p_tot_time_l5_decay_diff", "p_tot_pts_l5_diff", "p_tot_pts_l5_decay_diff", "p_matches_diff", "p_stam_adj_fatigue_diff", "p_stam_adj_fatigue_decay_diff",  "p_H2H_diff", "m_outcome"]]

In [131]:
#Sorting as such helps visually verify the complicated, backward-looking stat accrual calculations we will make below
df_player5 = df_player5.sort_values(by=['p_id','tour_wk','t_rd_num'], ascending = False)

In [132]:
df_player5.head(30)

Unnamed: 0,t_id,t_date,tour_wk,t_name,t_country,t_surf,t_lvl,t_draw_size,t_rd_num,m_num,...,p_SOS_adj_bp_save%_opp_bp_convert%_l60_diff,p_tot_time_l5_diff,p_tot_time_l5_decay_diff,p_tot_pts_l5_diff,p_tot_pts_l5_decay_diff,p_matches_diff,p_stam_adj_fatigue_diff,p_stam_adj_fatigue_decay_diff,p_H2H_diff,m_outcome
34850,2019-560,20190826,2019_24,US Open,USA,Hard,4,128,1.0,2059,...,,-0.0,-0.0,-0.0,-0.0,-225.0,250.75,150.45,-0.0,0
18294,2019-M014,20191014,2019_29,Moscow,RUS,Hard,1,32,1.0,2446,...,,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0
19382,2019-M004,20190225,2019_07,Acapulco,MEX,Hard,1,32,1.0,545,...,,-0.0,-0.0,-0.0,-0.0,-25.0,198.73,119.24,-0.0,0
0,2019-7696,20191105,2019_33,NextGen Finals,ITA,Hard,3,8,5.0,2781,...,20.07,11.0,16.8,49.0,44.8,-80.0,62.64,41.28,0.0,1
1,2019-7696,20191105,2019_33,NextGen Finals,ITA,Hard,3,8,4.0,2783,...,23.62,28.0,29.2,44.0,41.0,-18.0,37.21,27.37,0.0,1
18384,2019-7696,20191105,2019_33,NextGen Finals,ITA,Hard,3,8,3.0,2794,...,25.72,-61.0,-58.0,-52.0,-54.6,-16.0,5.82,-2.9,-0.0,0
2,2019-7696,20191105,2019_33,NextGen Finals,ITA,Hard,3,8,2.0,2795,...,17.59,-15.0,-15.0,13.0,13.0,-5.0,5.77,1.65,0.0,1
3,2019-7696,20191105,2019_33,NextGen Finals,ITA,Hard,3,8,1.0,2792,...,21.91,0.0,0.0,0.0,0.0,-106.0,60.99,36.6,1.0,1
28706,2019-0337,20191021,2019_30,Vienna,AUT,Hard,1,32,2.0,2528,...,16.15,-7.0,-7.0,-25.0,-25.0,-286.0,86.58,54.71,1.0,0
4,2019-0337,20191021,2019_30,Vienna,AUT,Hard,1,32,1.0,2541,...,10.11,0.0,0.0,0.0,0.0,-287.0,82.78,49.66,0.0,1


In [133]:
df_player5.to_csv('../data/df_player5.csv', index=False)

Ideally, as close to real time a model of conditions before a given match we want to predict can be generated. Once a sufficient number of matches have been played in a given tournament, priors on court speed can be updated as well. 

For now, we will use ace% per given tournament from the previous year (when available) as a proxy for court speed. Conditions, of course, are dictated by a number of factors, including the balls, altitude, watering frequency (clay) and sand incorporation in the mix for hard courts. Also, indoor conditions tend to be faster than outdoor. This is challenging to model because all conditions variables are seldom the same from year to year at a given venue. Plus, even the weather at the time of a match will make a considerable difference in court conditions, possibly even greater than the underlying "weather neutral" conditions. 

In [134]:
# First, generate by-tournament ace%
t_ace_perc = df_player5[["p_name","t_name","t_id","p_ace%"]]
t_ace_perc.head()

Unnamed: 0,p_name,t_name,t_id,p_ace%
34850,Zachary Svajda,US Open,2019-560,1.09
18294,Alibek Kachmazov,Moscow,2019-M014,4.82
19382,Emilio Nava,Acapulco,2019-M004,3.7
0,Jannik Sinner,NextGen Finals,2019-7696,3.85
1,Jannik Sinner,NextGen Finals,2019-7696,7.69


In [135]:
# Before computing by-tourny, by-year means, removing data from the three largest ace outliers in tennis history (apologies to Andy Roddick and Milos Raonic).
# Their absense or presence, esecially if they go very deep in the tourny, really does make a big difference at the individual tourny level as far as ace stats.
t_ace_perc = t_ace_perc[~t_ace_perc['p_name'].str.contains("Karlovic")]
t_ace_perc = t_ace_perc[~t_ace_perc['p_name'].str.contains("Isner")]
t_ace_perc = t_ace_perc[~t_ace_perc['p_name'].str.contains("Opelka")]
t_ace_perc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35812 entries, 34850 to 26087
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   p_name  35812 non-null  object 
 1   t_name  35812 non-null  object 
 2   t_id    35812 non-null  object 
 3   p_ace%  35812 non-null  float64
dtypes: float64(1), object(3)
memory usage: 1.4+ MB


In [136]:
# computes mean ace % per tourny per year (minus the 6'7" and above outliers removed above)
t_ace_perc = t_ace_perc.groupby(['t_id','t_name']).mean().round(2)
t_ace_perc.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,p_ace%
t_id,t_name,Unnamed: 2_level_1
2012-1536,Madrid Masters,8.01
2012-1720,Bangkok,5.26
2012-2276,Zagreb,8.24
2012-301,Auckland,6.55
2012-308,Munich,7.43
2012-314,Gstaad,6.85
2012-316,Bastad,4.55
2012-319,Kitzbuhel,5.28
2012-321,Stuttgart,5.58
2012-328,Basel,9.18


In [137]:
t_ace_perc = t_ace_perc.sort_values(by=['t_name','t_id'], ascending = False)
t_ace_perc

Unnamed: 0_level_0,Unnamed: 1_level_0,p_ace%
t_id,t_name,Unnamed: 2_level_1
2019-9164,Zhuhai,6.46
2015-2276,Zagreb,7.94
2014-2276,Zagreb,8.76
2013-2276,Zagreb,8.28
2012-2276,Zagreb,8.24
...,...,...
2016-M004,Acapulco,8.10
2015-807,Acapulco,5.89
2014-807,Acapulco,7.50
2013-807,Acapulco,4.70


In [138]:
t_ace_perc.rename(columns = {'p_ace%':'t_ace%'}, inplace=True)

In [139]:
# For each tourny in the sample, applies the previous year's ace% from the same tourney (where available) as the speedo conditions proxy
t_ace_perc["t_ace%_last"] = t_ace_perc.groupby('t_name')['t_ace%'].shift(-1)
t_ace_perc

Unnamed: 0_level_0,Unnamed: 1_level_0,t_ace%,t_ace%_last
t_id,t_name,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-9164,Zhuhai,6.46,
2015-2276,Zagreb,7.94,8.76
2014-2276,Zagreb,8.76,8.28
2013-2276,Zagreb,8.28,8.24
2012-2276,Zagreb,8.24,
...,...,...,...
2016-M004,Acapulco,8.10,5.89
2015-807,Acapulco,5.89,7.50
2014-807,Acapulco,7.50,4.70
2013-807,Acapulco,4.70,4.66


In [140]:
# Now we can just do a left join with the main dataframe on t_id to fill in the proper last year's value for each player/match
df_player6 = df_player5.merge(t_ace_perc['t_ace%_last'], on='t_id', how = 'left')

For tournaments without a prior year to assess conditions from (mostly tournies from the first year of the sample (2012) that we won't actually make predictions on, we will just use the overall sample mean for its' surface (hard or clay)

In [141]:
surface_ace_perc_means = df_player5[["p_name", "t_name","t_id","t_surf","p_ace%"]]
surface_ace_perc_means.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36538 entries, 34850 to 26087
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   p_name  36538 non-null  object 
 1   t_name  36538 non-null  object 
 2   t_id    36538 non-null  object 
 3   t_surf  36538 non-null  object 
 4   p_ace%  36538 non-null  float64
dtypes: float64(1), object(4)
memory usage: 1.7+ MB


In [142]:
# as with the by-tourny means above, removing the extreme ace outliers before computing the by surface averages for filling in the NaNs
surface_ace_perc_means = surface_ace_perc_means[~surface_ace_perc_means['p_name'].str.contains("Karlovic")]
surface_ace_perc_means = surface_ace_perc_means[~surface_ace_perc_means['p_name'].str.contains("Isner")]
surface_ace_perc_means = surface_ace_perc_means[~surface_ace_perc_means['p_name'].str.contains("Opelka")]
surface_ace_perc_means.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35812 entries, 34850 to 26087
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   p_name  35812 non-null  object 
 1   t_name  35812 non-null  object 
 2   t_id    35812 non-null  object 
 3   t_surf  35812 non-null  object 
 4   p_ace%  35812 non-null  float64
dtypes: float64(1), object(4)
memory usage: 1.6+ MB


In [143]:
# computes mean across all matches played on one surface in the sample (clay or hard court). Used to fill in NaNs with surface=specificity
surface_ace_perc_means = surface_ace_perc_means.groupby(['t_surf']).mean().round(2)
surface_ace_perc_means.rename(columns = {'p_ace%':'t_ace%'}, inplace=True)
surface_ace_perc_means

Unnamed: 0_level_0,t_ace%
t_surf,Unnamed: 1_level_1
Clay,5.46
Hard,8.46


In [144]:
surface_ace_perc_means["t_ace%"][0], surface_ace_perc_means["t_ace%"][1]

(5.46, 8.46)

In [145]:
df_player6.loc[(df_player6["t_ace%_last"].isnull()) & (df_player6["t_surf"] == "Hard") , "t_ace%_last"] = surface_ace_perc_means["t_ace%"][1]
df_player6.loc[(df_player6["t_ace%_last"].isnull()) & (df_player6["t_surf"] == "Clay") , "t_ace%_last"] = surface_ace_perc_means["t_ace%"][0] 

In [146]:
# Numerically encode surface (and handedness, which should have been converted earlier) moving forward
df_player6.loc[(df_player6["t_surf"] == "Hard"), "t_surf"] = 2 #Hard Court
df_player6.loc[(df_player6["t_surf"] == "Clay"), "t_surf"] = 1 #Clay Court

df_player6["t_surf"] = pd.to_numeric(df_player6["t_surf"])

In [148]:
# Now just drop player ace% per match column so we don't accidentally include in predictions
df_player6 = df_player6.drop(["p_ace%"], axis=1)

In [149]:
df_player6.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36538 entries, 0 to 36537
Data columns (total 97 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   t_id                                          36538 non-null  object 
 1   t_date                                        36538 non-null  int64  
 2   tour_wk                                       36538 non-null  object 
 3   t_name                                        36538 non-null  object 
 4   t_country                                     36538 non-null  object 
 5   t_surf                                        36538 non-null  int64  
 6   t_lvl                                         36538 non-null  int64  
 7   t_draw_size                                   36538 non-null  int64  
 8   t_rd_num                                      36488 non-null  float64
 9   m_num                                         36538 non-null 

In [150]:
# Creates a dataframe containing the player differential predictive features, along with useful "metadata" and target feature (% pts won) to bring to EDA
df_player_diffs = df_player6[["t_id", "t_date", "tour_wk", "t_name", "t_country", "t_surf", "t_ace%_last", "t_lvl", "t_draw_size", "t_rd_num", "m_num", "m_best_of", "m_time(m)", "m_tot_pts", "p_id", "p_name", "p_opp_rank_diff", "p_opp_log_rank_diff", "p_opp_rank_pts_diff", "p_opp_ht_diff", "p_opp_age_diff", "p_L_opp_R", "p_HCA_opp_N", "p_pts_won%_l60_decay_diff", "p_SOS_adj_pts_won%_l60_decay_diff", "p_sv_pts_won%_l60_decay_diff", "p_SOS_adj_sv_pts_won%_l60_decay_diff", "p_ret_pts_won%_l60_decay_diff", "p_SOS_adj_ret_pts_won%_l60_decay_diff", "p_sv_opp_ret_pts_won%_l60_decay_diff", "p_SOS_adj_sv_opp_ret_pts_won%_l60_decay_diff", "p_ret_opp_sv_pts_won%_l60_decay_diff", "p_SOS_adj_ret_opp_sv_pts_won%_l60_decay_diff", "p_ace%_l60_decay_diff", "p_SOS_adj_ace%_l60_decay_diff", "p_aced%_l60_decay_diff", "p_SOS_adj_aced%_l60_decay_diff", "p_ace%_opp_aced%_l60_decay_diff", "p_SOS_adj_ace%_opp_aced%_l60_decay_diff", "p_aced%_opp_ace%_l60_decay_diff", "p_SOS_adj_aced%_opp_ace%_l60_decay_diff", "p_bp_save%_l60_diff", "p_SOS_adj_bp_save%_l60_diff", "p_bp_convert%_l60_diff", "p_SOS_adj_bp_convert%_l60_diff", "p_bp_convert%_opp_bp_save%_l60_diff", "p_SOS_adj_bp_convert%_opp_bp_save%_l60_diff", "p_bp_save%_opp_bp_convert%_l60_diff", "p_SOS_adj_bp_save%_opp_bp_convert%_l60_diff", "p_tot_time_l5_diff", "p_tot_time_l5_decay_diff", "p_tot_pts_l5_diff", "p_tot_pts_l5_decay_diff", "p_matches_diff", "p_stam_adj_fatigue_diff", "p_stam_adj_fatigue_decay_diff",  "p_H2H_diff"]]

In [151]:
# Saves player differential features (plus metadata and target feature) for EDA stage
df_player_diffs.to_csv('../data/df_player_diffs.csv', index=False)

In [152]:
# Creates a dataframe containing the player raw predictive features, along with useful "metadata" and target feature (% pts won) to bring to EDA
df_player_raw = df_player6[["t_id", "t_date", "tour_wk", "t_name", "t_country", "t_surf", "t_ace%_last", "t_lvl", "t_draw_size", "t_rd_num", "m_num", "m_best_of", "m_time(m)", "m_tot_pts", "p_id", "p_name", "p_rank", "p_log_rank", "p_rank_pts", "p_country", "p_ent", "p_hd", "p_ht", "p_age", "p_matches", "p_H2H_w", "p_pts_won%", "p_pts_won%_l60_decay", "p_SOS_adj_pts_won%_l60_decay", "p_sv_pts_won%_l60_decay", "p_SOS_adj_sv_pts_won%_l60_decay", "p_ret_pts_won%_l60_decay", "p_SOS_adj_ret_pts_won%_l60_decay", "p_ace%_l60_decay", "p_SOS_adj_ace%_l60_decay", "p_aced%_l60_decay", "p_SOS_adj_aced%_l60_decay", "p_bp_save%_l60", "p_SOS_adj_bp_save%_l60", "p_bp_convert%_l60", "p_SOS_adj_bp_convert%_l60", "p_pts_won%_std_l60",'p_sv_pts_won%_std_l60','p_ret_pts_won%_std_l60', "p_tot_time_l5", "p_tot_time_l5_decay", "p_stamina_adj_fatigue", "p_stamina_adj_fatigue_decay", "p_tot_pts_l5", "p_tot_pts_l5_decay"]]

In [153]:
# Saves player raw features (plus metadata and target feature) for EDA stage
df_player_diffs.to_csv('../data/df_player_raw.csv', index=False)