In [1]:
import os
import pandas as pd
import numpy as np
import dill

from elo_funcs import elo_predict, fiveodds, player_update_elo, process_elo

from sklearn.metrics import brier_score_loss, accuracy_score, roc_auc_score

In [2]:
pd.options.display.max_columns =100

In [3]:
round_dict = {
    'RR' : 0,
    'ER' : 0,
    'BR' : 0,
    'R128' : 1,
    'R64' : 2,
    'R32' : 3,
    'R16' : 4,
    'QF' : 5,
    'Q4' : 5,
    'SF' : 6,
    'F' : 7
}

In [4]:
folder = "C:/Users/jyoung/Projects/tennis_project/tennis_data/tennis_atp-master"

df = pd.DataFrame()

for file in os.listdir(f'{folder}/'):

  if '20' in file and 'doubles' not in file:
    df_ = pd.read_csv(f'{folder}/{file}')
    df = pd.concat([df, df_])

print(df.shape)
df['round'] = df['round'].map(round_dict)
df['total_score'] = [sum([int(y) for y in x if y.isdigit()]) if type(
    x) == str else -1 for x in df.score]

df = df[df['winner_name'].apply(lambda x: isinstance(x, str))].copy()
df = df[df['loser_name'].apply(lambda x: isinstance(x, str))].copy()
df = df[~df['winner_name'].str.contains('Unknown')].copy()
df = df[~df['loser_name'].str.contains('Unknown')].copy()
df = df[df['winner_name'] != df['loser_name']].copy()

df.sort_values(by=['tourney_date', 'tourney_id', 'round'],
               ascending=[True, True, True],
               inplace=True)

# # convert the tourney date column to datetime
df['tourney_date'] = pd.to_datetime(
    [str(x)[:4] + '-' + str(x)[4:6] + '-' + str(x)[6:8] for x in df.tourney_date]          
    )

df['w_2ndsvOpps'] = df['w_svpt'].sub(df['w_1stIn'])
df['l_2ndsvOpps'] = df['l_svpt'].sub(df['l_1stIn'])

df['w_1stReturnOpps'] = df['l_1stIn']
df['w_2ndReturnOpps'] = df['l_svpt'].sub(df['l_1stIn'])

df['w_1stReturnPts'] = df['l_1stIn'] - df['l_1stWon']
df['w_2ndReturnPts'] = df['l_2ndsvOpps'] - (df['l_2ndWon'] + df['l_df'])

df['l_1stReturnOpps'] = df['w_1stIn']
df['l_2ndReturnOpps'] = df['w_svpt'].sub(df['w_1stIn'])

df['l_1stReturnPts'] = df['w_1stIn'] - df['w_1stWon']
df['l_2ndReturnPts'] = df['w_2ndsvOpps'] - (df['w_2ndWon'] + df['w_df'])

df['w_bpOpps'] = df['l_bpFaced']
df['w_bpWon'] = df['l_bpFaced'] - df['l_bpSaved']

df['l_bpOpps'] = df['w_bpFaced']
df['l_bpWon'] = df['w_bpFaced'] - df['w_bpSaved']


print('shape before dropping match dupes:', df.shape)

df.drop_duplicates(subset=['winner_name', 'loser_name', 'tourney_id'],
                   inplace=True)

print('shape after dropping match dupes:', df.shape)

df.reset_index(drop=True, inplace=True)

(722957, 53)
shape before dropping match dupes: (572975, 68)
shape after dropping match dupes: (572775, 68)


In [5]:
# creating separate winner and loser columns before stacking them to compute rolling
# statistics for each player

winner_df = df[['winner_name', 'tourney_date', 'tourney_id', 'round', 'total_score',
               'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
               'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'w_2ndsvOpps', 'w_1stReturnOpps',
               'w_1stReturnPts', 'w_2ndReturnOpps', 'w_2ndReturnPts', 'w_bpOpps', 'w_bpWon']].copy()

loser_df = df[['loser_name', 'tourney_date', 'tourney_id', 'round', 'total_score',
              'l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon',
              'l_SvGms', 'l_bpSaved', 'l_bpFaced', 'l_2ndsvOpps', 'l_1stReturnOpps',
              'l_1stReturnPts', 'l_2ndReturnOpps', 'l_2ndReturnPts', 'l_bpOpps', 'l_bpWon']].copy()

# renaming the columns so they're identical
winner_df.columns = [x.split('_')[1] if '_' in x else 'round' for x in winner_df.columns]
loser_df.columns = [x.split('_')[1] if '_' in x else 'round' for x in loser_df.columns]

# concatenating the dfs and renaming/sorting columns
combined_df = pd.concat([winner_df, loser_df], axis=0)

combined_df.rename(columns={'date' : 'tourney_date', 'id' :'tourney_id'}, inplace=True)

combined_df.sort_values(by=['tourney_date', 'tourney_id', 'round'],
                        ascending=[True, True, True],
                        inplace=True)

combined_df.reset_index(drop=True, inplace=True)

# calculating total matches played
combined_df['matches_played'] = combined_df.groupby('name').cumcount() + 1

# calculating rolling statistics for each player
combined_df['rolling_svpts'] = combined_df.groupby('name')['svpt'].transform(lambda x: x.shift().rolling(50).sum())
combined_df['rolling_1stIn'] = combined_df.groupby('name')['1stIn'].transform(lambda x: x.shift().rolling(50).sum())
combined_df['rolling_1stIn_perc'] = combined_df['rolling_1stIn'].div(combined_df['rolling_svpts'])

combined_df['rolling_1stWon'] = combined_df.groupby('name')['1stWon'].transform(lambda x: x.shift().rolling(50).sum())
combined_df['rolling_1stWon_perc'] = combined_df['rolling_1stWon'].div(combined_df['rolling_1stIn'])

combined_df['rolling_2ndsvOpps'] = combined_df.groupby('name')['2ndsvOpps'].transform(lambda x: x.shift().rolling(50).sum())
combined_df['rolling_2ndWon'] = combined_df.groupby('name')['2ndWon'].transform(lambda x: x.shift().rolling(50).sum())
combined_df['rolling_2ndWon_perc'] = combined_df['rolling_2ndWon'].div(combined_df['rolling_2ndsvOpps'])

combined_df['rolling_aces'] = combined_df.groupby('name')['ace'].transform(lambda x: x.shift().rolling(50).sum())
combined_df['rolling_aces_perc'] = combined_df['rolling_aces'].div(combined_df['rolling_svpts'])

combined_df['rolling_dfs'] = combined_df.groupby('name')['df'].transform(lambda x: x.shift().rolling(50).sum())
combined_df['rolling_dfs_perc'] = combined_df['rolling_dfs'].div(combined_df['rolling_svpts'])

combined_df['rolling_1stRetOpps'] = combined_df.groupby('name')['1stReturnOpps'].transform(lambda x: x.shift().rolling(50).sum())
combined_df['rolling_1stRetPts'] = combined_df.groupby('name')['1stReturnPts'].transform(lambda x: x.shift().rolling(50).sum())
combined_df['rolling_1stRet_perc'] = combined_df['rolling_1stRetPts'].div(combined_df['rolling_1stRetOpps'])

combined_df['rolling_2ndRetOpps'] = combined_df.groupby('name')['2ndReturnOpps'].transform(lambda x: x.shift().rolling(50).sum())
combined_df['rolling_2ndRetPts'] = combined_df.groupby('name')['2ndReturnPts'].transform(lambda x: x.shift().rolling(50).sum())
combined_df['rolling_2ndRet_perc'] = combined_df['rolling_2ndRetPts'].div(combined_df['rolling_2ndRetOpps'])

# creating a dataframe with the dates of each tourney played
tourney_dates = (
    combined_df
    .groupby(['name', 'tourney_id'])['tourney_date']
    .last()
    .to_frame()
    .reset_index()
    .sort_values(by='tourney_date')
)

# calculating each player's days of rest, or days since last tournament
tourney_dates['days_rest'] = (
    tourney_dates
    .groupby('name')['tourney_date']
    .transform(lambda x: (x - x.shift()).dt.days.fillna(-1000))
)

# adding the days of rest back to the main df
combined_df = (
    combined_df
    .merge(tourney_dates,
           how='left',
           left_on=['name', 'tourney_id', 'tourney_date'],
           right_on=['name', 'tourney_id', 'tourney_date'])
)

# dropping columns (may be unnecesary, check this)
combined_df = (
    combined_df
    .drop(
        columns=[x for x in combined_df.columns if 'w_' in x or 'l_' in x])
    .dropna(subset='ace')
)

In [6]:
df = (
    df.merge(
        combined_df,
        how='left',
        left_on=['tourney_date', 'tourney_id', 'round', 'winner_name'],
        right_on=['tourney_date', 'tourney_id', 'round', 'name'])
    .rename(columns={'matches_played' : 'winner_mp', 'days_rest' : 'winner_days_rest'})
    .merge(
        combined_df,
        how='left',
        left_on=['tourney_date', 'tourney_id', 'round', 'loser_name'],
        right_on=['tourney_date', 'tourney_id', 'round', 'name'])
    .rename(columns={'matches_played' : 'loser_mp', 'days_rest' : 'loser_days_rest'})
    )

print("shape before dropping dupes:", df.shape)

df.drop_duplicates(subset=['winner_name', 'loser_name', 'tourney_id'],
                   keep='first',
                   inplace=True)

print("shape after dropping dupes:", df.shape)

df.reset_index(drop=True, inplace=True)

df.columns = [x.replace('_y', '_loser') for x in df.columns]
df.columns = [x.replace('_x', '_winner') for x in df.columns]

shape before dropping dupes: (657058, 144)
shape after dropping dupes: (572775, 144)


In [7]:
# creating dataframes for each surface

hard_df = df[df['surface'] == 'Hard'].copy()
clay_df = df[df['surface'] == 'Clay'].copy()
grass_df = df[df['surface'] == 'Grass'].copy()
carpet_df = df[df['surface'] == 'Carpet'].copy()

# calculating surface elo and adding surface elo win probabilities to the df
hard_dict, hard_df = process_elo(hard_df)
clay_dict, clay_df = process_elo(clay_df)
grass_dict, grass_df = process_elo(grass_df)
carpet_dict, carpet_df = process_elo(carpet_df)

In [8]:
# bringing all the dataframes back together

df = (
    pd.concat(
        [hard_df, clay_df, grass_df, carpet_df])
    .sort_values(by=['tourney_date', 'tourney_id', 'round'],
                 ascending=[True, True, True])
    .reset_index(drop=True)
)

# creating a dict for handedness
hand_dict = df.groupby("winner_name")['winner_hand'].last().to_dict()

# creating a dict for matches played
matches_played_dict = (
    combined_df
    .groupby('name')['matches_played']
    .last()
    .to_dict()
)

# creating a dict for last tourney date
last_match_dict = (
    combined_df
    .groupby('name')['tourney_date']
    .last()
    .to_dict()
)

print("shape:", df.shape)

shape: (572775, 145)


In [9]:
# creating a player dict for entire dataset to calc general ELO

general_elo_dict = {name : 1500 for name in df.winner_name.unique()}

for name in df.loser_name.unique():
  if name not in general_elo_dict:
    general_elo_dict[name] = 1500

In [10]:
# cell where i did some manual grid search; also where general elo calculatd

for i in range(0, 1):

  # general_elo_dict = {name : 1500 for name in df.winner_name.unique()}

  for name in df.loser_name.unique():
    if name not in general_elo_dict:
      general_elo_dict[name] = 1500

  winner_probs = []

  for w, l, b in zip(df.winner_name, df.loser_name, df.best_of):
    d = player_update_elo(winner=w, loser=l, base_k=55, n=42, j=5, player_dict=general_elo_dict)
    winner_probs.append(d)

y = [1] * len(df)

df['winner_elo_proba'] = winner_probs

print(i, brier_score_loss(y, winner_probs))
print(i, accuracy_score(y, [round(x, 0) for x in winner_probs]))

0 0.2023881650389599
0 0.6818960324734844


In [11]:
df.groupby('winner_name')[['rolling_1stWon_perc_winner', 'rolling_2ndWon_perc_winner']].last().dropna().sort_values(by='rolling_1stWon_perc_winner', ascending=False).head(50)

Unnamed: 0_level_0,rolling_1stWon_perc_winner,rolling_2ndWon_perc_winner
winner_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Ivo Karlovic,0.834576,0.565536
Roger Federer,0.811826,0.611671
Sam Groth,0.809278,0.528807
Sam Querrey,0.806706,0.484737
Jerzy Janowicz,0.80541,0.514493
Gilles Muller,0.80352,0.491813
Mark Philippoussis,0.801401,0.51671
Mardy Fish,0.797721,0.534543
Kevin Anderson,0.795772,0.519191
Daniil Medvedev,0.792411,0.530448


In [12]:
return_df = (
    combined_df[~combined_df['ace'].isnull()]
    .groupby('name', group_keys=True)[['1stReturnOpps', '1stReturnPts', '2ndReturnOpps', '2ndReturnPts']]
    .sum()
    .dropna()
    .sort_values(by='1stReturnOpps')
)

return_df['return_pt_perc'] = (
    (return_df['1stReturnPts'] + return_df['2ndReturnPts'])
    .div
    (return_df['1stReturnOpps'] + return_df['2ndReturnOpps'])
)

return_df[return_df['1stReturnOpps'] > 10000].sort_values(by='return_pt_perc', ascending=False).head(30)

Unnamed: 0_level_0,1stReturnOpps,1stReturnPts,2ndReturnOpps,2ndReturnPts,return_pt_perc
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Guillermo Coria,14777.0,5235.0,10575.0,4866.0,0.39843
Diego Schwartzman,30308.0,10708.0,20373.0,9223.0,0.393264
Filippo Volandri,28437.0,10099.0,19752.0,8733.0,0.390795
Rafael Nadal,59603.0,20433.0,37650.0,17503.0,0.390075
Novak Djokovic,57753.0,19514.0,37383.0,17250.0,0.386436
David Ferrer,50825.0,17000.0,34437.0,15754.0,0.384157
Rui Machado,10549.0,3647.0,7109.0,3120.0,0.383226
David Guez,13569.0,4659.0,8796.0,3909.0,0.383099
Pedro Sousa,17933.0,6300.0,11254.0,4880.0,0.383047
Flavio Cipolla,17462.0,6194.0,11392.0,4839.0,0.382373


In [13]:
with open('general_elo_dict.pkl', 'wb') as file:
    dill.dump(general_elo_dict, file)

with open('hard_dict.pkl', 'wb') as file:
    dill.dump(hard_dict, file)

with open('clay_dict.pkl', 'wb') as file:
    dill.dump(clay_dict, file)

with open('grass_dict.pkl', 'wb') as file:
    dill.dump(grass_dict, file)

with open('carpet_dict.pkl', 'wb') as file:
    dill.dump(carpet_dict, file)

In [14]:
df.to_csv(f'{folder}/processed_apt.csv', index=False)