In [29]:
# first of all we import the python libraries we need
import numpy as np
import pandas as pd
import sklearn as sklearn
import matplotlib.pyplot as plt

In [30]:
# we now load all of our data from 2015 to 2024 into one dataframe
all_matches = pd.concat(
    [pd.read_csv(f'data/atp_matches_{i}.csv') for i in range(2015, 2025)],
    ignore_index=True
)

# our dataframe includes many columns which are unnecessary or include noise, so we create a list including those columns
unnecessary = [
    'score', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms', 
    'w_bpSaved', 'w_bpFaced','l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms',
    'l_bpSaved', 'l_bpFaced', 'winner_rank_points', 'loser_rank_points', 'winner_name', 'loser_name', 
    'winner_id', 'loser_id','winner_ioc', 'loser_ioc', 'winner_entry', 'loser_entry', 'tourney_id', 
    'tourney_name', 'match_num', 'winner_hand', 'loser_hand', 'tourney_date','winner_seed',
    'loser_seed', 'winner_ht', 'loser_ht'
]

# now we drop our unneeded columns
df = all_matches.drop(columns=unnecessary)

# we also have to encode our classifiers: surface, tourney_level, and   
df = pd.get_dummies(
    df,
    columns = ['surface', 'tourney_level'],
    drop_first = True
)

# Davis Cup is unnecessary so we also drop the column
df.drop(columns = ['tourney_level_D', 'tourney_level_F'], inplace=True)

round_map = {
    'R128': 1,
    'R64': 2,
    'R32': 3,
    'R16': 4,
    'R32': 3,
    'QF': 5,
    'SF': 6,
    'F': 7,
    'RR': 0
}
df['round'] = df['round'].map(round_map)
df.rename(columns={
    'tourney_level_G' : 'Grand Slam',
    'tourney_level_O' : 'Olympics',
    'tourney_level_M' : 'Masters',
    'tourney_level_A' : 'ATP 250/500'
}, inplace = True)

#we romove matches where the duration is less than 20 minutes because these are likely to be walkovers or incomplete matches 
df = df[df["minutes"] > 20].copy()

#now we create two different dataframes out of the one we had. One for best of 5 matches and one for best of 3

df_best5 = df[df["best_of"] == 5].copy()
df_best3 = df[df["best_of"] == 3].copy()
display(df_best5)
display(df_best3)

Unnamed: 0,draw_size,winner_age,loser_age,best_of,round,minutes,winner_rank,loser_rank,surface_Clay,surface_Grass,surface_Hard,Grand Slam,Masters,Olympics
139,128,27.6,25.5,5,1.0,109.0,1.0,116.0,False,False,True,True,False,False
140,128,23.9,27.0,5,1.0,152.0,88.0,61.0,False,False,True,True,False,False
141,128,30.3,18.7,5,1.0,207.0,92.0,212.0,False,False,True,True,False,False
142,128,31.1,27.9,5,1.0,162.0,33.0,103.0,False,False,True,True,False,False
143,128,29.7,29.9,5,1.0,127.0,21.0,118.0,False,False,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27407,8,20.3,20.5,5,0.0,102.0,41.0,128.0,False,False,True,False,False,False
27408,8,20.3,19.6,5,0.0,112.0,41.0,138.0,False,False,True,False,False,False
27409,8,20.5,19.8,5,0.0,116.0,128.0,50.0,False,False,True,False,False,False
27410,8,19.6,19.8,5,0.0,93.0,138.0,50.0,False,False,True,False,False,False


Unnamed: 0,draw_size,winner_age,loser_age,best_of,round,minutes,winner_rank,loser_rank,surface_Clay,surface_Grass,surface_Hard,Grand Slam,Masters,Olympics
0,28,25.5,23.7,3,3.0,65.0,153.0,220.0,False,False,True,False,False,False
1,28,33.4,22.3,3,3.0,104.0,73.0,123.0,False,False,True,False,False,False
2,28,22.9,30.0,3,3.0,68.0,125.0,21.0,False,False,True,False,False,False
3,28,27.8,27.4,3,3.0,69.0,31.0,72.0,False,False,True,False,False,False
4,28,25.4,33.6,3,3.0,144.0,34.0,110.0,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27667,4,18.8,21.8,3,0.0,77.0,1109.0,740.0,True,False,False,False,False,False
27668,4,26.9,23.2,3,0.0,131.0,554.0,748.0,False,False,True,False,False,False
27669,4,27.3,26.4,3,0.0,131.0,416.0,,False,False,True,False,False,False
27670,4,26.9,27.3,3,0.0,159.0,554.0,416.0,False,False,True,False,False,False
