# Feature Extraction

## Wins & Loses of All Players in ATP Matches (until today)


In [49]:
finals = atpmatches[(atpmatches['round'] == 'F')]
semifinals = atpmatches[(atpmatches['round'] == 'SF')]

#get players who won a title (final)
titles_group = finals.groupby('winner_name').size()
#get players who reach a final match - whether won or lost
finals_group = semifinals.groupby('winner_name').size()

#get all players who won a game or lost a game so as to count wins/losses
w_group = atpmatches.groupby('winner_name').size()
l_group = atpmatches.groupby('loser_name').size()

scores = pd.DataFrame({'total_wins': w_group, 'total_loses': l_group}).fillna(0)
scores[['total_wins', 'total_loses']] = scores[['total_wins', 'total_loses']].astype(int)
scores = scores.reindex(['total_wins', 'total_loses'], axis=1)

scores['total_matches'] = scores['total_wins'] + scores['total_loses']
scores['perc_of_total_wins'] = np.round(scores['total_wins']*100/scores['total_matches'],2)

scores.index.name = 'player_name'

scores = scores.join(pd.DataFrame(finals_group, columns = ['finals'],)).fillna(0)
scores = scores.join(pd.DataFrame(titles_group, columns = ['titles'],)).fillna(0)
   
    
scores['titles'] = scores['titles'].astype('int')
scores['finals'] = scores['finals'].astype('int')
    
scores = scores.sort_values(['titles', 'total_wins'], ascending=False)
scores.head()

Unnamed: 0_level_0,total_wins,total_loses,total_matches,perc_of_total_wins,finals,titles
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Roger Federer,1160,250,1410,82.27,151,102
Rafael Nadal,971,198,1169,83.06,121,87
Novak Djokovic,920,179,1099,83.71,117,86
Andy Murray,637,209,846,75.3,64,45
Andy Roddick,563,188,751,74.97,50,31


In [50]:
print('Players evolved: ', scores.shape[0])

Players evolved:  693


In [51]:
scores = scores.sort_values(['player_name'])
# uncomment the above line only for the first time 
# do not re-run it !! 
scores = scores.reset_index(level=0)
scores.head()

Unnamed: 0,player_name,total_wins,total_loses,total_matches,perc_of_total_wins,finals,titles
0,Adam Pavlasek,5,8,13,38.46,0,0
1,Adrian Garcia,8,21,29,27.59,0,0
2,Adrian Mannarino,200,254,454,44.05,10,1
3,Adrian Menendez Maceiras,5,18,23,21.74,0,0
4,Adrian Ungur,6,27,33,18.18,0,0


In [52]:
scores.to_csv('atp/atp_allwinslosses.csv')

In [53]:
atpplayers = pd.merge(left=players, right=scores, left_on='name', right_on='player_name')
atpplayers = atpplayers[['player_name', 'player_id', 'name_first', 'name_last', 'hand', 'dob', 
                        'ioc', 'height', 'total_wins', 'total_loses', 'total_matches', 
                        'perc_of_total_wins', 'finals', 'titles']]
print('Shape of Dataframe: ', atpplayers.shape[0])
atpplayers.head()

Shape of Dataframe:  694


Unnamed: 0,player_name,player_id,name_first,name_last,hand,dob,ioc,height,total_wins,total_loses,total_matches,perc_of_total_wins,finals,titles
0,Adam Pavlasek,106361,Adam,Pavlasek,R,19941008,CZE,186.0,5,8,13,38.46,0,0
1,Adrian Garcia,103201,Adrian,Garcia,R,19780525,CHI,175.0,8,21,29,27.59,0,0
2,Adrian Mannarino,105173,Adrian,Mannarino,L,19880629,FRA,183.0,200,254,454,44.05,10,1
3,Adrian Menendez Maceiras,104629,Adrian,Menendez Maceiras,R,19851028,ESP,,5,18,23,21.74,0,0
4,Adrian Ungur,104494,Adrian,Ungur,R,19850125,ROU,178.0,6,27,33,18.18,0,0


### Hard Surface

In [54]:
hard = atpmatches[(atpmatches['surface'] == 'Hard')]
    
w_group = hard.groupby('winner_name').size()
l_group = hard.groupby('loser_name').size()
    
h_surf = pd.DataFrame({'wins_hard': w_group, 'losses_hard': l_group}).fillna(0)
h_surf[['wins_hard', 'losses_hard']] = h_surf[['wins_hard', 'losses_hard']].astype(int)
    
h_surf = h_surf.reindex(['wins_hard', 'losses_hard'], axis=1)
    
h_surf['matches_hard'] = h_surf['wins_hard'] + h_surf['losses_hard']
h_surf['percentage_hard'] = np.round(h_surf['wins_hard']*100/h_surf['matches_hard'],2)
    
h_surf.index.name = 'player_name'
    
h_surf = h_surf.sort_values(['wins_hard', 'percentage_hard'], ascending=False)

print('Shape of Dataframe: ', h_surf.shape[0])

h_surf.to_csv('atp/atp_WinsHardSurface.csv')

h_surf.head()

Shape of Dataframe:  686


Unnamed: 0_level_0,wins_hard,losses_hard,matches_hard,percentage_hard
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Roger Federer,730,147,877,83.24
Novak Djokovic,592,109,701,84.45
Rafael Nadal,464,131,595,77.98
Andy Murray,434,140,574,75.61
Andy Roddick,399,127,526,75.86


### Clay Surface

In [55]:
clay = atpmatches[(atpmatches['surface'] == 'Clay')]
    
w_group = clay.groupby('winner_name').size()
l_group = clay.groupby('loser_name').size()
    
cl_surf = pd.DataFrame({'wins_clay': w_group, 'losses_clay': l_group}).fillna(0)
cl_surf[['wins_clay', 'losses_clay']] = cl_surf[['wins_clay', 'losses_clay']].astype(int)
    
cl_surf = cl_surf.reindex(['wins_clay', 'losses_clay'], axis=1)
    
cl_surf['matches_clay'] = cl_surf['wins_clay'] + cl_surf['losses_clay']
cl_surf['percentage_clay'] = np.round(cl_surf['wins_clay']*100/cl_surf['matches_clay'],2)
    
cl_surf.index.name = 'player_name'
    
cl_surf = cl_surf.sort_values(['wins_clay', 'percentage_clay'], ascending=False)

print('Shape of Dataframe: ', cl_surf.shape[0])

cl_surf.to_csv('atp/atp_WinsClaySurface.csv')

cl_surf.head()

Shape of Dataframe:  635


Unnamed: 0_level_0,wins_clay,losses_clay,matches_clay,percentage_clay
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Rafael Nadal,436,43,479,91.02
David Ferrer,301,136,437,68.88
Nicolas Almagro,253,136,389,65.04
Tommy Robredo,241,122,363,66.39
Novak Djokovic,223,50,273,81.68


### Grass Surface

In [23]:
grass = atpmatches[(atpmatches['surface'] == 'Grass')]
    
w_group = grass.groupby('winner_name').size()
l_group = grass.groupby('loser_name').size()
    
g_surf = pd.DataFrame({'wins_grass': w_group, 'losses_grass': l_group}).fillna(0)
g_surf[['wins_grass', 'losses_grass']] = g_surf[['wins_grass', 'losses_grass']].astype(int)
    
g_surf = g_surf.reindex(['wins_grass', 'losses_grass'], axis=1)
    
g_surf['matches_grass'] = g_surf['wins_grass'] + g_surf['losses_grass']
g_surf['percentage_grass'] = np.round(g_surf['wins_grass']*100/g_surf['matches_grass'],2)
    
g_surf.index.name = 'player_name'

g_surf = g_surf.sort_values(['wins_grass', 'percentage_grass'], ascending=False)

print('Shape of Dataframe: ', g_surf.shape[0])

g_surf.to_csv('atp/atp_WinsGrassSurface.csv')

g_surf.head()

Shape of Dataframe:  745


Unnamed: 0_level_0,wins_grass,losses_grass,matches_grass,percentage_grass
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Roger Federer,187,27,214,87.38
Lleyton Hewitt,103,30,133,77.44
Andy Murray,102,23,125,81.6
Novak Djokovic,101,16,117,86.32
Feliciano Lopez,83,45,128,64.84


### Carpet Surface

In [24]:
carpet = atpmatches[(atpmatches['surface'] == 'Carpet')]
    
w_group = carpet.groupby('winner_name').size()
l_group = carpet.groupby('loser_name').size()
    
c_surf = pd.DataFrame({'wins_carpet': w_group, 'losses_carpet': l_group}).fillna(0)
c_surf[['wins_carpet', 'losses_carpet']] = c_surf[['wins_carpet', 'losses_carpet']].astype(int)
    
c_surf = c_surf.reindex(['wins_carpet', 'losses_carpet'], axis=1)
    
c_surf['matches_carpet'] = c_surf['wins_carpet'] + c_surf['losses_carpet']
c_surf['percentage_carpet'] = np.round(c_surf['wins_carpet']*100/c_surf['matches_carpet'],2)
    
c_surf.index.name = 'player_name'
    
c_surf = c_surf.sort_values(['wins_carpet', 'percentage_carpet'], ascending=False)

print('Shape of Dataframe: ', c_surf.shape[0])

c_surf.to_csv('atp/atp_WinsCarpetSurface.csv')

c_surf.head()

Shape of Dataframe:  276


Unnamed: 0_level_0,wins_carpet,losses_carpet,matches_carpet,percentage_carpet
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ivan Ljubicic,40,22,62,64.52
Marat Safin,38,14,52,73.08
Roger Federer,37,13,50,74.0
Mikhail Youzhny,31,17,48,64.58
Nikolay Davydenko,29,14,43,67.44


### All Together

In [25]:
hard_carpet = pd.concat([h_surf, c_surf],axis=1)
hard_carpet['matches1'] = hard_carpet['matches_hard'] + hard_carpet['matches_carpet']

grass_clay = pd.concat([g_surf, cl_surf], axis=1)
grass_clay['matches2'] = grass_clay['matches_grass'] + grass_clay['matches_clay']

In [26]:
allsurfaces = pd.concat([hard_carpet, grass_clay], axis=1)
allsurfaces['matches'] = allsurfaces['matches1'] + allsurfaces['matches2']

In [27]:
allsurfaces = allsurfaces[['matches', 'percentage_hard', 'percentage_carpet', 'percentage_grass', 'percentage_clay']]
allsurfaces.fillna(0, inplace=True)
allsurfaces.to_csv('atp/atp_allsurfaces_PERCENTAGE_.csv')

In [28]:
allsurfaces = allsurfaces.sort_values(['matches'], ascending=False)
allsurfaces.head()

Unnamed: 0,matches,percentage_hard,percentage_carpet,percentage_grass,percentage_clay
Roger Federer,1420.0,83.35,74.0,87.38,76.92
Rafael Nadal,1173.0,78.06,16.67,78.65,91.06
Novak Djokovic,1106.0,84.5,55.56,86.32,81.95
David Ferrer,1055.0,63.36,43.75,61.76,69.35
Fernando Verdasco,944.0,52.78,57.14,53.26,59.46


In [29]:
print('Shape of Dataframe: ', allsurfaces.shape[0])

Shape of Dataframe:  1444


In [30]:
allsurfaces = allsurfaces.reset_index(level=0)
allsurfaces.rename(columns = {'index': 'player_name'}, inplace=True)
allsurfaces.head()

Unnamed: 0,player_name,matches,percentage_hard,percentage_carpet,percentage_grass,percentage_clay
0,Roger Federer,1420.0,83.35,74.0,87.38,76.92
1,Rafael Nadal,1173.0,78.06,16.67,78.65,91.06
2,Novak Djokovic,1106.0,84.5,55.56,86.32,81.95
3,David Ferrer,1055.0,63.36,43.75,61.76,69.35
4,Fernando Verdasco,944.0,52.78,57.14,53.26,59.46


In [31]:
allsurfaces = allsurfaces.sort_values(['player_name'])
allsurfaces.head()

Unnamed: 0,player_name,matches,percentage_hard,percentage_carpet,percentage_grass,percentage_clay
1263,Abdulla Hajji,0.0,0.0,0.0,0.0,0.0
811,Adam Chadaj,0.0,0.0,0.0,0.0,50.0
1262,Adam Kennedy,0.0,0.0,0.0,0.0,0.0
1261,Adam Pavlasek,0.0,0.0,0.0,50.0,50.0
1370,Adrian Andreev,0.0,25.0,0.0,0.0,0.0


In [32]:
atpplayers_ = pd.merge(left=atpplayers, right=allsurfaces, left_on='player_name', right_on='player_name')
print('Shape of Dataframe: ', atpplayers_.shape[0])
atpplayers_.head()

Shape of Dataframe:  1450


Unnamed: 0,player_name,player_id,name_first,name_last,hand,dob,ioc,height_,total_wins,total_loses,total_matches,perc_of_total_wins,finals,titles,matches,percentage_hard,percentage_carpet,percentage_grass,percentage_clay
0,Abdulla Hajji,108993,Abdulla,Hajji,U,19901205,QAT,185,0,3,3,0.0,0,0,0.0,0.0,0.0,0.0,0.0
1,Adam Chadaj,104360,Adam,Chadaj,L,19840501,POL,178,1,1,2,50.0,0,0,0.0,0.0,0.0,0.0,50.0
2,Adam Kennedy,104126,Adam,Kennedy,R,19830209,AUS,185,0,3,3,0.0,0,0,0.0,0.0,0.0,0.0,0.0
3,Adam Pavlasek,106361,Adam,Pavlasek,R,19941008,CZE,186,5,9,14,35.71,0,0,0.0,0.0,0.0,50.0,50.0
4,Adrian Andreev,202090,Adrian,Andreev,R,20010512,BUL,185,2,6,8,25.0,0,0,0.0,25.0,0.0,0.0,0.0


# GOOD GOOD GOOD GOOD

## Wins on specific Tournament

In [33]:
def wins_on_tour_per_player(winsOnTour, tourney):
    
    tourney_ = str(tourney)
    matches = winsOnTour[(winsOnTour['tourney_name'] == tourney_)]
    finals = matches[(matches['round'] == 'F')]
    semifinals = matches[(matches['round'] == 'SF')]
    
    #get players who won a title (final)
    titles_group = finals.groupby('winner_name').size()
    #get players who reach a final match - whether won or lost
    finals_group = semifinals.groupby('winner_name').size()
    
    #get all players who won a game or lost a game so as to count wins/losses
    w_group = matches.groupby('winner_name').size()
    l_group = matches.groupby('loser_name').size()
    
    scores = pd.DataFrame({'wins': w_group, 'losses': l_group}).fillna(0)
    scores[['wins', 'losses']] = scores[['wins', 'losses']].astype(int)
    
    scores = scores.reindex(['wins', 'losses'], axis=1)
    
    scores['matches'] = scores['wins'] + scores['losses']
    scores['percentage'] = np.round(scores['wins']*100/scores['matches'],2)
    
    scores.index.name = 'player_name'
    
    scores = scores.join(pd.DataFrame(finals_group, columns = ['finals'],)).fillna(0)
    scores = scores.join(pd.DataFrame(titles_group, columns = ['titles'],)).fillna(0)
    
    scores['titles'] = scores['titles'].astype('int')
    scores['finals'] = scores['finals'].astype('int')
    
    #scores = scores.sort_values(['titles', 'wins'], ascending=False)
    return scores

In [34]:
tourlist = ['Adelaide', 'Doha', 'Chennai', 'Auckland', 'Sydney', 'Australian Open',
            'San Jose', 'Dubai', 'San Jose', 'Dubai', 'Marseille', 'Memphis',
            'Rotterdam', 'London', 'Mexico City', 'Copenhagen', 'Delray Beach',
            'Santiago', 'Bogota', 'Scottsdale', 'Indian Wells Masters', 'Miami Masters',
            'Casablanca', 'Atlanta', 'Estoril', 'Monte Carlo Masters', 'Barcelona', 
            'Munich', 'Mallorca', 'Orlando', 'Rome Masters', 'Hamburg Masters', 
            'Dusseldorf', 'St. Poelten', 'Roland Garros', "Queen's Club", 'Halle',
            's Hertogenbosch', 'Nottingham', 'Wimbledon', 'Gstaad', 'Newport', 'Bastad',
            'Amsterdam', 'Stuttgart Outdoor', 'Umag', 'Kitzbuhel', 'Los Angeles', 
            'San Marino', 'Canada Masters', 'Cincinnati Masters', 'Washington', 
            'Indianapolis', 'Long Island', 'US Open', 'Tashkent', 'Bucharest', 
            'Sydney Olympics', 'Palermo', 'Hong Kong', 'Tokyo', 'Vienna', 'Toulouse',
            'Shanghai', 'Basel', 'Moscow', 'Stuttgart Masters', 'Lyon', 'St. Petersburg',
            'Paris Masters', 'Stockholm', 'Brighton', 'Masters Cup', 'Milan', 
            'Vina del Mar', 'Buenos Aires', 'Acapulco', 'Houston', 'Stuttgart', 
            'Sopot', 'Costa Do Sauipe', 'Amersfoort', 'Madrid Masters', 'Valencia',
            'Bangkok', 'Metz', 'Athens Olympics', 'Ho Chi Minh City', 'Zagreb',
            'Las Vegas', 'Poertschach', 'Mumbai', 'Warsaw', 'Beijing Olympics', 
            'Brisbane', 'Johannesburg', 'Belgrade', 'Eastbourne', 'Hamburg', 
            'Kuala Lumpur', 'Shanghai Masters', 'Nice', 'Montpellier', 'Winston-Salem',
            'Sao Paulo', 'London Olympics', 'Rio de Janeiro', 'Shenzhen', 'Quito',
            'Istanbul', 'Geneva', 'Sofia', 'Marrakech', 'Los Cabos', 'Rio Olympics',
            'Chengdu', 'Antwerp', 'Budapest', 'Antalya', 'NextGen Finals', 'Pune', 
            'New York', 'Cordoba', 'Zhuhai', 'Atp Cup', 'ATP Rio de Janeiro', 'Us Open',
            'St Petersburg', 'Cologne 1', 'Sardinia', 'Cologne 2', 'Nur-Sultan', 
            'San Diego', 'Great Ocean Road Open', 'Murray River Open', 'Singapore', 
            'Marbella', 'Cagliari', 'Parma', 'Belgrade 2', 'Tokyo Olympics']

In [35]:
tour = atpmatches.copy()

for t in tourlist:
    #print(t)
    sc = wins_on_tour_per_player(tour, t)
    #print(sc)
    sc.to_csv('atp/atp_winslossesOn_' + t + '.csv')


In [36]:
#Example
wins_on_tour_per_player(tour, 'Atlanta')

Unnamed: 0_level_0,wins,losses,matches,percentage,finals,titles
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Adrian Mannarino,1,2,3,33.33,0,0
Alejandro Falla,0,1,1,0.00,0,0
Alejandro Gonzalez,0,1,1,0.00,0,0
Alex Bogomolov Jr,0,2,2,0.00,0,0
Alex Bolt,0,1,1,0.00,0,0
...,...,...,...,...,...,...
Xavier Malisse,6,5,11,54.55,1,0
Yasutaka Uchiyama,0,1,1,0.00,0,0
Yen Hsun Lu,5,4,9,55.56,0,0
Yoshihito Nishioka,3,1,4,75.00,0,0


## Same Handedness

In [37]:
pd.set_option('display.max_columns', None)

In [38]:
atpmatches.loc[(atpmatches['winner_hand']==atpmatches['loser_hand']), 'handedness'] = 1
atpmatches.loc[((atpmatches['winner_hand']=='R') & (atpmatches['loser_hand']=='L')), 'handedness'] = 2
atpmatches.loc[((atpmatches['winner_hand']=='L') & (atpmatches['loser_hand']=='R')), 'handedness'] = 3

In [39]:
atpmatches.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,handedness
0,2019-M020,Brisbane,Hard,32,A,20181231,300,105453,2.0,,Kei Nishikori,R,178.0,JPN,29.004791,106421,4.0,,Daniil Medvedev,R,198.0,RUS,22.885695,6-4 3-6 6-2,3,F,124.0,3.0,3.0,77.0,44.0,31.0,17.0,13.0,3.0,6.0,8.0,6.0,100.0,54.0,34.0,20.0,14.0,10.0,15.0,9.0,3590.0,16.0,1977.0,1.0
1,2019-M020,Brisbane,Hard,32,A,20181231,299,106421,4.0,,Daniil Medvedev,R,198.0,RUS,22.885695,104542,,PR,Jo-Wilfried Tsonga,R,188.0,FRA,33.705681,7-6(6) 6-2,3,SF,82.0,10.0,1.0,52.0,33.0,28.0,14.0,10.0,0.0,1.0,17.0,2.0,77.0,52.0,36.0,7.0,10.0,10.0,13.0,16.0,1977.0,239.0,200.0,1.0
2,2019-M020,Brisbane,Hard,32,A,20181231,298,105453,2.0,,Kei Nishikori,R,178.0,JPN,29.004791,104871,,,Jeremy Chardy,R,188.0,FRA,31.882272,6-2 6-2,3,SF,66.0,2.0,2.0,47.0,33.0,26.0,9.0,8.0,2.0,2.0,10.0,3.0,46.0,27.0,15.0,6.0,8.0,1.0,5.0,9.0,3590.0,40.0,1050.0,1.0
3,2019-M020,Brisbane,Hard,32,A,20181231,297,104542,,PR,Jo-Wilfried Tsonga,R,188.0,FRA,33.705681,200282,7.0,,Alex De Minaur,R,183.0,AUS,19.868583,6-4 7-6(2),3,QF,106.0,12.0,2.0,68.0,43.0,34.0,15.0,11.0,4.0,5.0,1.0,2.0,81.0,60.0,38.0,9.0,11.0,4.0,6.0,239.0,200.0,31.0,1298.0,1.0
4,2019-M020,Brisbane,Hard,32,A,20181231,296,106421,4.0,,Daniil Medvedev,R,198.0,RUS,22.885695,105683,5.0,,Milos Raonic,R,196.0,CAN,28.010951,6-7(2) 6-3 6-4,3,QF,129.0,12.0,3.0,105.0,68.0,48.0,25.0,16.0,8.0,8.0,29.0,5.0,94.0,56.0,46.0,19.0,15.0,2.0,4.0,16.0,1977.0,18.0,1855.0,1.0


## Home Factor

In [40]:
home = atpmatches.copy()

In [41]:
conditions = [
    (home['tourney_name'] == 'Adelaide'), (home['tourney_name'] == 'Doha'),
    (home['tourney_name'] == 'Chennai'), (home['tourney_name'] == 'Auckland'),
    (home['tourney_name'] == 'Sydney'), (home['tourney_name'] == 'Australian Open'),
    (home['tourney_name'] == 'San Jose'), (home['tourney_name'] == 'Dubai'),
    (home['tourney_name'] == 'Marseille'), (home['tourney_name'] == 'Memphis'),
    (home['tourney_name'] == 'Rotterdam'), (home['tourney_name'] == 'London'),
    (home['tourney_name'] == 'Mexico City'), (home['tourney_name'] == 'Copenhagen'),
    (home['tourney_name'] == 'Delray Beach'), (home['tourney_name'] == 'Santiago'),
    (home['tourney_name'] == 'Bogota'), (home['tourney_name'] == 'Scottsdale'),
    (home['tourney_name'] == 'Indian Wells Masters'), 
    (home['tourney_name'] == 'Miami Masters'), (home['tourney_name'] == 'Casablanca'),
    (home['tourney_name'] == 'Atlanta'), (home['tourney_name'] == 'Estoril'),
    (home['tourney_name'] == 'Monte Carlo Masters'), (home['tourney_name'] == 'Barcelona'),
    (home['tourney_name'] == 'Munich'), (home['tourney_name'] == 'Mallorca'),
    (home['tourney_name'] == 'Orlando'), (home['tourney_name'] == 'Rome Masters'),
    (home['tourney_name'] == 'Hamburg Masters'), (home['tourney_name'] == 'Dusseldorf'),
    (home['tourney_name'] == 'St. Poelten'), (home['tourney_name'] == 'Roland Garros'),
    (home['tourney_name'] == "Queen's Club"), (home['tourney_name'] == 'Halle'),
    (home['tourney_name'] == 's Hertogenbosch'), (home['tourney_name'] == 'Nottingham'),
    (home['tourney_name'] == 'Wimbledon'), (home['tourney_name'] == 'Gstaad'),
    (home['tourney_name'] == 'Newport'), (home['tourney_name'] == 'Bastad'),
    (home['tourney_name'] == 'Amsterdam'), (home['tourney_name'] == 'Stuttgart Outdoor'),
    (home['tourney_name'] == 'Umag'), (home['tourney_name'] == 'Kitzbuhel'),  
    (home['tourney_name'] == 'Los Angeles'), (home['tourney_name'] == 'San Marino'),
    (home['tourney_name'] == 'Canada Masters'), (home['tourney_name'] == 'Cincinnati Masters'),
    (home['tourney_name'] == 'Washington'), (home['tourney_name'] == 'Indianapolis'),
    (home['tourney_name'] == 'Long Island'), (home['tourney_name'] == 'US Open'),
    (home['tourney_name'] == 'Tashkent'), (home['tourney_name'] == 'Bucharest'), 
    (home['tourney_name'] == 'Sydney Olympics'), (home['tourney_name'] == 'Palermo'), 
    (home['tourney_name'] == 'Hong Kong'), (home['tourney_name'] == 'Tokyo'), 
    (home['tourney_name'] == 'Vienna'), (home['tourney_name'] == 'Toulouse'), 
    (home['tourney_name'] == 'Shanghai'), (home['tourney_name'] == 'Basel'), 
    (home['tourney_name'] == 'Moscow'), (home['tourney_name'] == 'Stuttgart Masters'),     
    (home['tourney_name'] == 'Lyon'), (home['tourney_name'] == 'St. Petersburg'), 
    (home['tourney_name'] == 'Paris Masters'), (home['tourney_name'] == 'Stockholm'),
    (home['tourney_name'] == 'Brighton'), (home['tourney_name'] == 'Masters Cup'),
    (home['tourney_name'] == 'Milan'), (home['tourney_name'] == 'Vina del Mar'),
    (home['tourney_name'] == 'Buenos Aires'), (home['tourney_name'] == 'Acapulco'),
    (home['tourney_name'] == 'Houston'), (home['tourney_name'] == 'Stuttgart'),
    (home['tourney_name'] == 'Sopot'), (home['tourney_name'] == 'Costa Do Sauipe'),
    (home['tourney_name'] == 'Amersfoort'), (home['tourney_name'] == 'Madrid Masters'),
    (home['tourney_name'] == 'Valencia'), (home['tourney_name'] == 'Bangkok'),
    (home['tourney_name'] == 'Metz'), (home['tourney_name'] == 'Athens Olympics'),
    (home['tourney_name'] == 'Beijing'), (home['tourney_name'] == 'New Haven'),
    (home['tourney_name'] == 'Ho Chi Minh City'), (home['tourney_name'] == 'Zagreb'),
    (home['tourney_name'] == 'Las Vegas'), (home['tourney_name'] == 'Poertschach'),
    (home['tourney_name'] == 'Mumbai'), (home['tourney_name'] == 'Warsaw'), 
    (home['tourney_name'] == 'Beijing Olympics'), (home['tourney_name'] == 'Brisbane'),
    (home['tourney_name'] == 'Johannesburg'), (home['tourney_name'] == 'Belgrade'),
    (home['tourney_name'] == 'Eastbourne'), (home['tourney_name'] == 'Hamburg'),
    (home['tourney_name'] == 'Kuala Lumpur'), (home['tourney_name'] == 'Shanghai Masters'),
    (home['tourney_name'] == 'Nice'), (home['tourney_name'] == 'Montpellier'),
    (home['tourney_name'] == 'Winston-Salem'), (home['tourney_name'] == 'Sao Paulo'), 
    (home['tourney_name'] == 'London Olympics'), (home['tourney_name'] == 'Rio de Janeiro'),
    (home['tourney_name'] == 'Shenzhen'), (home['tourney_name'] == 'Quito'),
    (home['tourney_name'] == 'Istanbul'), (home['tourney_name'] == 'Geneva'),
    (home['tourney_name'] == 'Sofia'), (home['tourney_name'] == 'Marrakech'),
    (home['tourney_name'] == 'Los Cabos'), (home['tourney_name'] == 'Rio Olympics'),
    (home['tourney_name'] == 'Chengdu'), (home['tourney_name'] == 'Antwerp'),
    (home['tourney_name'] == 'Budapest'), (home['tourney_name'] == 'Antalya'),
    (home['tourney_name'] == 'NextGen Finals'), (home['tourney_name'] == 'Pune'),
    (home['tourney_name'] == 'New York'), (home['tourney_name'] == 'Cordoba'),
    (home['tourney_name'] == 'Zhuhai'), (home['tourney_name'] == 'Atp Cup'), 
    (home['tourney_name'] == 'ATP Rio de Janeiro'), (home['tourney_name'] == 'Us Open'),
    (home['tourney_name'] == 'St Petersburg'), (home['tourney_name'] == 'Cologne 1'),
    (home['tourney_name'] == 'Sardinia'), (home['tourney_name'] == 'Cologne 2'),
    (home['tourney_name'] == 'Nur-Sultan'), (home['tourney_name'] == 'San Diego'),
    (home['tourney_name'] == 'Great Ocean Road Open'), (home['tourney_name'] == 'Murray River Open'),
    (home['tourney_name'] == 'Singapore'), (home['tourney_name'] == 'Marbella'), 
    (home['tourney_name'] == 'Cagliari'), (home['tourney_name'] == 'Parma'), 
    (home['tourney_name'] == 'Belgrade 2'), (home['tourney_name'] == 'Tokyo Olympics')    
]

values = [ 
    'AUS', 'QAT', 'IND', 'NZL', 'AUS', 'AUS', 'CRI', 'UAE', 'FRA', 'USA', 
    'NDL', 'GBR', 'MEX', 'DNK', 'USA', 'CHI', 'COL', 'USA', 'USA', 'USA', 
    'MAR', 'USA', 'PRT', 'MON', 'ESP', 'GER', 'ESP', 'USA', 'ITA', 'GER', 
    'GER', 'AUT', 'FRA', 'GBR', 'GER', 'NLD', 'GBR', 'GBR', 'SUI', 'GBR', 
    'SWE', 'NLD', 'GER', 'CRO', 'AUT', 'USA', 'SMR', 'CAN', 'USA', 'USA', 
    'USA', 'USA', 'USA', 'UZB', 'ROU', 'AUS', 'ITA', 'CHN', 'JPN', 'AUT', 
    'FRA', 'CHN', 'SUI', 'RUS', 'GER', 'FRA', 'RUS', 'FRA', 'SWE', 'GBR', 
    'USA', 'ITA', 'CHI', 'ARG', 'MEX', 'USA', 'GER', 'POL', 'BRA', 'NDL', 
    'ESP', 'ESP', 'THA', 'FRA', 'GRE', 'CHN', 'USA', 'VNM', 'CRO', 'USA',
    'AUT', 'IND', 'POL', 'CHN', 'AUS', 'RSA', 'SRB', 'GBR', 'GER', 'MAS', 
    'CHN', 'FRA', 'FRA', 'USA', 'BRA', 'GBR', 'BRA', 'CHN', 'ECU', 'TUR', 
    'SUI', 'BGR', 'MAR', 'MEX', 'BRA', 'CHN', 'BEL', 'HUN', 'TUR', 'ITA', 
    'IND', 'USA', 'ESP', 'CHN', 'AUS', 'BRA', 'USA', 'RUS', 'GER', 'ITA', 
    'GER', 'KAZ', 'AUS', 'AUS', 'SGP', 'ESP', 'ITA', 'ITA', 'SRB', 'JPN', 
    'USA'
]

In [42]:
home['location_country'] = np.select(conditions, values)

In [43]:
home = home[['tourney_id', 'tourney_name', 'location_country', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points'
       ]]
home.head()

Unnamed: 0,tourney_id,tourney_name,location_country,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,2019-M020,Brisbane,AUS,Hard,32,A,20181231,300,105453,2.0,,Kei Nishikori,R,178.0,JPN,29.004791,106421,4.0,,Daniil Medvedev,R,198.0,RUS,22.885695,6-4 3-6 6-2,3,F,124.0,3.0,3.0,77.0,44.0,31.0,17.0,13.0,3.0,6.0,8.0,6.0,100.0,54.0,34.0,20.0,14.0,10.0,15.0,9.0,3590.0,16.0,1977.0
1,2019-M020,Brisbane,AUS,Hard,32,A,20181231,299,106421,4.0,,Daniil Medvedev,R,198.0,RUS,22.885695,104542,,PR,Jo-Wilfried Tsonga,R,188.0,FRA,33.705681,7-6(6) 6-2,3,SF,82.0,10.0,1.0,52.0,33.0,28.0,14.0,10.0,0.0,1.0,17.0,2.0,77.0,52.0,36.0,7.0,10.0,10.0,13.0,16.0,1977.0,239.0,200.0
2,2019-M020,Brisbane,AUS,Hard,32,A,20181231,298,105453,2.0,,Kei Nishikori,R,178.0,JPN,29.004791,104871,,,Jeremy Chardy,R,188.0,FRA,31.882272,6-2 6-2,3,SF,66.0,2.0,2.0,47.0,33.0,26.0,9.0,8.0,2.0,2.0,10.0,3.0,46.0,27.0,15.0,6.0,8.0,1.0,5.0,9.0,3590.0,40.0,1050.0
3,2019-M020,Brisbane,AUS,Hard,32,A,20181231,297,104542,,PR,Jo-Wilfried Tsonga,R,188.0,FRA,33.705681,200282,7.0,,Alex De Minaur,R,183.0,AUS,19.868583,6-4 7-6(2),3,QF,106.0,12.0,2.0,68.0,43.0,34.0,15.0,11.0,4.0,5.0,1.0,2.0,81.0,60.0,38.0,9.0,11.0,4.0,6.0,239.0,200.0,31.0,1298.0
4,2019-M020,Brisbane,AUS,Hard,32,A,20181231,296,106421,4.0,,Daniil Medvedev,R,198.0,RUS,22.885695,105683,5.0,,Milos Raonic,R,196.0,CAN,28.010951,6-7(2) 6-3 6-4,3,QF,129.0,12.0,3.0,105.0,68.0,48.0,25.0,16.0,8.0,8.0,29.0,5.0,94.0,56.0,46.0,19.0,15.0,2.0,4.0,16.0,1977.0,18.0,1855.0


In [44]:
# 1 is for the win, 2 is for lose for games in homecountry
home['home_advantage'] = np.select([(home['winner_ioc'] == home['location_country'])], [1])
home['home_advantage'] = np.select([(home['loser_ioc'] == home['location_country'])], [2])
home.head()

Unnamed: 0,tourney_id,tourney_name,location_country,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,home_advantage
0,2019-M020,Brisbane,AUS,Hard,32,A,20181231,300,105453,2.0,,Kei Nishikori,R,178.0,JPN,29.004791,106421,4.0,,Daniil Medvedev,R,198.0,RUS,22.885695,6-4 3-6 6-2,3,F,124.0,3.0,3.0,77.0,44.0,31.0,17.0,13.0,3.0,6.0,8.0,6.0,100.0,54.0,34.0,20.0,14.0,10.0,15.0,9.0,3590.0,16.0,1977.0,0
1,2019-M020,Brisbane,AUS,Hard,32,A,20181231,299,106421,4.0,,Daniil Medvedev,R,198.0,RUS,22.885695,104542,,PR,Jo-Wilfried Tsonga,R,188.0,FRA,33.705681,7-6(6) 6-2,3,SF,82.0,10.0,1.0,52.0,33.0,28.0,14.0,10.0,0.0,1.0,17.0,2.0,77.0,52.0,36.0,7.0,10.0,10.0,13.0,16.0,1977.0,239.0,200.0,0
2,2019-M020,Brisbane,AUS,Hard,32,A,20181231,298,105453,2.0,,Kei Nishikori,R,178.0,JPN,29.004791,104871,,,Jeremy Chardy,R,188.0,FRA,31.882272,6-2 6-2,3,SF,66.0,2.0,2.0,47.0,33.0,26.0,9.0,8.0,2.0,2.0,10.0,3.0,46.0,27.0,15.0,6.0,8.0,1.0,5.0,9.0,3590.0,40.0,1050.0,0
3,2019-M020,Brisbane,AUS,Hard,32,A,20181231,297,104542,,PR,Jo-Wilfried Tsonga,R,188.0,FRA,33.705681,200282,7.0,,Alex De Minaur,R,183.0,AUS,19.868583,6-4 7-6(2),3,QF,106.0,12.0,2.0,68.0,43.0,34.0,15.0,11.0,4.0,5.0,1.0,2.0,81.0,60.0,38.0,9.0,11.0,4.0,6.0,239.0,200.0,31.0,1298.0,2
4,2019-M020,Brisbane,AUS,Hard,32,A,20181231,296,106421,4.0,,Daniil Medvedev,R,198.0,RUS,22.885695,105683,5.0,,Milos Raonic,R,196.0,CAN,28.010951,6-7(2) 6-3 6-4,3,QF,129.0,12.0,3.0,105.0,68.0,48.0,25.0,16.0,8.0,8.0,29.0,5.0,94.0,56.0,46.0,19.0,15.0,2.0,4.0,16.0,1977.0,18.0,1855.0,0


In [45]:
home.to_csv('atp/atp_matches_update.csv')

## Head-to-Head 

In [46]:
def geth2hforplayer(matches,name):
    """
    get all head-to-heads of the player
    returns list of all head-to-heads 
    like:
    if name = 'Roger Federer' then 
    the result ['Sergi Bruguera', 0, 1]
    means that Roger Federer had 0 wins over Bruguera
    and Sergi Bruguera had 1 win over Federer.
    """
    
    matches = matches[(matches['winner_name'] == name) | (matches['loser_name'] == name)]
    h2hs = {}
    for index, match in matches.iterrows():
        if (match['winner_name'] == name):
            if (match['loser_name'] not in h2hs):
                h2hs[match['loser_name']] = {}
                h2hs[match['loser_name']]['l'] = 0
                h2hs[match['loser_name']]['w'] = 1
            else:
                h2hs[match['loser_name']]['w'] = h2hs[match['loser_name']]['w']+1
        elif (match['loser_name'] == name):
            if (match['winner_name'] not in h2hs):
                h2hs[match['winner_name']] = {}
                h2hs[match['winner_name']]['w'] = 0
                h2hs[match['winner_name']]['l'] = 1
            else:
                h2hs[match['winner_name']]['l'] = h2hs[match['winner_name']]['l']+1

    h2hlist = []
    for k, v in h2hs.items():
        h2hlist.append([k, v['w'],v['l']])

    if (len(h2hlist) == 0):
        return ''
    else:
        return sorted(h2hlist, key=itemgetter(1,2))

In [47]:
# change the name based on the player you want
head2headresults = geth2hforplayer(atpmatches, 'Roger Federer')
head2headresults

[['Andrey Rublev', 0, 1],
 ['Thanasi Kokkinakis', 0, 1],
 ['Pablo Andujar', 0, 1],
 ['Felix Auger Aliassime', 0, 1],
 ['Andrea Gaudenzi', 0, 1],
 ['Evgeny Donskoy', 0, 1],
 ['Felix Mantilla', 0, 1],
 ['Markus Hantschk', 0, 1],
 ['Andrei Medvedev', 0, 1],
 ['James Sekulov', 0, 1],
 ['Francisco Clavet', 0, 1],
 ['Sergi Bruguera', 0, 1],
 ['Richard Fromberg', 0, 1],
 ['Franco Squillari', 0, 2],
 ['Patrick Rafter', 0, 2],
 ['Kyle Edmund', 1, 0],
 ['Denis Shapovalov', 1, 0],
 ['Oscar Otte', 1, 0],
 ['Casper Ruud', 1, 0],
 ['Pierre Hugues Herbert', 1, 0],
 ['Lloyd Harris', 1, 0],
 ['Jay Clarke', 1, 0],
 ['Juan Ignacio Londero', 1, 0],
 ['Sumit Nagal', 1, 0],
 ['Alex De Minaur', 1, 0],
 ['Ruben Bemelmans', 1, 0],
 ['Hyeon Chung', 1, 0],
 ['Matthew Ebden', 1, 0],
 ['Yoshihito Nishioka', 1, 0],
 ['Marius Copil', 1, 0],
 ['Tennys Sandgren', 1, 0],
 ['Bobby Reynolds', 1, 0],
 ['Ruben Ramirez Hidalgo', 1, 0],
 ['Frederico Gil', 1, 0],
 ['Jan Vacek', 1, 0],
 ['Maximo Gonzalez', 1, 0],
 ['Thiago Alv

## Percentage of Ace over Double Faults for the previous 12 months. 
higher value means higher serving speed and accuracy


In [48]:
atpmatches.sort_values(['tourney_date'], ascending=False, inplace=True)
atpmatches['tourney_date'] = atpmatches['tourney_date'].astype('int')

# thewroume oti to 'NOW' einai 31.12.2020 
# otan tha trexei kanonika auto tha allaksei se alli imerominia. 

period = atpmatches[(atpmatches['tourney_date'] >= 20191231)]
period = period[(period['tourney_date'] <= 20201231)]

w_group_a = period.groupby('winner_name')['w_ace'].sum()
l_group_a = period.groupby('loser_name')['l_ace'].sum()
w_group_df = period.groupby('winner_name')['w_df'].sum()
l_group_df = period.groupby('loser_name')['l_df'].sum()

aces = pd.DataFrame({'w_total_aces': w_group_a, 'l_total_aces': l_group_a, 'w_total_df': w_group_df, 'l_total_df': l_group_df}).fillna(0)
aces[['w_total_aces', 'l_total_aces', 'w_total_df', 'l_total_df']] = aces[['w_total_aces', 'l_total_aces', 'w_total_df', 'l_total_df']].astype(int)
aces = aces.reindex(['w_total_aces', 'l_total_aces', 'w_total_df', 'l_total_df'], axis=1)
aces.index.name = 'player_name'
aces['total_aces'] = aces['w_total_aces'] + aces['l_total_aces']
aces['total_dfs'] = aces['w_total_df'] + aces['l_total_df']
#perc['w_srv_speed'] = (perc['w_ace']/perc['w_df'])*100
#perc['l_srv_speed'] = (perc['l_ace']/perc['l_df'])*100

aces.drop('w_total_aces', axis=1, inplace=True)
aces.drop('l_total_aces', axis=1, inplace=True)
aces.drop('w_total_df', axis=1, inplace=True)
aces.drop('l_total_df', axis=1, inplace=True)

#serving speep and quality
aces['srvspqual'] = (aces['total_aces']/aces['total_dfs'])*100

aces.sort_values(['srvspqual'], ascending=False, inplace=True)


In [49]:
aces.head(20)

Unnamed: 0_level_0,total_aces,total_dfs,srvspqual
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Noah Rubin,2,0,inf
Thomas Fabbiano,1,0,inf
Juan Pablo Varillas,3,0,inf
Pedro Cachin,9,1,900.0
Enzo Couacaud,9,1,900.0
John Patrick Smith,8,1,800.0
Reilly Opelka,370,48,770.833333
Prajnesh Gunneswaran,30,4,750.0
Nick Kyrgios,169,23,734.782609
Daniel Elahi Galan,27,4,675.0


## Percentage of Games Won over the last 12 months
## and Wins over the last 12 months

In [50]:
atpmatches.sort_values(['tourney_date'], ascending=False, inplace=True)
atpmatches['tourney_date'] = atpmatches['tourney_date'].astype('int')

# thewroume oti to 'NOW' einai 31.12.2020 
# otan tha trexei kanonika auto tha allaksei se alli imerominia. 

period = atpmatches[(atpmatches['tourney_date'] >= 20191231)]
period = period[(period['tourney_date'] <= 20201231)]

w_group = period.groupby('winner_name').size()
l_group = period.groupby('loser_name').size()

scores = pd.DataFrame({'total_wins': w_group, 'total_loses': l_group}).fillna(0)
scores[['total_wins', 'total_loses']] = scores[['total_wins', 'total_loses']].astype(int)
scores = scores.reindex(['total_wins', 'total_loses'], axis=1)
scores['total_matches'] = scores['total_wins'] + scores['total_loses']
scores['perc_of_total_wins'] = np.round(scores['total_wins']*100/scores['total_matches'],2)

scores.index.name = 'player_name'

scores.sort_values(['perc_of_total_wins'], ascending=False, inplace=True)

scores.head(20)

Unnamed: 0_level_0,total_wins,total_loses,total_matches,perc_of_total_wins
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Novak Djokovic,41,4,45,91.11
Roger Federer,5,1,6,83.33
Andrey Rublev,39,10,49,79.59
Rafael Nadal,27,7,34,79.41
Nick Kyrgios,6,2,8,75.0
Renzo Olivo,3,1,4,75.0
Daniil Medvedev,27,10,37,72.97
Gael Monfils,16,6,22,72.73
Dominic Thiem,24,9,33,72.73
Milos Raonic,23,9,32,71.88


## Total Career Aces
maybe i should put and the total number of matches. 

In [54]:
atpmatches.sort_values(['tourney_date'], ascending=False, inplace=True)
atpmatches['tourney_date'] = atpmatches['tourney_date'].astype('int')

w_group_a = atpmatches.groupby('winner_name')['w_ace'].sum()
l_group_a = atpmatches.groupby('loser_name')['l_ace'].sum()

aces = pd.DataFrame({'w_total_aces': w_group_a, 'l_total_aces': l_group_a}).fillna(0)
aces[['w_total_aces', 'l_total_aces']] = aces[['w_total_aces', 'l_total_aces']].astype(int)
aces = aces.reindex(['w_total_aces', 'l_total_aces'], axis=1)
aces.index.name = 'player_name'
aces['total_aces'] = aces['w_total_aces'] + aces['l_total_aces']

In [55]:
aces.sort_values(['total_aces'], ascending=False, inplace=True)
aces.head(10)

Unnamed: 0_level_0,w_total_aces,l_total_aces,total_aces
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ivo Karlovic,7385,6170,13555
John Isner,8534,4438,12972
Roger Federer,9231,1905,11136
Feliciano Lopez,5767,4086,9853
Andy Roddick,6871,2005,8876
Sam Querrey,4997,3526,8523
Milos Raonic,5785,2268,8053
Ivan Ljubicic,4928,2909,7837
Kevin Anderson,4767,2716,7483
Marin Cilic,5035,2215,7250


## Percentage of making the 1st Serves (Total)

1st Serves that go in!! 

PERCENTAGE OS PROS TI? 

In [56]:
#atpmatches.sort_values(['tourney_date'], ascending=False, inplace=True)
#atpmatches['tourney_date'] = atpmatches['tourney_date'].astype('int')

w_group_s = atpmatches.groupby('winner_name')['w_1stIn'].sum()
l_group_s = atpmatches.groupby('loser_name')['l_1stIn'].sum()

aces = pd.DataFrame({'w_total_serves': w_group_s, 'l_total_serves': l_group_s}).fillna(0)
aces[['w_total_serves', 'l_total_serves']] = aces[['w_total_serves', 'l_total_serves']].astype(int)
aces = aces.reindex(['w_total_serves', 'l_total_serves'], axis=1)
aces.index.name = 'player_name'
aces['total_1stInserves'] = aces['w_total_serves'] + aces['l_total_serves']

In [58]:
aces.sort_values(['total_1stInserves'], ascending=False, inplace=True)
aces.head(10)

Unnamed: 0_level_0,w_total_serves,l_total_serves,total_1stInserves
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Roger Federer,54300,14369,68669
Rafael Nadal,47633,11288,58921
Novak Djokovic,45830,10055,55885
Fernando Verdasco,28024,23232,51256
David Ferrer,32965,18223,51188
Feliciano Lopez,23668,21770,45438
Tommy Robredo,26240,16683,42923
John Isner,25740,16942,42682
Mikhail Youzhny,22502,19228,41730
Tomas Berdych,26003,14748,40751


## 1st Serve Points Won and Percentage (Total)

## 2nd Serve Points Won and Percentage (Total)

## Service Games Won (Total)

## Break-points Saved (Total) 

In [67]:
w_group = atpmatches.groupby('winner_name')['w_bpSaved'].sum()
l_group = atpmatches.groupby('loser_name')['l_bpSaved'].sum()

aces = pd.DataFrame({'WbreakpointsSaved': w_group, 'LbreakpointsSaved': l_group}).fillna(0)
aces[['WbreakpointsSaved','LbreakpointsSaved']].astype(int)
aces = aces.reindex(['WbreakpointsSaved','LbreakpointsSaved'], axis=1)
aces.index.name = 'player_name'
aces['total'] = aces['WbreakpointsSaved'] + aces['LbreakpointsSaved']

aces.head(10)

Unnamed: 0_level_0,WbreakpointsSaved,LbreakpointsSaved,total
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Abdulla Hajji,0.0,5.0,5.0
Adam Chadaj,6.0,3.0,9.0
Adam Kennedy,0.0,11.0,11.0
Adam Pavlasek,24.0,46.0,70.0
Adrian Andreev,13.0,23.0,36.0
Adrian Bohane,0.0,4.0,4.0
Adrian Cruciat,0.0,7.0,7.0
Adrian Garcia,22.0,106.0,128.0
Adrian Mannarino,751.0,1232.0,1983.0
Adrian Menendez Maceiras,33.0,75.0,108.0


## Break-points Faced (Total) 

In [68]:
w_group = atpmatches.groupby('winner_name')['w_bpFaced'].sum()
l_group = atpmatches.groupby('loser_name')['l_bpFaced'].sum()

aces = pd.DataFrame({'WbreakpointsFaced': w_group, 'LbreakpointsFaced': l_group}).fillna(0)
aces[['WbreakpointsFaced','LbreakpointsFaced']].astype(int)
aces = aces.reindex(['WbreakpointsFaced','LbreakpointsFaced'], axis=1)
aces.index.name = 'player_name'
aces['total'] = aces['WbreakpointsFaced'] + aces['LbreakpointsFaced']

aces.head(10)

Unnamed: 0_level_0,WbreakpointsFaced,LbreakpointsFaced,total
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Abdulla Hajji,0.0,21.0,21.0
Adam Chadaj,9.0,7.0,16.0
Adam Kennedy,0.0,22.0,22.0
Adam Pavlasek,32.0,91.0,123.0
Adrian Andreev,17.0,49.0,66.0
Adrian Bohane,0.0,7.0,7.0
Adrian Cruciat,0.0,14.0,14.0
Adrian Garcia,35.0,186.0,221.0
Adrian Mannarino,1113.0,2232.0,3345.0
Adrian Menendez Maceiras,46.0,150.0,196.0


## Percentage of break points saved ( saved/faced )

In [72]:
w_group_s = atpmatches.groupby('winner_name')['w_bpSaved'].sum()
l_group_s = atpmatches.groupby('loser_name')['l_bpSaved'].sum()

w_group_f = atpmatches.groupby('winner_name')['w_bpFaced'].sum()
l_group_f = atpmatches.groupby('loser_name')['l_bpFaced'].sum()

aces = pd.DataFrame({'WbreakpointsSaved': w_group_s, 'LbreakpointsSaved': l_group_s, 'WbreakpointsFaced': w_group_f, 'LbreakpointsFaced': l_group_f}).fillna(0)

aces[['WbreakpointsSaved','LbreakpointsSaved', 'WbreakpointsFaced','LbreakpointsFaced']].astype(int)

aces = aces.reindex(['WbreakpointsSaved','LbreakpointsSaved', 'WbreakpointsFaced','LbreakpointsFaced'], axis=1)
                                                       
aces.index.name = 'player_name'

aces['total_s'] = aces['WbreakpointsSaved'] + aces['LbreakpointsSaved']
aces['total_f'] = aces['WbreakpointsFaced'] + aces['LbreakpointsFaced']
aces['percentage'] = (aces['total_s']/aces['total_f'])*100

aces.head(10)

Unnamed: 0_level_0,WbreakpointsSaved,LbreakpointsSaved,WbreakpointsFaced,LbreakpointsFaced,total_s,total_f,percentage
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Abdulla Hajji,0.0,5.0,0.0,21.0,5.0,21.0,23.809524
Adam Chadaj,6.0,3.0,9.0,7.0,9.0,16.0,56.25
Adam Kennedy,0.0,11.0,0.0,22.0,11.0,22.0,50.0
Adam Pavlasek,24.0,46.0,32.0,91.0,70.0,123.0,56.910569
Adrian Andreev,13.0,23.0,17.0,49.0,36.0,66.0,54.545455
Adrian Bohane,0.0,4.0,0.0,7.0,4.0,7.0,57.142857
Adrian Cruciat,0.0,7.0,0.0,14.0,7.0,14.0,50.0
Adrian Garcia,22.0,106.0,35.0,186.0,128.0,221.0,57.918552
Adrian Mannarino,751.0,1232.0,1113.0,2232.0,1983.0,3345.0,59.282511
Adrian Menendez Maceiras,33.0,75.0,46.0,150.0,108.0,196.0,55.102041


## Percentage of Making the First Serve

## Percentage of Making the First Serve and Win It

In [73]:
w_group_s = atpmatches.groupby('winner_name')['w_1stIn'].sum()
l_group_s = atpmatches.groupby('loser_name')['l_1stIn'].sum()

w_group_w = atpmatches.groupby('winner_name')['w_1stWon'].sum()
l_group_w = atpmatches.groupby('loser_name')['l_1stWon'].sum()

aces = pd.DataFrame({'w_1st_serves': w_group_s, 'l_1st_serves': l_group_s, 'w_1stWon': w_group_w, 'l_1stWon': l_group_w }).fillna(0)
aces[['w_1st_serves','l_1st_serves','w_1stWon','l_1stWon']].astype(int)
aces = aces.reindex(['w_1st_serves','l_1st_serves','w_1stWon','l_1stWon'], axis=1)
aces.index.name = 'player_name'
aces['1stServe'] = aces['w_1st_serves'] + aces['l_1st_serves']
aces['1stWon'] = aces['w_1stWon'] + aces['l_1stWon']
aces['percentage1stIn1stWon'] = (aces['1stWon']/aces['1stServe'])*100

aces.head(10)

Unnamed: 0_level_0,w_1st_serves,l_1st_serves,w_1stWon,l_1stWon,1stServe,1stWon,percentage1stIn1stWon
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Abdulla Hajji,0.0,83.0,0.0,41.0,83.0,41.0,49.39759
Adam Chadaj,53.0,47.0,37.0,26.0,100.0,63.0,63.0
Adam Kennedy,0.0,127.0,0.0,80.0,127.0,80.0,62.992126
Adam Pavlasek,322.0,427.0,240.0,279.0,749.0,519.0,69.29239
Adrian Andreev,105.0,263.0,69.0,148.0,368.0,217.0,58.967391
Adrian Bohane,0.0,38.0,0.0,28.0,38.0,28.0,73.684211
Adrian Cruciat,0.0,70.0,0.0,48.0,70.0,48.0,68.571429
Adrian Garcia,343.0,969.0,263.0,614.0,1312.0,877.0,66.844512
Adrian Mannarino,9812.0,11973.0,7277.0,7812.0,21785.0,15089.0,69.263255
Adrian Menendez Maceiras,342.0,876.0,253.0,560.0,1218.0,813.0,66.748768


## Current round number

## Percentage of games won on the same type of surface

## Percentage of games won against the same opponent

## Percentage of games won against player with same handedness as the current opponent

## Percentage of games won in the same tournament

## Games won before in the same round

## Court Speed 
Court Speed Index (1 - 100): 80 * cube-root(Ace % * (Service Points Won % - 50%) * (Service Games Won % - 50%)) - 56, where statistics figures are adjusted with server's and returner's relative figure difference averaged by season and surface

## Tournament Participation 
Participation percentage measures how much the best players participate in the draw (100% if all top players participate in the draw). Formula: sum(ParticipationWeight(rank)) / sum(ParticipationWeight(1..PlayerCount)): i.e. the sum of participation weights of all players in the draw compared to maximal participation weight if all top players would have participated, where participation weight depend on ranking (see Participation Weights in the Glossary page).

## Tournament Strength 
Tournament strength measures the strength of the participating players based on weighted Elo ratings of the participating players. Formula: sum(ParticipationWeight(EloSeeding) * (Elo - 1500) / 400) * BestOfFactor, i.e. weighted sum of participating players strengths based on Elo Rating multiplied by best-of factor: 1.25 for Grand Slam, 1 for other tournaments (better player has ~25% more chance to win in best-of-5 compared to best-of-3), where weight depends on Elo-based seeding (see Participation Weights in the Glossary page).

## Tournament Elo Rating 
Average Elo rating measures the average strength of the participating players. Formula: sum(ParticipationWeight(EloSeeding) * EloRating) / sum(ParticipationWeight(1..PlayerCount)), i.e. weighted average of participating players Elo ratings at the beginning of the tournament, where weight depend on Elo-based seeding (see Participation Weights in the Glossary page).


## Participation Weights
Participation weights: Weights of players participating in the tournament based on their ranking or seeding (depends on context): [1: 100, 2: 85, 3: 75, 4: 67, 5: 60, 6: 55, 7-8: 50, 9-10: 45, 11-13: 40, 14-16: 35, 17-20: 30, 21-25: 25, 26-30: 20, 31-35: 16, 36-40: 13, 41-45: 10, 46-50: 8, 51-60: 6, 61-70: 5, 71-80: 4, 81-100: 3, 101-150: 2, 151-200: 1]

## Title Difficulty 
A factor of difficulty to win the title compared to a difficulty for an average title winner to win an average tournament event of the same tournament level (calculation steps: first, probabilities to win the matches on the path to a title for an average title winner of the same tournament level are calculated based on average Elo Ratings of the title winners as well as Elo Ratings of the opponents the actual winner has faced (P = 1 / (1 + 10 ^ ((AvgWinnerElo - ActualOpponentElo) / 400)); second, difficulty to win the title is calculated as 1 - the product of winning probabilities from the first step (difficulty = 1 - title winning probability); third, title difficulty is normalized so that the average difficulty of the same tournament level is 1). Example: Difficulty factor of 1.125 means that a title was 12.5% harder to win compared to an average title of the same tournament level.

## Rivalry Score 
Rivalry score: sum(1 + match GOAT points), where match GOAT points are: [GS F: 8, GS SF: 4, GS QF: 2, GS R16: 1, TF F: 6, TF SF: 3, TF RR: 1, M F: 4, M SF: 2...] as in Big Wins match factor in GOAT Points legend

## Match Greatness Score 
Match Greatness Score is proportional to match factor (depending on tournament level and round as in Big Wins), player rankings factor (as in Big Wins), player career-high rankings factor (as in Big Wins), player Elo Ratings (((WinnerElo - 1500) / 400 + (LoserElo - 1500) / 400) / 2) and match length (sqrt(sets * (games + 2 * tie-breaks)))

## Mental Toughness Rating 
Mental Toughness Rating compares players in pressure situations: Mental Toughness Rating = Mental Points won / Mental Points lost

## Mental Point
Mental Points are weighted pressure situations: Mental Point = 2 * Best-of-3 Deciding Set + 4 * Best-of-5 Deciding Set + 2 * Final Match + Non-Deciding Set Tie-Break + 2 * Deciding Set Tie Break

## Draw Bonus 
Draw Bonus represents how much a player has benefited or penalized with the actual draw. It is calculated as a relative difference in the title-winning percentage of the actual draw compared to the average draw without seeding and with random byes. Thus Draw Bonus incorporates both draw luck and draw seeding factors.

## First Serve Effectiveness
First Serve Effectiveness: 1st serve points won % divided by 2nd serve points won %

## Service In-play Points Won % 
Service in-play points won % (excluding aces and double faults)

## Serve Rating
Ace % - Double Faults % + 1st Serve % + 1st Serve Points Won % + 2nd Serve Points Won % + Break Points Saved % + Service Games Won %

## Return In-play Points Won % 
Return in-play points won % (excluding aces and double faults against)

## Return Rating 
1st Serve Return Points Won % + 2nd Serve Return Points Won % + Break Points Converted % + Return Games Won %

## Total 2nd Serve In-play Points Won % 
Total (serve + return) 2nd serve in-play points won %

## Return to Service Points Ratio 
Return points played divided by service points played

## Points Dominance Ratio 
Points Dominance Ratio: % of return points won divided by % of service points lost

## In-play Points Dominance Ratio 
In-play Points Dominance Ratio: % of return in-play points won divided by % of service in-play points lost (excluding aces and double faults)

## 2nd Serve In-play Points Dominance Ratio 
2-nd Serve In-play Points Dominance Ratio: % of 2-nd serve return in-play points won divided by % of 2-nd serve service in-play points lost (excluding double faults)

## Games Dominance Ratio 
Games Dominance Ratio: % of return games won divided by % of service games lost

## Break Points Ratio 
Break Points Ratio: % of break points converted divided by % of faced break points lost

## Over-Performing Ratio 
Points to Matches Over-Performing Ratio: % of matches won divided by % of total points won

## Points to Sets Over-Performing Ratio 
Points to Sets Over-Performing Ratio: % of sets won divided by % of total points won

## Points to Games Over-Performing Ratio 
Points to Games Over-Performing Ratio: % of games won divided by % of total points won

## Service Points to Service Games Over-Performing Ratio 
Service Points to Service Games Over-Performing Ratio: % of service games won divided by % of service points won

## Return Points to Return Games Over-Performing Ratio 
Return Points to Return Games Over-Performing Ratio: % of return games won divided by % of return points won

## Points to Tie-Breaks Over-Performing Ratio 
Points to Tie-Breaks Over-Performing Ratio: % of tie-breaks won divided by % of total points won

## Games to Matches Over-Performing Ratio 
Games to Matches Over-Performing Ratio: % of matches won divided by % of games won

## Games to Sets Over-Performing Ratio 
Games to Sets Over-Performing Ratio: % of sets won divided by % of games won

## Sets to Matches Over-Performing Ratio 
Sets to Matches Over-Performing Ratio: % of matches won divided by % of sets won

## Break Points Over-Performing Ratio 
Break Points Over-Performing Ratio: % of break points won (saved + converted) divided by % of total points won

## Break Points Saved Over-Performing Ratio 
Break Points Saved Over-Performing Ratio: % of break points saved divided by % of service points won

## Break Points Converted Over-Performing Ratio 
Break Points Converted Over-Performing Ratio: % of break points converted divided by % of return points won

## Opponent Rank 
Average opponent rank (geometric mean)

## Opponent Elo Rating 
Average opponent Elo rating (arithmetic mean)

## Upsets scored 
Matches won over higher-ranked players (according to ATP ranking)

## Upsets scored % 
% of Matches won over higher-ranked players (according to ATP ranking)

## Upsets against 
Matches lost from lower-ranked players (according to ATP ranking)

## Upsets against %
% of Matches lost from lower-ranked players (according to ATP ranking)

## Upsets 
Matches won over higher-ranked players + Matches lost from lower-ranked players (according to ATP ranking)

## Upsets %
% Matches won over higher-ranked players + Matches lost from lower-ranked players (according to ATP ranking)