In [15]:
import numpy as np
import pandas as pd
import glob

## Data Cleaning

### Atp Matches
Gather all ATP Matches files into one file

In [16]:
allfiles = glob.glob("atp/atp_matches_" + "????.csv")
atpmatches = pd.DataFrame()
atpmatcheslist = list()
for file in allfiles:
    df = pd.read_csv(file, index_col=None, header=0)
    atpmatcheslist.append(df)
atpmatches = pd.concat(atpmatcheslist)

In [17]:
pd.set_option('display.max_columns', None)

In [18]:
atpmatches.sort_values(['tourney_date', 'tourney_id'], inplace=True)
atpmatches.head(3)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
1026,1968-580,Australian Chps.,Grass,64,G,19680119,1,110023,,,Richard Coulthard,R,,AUS,,107760,,,Max Senior,R,,AUS,,12-10 7-5 4-6 7-5,5,R64,,,,,,,,,,,,,,,,,,,,,,,
1027,1968-580,Australian Chps.,Grass,64,G,19680119,2,109803,,,John Brown,R,,AUS,27.520876,106964,,,Ernie Mccabe,R,,AUS,,6-3 6-2 6-4,5,R64,,,,,,,,,,,,,,,,,,,,,,,
1028,1968-580,Australian Chps.,Grass,64,G,19680119,3,100257,,,Ross Case,R,,AUS,16.21629,110024,15.0,,Gondo Widjojo,R,,INA,,6-4 3-6 6-3 7-5,5,R64,,,,,,,,,,,,,,,,,,,,,,,


### Players
Gather players' file and drop useless columns

In [30]:
players = pd.read_csv('atp/atp_players.csv')
players.head(3)

Unnamed: 0,player_id,name_first,name_last,hand,dob,ioc,height,wikidata_id
0,100001,Gardnar,Mulloy,R,19131122,USA,,Q54544
1,100002,Pancho,Segura,R,19210620,ECU,,Q54581
2,100003,Frank,Sedgman,R,19271002,AUS,,Q962049


In [31]:
# drop 'wikidata_id' column 
players['player_name'] = players['name_first'] + ' ' + players['name_last']
players = players.sort_values(['player_name'])
players = players[['player_id', 'player_name', 'name_first', 'name_last', 'hand', 'dob', 'ioc', 'height']]
print('Shape of Dataframe: ', players.shape[0])
players.head(3)

Shape of Dataframe:  55614


Unnamed: 0,player_id,player_name,name_first,name_last,hand,dob,ioc,height
16955,116971,?? Baillie,??,Baillie,U,,USA,
49642,204608,A Aguilar,A,Aguilar,R,,MEX,
47812,202778,A Alvarez,A,Alvarez,U,,ESP,


#### Remove unfinished Matches, Davis Cup Matches, Players with less than 9 played matches and the matches that include these players

In [32]:
matches_before = atpmatches.shape[0]

#remove unfinished matches
atpmatches = atpmatches[atpmatches['score'].str.contains(r'RET') == False]
atpmatches = atpmatches[atpmatches['score'].str.contains(r'W/O') == False]
atpmatches = atpmatches[atpmatches['score'].str.contains(r'DEF') == False]

matches_after1 = atpmatches.shape[0]
matches_unfinished = matches_before - matches_after1

In [33]:
# remove Davis Cup matches to make easier my life
atpmatches = atpmatches[atpmatches['tourney_level'].str.contains('D') == False]

matches_after2 = atpmatches.shape[0]
matches_Davis = matches_after1 - matches_after2

In [40]:
# find total matches of every player and remove those with less than 9 played games
w_group = atpmatches.groupby('winner_name').size()
l_group = atpmatches.groupby('loser_name').size()

matches = pd.DataFrame({'total_wins': w_group, 'total_losses': l_group}).fillna(0)
matches[['total_wins', 'total_losses']] = matches[['total_wins', 'total_losses']].astype(int)
matches = matches.reindex(['total_wins', 'total_losses'], axis=1)
matches['total_matches'] = matches['total_wins'] + matches['total_losses']

#matches = matches[['total_matches']]
matches.index.name = 'player_name'
matches.reset_index(level=0, inplace=True)

print(matches.shape[0])
playerslist = list(matches['player_name'])
print(len(playerslist))

atpmatches = atpmatches[atpmatches.winner_name.isin(playerslist)]
atpmatches = atpmatches[atpmatches.loser_name.isin(playerslist)]

matches_after3 = atpmatches.shape[0]
matches_ = matches_after2 - matches_after3
matches.head(10)

4836
4836


Unnamed: 0,player_name,total_wins,total_losses,total_matches
0,,0,1,1
1,A D'Orsongna,0,1,1
2,A Difrancesco,0,1,1
3,A El Ghani,0,1,1
4,A Ferguson,1,1,2
5,A Fine,0,1,1
6,A G Barker,1,1,2
7,A Galbraith,0,1,1
8,A H Davies,0,1,1
9,A Hernandez,0,1,1


In [35]:
atpmatches.sort_values(['tourney_date', 'tourney_id'], inplace=True)
print(atpmatches.shape[0])
atpmatches.head(5)

162980


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
1026,1968-580,Australian Chps.,Grass,64,G,19680119,1,110023,,,Richard Coulthard,R,,AUS,,107760,,,Max Senior,R,,AUS,,12-10 7-5 4-6 7-5,5,R64,,,,,,,,,,,,,,,,,,,,,,,
1027,1968-580,Australian Chps.,Grass,64,G,19680119,2,109803,,,John Brown,R,,AUS,27.520876,106964,,,Ernie Mccabe,R,,AUS,,6-3 6-2 6-4,5,R64,,,,,,,,,,,,,,,,,,,,,,,
1028,1968-580,Australian Chps.,Grass,64,G,19680119,3,100257,,,Ross Case,R,,AUS,16.21629,110024,15.0,,Gondo Widjojo,R,,INA,,6-4 3-6 6-3 7-5,5,R64,,,,,,,,,,,,,,,,,,,,,,,
1029,1968-580,Australian Chps.,Grass,64,G,19680119,4,100105,5.0,,Allan Stone,R,,AUS,22.264203,110025,,,Robert Layton,R,,AUS,,6-4 6-2 6-1,5,R64,,,,,,,,,,,,,,,,,,,,,,,
1030,1968-580,Australian Chps.,Grass,64,G,19680119,5,109966,,,Warren Jacques,R,,AUS,29.861739,110026,,,Bert Kearney,R,,AUS,,6-4 6-1 7-5,5,R64,,,,,,,,,,,,,,,,,,,,,,,


In [37]:
players = players[players.player_name.isin(playerslist)]
print(players.shape[0])
players.head(5)

4975


Unnamed: 0,player_id,player_name,name_first,name_last,hand,dob,ioc,height
54582,209548,A D'Orsongna,A,D'Orsongna,U,,,
9851,109866,A Difrancesco,A,Difrancesco,R,,AUS,
10407,110422,A El Ghani,A,El Ghani,U,,UAE,
54863,209829,A Ferguson,A,Ferguson,U,,,
54683,209649,A Fine,A,Fine,U,,GBR,


In [12]:
percentage = ((matches_before - matches_after3)/matches_before) * 100

print('Matches before cleaning: ', matches_before)
print("#########################################")
print('Unfinished Matches removed: ', matches_unfinished)
print('Davis-Cup Matches removed: ', matches_Davis)
print('Matches removed of players with less than 9 played games: ', matches_)
print("#########################################")
print('Matches after cleaning: ', matches_after3)
print('Percentage of matches removed: ', percentage, '%')

Matches before cleaning:  180305
#########################################
Unfinished Matches removed:  4734
Davis-Cup Matches removed:  13752
Matches removed of players with less than 9 played games:  7030
#########################################
Matches after cleaning:  154789
Percentage of matches removed:  14.151576495382823 %


In [13]:
print('Players before cleaning: ', matches.shape[0])
print("#########################################")
print('Players with less than 9 matches: ', playersOff.shape[0])
print("#########################################")
print('Players after cleaning: ', playersOn.shape[0])

Players before cleaning:  4820
#########################################
Players with less than 9 matches:  2932
#########################################
Players after cleaning:  1888


### Save the cleaned Dataset

In [14]:
atpmatches.to_csv('~/Desktop/atp/prod/atpmatches_cleaned.csv')
players.to_csv('~/Desktop/atp/prod/atpplayers_cleaned.csv')

## Player Parameters

### Height

In [None]:
h_w = atpmatches.groupby('winner_name')['winner_ht'].mean()
h_l = atpmatches.groupby('loser_name')['loser_ht'].mean()

height_w = pd.DataFrame({'height_': h_w}).fillna(0)
height_l = pd.DataFrame({'height_': h_l}).fillna(0)

height_w.index.name = 'player_name'
height_l.index.name = 'player_name'

height_w = height_w.reset_index(level=0)
height_l = height_l.reset_index(level=0)

height = pd.concat([height_w, height_l])
height = height.sort_values(['player_name'])
height.drop_duplicates(subset='player_name', inplace=True)

In [None]:
height['height_'] = height['height_'].replace({0: np.nan})
height_average = height['height_'].mean()
height_average = round(height_average)

height['height_'].fillna(height_average, inplace=True)
height.head(10)

In [None]:
atpplayers = pd.merge(left=players, right=height, left_on='player_name', right_on='player_name')
print('Shape of Dataframe: ', atpplayers.shape[0])
atpplayers.head()

In [None]:
atpplayers.drop(['height'], axis=1, inplace=True)

In [None]:
atpplayers = atpplayers[['player_id', 'player_name', 'hand', 'dob', 'ioc', 'height_']]

In [None]:
atpplayers.head()

## Surface

In [None]:
hard = atpmatches[(atpmatches['surface'] == 'Hard')]
clay = atpmatches[(atpmatches['surface'] == 'Clay')]
grass = atpmatches[(atpmatches['surface'] == 'Grass')]
carpet = atpmatches[(atpmatches['surface'] == 'Carpet')]

total_matches = hard.shape[0] + clay.shape[0] + grass.shape[0] + carpet.shape[0]

print('Total Matches: ', total_matches)
print('Hard Surface Matches: ', hard.shape[0], '(', round((hard.shape[0]/total_matches)*100, 3), '%)')
print('Clay Surface Matches: ', clay.shape[0], '(', round((clay.shape[0]/total_matches)*100, 3), '%)')
print('Grass Surface Matches: ', grass.shape[0], '(', round((grass.shape[0]/total_matches)*100, 3), '%)')
print('Carpet Surface Matches: ', carpet.shape[0], '(', round((carpet.shape[0]/total_matches)*100, 3), '%)')

### Hard Surface 

In [None]:
w_group = hard.groupby('winner_name').size()
l_group = hard.groupby('loser_name').size()
    
h_surf = pd.DataFrame({'wins_hard': w_group, 'losses_hard': l_group}).fillna(0)
h_surf[['wins_hard', 'losses_hard']] = h_surf[['wins_hard', 'losses_hard']].astype(int)

h_surf = h_surf.reindex(['wins_hard', 'losses_hard'], axis=1)
h_surf['matches_hard'] = h_surf['wins_hard'] + h_surf['losses_hard']
h_surf['hard(%)'] = np.round((h_surf['wins_hard']/h_surf['matches_hard'])*100,2)
    
h_surf.index.name = 'player_name'
h_surf = h_surf.sort_values(['player_name'])
print('Shape of Dataframe: ', h_surf.shape[0])

h_surf.head(3)

In [None]:
h_surf.to_csv('~/Desktop/atp/prod/atp_winsOnHardSurface.csv')

### Clay Surface

In [None]:
w_group = clay.groupby('winner_name').size()
l_group = clay.groupby('loser_name').size()
    
cl_surf = pd.DataFrame({'wins_clay': w_group, 'losses_clay': l_group}).fillna(0)
cl_surf[['wins_clay', 'losses_clay']] = cl_surf[['wins_clay', 'losses_clay']].astype(int)
    
cl_surf = cl_surf.reindex(['wins_clay', 'losses_clay'], axis=1)  
cl_surf['matches_clay'] = cl_surf['wins_clay'] + cl_surf['losses_clay']
cl_surf['clay(%)'] = np.round((cl_surf['wins_clay']/cl_surf['matches_clay'])*100,2)
    
cl_surf.index.name = 'player_name'
cl_surf = cl_surf.sort_values(['player_name'])
print('Shape of Dataframe: ', cl_surf.shape[0])

cl_surf.head(3)

In [None]:
cl_surf.to_csv('~/Desktop/atp/prod/atp_winsOnClaySurface.csv')

### Grass Surface

In [None]:
w_group = grass.groupby('winner_name').size()
l_group = grass.groupby('loser_name').size()
    
g_surf = pd.DataFrame({'wins_grass': w_group, 'losses_grass': l_group}).fillna(0)
g_surf[['wins_grass', 'losses_grass']] = g_surf[['wins_grass', 'losses_grass']].astype(int)

g_surf = g_surf.reindex(['wins_grass', 'losses_grass'], axis=1)
g_surf['matches_grass'] = g_surf['wins_grass'] + g_surf['losses_grass']
g_surf['grass(%)'] = np.round((g_surf['wins_grass']/g_surf['matches_grass'])*100,2)
    
g_surf.index.name = 'player_name'
g_surf = g_surf.sort_values(['player_name'])
print('Shape of Dataframe: ', g_surf.shape[0])

g_surf.head(3)

In [None]:
g_surf.to_csv('~/Desktop/atp/prod/atp_winsOnGrassSurface.csv')

### Carpet Surface 

In [None]:
w_group = carpet.groupby('winner_name').size()
l_group = carpet.groupby('loser_name').size()
    
c_surf = pd.DataFrame({'wins_carpet': w_group, 'losses_carpet': l_group}).fillna(0)
c_surf[['wins_carpet', 'losses_carpet']] = c_surf[['wins_carpet', 'losses_carpet']].astype(int)  
c_surf = c_surf.reindex(['wins_carpet', 'losses_carpet'], axis=1)
    
c_surf['matches_carpet'] = c_surf['wins_carpet'] + c_surf['losses_carpet']
c_surf['carpet(%)'] = np.round((c_surf['wins_carpet']/c_surf['matches_carpet'])*100,2)
    
c_surf.index.name = 'player_name'
c_surf = c_surf.sort_values(['player_name'])
print('Shape of Dataframe: ', c_surf.shape[0])

c_surf.head(3)

In [None]:
c_surf.to_csv('~/Desktop/atp/prod/atp_winsOnCarpetSurface.csv')

### All Types Summary

In [None]:
hard_carpet = pd.concat([h_surf, c_surf],axis=1).fillna(0)
hard_carpet['matches1'] = hard_carpet['matches_hard'] + hard_carpet['matches_carpet']
hard_carpet = hard_carpet[['matches_hard', 'hard(%)', 'matches_carpet', 'carpet(%)', 'matches1']]

grass_clay = pd.concat([g_surf, cl_surf], axis=1).fillna(0)
grass_clay['matches2'] = grass_clay['matches_grass'] + grass_clay['matches_clay']
grass_clay = grass_clay[['matches_grass', 'grass(%)', 'matches_clay', 'clay(%)', 'matches2']]

allsurfaces = pd.concat([hard_carpet, grass_clay], axis=1)
allsurfaces['matches'] = allsurfaces['matches1'] + allsurfaces['matches2']

allsurfaces = allsurfaces[['matches', 'hard(%)', 'carpet(%)', 'grass(%)', 'clay(%)']]
allsurfaces.rename(columns = {'matches': 'total_matches'}, inplace=True)

allsurfaces = allsurfaces.reset_index(level=0)
allsurfaces.rename(columns = {'index': 'player_name'}, inplace=True)
allsurfaces = allsurfaces.sort_values(['player_name'])
allsurfaces.head(10)

In [None]:
atpplayers = pd.merge(left=atpplayers, right=allsurfaces, left_on='player_name', right_on='player_name')
atpplayers.head(10)

In [None]:
atpplayers = atpplayers[['player_id', 'player_name', 'hand', 'dob', 'ioc', 'height_', 'total_matches', 'hard(%)', 'carpet(%)', 'grass(%)', 'clay(%)']]
atpplayers.to_csv('~/Desktop/atp/prod/atpplayers.csv')

## Performance Metrics per Match

In [None]:
metrics = atpmatches.copy()
metrics.head(5)

### Percentage of Successful First Serves
$$ 1stS(\%) = \frac{1stIn}{svpt} $$

In [None]:
metrics['w_1stS(%)'] = round((metrics['w_1stIn']/metrics['w_svpt'])*100,2)
metrics['l_1stS(%)'] = round((metrics['l_1stIn']/metrics['l_svpt'])*100,2)
metrics.head(5)

### Number of Successful Second Serves
$$ 2ndIn = svpt - 1stIn $$

In [None]:
metrics['w_2ndIn'] = metrics['w_svpt'] - metrics['w_1stIn']
metrics['l_2ndIn'] = metrics['l_svpt'] - metrics['l_1stIn']
metrics.head(5)

### Percentage of First Serve Points Won
$$ 1stSvpt(\%) = \frac{1stWon}{1stIn} $$

In [None]:
metrics['w_1st_svpt(%)'] = round((metrics['w_1stWon']/metrics['w_1stIn'])*100,2)
metrics['l_1st_svpt(%)'] = round((metrics['l_1stWon']/metrics['l_1stIn'])*100,2)
metrics.head(5)

### Percentage of Second Serve Points Won
$$ 2ndSvpt(\%) = \frac{2nWon}{2ndIn} $$

In [None]:
metrics['w_2nd_svpt(%)'] = round((metrics['w_2ndWon']/metrics['w_2ndIn'])*100,2)
metrics['l_2nd_svpt(%)'] = round((metrics['l_2ndWon']/metrics['l_2ndIn'])*100,2)
metrics.head(5)

### Number of First Serve Return Points Won
$$ winner_{1stSreturnPointsWon} = loser_{1stIn} - loser_{1stWon} $$

In [None]:
metrics['w_1stS_rtpWon'] = metrics['l_1stIn'] - metrics['l_1stWon']
metrics['l_1stS_rtpWon'] = metrics['w_1stIn'] - metrics['w_1stWon']
metrics.head(5)

### Number of Second Serve Return Points Won
$$ winner_{2ndSreturnPointsWon} = loser_{2ndIn} - loser_{2ndWon} $$

In [None]:
metrics['w_2ndS_rtpWon'] = metrics['l_2ndIn'] - metrics['l_2ndWon']
metrics['l_2ndS_rtpWon'] = metrics['w_2ndIn'] - metrics['w_2ndWon']
metrics.head(5)

### Percentage of First Serve Return Points Won
$$ winner_{1stSreturnPointsWon}(\%) = \frac{winner_{1stSreturnPointsWon}}{loser_{1stIn}} $$

In [None]:
metrics['w_1stS_rtpWon(%)'] = round((metrics['w_1stS_rtpWon']/metrics['l_1stIn'])*100,2)
metrics['l_1stS_rtpWon(%)'] = round((metrics['l_1stS_rtpWon']/metrics['w_1stIn'])*100,2)
metrics.head(5)

### Percentage of Second Serve Return Points Won
$$ winner_{2ndSreturnPointsWon}(\%) = \frac{winner_{2ndSreturnPointsWon}}{loser_{2ndIn}} $$

In [None]:
metrics['w_2ndS_rtpWon(%)'] = round((metrics['w_2ndS_rtpWon']/metrics['l_2ndIn'])*100,2)
metrics['l_2ndS_rtpWon(%)'] = round((metrics['l_2ndS_rtpWon']/metrics['w_2ndIn'])*100,2)
metrics.head(5)

### Percentage of Break Points Won
$$ bpWon(\%) = \frac{bpSaved}{bpFaced} $$

In [None]:
metrics['w_bpWon(%)'] = round((metrics['w_bpSaved']/metrics['w_bpFaced'])*100,2)
metrics['l_bpWon(%)'] = round((metrics['l_bpSaved']/metrics['l_bpFaced'])*100,2)
metrics.head(5)

### Number of Break Points Won
$$ winner_{bpWon} = loser_{bpFaced} - loser_{bpSaved} $$

In [None]:
metrics['w_bpWon'] = metrics['l_bpFaced'] - metrics['l_bpSaved']
metrics['l_bpWon'] = metrics['w_bpFaced'] - metrics['w_bpSaved']
metrics.head(5)

### Percentage of Break Points Converted
$$ winner_{bpConv}(\%) = \frac{winner_{bpWon}}{loser_{bpFaced}} $$

In [None]:
metrics['w_bpConv(%)'] = round((metrics['w_bpWon']/metrics['l_bpFaced'])*100,2)
metrics['l_bpConv(%)'] = round((metrics['l_bpWon']/metrics['w_bpFaced'])*100,2)
metrics.head(5)

### Summarize and Save in a csv file

In [None]:
metrics.sort_values(['tourney_date', 'tourney_id'], inplace = True)
metrics.to_csv('~/Desktop/atp/prod/performance_metrics.csv')

## Other Important Features 

### Wins and Losses of All Players in ATP Matches 

In [None]:
finals = atpmatches[(atpmatches['round'] == 'F')]
semifinals = atpmatches[(atpmatches['round'] == 'SF')]

titles_group = finals.groupby('winner_name').size()
finals_group = semifinals.groupby('winner_name').size()

w_group = atpmatches.groupby('winner_name').size()
l_group = atpmatches.groupby('loser_name').size()

scores = pd.DataFrame({'Wins': w_group, 'Losses': l_group}).fillna(0)
scores[['Wins', 'Losses']] = scores[['Wins', 'Losses']].astype(int)
scores = scores.reindex(['Wins', 'Losses'], axis=1)

scores['Matches'] = scores['Wins'] + scores['Losses']
scores['Wins(%)'] = np.round(scores['Wins']*100/scores['Matches'],2)

scores.index.name = 'player_name'

scores = scores.join(pd.DataFrame(finals_group, columns = ['Finals'],)).fillna(0)
scores = scores.join(pd.DataFrame(titles_group, columns = ['Titles'],)).fillna(0)

scores['Titles'] = scores['Titles'].astype('int')
scores['Finals'] = scores['Finals'].astype('int')
    
scores = scores.sort_values(['Titles', 'Wins'], ascending=False)
scores.head()

In [None]:
scores.sort_values(['player_name'], inplace=True)
scores.reset_index(level=0, inplace=True)
scores.head(3)

In [None]:
atpplayers = pd.merge(left=atpplayers, right=scores, left_on='player_name', right_on='player_name')
print('Shape of Dataframe: ', atpplayers.shape[0])
atpplayers.head(3)

In [None]:
atpplayers.drop(['total_matches'],axis=1, inplace=True)
atpplayers[['height_']] = atpplayers[['height_']].astype('int')
atpplayers.head(3)

### Previous Wins and Losses on Specific Tournament

In [None]:
def wins_on_tour_per_player(winsOnTour, tourney):
    tourney_ = str(tourney)
    matches = winsOnTour[(winsOnTour['tourney_name'] == tourney_)]
    finals = matches[(matches['round'] == 'F')]
    semifinals = matches[(matches['round'] == 'SF')]
    
    titles_group = finals.groupby('winner_name').size()
    finals_group = semifinals.groupby('winner_name').size()
    
    w_group = matches.groupby('winner_name').size()
    l_group = matches.groupby('loser_name').size()
    
    scores = pd.DataFrame({'wins': w_group, 'losses': l_group}).fillna(0)
    scores[['wins', 'losses']] = scores[['wins', 'losses']].astype(int)
    
    scores = scores.reindex(['wins', 'losses'], axis=1)
    
    scores['matches'] = scores['wins'] + scores['losses']
    scores['percentage'] = np.round(scores['wins']*100/scores['matches'],2)
    
    scores.index.name = 'player_name'
    
    scores = scores.join(pd.DataFrame(finals_group, columns = ['finals'],)).fillna(0)
    scores = scores.join(pd.DataFrame(titles_group, columns = ['titles'],)).fillna(0)
    
    scores['titles'] = scores['titles'].astype('int')
    scores['finals'] = scores['finals'].astype('int')
    return scores

In [None]:
# INSERT THE TOURNAMENT YOU WANT FROM THE LIST
tournaments = ['Adelaide', 'Auckland', 'Australian Open', 'Atlanta', 'Amsterdam','Acapulco',
               'Amersfoort', 'Athens Olympics', 'Antwerp', 'Antalya',  'Atp Cup',
               'Bogota', 'Barcelona', 'Bastad', 'Bucharest', 'Basel', 'Brighton','Buenos Aires',
               'Bangkok', 'Beijing Olympics','Brisbane', 'Belgrade','Budapest','Belgrade 2',
               'Casablanca', 'Chennai', 'Copenhagen', 'Canada Masters', 'Cincinnati Masters', 
               'Costa Do Sauipe', 'Chengdu', 'Cordoba', 'Cologne 1','Cologne 2','Cagliari',
               'Doha', 'Dubai', 'Delray Beach', 'Dusseldorf',
               'Estoril','Eastbourne',
               'Gstaad', 'Geneva', 'Great Ocean Road Open', 
               'Hamburg Masters', 'Halle', 'Hong Kong', 'Houston', 'Ho Chi Minh City','Hamburg',
               'Indian Wells Masters', 'Indianapolis','Istanbul',
               'Johannesburg',
               'Kitzbuhel','Kuala Lumpur',
               'London', 'Los Angeles', 'Long Island','Lyon','Las Vegas', 'London Olympics', 
               'Los Cabos', 
               'Mallorca', 'Marseille', 'Memphis','Mexico City','Miami Masters', 'Monte Carlo Masters',
               'Munich','Moscow', 'Masters Cup', 'Milan', 'Madrid Masters', 'Metz', 'Mumbai',
               'Montpellier','Marrakech', 'Murray River Open', 'Marbella',
               'Nottingham','Newport', 'Nice', 'NextGen Finals', 'New York', 'Nur-Sultan',
               "Queen's Club",'Quito',
               'Rome Masters', 'Roland Garros', 'Rotterdam','Rio de Janeiro','Rio Olympics',
               'ATP Rio de Janeiro', 
               'Orlando',
               'Palermo','Paris Masters','Poertschach', 'Pune', 'Parma',
               'San Jose', 'Santiago', 'Scottsdale', 'Sydney', 'St. Poelten', 's Hertogenbosch',
               'Stuttgart Outdoor', 'San Marino', 'Sydney Olympics', 'Shanghai','Stuttgart Masters',
               'St. Petersburg', 'Stockholm', 'Stuttgart','Sopot', 'Shanghai Masters','Sao Paulo',
               'Shenzhen', 'Sofia', 'St Petersburg', 'Sardinia', 'San Diego', 'Singapore', 
               'Tashkent', 'Tokyo', 'Toulouse','Tokyo Olympics',
               'Umag', 'US Open', 
               'Vienna', 'Vina del Mar', 'Valencia',
               'Wimbledon','Washington', 'Warsaw', 'Winston-Salem',
               'Zagreb',  'Zhuhai' ]

In [None]:
wins_on_tour_per_player(atpmatches, 'Umag')