In [1]:
import pandas as pd
from urllib.request import urlopen  
import os.path as osp
import os
import logging
import zipfile
from glob import glob
logging.getLogger().setLevel('INFO')

## Helpers

In [2]:
def download_file(url_str, path):
    url = urlopen(url_str)
    output = open(path, 'wb')       
    output.write(url.read())
    output.close()  
    
def extract_file(archive_path, target_dir):
    zip_file = zipfile.ZipFile(archive_path, 'r')
    zip_file.extractall(target_dir)
    zip_file.close()

## Download the dataset

In [3]:
BASE_URL = 'http://tennis-data.co.uk'
DATA_DIR = "tennis_data"
ATP_DIR = './{}/ATP'.format(DATA_DIR)
WTA_DIR = './{}/WTA'.format(DATA_DIR)

ATP_URLS = [BASE_URL + "/%i/%i.zip" % (i,i) for i in range(2000,2019)]
WTA_URLS = [BASE_URL + "/%iw/%i.zip" % (i,i) for i in range(2007,2019)]

os.makedirs(osp.join(ATP_DIR, 'archives'), exist_ok=True)
os.makedirs(osp.join(WTA_DIR, 'archives'), exist_ok=True)

for files, directory in ((ATP_URLS, ATP_DIR), (WTA_URLS, WTA_DIR)):
    for dl_path in files:
        logging.info("downloading & extracting file %s", dl_path)
        archive_path = osp.join(directory, 'archives', osp.basename(dl_path))
        download_file(dl_path, archive_path)
        extract_file(archive_path, directory)
    
ATP_FILES = sorted(glob("%s/*.xls*" % ATP_DIR))
WTA_FILES = sorted(glob("%s/*.xls*" % WTA_DIR))

df_atp = pd.concat([pd.read_excel(f) for f in ATP_FILES], ignore_index=True)
df_wta = pd.concat([pd.read_excel(f) for f in WTA_FILES], ignore_index=True)

logging.info("%i matches ATP in df_atp", df_atp.shape[0])
logging.info("%i matches WTA in df_wta", df_wta.shape[0])

INFO:root:downloading & extracting file http://tennis-data.co.uk/2000/2000.zip
INFO:root:downloading & extracting file http://tennis-data.co.uk/2001/2001.zip
INFO:root:downloading & extracting file http://tennis-data.co.uk/2002/2002.zip
INFO:root:downloading & extracting file http://tennis-data.co.uk/2003/2003.zip
INFO:root:downloading & extracting file http://tennis-data.co.uk/2004/2004.zip
INFO:root:downloading & extracting file http://tennis-data.co.uk/2005/2005.zip
INFO:root:downloading & extracting file http://tennis-data.co.uk/2006/2006.zip
INFO:root:downloading & extracting file http://tennis-data.co.uk/2007/2007.zip
INFO:root:downloading & extracting file http://tennis-data.co.uk/2008/2008.zip
INFO:root:downloading & extracting file http://tennis-data.co.uk/2009/2009.zip
INFO:root:downloading & extracting file http://tennis-data.co.uk/2010/2010.zip
INFO:root:downloading & extracting file http://tennis-data.co.uk/2011/2011.zip
INFO:root:downloading & extracting file http://tenni

## Your work

### Questions :
#### 1. Who are the three ATP players with the most wins ?

In [4]:
df_atp['Winner'].value_counts(sort=True, ascending=False)[0:3]

Federer R.     1121
Nadal R.        891
Djokovic N.     802
Name: Winner, dtype: int64

The ATP players with the most wins are in order :  
_ Federer R. with 1121 wins  
_ Nadal R. with 891 wins  
_ Djokovic N. with 802 wins.  

#### 2. How many sets did the player “ Federer R.” win in total ?

In [5]:
(df_atp[df_atp['Winner']==df_atp['Winner'].value_counts(sort=True, ascending=False).index[0]])['Wsets'].sum()

2546.0

Federer R. win in total 2546 sets.

#### 3. How many sets did the player “ Federer R.” win during the years 2016 and 2017 ?

In [6]:
df_atp['Date'] = pd.to_datetime(df_atp['Date'])
df_atp2016 = df_atp[df_atp['Date'].dt.year== 2016]
ind2016 = (df_atp2016['Winner']=='Federer R.') | (df_atp2016['Loser']=='Federer R.')
df_atp2016[ind2016]['Wsets'].sum()

68.0

Federer R. won during 2016, 68 sets.

In [7]:
df_atp2017 = df_atp[df_atp['Date'].dt.year== 2017]
ind2017 = (df_atp2017['Winner']=='Federer R.') | (df_atp2017['Loser']=='Federer R.')
df_atp2017[ind2017]['Wsets'].sum()

131.0

Federer R. won during 2017, 131 sets.

#### 4. For each match, what is the percentage of victories of the winner in the past ?

In [8]:
def f(ind,df):
    y = (df.index<ind) & ((df['Winner'] == df.loc[ind]['Winner']) | 
                               (df['Loser'] == df.loc[ind]['Loser']))
    tot = df[y].shape[0]
    z = (df.index<ind) & (df['Winner'] == df.loc[ind]['Winner'])
    win = df[z].shape[0]
    #print(df_atp.loc[ind]['Winner']," ",tot," ",win)
    if tot==0:
        return 0
    else:
        return(win/tot)

df_atp['taux'] = pd.Series(range(df_atp.shape[0]))
df_atp['win_rate'] = df_atp['taux'].apply(lambda x: f(x,df_atp)).copy()
df_wta['taux'] = pd.Series(range(df_wta.shape[0]))
df_wta['win_rate'] = df_wta['taux'].apply(lambda x: f(x,df_wta)).copy()

In [9]:
df_atp.drop('taux',axis=1,inplace=True)
df_wta.drop('taux',axis=1,inplace=True)

In [10]:
df_atp.loc[500:510]
df_wta.loc[500:510]

Unnamed: 0,AvgL,AvgW,B365L,B365W,Best of,CBL,CBW,Comment,Court,Date,...,UBW,W1,W2,W3,WPts,WRank,WTA,Winner,Wsets,win_rate
500,,,2.5,1.5,3,2.35,1.55,Completed,Outdoor,2007-03-01,...,1.45,7.0,7.0,,487.75,47.0,14,Dulko G.,2.0,0.875
501,,,1.66,2.1,3,1.8,1.95,Completed,Outdoor,2007-03-01,...,2.05,6.0,6.0,,327.0,83.0,14,Schruff J.,2.0,0.25
502,,,4.5,1.16,3,4.5,1.18,Retired,Outdoor,2007-03-01,...,1.18,6.0,3.0,,759.0,27.0,14,Garbin T.,1.0,0.909091
503,,,2.2,1.61,3,2.2,1.63,Completed,Outdoor,2007-03-01,...,1.7,5.0,6.0,6.0,454.5,52.0,14,Loit E.,2.0,0.615385
504,,,3.5,1.28,3,3.35,1.3,Completed,Outdoor,2007-03-01,...,1.3,7.0,6.0,,528.0,42.0,14,Pennetta F.,2.0,0.625
505,,,1.25,3.75,3,1.28,3.5,Completed,Outdoor,2007-03-02,...,3.5,5.0,7.0,6.0,123.75,216.0,14,Errani S.,2.0,0.333333
506,,,3.5,1.28,3,3.35,1.3,Completed,Outdoor,2007-03-02,...,1.34,6.0,6.0,,327.0,83.0,14,Schruff J.,2.0,1.0
507,,,3.0,1.36,3,3.35,1.3,Completed,Outdoor,2007-03-02,...,1.35,0.0,6.0,6.0,454.5,52.0,14,Loit E.,2.0,0.75
508,,,4.0,1.22,3,3.8,1.24,Completed,Outdoor,2007-03-03,...,1.24,6.0,6.0,,528.0,42.0,14,Pennetta F.,2.0,0.857143
509,,,1.5,2.5,3,1.55,2.35,Completed,Outdoor,2007-03-04,...,2.45,7.0,6.0,,454.5,52.0,14,Loit E.,2.0,0.714286


In [11]:
df_atp.to_csv('atp.csv')
df_wta.to_csv('wta.csv')

# Prédiction WTA