# Exercise: Tennis

In [6]:
%matplotlib inline
import matplotlib.pyplot as plt
import zipfile
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression

In this exercise we will load the Tennis dataset, choose one of the datasets, and fit a logistic model to try and predict if the player won a game.

In [7]:
filename = '../data/tennis.zip'
tennis_zip = zipfile.ZipFile(filename)
for f in tennis_zip.filelist: 
    print(f.filename)    

data/Andy-Murray.csv
data/Novak-Djokovic.csv
data/Rafael-Nadal.csv
data/Roger-Federer.csv


In [8]:
player = 'Rafael-Nadal'
path = 'data/{}.csv'
path = path.format(player.replace(' ', '-'))
with tennis_zip.open(path) as f:
    df = pd.read_csv(f)
df.head()

Unnamed: 0,year,tournament,start date,type,surface,draw,atp points,atp ranking,tournament prize money,round,...,player2 2nd serve return points total,player2 break points converted won,player2 break points converted total,player2 return games played,player2 total service points won,player2 total service points total,player2 total return points won,player2 total return points total,player2 total points won,player2 total points total
0,2001,"Seville, Spain",17.09.2001,CH,Outdoor: Clay,Draw: 32,5.0,,$650,R32,...,,,,,,,,,,
1,2001,"Seville, Spain",17.09.2001,CH,Outdoor: Clay,Draw: 32,5.0,,$650,R16,...,,,,,,,,,,
2,2001,"Spain F10, Madrid",10.09.2001,FU,Outdoor: Hard,Draw: 32,,,$117,R32,...,,,,,,,,,,
3,2002,"Spain F20, Gran Canaria",25.11.2002,FU,Outdoor: Carpet,Draw: 32,18.0,238.0,"$1,950",R32,...,,,,,,,,,,
4,2002,"Spain F20, Gran Canaria",25.11.2002,FU,Outdoor: Carpet,Draw: 32,18.0,238.0,"$1,950",R16,...,,,,,,,,,,


Step-by-step instructions:

- Use whatever table columns you want as features (i.e. independent variables, $x$s), see `df.columns` for ideas.
- Create a new column that has a boolean specifiying if the plater won the game (see the columns `player1 name` and `winner`).
- Create a regression model and fit it to the chosen data (using scikit, statsmodels, or our own hand-crafted).
- Check the accuracy (score) of the fitted model.
- Use the `pickle` model to dump the fitted model so it can be used again later for prediction or re-fitting.

For bonus points:
- Use [`sklearn.model_selection.train_test_split`](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html#sklearn.model_selection.train_test_split) (click the link for docs) to split the `X` and `y` into a training and testing set.
- Fit the model on the training set.
- Score the model on the test set.

Solution is given in [solutions/logistic-tennis.py](../solutions/logistic-tennis.py).

In [9]:
features = ['player1 aces', 'player1 double faults']
model_filename = 'logistic_tennis.model'

df['win'] = df['player1 name'] == df['winner']
target = 'win'
idx = np.isfinite(df[features]).all(axis=1)
df = df[idx]
df.head()

Unnamed: 0,year,tournament,start date,type,surface,draw,atp points,atp ranking,tournament prize money,round,...,player2 break points converted won,player2 break points converted total,player2 return games played,player2 total service points won,player2 total service points total,player2 total return points won,player2 total return points total,player2 total points won,player2 total points total,win
49,2002,"Mallorca, Spain",29.04.2002,WS,Outdoor: Clay,Draw: 32,15,762.0,"$5,850",R32,...,3.0,9.0,10.0,28.0,59.0,27.0,66.0,55.0,125.0,True
50,2002,"Mallorca, Spain",29.04.2002,WS,Outdoor: Clay,Draw: 32,15,762.0,"$5,850",R16,...,4.0,9.0,8.0,35.0,48.0,29.0,59.0,64.0,107.0,False
51,2003,"Basel, Switzerland",20.10.2003,WS,Indoor: Carpet,Draw: 32,5,48.0,"$10,000",R32,...,2.0,3.0,15.0,68.0,97.0,33.0,98.0,101.0,195.0,False
52,2003,"ATP Masters Series Madrid, Spain",13.10.2003,SU,Indoor: Hard,Draw: 48,5,49.0,"$7,500",R64,...,6.0,10.0,14.0,46.0,82.0,37.0,80.0,83.0,162.0,False
53,2003,"Lyon, France",06.10.2003,WS,Indoor: Carpet,Draw: 32,5,47.0,"$7,950",R32,...,4.0,8.0,10.0,39.0,55.0,34.0,72.0,73.0,127.0,False


In [11]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    df[features], df[target], test_size=0.75)

model = LogisticRegression()
model.fit(X_train, y_train)
print("Accuracy:", model.score(X_test, y_test))

Accuracy: 0.8323586744639376


In [12]:
with open(model_filename, 'wb') as f:
    pickle.dump(model, f)
    print("Saving model to", model_filename)

Saving model to logistic_tennis.model


In [13]:
print('Prediction:')
print(X_test.iloc[0], y_test.iloc[0])

Prediction:
player1 aces             2.0
player1 double faults    1.0
Name: 657, dtype: float64 False
