# Roger Federer Match Length History Pilot Analysis

Now this is exciting ! We will now go beyond analysing generalized (ie. non player specific) match length data and take a look at a specific player : the GOAT Roger Federer. There are several reasons for this choice (many matches at the top level, long career spanning more than two decades, evolution in terms of playstyle) but mainly Federer is my tennis hero.

The goal here is to obtain a series of weighted variables which would allow us to predict the length of a Roger Federer match against a given opponent, under a given set of match conditions.

In [57]:
### IMPORTS ###

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [58]:
### CLEAN FEDERER MATCHES TABLES ###

atp = pd.read_csv("atp_cat.csv")

fed_won = atp[atp["winner_name"] == "Roger Federer"]    # 1163 wins
fed_lost = atp[atp["loser_name"] == "Roger Federer"]    # 261 loses (82% winrate)

# We'll drop all of Federer's information except his age
fed_won = fed_won.drop(labels=["winner_name", "winner_hand", "winner_ht", "winner_ioc"], axis=1)
fed_won = fed_won.rename(columns={"winner_age": "fed_age", "winner_rank": "fed_rank", "winner_rank_points": "fed_rank_points",
                                  "loser_name":"opp_name", "loser_hand":"opp_hand", "loser_ht":"opp_ht", "loser_ioc":"opp_ioc", "loser_age":"opp_age",
                                  "loser_rank": "opp_rank", "loser_rank_points": "opp_rank_points"})
fed_won["fed_won"] = "1"

fed_lost = fed_lost.drop(labels=["loser_name", "loser_hand", "loser_ht", "loser_ioc"], axis=1)
fed_lost = fed_lost.rename(columns={"loser_age": "fed_age", "loser_rank": "fed_rank", "loser_rank_points": "fed_rank_points",
                                    "winner_name":"opp_name", "winner_hand":"opp_hand", "winner_ht":"opp_ht", "winner_ioc":"opp_ioc", "winner_age":"opp_age",
                                    "winner_rank": "opp_rank", "winner_rank_points": "opp_rank_points"})
fed_lost["fed_won"] = "0"

fed = pd.concat([fed_won, fed_lost])
fed.head(5)    # Clean Table of all Roger Federer ATP matches

Unnamed: 0.1,Unnamed: 0,tourney_name,surface,tourney_level,tourney_date,fed_age,opp_name,opp_hand,opp_ht,opp_ioc,opp_age,score,best_of,round,minutes,fed_rank,fed_rank_points,opp_rank,opp_rank_points,fed_won
24932,3086,Toulouse,Hard,A,19980928,17.138946,Guillaume Raoux,R,180.0,FRA,28.618754,6-2 6-2,3,R32,60.0,878.0,9.0,45.0,859.0,1
24941,3095,Toulouse,Hard,A,19980928,17.138946,Richard Fromberg,R,196.0,AUS,28.418891,6-1 7-6(5),3,R16,85.0,878.0,9.0,43.0,927.0,1
25666,251,Marseille,Hard,A,19990201,17.483915,Carlos Moya,R,190.0,ESP,22.431211,7-6(1) 3-6 6-3,3,R32,113.0,243.0,173.0,5.0,3178.0,1
25682,267,Marseille,Hard,A,19990201,17.483915,Jerome Golmard,L,188.0,FRA,25.396304,6-7(6) 7-6(5) 7-6(5),3,R16,140.0,243.0,173.0,63.0,743.0,1
25849,479,Rotterdam,Carpet,A,19990215,17.522245,Guillaume Raoux,R,180.0,FRA,29.002053,6-7(4) 7-5 7-6(3),3,R32,149.0,178.0,262.0,71.0,691.0,1


## Federer Generalities

### Surface

In [59]:
fed_surface = pd.pivot_table(fed, values="minutes", index = ["best_of", "opp_hand"], columns=["surface"]) # average duration per surface
display(fed_surface)
fed['surface'].value_counts(ascending=True)

Unnamed: 0_level_0,surface,Carpet,Clay,Grass,Hard
best_of,opp_hand,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3,L,61.4,95.363636,72.9,82.361111
3,R,93.1875,88.39375,89.388235,89.526678
5,L,,161.625,131.315789,131.05
5,R,170.75,134.097561,124.031579,130.214634


Carpet     57
Grass     209
Clay      280
Hard      878
Name: surface, dtype: int64

## Simple Regression Decision Tree

We will create a simple decision tree using five features :
- Surface  
- Best of  
- Opponent hand  
- Opponent height  
- Ranking difference  

### Formatting

In [60]:
fed1 = fed[["minutes", "surface", "opp_hand", "opp_ht"]]
fed1["rank_diff"] = fed["fed_rank"] - fed["opp_rank"]

# One-Hot Encoding
fed1 = pd.get_dummies(data=fed1, columns=["surface", "opp_hand"])

fed1 = fed1.dropna(axis=0, how='any')

fed1.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,minutes,opp_ht,rank_diff,surface_Carpet,surface_Clay,surface_Grass,surface_Hard,opp_hand_L,opp_hand_R
24932,60.0,180.0,833.0,0,0,0,1,0,1
24941,85.0,196.0,835.0,0,0,0,1,0,1
25666,113.0,190.0,238.0,0,0,0,1,0,1
25682,140.0,188.0,180.0,0,0,0,1,1,0
25849,149.0,180.0,107.0,1,0,0,0,0,1


### Decision Tree

In [61]:
# SETUP #

from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
import matplotlib.pyplot as plt

target='minutes'

X = fed1.drop([target], axis=1)
y = fed1[target]

X = scale(X)
y = scale(y)

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, test_size=.3)

#### Exemple of Regression Tree

In [62]:
# dtr = DecisionTreeRegressor(max_depth=6, min_samples_split=.1)

# dtr.fit(Xtrain, Ytrain)

# Ypred = dtr.predict(Xtest)

# x_ax = range(len(Ytest))
# plt.plot(x_ax, Ytest, linewidth=1, label="original")
# plt.plot(x_ax, Ypred, linewidth=1.1, label="predicted")
# plt.title("Y-test and y-predicted data")
# plt.xlabel('X-axis')
# plt.ylabel('Y-axis')
# plt.legend(loc='best',fancybox=True, shadow=True)
# plt.grid(True)
# plt.show()

# plt.figure(figsize=(30,15))
# tree.plot_tree(dtr,
#           filled=True,
#           rounded=True,
#           fontsize=10,
#           feature_names=["opp_ht", "rank_diff", "surface_Carpet", "surface_Clay", "surface_Grass", "surface_Hard", "opp_hand_L", "opp_hand_R"])

# plt.savefig('tree_high_dpi', dpi=600)

### Cross Validation

In [67]:
# K-FOLD CROSS VALIDATION

K = 10

# MAX_DEPTH=np.arange(3, 10)
# MIN_SAMPLE=[0.5, 0.4, 0.3, 0.2, 0.1, 0.05]
MAX_DEPTH=np.arange(3, 5)
print(MAX_DEPTH)
MIN_SAMPLE=[0.1]

scores = [[] for n in range(len(MAX_DEPTH))]   # sklearn Decision Tree score (max = 1)
MSE = [[] for n in range(len(MAX_DEPTH))]      # Mean Squared Error
RMSE = [[] for n in range(len(MAX_DEPTH))]     # Root Mean Squared Error

for i in range(len(MAX_DEPTH)) :

    for j in range(len(MIN_SAMPLE)) :

        d = MAX_DEPTH[i]
        s = MIN_SAMPLE[j]

        # print(f"Max depth : {d} , Min sample : {s}")

        dtr = DecisionTreeRegressor(max_depth=d, min_samples_split=s)

        dtr.fit(Xtrain, Ytrain)

        score = dtr.score(Xtrain, Ytrain)   
        scores[i].append(score)
        # print(f"score : {score}.")

        Ypred = dtr.predict(Xtest)

        mse = mean_squared_error(Ytest, Ypred)  # mean of (y_true - y_pred)**2

        MSE[i].append(mse)
        # print(f"MSE : {mse}.")
        RMSE[i].append(mse*(1/2.0))
        # print(f"RMSE : {mse*(1/2.0)}.")

scores_df = pd.DataFrame(data=scores, index=MAX_DEPTH, columns=MIN_SAMPLE)
index = scores_df.index
index.name = "scores"
display(scores_df)
MSE_df = pd.DataFrame(data=MSE, index=MAX_DEPTH, columns=MIN_SAMPLE)
index = MSE_df.index
index.name = "MSE"
display(MSE_df)
RMSE_df = pd.DataFrame(data=MSE, index=MAX_DEPTH, columns=MIN_SAMPLE)
index = RMSE_df.index
index.name = "RMSE"
display(RMSE_df)

[3 4]


Unnamed: 0_level_0,0.1
scores,Unnamed: 1_level_1
3,0.115449
4,0.126662


Unnamed: 0_level_0,0.1
MSE,Unnamed: 1_level_1
3,0.8831
4,0.87284


Unnamed: 0_level_0,0.1
RMSE,Unnamed: 1_level_1
3,0.8831
4,0.87284


In [71]:
CV = model_selection.KFold(n_splits=K,shuffle=True)

lr_errors = []
ann_errors = []
baseline_errors= []

for train_index, test_index in CV.split(X, y):
    print(f'   K-validation fold: {j+1}/{K}')  
    # Inner cross-validation loop. Use cross-validation to select optimal model
    
    # extract training and test set for current CV fold
    X_Kj_train = X[train_index,:]
    y_Kj_train = y[train_index]
    X_Kj_test = X[test_index,:]
    y_Kj_test = y[test_index]
    
    # coeff_error = len(X_Dj_val)/len(X)

	# Fit and evaluate models
        
    j+=1


# # For each s compute: Ê_gen_Ms
# lr_E_gen = np.sum(errors_lr,0)
# ann_E_gen = np.sum(errors_ann,0)

# # select optimal model
# lr_optimal_model = lr_all_models[np.argmin(lr_E_gen)]
# ann_optimal_model = ann_all_models[np.argmin(ann_E_gen)]

# # train optimal model on Di_par
# lr_optimal_model.fit(X_Di_par, y_Di_par)
# ann_optimal_model.fit(X_Di_par, y_Di_par)

# # test trained optimal model on Di_test

# # LOGISTIC REGRESSION
# lr_y_Di_prediction = lr_optimal_model.predict(X_Di_test)
# lr_E_i_test = np.sum(lr_y_Di_prediction!=y_Di_test)
# # [ optimal parameter lambda , outer_coeff_error * E_i_test]
# lr_errors.append([lr_lambda_list[np.argmin(lr_E_gen)], lr_E_i_test])
    
# # BASELINE
# counter = dict(Counter(y_Di_par))
# largest_class, _ = sorted(counter.items(), reverse=True, key=lambda x:x[1])[0]
# baseline_errors.append(len(y_Di_test[y_Di_test != largest_class]))

   K-validation fold: 21/10
[[-0.89379627 10.28185544 -0.20548047 ...  0.78920046 -0.36072217
   0.36072217]
 [ 1.4339619  10.30547073 -0.20548047 ...  0.78920046 -0.36072217
   0.36072217]
 [ 0.56105259  3.2563079  -0.20548047 ...  0.78920046 -0.36072217
   0.36072217]
 ...
 [-0.89379627 -0.34502319 -0.20548047 ... -1.26710519 -0.36072217
   0.36072217]
 [ 0.99750725  0.29258953 -0.20548047 ... -1.26710519 -0.36072217
   0.36072217]
 [ 1.4339619   0.32801246 -0.20548047 ... -1.26710519 -0.36072217
   0.36072217]]
yellow
[-1.04158292 -0.42258819  0.27068591 ...  0.22116633  0.04784781
  0.17164676]
   K-validation fold: 22/10
[[-0.89379627 10.28185544 -0.20548047 ...  0.78920046 -0.36072217
   0.36072217]
 [ 1.4339619  10.30547073 -0.20548047 ...  0.78920046 -0.36072217
   0.36072217]
 [ 0.27008282  2.57146462 -0.20548047 ...  0.78920046  2.77221666
  -2.77221666]
 ...
 [-0.89379627 -0.34502319 -0.20548047 ... -1.26710519 -0.36072217
   0.36072217]
 [ 0.99750725  0.29258953 -0.20548047