In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [206]:
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [3]:
df = pd.read_excel('datasets/2012.xls', sep=';')

In [5]:
df.columns

Index(['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface',
       'Round', 'Best of', 'Winner', 'Loser', 'WRank', 'LRank', 'WPts', 'LPts',
       'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets',
       'Lsets', 'Comment', 'B365W', 'B365L', 'EXW', 'EXL', 'LBW', 'LBL', 'PSW',
       'PSL', 'SJW', 'SJL', 'MaxW', 'MaxL', 'AvgW', 'AvgL'],
      dtype='object')

In [212]:
df1 = df[['WRank','LRank','WPts','LPts','PSW','PSL']].copy()
df1['fav_won'] = df1.WRank < df1.LRank
df1.dropna(inplace=True)

In [213]:
X = pd.DataFrame(columns=['Fodds','NFodds','Frank','NFrank','Fpts','NFpts'])

In [214]:
def fav_rank(row):
    if row.PSW <= row.PSL:
        return row.WRank
    else:
        return row.LRank

def not_fav_rank(row):
    if row.PSW >= row.PSL:
        return row.WRank
    else:
        return row.LRank
    
def fav_pts(row):
    if row.PSW <= row.PSL:
        return row.WPts
    else:
        return row.LPts

def not_fav_pts(row):
    if row.PSW >= row.PSL:
        return row.WPts
    else:
        return row.LPts

In [215]:
X.Fodds = df1[['PSL','PSW']].min(axis=1)
X.NFodds = df1[['PSL','PSW']].max(axis=1)
X.Frank = df1.apply(fav_rank, axis=1)
X.NFrank = df1.apply(not_fav_rank, axis=1)
X.Fpts = df1.apply(fav_pts, axis=1)
X.NFpts = df1.apply(not_fav_pts, axis=1)

y = label_binarize(df1.fav_won, classes=[0,1])

In [216]:
df1.head()

Unnamed: 0,WRank,LRank,WPts,LPts,PSW,PSL,fav_won
0,275.0,80.0,171,653.0,4.65,1.23,False
1,15.0,74.0,1925,671.0,1.32,3.71,True
2,73.0,23.0,685,1630.0,3.29,1.38,False
3,42.0,52.0,970,900.0,1.7,2.27,True
4,54.0,85.0,893,630.0,1.41,3.13,True


In [217]:
X.head()

Unnamed: 0,Fodds,NFodds,Frank,NFrank,Fpts,NFpts
0,1.23,4.65,80.0,275.0,653.0,171.0
1,1.32,3.71,15.0,74.0,1925.0,671.0
2,1.38,3.29,23.0,73.0,1630.0,685.0
3,1.7,2.27,42.0,52.0,970.0,900.0
4,1.41,3.13,54.0,85.0,893.0,630.0


In [218]:
y[:5]

array([[0],
       [1],
       [0],
       [1],
       [1]])

In [226]:
y = y.ravel()

#### Logistic Regr

In [227]:
logreg = LogisticRegression(max_iter=200)

In [228]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [229]:
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [230]:
logreg.coef_

array([[-1.38571360e-01,  1.98053380e-01, -3.09730328e-03,
         1.77154816e-03,  1.06517222e-04, -1.03345956e-04]])

In [231]:
y_pred = logreg.predict(X_test)

In [232]:
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred))

0.7007722007722008


#### testing su OOS

In [237]:
df = pd.read_excel('datasets/2019.xlsx', sep=';')

df1 = df[['WRank','LRank','WPts','LPts','PSW','PSL']].copy()
df1['fav_won'] = df1.WRank < df1.LRank
df1.dropna(inplace=True)

In [238]:
X = pd.DataFrame(columns=['Fodds','NFodds','Frank','NFrank','Fpts','NFpts'])
X.Fodds = df1[['PSL','PSW']].min(axis=1)
X.NFodds = df1[['PSL','PSW']].max(axis=1)
X.Frank = df1.apply(fav_rank, axis=1)
X.NFrank = df1.apply(not_fav_rank, axis=1)
X.Fpts = df1.apply(fav_pts, axis=1)
X.NFpts = df1.apply(not_fav_pts, axis=1)

y = label_binarize(df1.fav_won, classes=[0,1])
y = y.ravel()

In [239]:
y_pred = logreg.predict(X)

In [240]:
print(metrics.accuracy_score(y, y_pred))

0.6281951975213013


## Proviamo a predire

In [126]:
rankings = pd.read_csv('datasets/Rankings.csv')

Player H

In [149]:
rankings[rankings.name.str.contains('Sinner')].points.iloc[0]

710

Player A

In [150]:
rankings[rankings.name.str.contains('busta', case=False)].points.iloc[0]

1332

In [153]:
rankings.head()

Unnamed: 0,rank,name,country_name,country_id,points,bestRank,bestRankDate,rankDiff,pointsDiff,bestPoints
0,1,Novak Djokovic,Serbia,SRB,9720,1,2011-07-04,0.0,0.0,16950
1,2,Rafael Nadal,Spain,ESP,9395,1,2008-08-18,0.0,0.0,15390
2,3,Roger Federer,Switzerland,SUI,7130,1,2004-02-02,0.0,0.0,12315
3,4,Dominic Thiem,Austria,AUT,7045,4,2017-11-06,0.0,0.0,7045
4,5,Daniil Medvedev,Russian Federation,RUS,5890,4,2019-09-09,0.0,-70.0,5960
