###### RatingPoints notebook overview

In this notebook I will investigate the predictive value of rating points data.

###### Imports

In [6]:
import pandas as pd
import numpy as np
import time
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split

In [7]:
# read in all singles matches
s_matches = pd.read_csv('../Data/singles_matches_df.csv',parse_dates = ['tourney_date'], 
                      infer_datetime_format = True)

  s_matches = pd.read_csv('../Data/singles_matches_df.csv',parse_dates = ['tourney_date'],


In [32]:
# select non null ranking points ['tourney_date','winner_id','loser_id','winner_rank_points', 'loser_rank_points']
lr_matches = s_matches[s_matches['loser_rank_points'].notnull() & 
          s_matches['winner_rank_points'].notnull()]
lr_matches = lr_matches[
    ['tourney_date','winner_id','loser_id','winner_rank_points', 
     'loser_rank_points', 'winner_rank', 'loser_rank']]
# get rid of matches in which participants have zero ranking points
lr_matches = lr_matches.query('winner_rank_points != 0 and loser_rank_points != 0')

In [44]:
# create derived factors from rating points that may be predictive of outcome
lr_matches['higher_rp_wins'] = lr_matches['winner_rank_points']>\
                                lr_matches['loser_rank_points']

# convert outcome dtype from boolean to integer
lr_matches['higher_rp_wins'] = lr_matches['higher_rp_wins'].astype(int)

# raw differential in rank points between winner and loser
lr_matches['rp_differential'] = lr_matches['winner_rank_points'] - \
                                lr_matches['loser_rank_points']

# differential weighted by the inverse of the sum of both players' rank points
lr_matches['rp_weighted_differential'] = lr_matches['rp_differential']/ \
                                            (lr_matches['winner_rank_points'] + 
                                            lr_matches['loser_rank_points'])

lr_matches

Unnamed: 0,tourney_date,winner_id,loser_id,winner_rank_points,loser_rank_points,winner_rank,loser_rank,higher_rp_wins,rp_differential,rp_weighted_differential
126311,1990-01-01,101971,101439,213.0,421.0,114.0,45.0,0,-208.0,-0.328076
126312,1990-01-01,101124,100752,437.0,313.0,42.0,62.0,1,124.0,0.165333
126313,1990-01-01,100790,102000,88.0,133.0,239.0,168.0,0,-45.0,-0.203620
126314,1990-01-01,101421,101482,218.0,168.0,105.0,134.0,1,50.0,0.129534
126315,1990-01-01,101332,101320,197.0,505.0,122.0,32.0,0,-308.0,-0.438746
...,...,...,...,...,...,...,...,...,...,...
876971,2022-12-26,207134,208892,6.0,3.0,1178.0,1461.0,1,3.0,0.333333
876973,2022-12-26,207987,208519,31.0,12.0,716.0,966.0,1,19.0,0.441860
876974,2022-12-26,144932,209977,23.0,8.0,799.0,1117.0,1,15.0,0.483871
876976,2022-12-26,209079,210696,7.0,2.0,1125.0,1548.0,1,5.0,0.555556


In [None]:
# Testing initial logistic regression model

In [47]:
# Select the predictive and the target columns and split into train and test
X  = lr_matches[['rp_weighted_differential','rp_differential',
                 'winner_rank_points','loser_rank_points']]
y = lr_matches['higher_rp_wins']
X_train, X_test, y_train, y_test = train_test_split(X,y) 

In [57]:
# I will initialize with regularizations between 1.e-01 and 1.e+03
logreg = LogisticRegressionCV(Cs = np.logspace(-1,3,5))

In [59]:
logreg.fit(X_train, y_train)

LogisticRegressionCV(Cs=array([1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]))

In [60]:
logreg.score(X_train, y_train), logreg.score(X_test, y_test)

(1.0, 1.0)

In [62]:
logreg.predict(X_train)

320205    0
146833    0
373990    0
565113    0
537014    0
         ..
838411    0
216395    0
159769    0
311484    0
217850    0
Name: higher_rp_wins, Length: 463624, dtype: int64

In [1]:
import random


In [4]:
[random.choice([0,1]) for _ in range(0,lr_matches.shape(0))]

NameError: name 'lr_matches' is not defined

In [43]:
lr_matches

Unnamed: 0,tourney_date,winner_id,loser_id,winner_rank_points,loser_rank_points,winner_rank,loser_rank,higher_rp_wins,rp_differential,rp_weighted_differential
126311,1990-01-01,101971,101439,213.0,421.0,114.0,45.0,0,-208.0,-0.328076
126312,1990-01-01,101124,100752,437.0,313.0,42.0,62.0,1,124.0,0.165333
126313,1990-01-01,100790,102000,88.0,133.0,239.0,168.0,0,-45.0,-0.203620
126314,1990-01-01,101421,101482,218.0,168.0,105.0,134.0,1,50.0,0.129534
126315,1990-01-01,101332,101320,197.0,505.0,122.0,32.0,0,-308.0,-0.438746
...,...,...,...,...,...,...,...,...,...,...
876971,2022-12-26,207134,208892,6.0,3.0,1178.0,1461.0,1,3.0,0.333333
876973,2022-12-26,207987,208519,31.0,12.0,716.0,966.0,1,19.0,0.441860
876974,2022-12-26,144932,209977,23.0,8.0,799.0,1117.0,1,15.0,0.483871
876976,2022-12-26,209079,210696,7.0,2.0,1125.0,1548.0,1,5.0,0.555556


In [31]:
lr_matches.query('winner_rank_points == 0 or loser_rank_points == 0')

Unnamed: 0,tourney_date,winner_id,loser_id,winner_rank_points,loser_rank_points,winner_rank,loser_rank
46007,1973-10-21,100051,100215,0.0,0.0,105.0,189.0
46013,1973-10-21,100156,100178,0.0,0.0,174.0,123.0
46016,1973-10-21,100205,100141,0.0,0.0,42.0,77.0
46019,1973-10-21,100309,100030,0.0,0.0,32.0,40.0
46021,1973-10-21,100321,100309,0.0,0.0,22.0,32.0
...,...,...,...,...,...,...,...
73027,1980-01-01,100631,100090,0.0,0.0,69.0,274.0
73028,1980-01-01,100446,100285,0.0,0.0,45.0,86.0
73029,1980-01-01,100631,100259,0.0,0.0,69.0,47.0
73030,1980-01-01,100446,100631,0.0,0.0,45.0,69.0


In [10]:
singles_matches['loser_rank_points']

0          NaN
1          NaN
2          NaN
3          NaN
4          NaN
          ... 
876973    12.0
876974     8.0
876975     NaN
876976     2.0
876977     6.0
Name: loser_rank_points, Length: 876978, dtype: float64

In [8]:
singles_matches[singles_matches[]]

UndefinedVariableError: name 'np' is not defined