In [2]:
import os
import pandas as pd
import numpy as np
import pickle

from scipy.optimize import minimize


from trueskillthroughtime import Game, Player, History, Gaussian

In [3]:
folder = "C:/Users/yaobv/tennis_project/tennis_atp"

df = pd.DataFrame()

for file in os.listdir(f'{folder}'):

    if file.endswith('csv') and '20' in file and 'doubles' not in file:
        df_ = pd.read_csv(f'{folder}/{file}')
        df = pd.concat([df, df_])


df = df[df['winner_name'].apply(lambda x: isinstance(x, str))].copy()
df = df[df['loser_name'].apply(lambda x: isinstance(x, str))].copy()
df = df[~df['winner_name'].str.contains('Unknown')].copy()
df = df[~df['loser_name'].str.contains('Unknown')].copy()
df = df[df['winner_name'] != df['loser_name']].copy()

df.sort_values(by=['tourney_date', 'tourney_id', 'round'],
               ascending=[True, True, True],
               inplace=True)

print('shape before dropping match dupes:', df.shape)


df.drop_duplicates(subset=['winner_name', 'loser_name', 'tourney_id'],
                   inplace=True)

print('shape after dropping match dupes:', df.shape)

df.reset_index(inplace=True, drop=True)

shape before dropping match dupes: (581807, 53)
shape after dropping match dupes: (581746, 53)


In [4]:
columns = zip(df.winner_name, df.loser_name, df.surface)

composition = [[[winner, winner + surface], [loser, loser + surface]]
               for winner, loser, surface in columns]

In [5]:
all_atp_players = set([val for sublist in zip(df.winner_name, df.loser_name) for val in sublist])

In [19]:
priors = dict([(p, Player(Gaussian(0., 1.51), 1.5, 0.098) ) for p in all_atp_players])

In [20]:
true_skill_history_priors = History(composition=composition,
                                    priors=priors,
                                    beta=0,
                                    sigma=0.5,
                                    gamma=0.01)

In [21]:
true_skill_history_priors.log_evidence()

-346716.8869531072

In [34]:
def training(x):
    
    a, b = x
    
    priors = dict([(p, Player(Gaussian(0, a), 1.5, 0.98)) for p in all_atp_players])
    
    true_skill_history_priors = History(composition=composition,
                                    priors=priors,
                                    beta=0,
                                    sigma=b,
                                    gamma=0.01)
    
    log_ev = true_skill_history_priors.log_evidence()
    
    return -1 * log_ev  

In [None]:
to_start = [1.3, 0.5]

bounds = ((0.9, 1.6), (0.3, 0.8))

result = minimize(training, to_start, bounds=bounds, method='SLSQP')

In [39]:
true_skill_history_priors.convergence(epsilon=0.01, iterations=10)

Iteration =  0 , step =  (3.7638065299271775, 0.9568087274906079)
Iteration =  1 , step =  (0.5992915591564723, 0.04234960836586121)
Iteration =  2 , step =  (0.3750498157579929, 0.014743538780252585)
Iteration =  3 , step =  (0.27690898982839673, 0.010851022039432667)
Iteration =  4 , step =  (0.23299169677792353, 0.009073882388377585)
Iteration =  5 , step =  (0.20274338728320362, 0.007937127428417012)
Iteration =  6 , step =  (0.1811644466948925, 0.006970466322375524)
Iteration =  7 , step =  (0.16183371420684978, 0.006148849143449153)
Iteration =  8 , step =  (0.14468375524345944, 0.0054461237981418)
Iteration =  9 , step =  (0.12951581360599285, 0.0048408627363858425)
End


((0.12951581360599285, 0.0048408627363858425), 10)

In [40]:
true_skill_history_priors.log_evidence()

-326440.88258121174

In [41]:
ts_dict = true_skill_history_priors.learning_curves()

In [42]:
small_dict = dict()

for key in ts_dict.keys():
    small_dict[key] = ts_dict[key][-1]

In [82]:
sorted(small_dict, key=lambda x: [list(y[1])[0] for y in x.values()])

AttributeError: 'str' object has no attribute 'values'

In [86]:
top_players = dict()

for player, rating in small_dict.items():
    
    top_players[player] = list(rating[1])[0]

In [89]:
sorted(top_players.items(), key=lambda x: x[1], reverse=True)[:25]

[('Novak Djokovic', 9.273663390050773),
 ('Robin Soderling', 8.39284263036838),
 ('Carlos Alcaraz', 7.804666508346942),
 ('Stefanos Tsitsipas', 7.778554616843257),
 ('Matteo Berrettini', 7.575524197051474),
 ('Alexander Zverev', 7.443913394475121),
 ('Rafael Nadal', 7.383220595871317),
 ('Nick Kyrgios', 7.349614231153415),
 ('Felix Auger Aliassime', 7.344618655079774),
 ('Roger Federer', 7.246479432474963),
 ('Holger Rune', 7.1376156564818585),
 ('Daniil Medvedev', 7.034209250580584),
 ('Jannik Sinner', 6.998889875131355),
 ('Juan Martin del Potro', 6.855875722959703),
 ('Sebastian Korda', 6.846484537383871),
 ('Taylor Fritz', 6.8386449332453205),
 ('Cameron Norrie', 6.711937934450516),
 ('Casper Ruud', 6.67839116853247),
 ('Frances Tiafoe', 6.583157251182714),
 ('Hubert Hurkacz', 6.579836456538137),
 ('Andrey Rublev', 6.537087781448688),
 ('David Nalbandian', 6.487021689170225),
 ('Alex De Minaur', 6.482223026403858),
 ('Karen Khachanov', 6.451622281774738),
 ('Borna Coric', 6.4344927

In [90]:
clay_players = [x for x in top_players.keys() if x.endswith('Clay')]
clay_dict = dict()

for p, r in small_dict.items():
    if p in clay_players:
        clay_dict[p] = list(r[1])[0]        

In [91]:
sorted(clay_dict.items(), key=lambda x: x[1], reverse=True)[:25]

[('Filippo VolandriClay', 2.034824026340198),
 ('Rafael NadalClay', 1.7398750755004595),
 ('Daniel Gimeno TraverClay', 1.6657520755190967),
 ('Federico DelbonisClay', 1.5986774537875765),
 ('Potito StaraceClay', 1.5332930373432294),
 ('Adrian UngurClay', 1.515806148660574),
 ('Pablo AndujarClay', 1.5147155265139758),
 ('Gaston GaudioClay', 1.477879059682873),
 ('Albert MontanesClay', 1.4769406537309206),
 ('Marco CecchinatoClay', 1.452026189346882),
 ('David SanchezClay', 1.4302984612738923),
 ('Albert PortasClay', 1.4165234043733175),
 ('Ruben Ramirez HidalgoClay', 1.4083556448557817),
 ('Oscar HernandezClay', 1.3961809715631273),
 ('Laslo DjereClay', 1.3620709147301602),
 ('Nicolas AlmagroClay', 1.3514624896343512),
 ('Facundo BagnisClay', 1.343836397127044),
 ('Alessio Di MauroClay', 1.326519431471131),
 ('Simone VagnozziClay', 1.3139490365486515),
 ('Mariano PuertaClay', 1.3109418951115865),
 ('Jose AcasusoClay', 1.308203922288961),
 ('Eric ProdonClay', 1.2930829933484944),
 ('Pere

In [92]:
grass_players = [x for x in top_players.keys() if x.endswith('Grass')]
grass_dict = dict()

for p, r in small_dict.items():
    if p in grass_players:
        grass_dict[p] = list(r[1])[0] 

In [95]:
sorted(grass_dict.items(), key=lambda x: x[1], reverse=True)[:25]

[('Nicolas MahutGrass', 0.9815734879625846),
 ('Jonas BjorkmanGrass', 0.9696293926751484),
 ('Lleyton HewittGrass', 0.914744212310804),
 ('Alexander PoppGrass', 0.8929656637929466),
 ('Feliciano LopezGrass', 0.8813820554518281),
 ('Matthew EbdenGrass', 0.8738608290646352),
 ('Florian MayerGrass', 0.867749971425988),
 ('Xavier MalisseGrass', 0.8299195211179867),
 ('Andy RoddickGrass', 0.8096794153589679),
 ('Travis RettenmaierGrass', 0.801000919879447),
 ('Jimmy WangGrass', 0.7986896906286571),
 ('Sam GrothGrass', 0.7966195671447891),
 ('Brydan KleinGrass', 0.793607356014414),
 ('David PrinosilGrass', 0.7642178753230623),
 ('Adrian MannarinoGrass', 0.7443137519752823),
 ('Denis KudlaGrass', 0.7369892897936908),
 ('Ivo KarlovicGrass', 0.733736667576267),
 ('Mate PavicGrass', 0.7237718612659084),
 ('Jamie DelgadoGrass', 0.7211601581981387),
 ('Benjamin BeckerGrass', 0.7056740250967313),
 ('Rainer SchuettlerGrass', 0.7040587148081976),
 ('Mario AncicGrass', 0.7039355615959108),
 ('Bernard 

In [96]:
hard_players = [x for x in top_players.keys() if x.endswith('Hard')]
hard_dict = dict()

for p, r in small_dict.items():
    if p in hard_players:
        hard_dict[p] = list(r[1])[0] 

In [97]:
sorted(hard_dict.items(), key=lambda x: x[1], reverse=True)[:25]

[('Vasek PospisilHard', 1.1877398803980928),
 ('James BlakeHard', 1.1804889157417129),
 ('Bradley KlahnHard', 1.1783713766065067),
 ('Jan Michael GambillHard', 1.0992895468054245),
 ('Alex Bogomolov JrHard', 1.0975890290086536),
 ('Michael RussellHard', 1.0861680146939454),
 ('Pablo Vivero GonzalezHard', 1.0861146769901153),
 ('Andres Artunedo MartinavarroHard', 1.0825509962922761),
 ('Danai UdomchokeHard', 1.0712780341100079),
 ('Bjorn PhauHard', 1.0663995153664902),
 ('Ricardas BerankisHard', 1.0194312385107114),
 ('Donald YoungHard', 1.0165654844777523),
 ('Yu Jr WangHard', 1.00611966292732),
 ('Gilles SimonHard', 1.0007019073044348),
 ('Lloyd HarrisHard', 0.9989057720158684),
 ('Cem IlkelHard', 0.9926418864883028),
 ('Tim SmyczekHard', 0.9779132314157563),
 ('Emil RuusuvuoriHard', 0.9736093334852436),
 ('Peng SunHard', 0.9695940196640503),
 ('Andre AgassiHard', 0.9644578489846389),
 ('Gael MonfilsHard', 0.95583861568909),
 ('David GuezHard', 0.9470721905181229),
 ('Andy RoddickHard

In [43]:
with open('mens_big_dict.pickle', 'wb') as file:
    pickle.dump(ts_dict, file, protocol=pickle.HIGHEST_PROTOCOL)

In [44]:
with open('mens_small_dict.pickle', 'wb') as fileb:
    pickle.dump(small_dict, fileb, protocol=pickle.HIGHEST_PROTOCOL)