In [1]:
import pandas as pd
import pickle
import dill
import numpy as np
import math

from trueskillthroughtime import Player, Game, History

from sklearn.metrics import brier_score_loss, log_loss

In [2]:
folder = "C:/Users/jyoung/Projects/tennis_project/tennis_data/tennis_atp-master"

df_file = "processed_apt.csv"

dict_file = "trueskill_dict.pickle"

In [3]:
df = pd.read_csv(f'{folder}/{df_file}')

print(df.shape)
print("the last day in the df is:", df.tourney_date.max())

  df = pd.read_csv(f'{folder}/{df_file}')


(579538, 146)
the last day in the df is: 2022-11-14


In [4]:
# converting tournament dates to datetime objects
times = pd.to_datetime(df['tourney_date'])

# calculating days since the first day in the dataset
# note: using actual days with TrueSkill turns out be worse 
# worse than simply using the index

times = [(x - pd.to_datetime('2000-01-01')).days for x in times]

In [5]:
# TrueSkill Through Time to working its magic

columns = zip(df.winner_name, df.loser_name)

composition = [[[winner], [loser]] for winner, loser in columns]

true_skill_history = History(composition=composition,
                             sigma=1.6,
                             gamma=0.107)

print(true_skill_history.log_evidence())

-345912.0420518494


In [10]:
# optimizing gamma

gammas = np.linspace(0.05, 0.12, 12)

for j in gammas:

    true_skill_history = History(composition=composition,
                         sigma=1.6,
                         gamma=j)
    evidence = true_skill_history.log_evidence()
    
    print(j, evidence, math.exp(evidence/true_skill_history.size))

0.05 -344795.15790574765 0.5477298285751804
0.056363636363636366 -344066.60438612517 0.5484269686257985
0.06272727272727273 -343476.0461914203 0.5489927144504749
0.06909090909090909 -343002.26215888205 0.5494470142841851
0.07545454545454545 -342628.4263179383 0.5498057415888105
0.08181818181818182 -342340.981241358 0.5500817288713742
0.08818181818181818 -342128.8460357373 0.5502854970395542
0.09454545454545454 -341982.84844937996 0.5504257800379911
0.1009090909090909 -341895.3099037875 0.5505099093285943
0.10727272727272727 -341859.73694622813 0.5505441005446488
0.11363636363636363 -341870.58811101806 0.550533670640811
0.12 -341923.09520285943 0.5504832047561662


In [6]:
# processing the histories

true_skill_history.convergence(epsilon=0.01, iterations=10)

print(true_skill_history.log_evidence())

Iteration =  0 , step =  (3.70091268860161, 1.0748582562223121)
Iteration =  1 , step =  (0.3632730034991545, 0.08242001609325778)
Iteration =  2 , step =  (0.22510792789432577, 0.016388652814405935)
Iteration =  3 , step =  (0.15995661247003934, 0.012968815395214328)
Iteration =  4 , step =  (0.12588781769903035, 0.010715947165092388)
Iteration =  5 , step =  (0.10930435168308028, 0.009182546383559309)
Iteration =  6 , step =  (0.09789675706529355, 0.008109684694700858)
Iteration =  7 , step =  (0.08840028716902548, 0.007273102072066662)
Iteration =  8 , step =  (0.08187400663935307, 0.0065860769009968845)
Iteration =  9 , step =  (0.07656414351253282, 0.006003993347694303)
End
-322922.06826504075


In [7]:
ts_dict = true_skill_history.learning_curves()

In [8]:
# creating a dictionary to count how many times a player has appeared in the dataset
match_count_dict = {name : 0 for name in ts_dict.keys()}

# extracting the dictionary containing each player's sequence of TrueSkill ratings

ts_win_probs = []

for winner, loser in zip(df.winner_name, df.loser_name):
  
  
  try:
    winner_idx = match_count_dict[winner]
    loser_idx = match_count_dict[loser]

    winner_player = Player(ts_dict[winner][winner_idx][1])
    loser_player = Player(ts_dict[loser][loser_idx][1])
    result = Game([[winner_player], [loser_player]]).evidence

    ts_win_probs.append(result)
  
    match_count_dict[winner] += 1
    match_count_dict[loser] += 1

  except:
    ts_win_probs.append(0.5)


df['winner_ts_win_prob'] = ts_win_probs

In [9]:
def true_skill_win_proba(player_1=None, player_2=None):

  player_1 = Player(ts_dict[player_1][-1][1])
  player_2 = Player(ts_dict[player_2][-1][1])

  return Game([[player_1], [player_2]]).evidence

In [10]:
true_skill_win_proba('Matthias Bachinger', 'Gabriel Decamps')

0.4864514309127883

In [12]:
# saving the ts dictionary

dict_file = "trueskill_dict.pickle"

with open(f'{folder}/{dict_file}', 'wb') as handle:
    dill.dump(ts_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
df.to_csv(f'{folder}/processed_apt_with_ts.csv', index=False)