In [2]:
import os
import pandas as pd
import numpy as np
import pickle

from scipy.optimize import minimize


from trueskillthroughtime import Game, Player, History, Gaussian

In [6]:
folder = "C:/Users/yaobv/tennis_project/tennis_wta"

df = pd.DataFrame()

for file in os.listdir(f'{folder}'):

    if file.endswith('csv') and '20' in file and 'doubles' not in file and '1920' not in file:
        
        try:    
            df_ = pd.read_csv(f'{folder}/{file}')
            df = pd.concat([df, df_])
            
        except:
            df_ = pd.read_csv(f'{folder}\{file}', encoding='latin-1')
            df = pd.concat([df, df_])


df = df[df['winner_name'].apply(lambda x: isinstance(x, str))].copy()
df = df[df['loser_name'].apply(lambda x: isinstance(x, str))].copy()
df = df[~df['winner_name'].str.contains('Unknown')].copy()
df = df[~df['loser_name'].str.contains('Unknown')].copy()
df = df[df['winner_name'] != df['loser_name']].copy()

df.sort_values(by=['tourney_date', 'tourney_id', 'round'],
               ascending=[True, True, True],
               inplace=True)

print('shape before dropping match dupes:', df.shape)

df.dropna(subset=['surface', 'winner_name', 'loser_name'],
          inplace=True)

df.drop_duplicates(subset=['winner_name', 'loser_name', 'tourney_id'],
                   inplace=True)

print('shape after dropping match dupes:', df.shape)

df.reset_index(inplace=True, drop=True)

  df_ = pd.read_csv(f'{folder}/{file}')
  df_ = pd.read_csv(f'{folder}/{file}')
  df_ = pd.read_csv(f'{folder}/{file}')


shape before dropping match dupes: (455216, 54)
shape after dropping match dupes: (454092, 54)


In [7]:
columns = zip(df.winner_name, df.loser_name, df.surface)

composition = [[[winner, winner + surface], [loser, loser + surface]]
               for winner, loser, surface in columns]

In [12]:
all_wta_players = set([val for sublist in zip(df.winner_name, df.loser_name) for val in sublist])

In [13]:
priors = dict([(p, Player(Gaussian(0., 1.51), 1.5, 0.098) ) for p in all_wta_players])

In [10]:
true_skill_history_priors = History(composition=composition,
                                    priors=priors,
                                    beta=0,
                                    sigma=0.5,
                                    gamma=0.01)

In [11]:
true_skill_history_priors.log_evidence() # initial results with 0, 1.51, 1.5, 0.098

-268809.15454599314

In [16]:
def training(x):
    
    a, b, c = x
    
    priors = dict([(p, Player(Gaussian(0, a), b, c)) for p in all_wta_players])
    
    true_skill_history_priors = History(composition=composition,
                                    priors=priors,
                                    beta=0,
                                    sigma=0.5,
                                    gamma=0.01)
    
    log_ev = true_skill_history_priors.log_evidence()
    
    return -1 * log_ev  

In [17]:
to_start = [1.5, 1.5, 0.1]

bounds = ((1.3, 1.8), (1.3, 1.8), (0.08, 0.12))

result = minimize(training, to_start, bounds=bounds, method='SLSQP')

In [18]:
print(result)

     fun: 268697.5918395683
     jac: array([  1547.52734375,   2240.4609375 , -47232.875     ])
 message: 'Optimization terminated successfully'
    nfev: 4
     nit: 5
    njev: 1
  status: 0
 success: True
       x: array([1.5, 1.5, 0.1])


In [39]:
true_skill_history_priors.convergence(epsilon=0.01, iterations=10)

Iteration =  0 , step =  (3.7638065299271775, 0.9568087274906079)
Iteration =  1 , step =  (0.5992915591564723, 0.04234960836586121)
Iteration =  2 , step =  (0.3750498157579929, 0.014743538780252585)
Iteration =  3 , step =  (0.27690898982839673, 0.010851022039432667)
Iteration =  4 , step =  (0.23299169677792353, 0.009073882388377585)
Iteration =  5 , step =  (0.20274338728320362, 0.007937127428417012)
Iteration =  6 , step =  (0.1811644466948925, 0.006970466322375524)
Iteration =  7 , step =  (0.16183371420684978, 0.006148849143449153)
Iteration =  8 , step =  (0.14468375524345944, 0.0054461237981418)
Iteration =  9 , step =  (0.12951581360599285, 0.0048408627363858425)
End


((0.12951581360599285, 0.0048408627363858425), 10)

In [19]:
true_skill_history_priors.log_evidence()

-268809.15454599314

In [20]:
ts_dict = true_skill_history_priors.learning_curves()

In [21]:
small_dict = dict()

for key in ts_dict.keys():
    small_dict[key] = ts_dict[key][-1]

In [22]:
with open('womens_big_dict.pickle', 'wb') as file:
    pickle.dump(ts_dict, file, protocol=pickle.HIGHEST_PROTOCOL)

In [23]:
with open('womens_small_dict.pickle', 'wb') as fileb:
    pickle.dump(small_dict, fileb, protocol=pickle.HIGHEST_PROTOCOL)