In [16]:
import logging
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
from torch.distributions import constraints

from operator import mul    # or mul=lambda x,y:x*y
from fractions import Fraction
import functools 

from sklearn.model_selection import train_test_split

import pyro
from torch.distributions import constraints
from torch.autograd import Variable
import pyro.distributions as dist
import pyro.optim as optim

pyro.set_rng_seed(1)
assert pyro.__version__.startswith('1.4.0')

from utils import *
from eval_metrics import *
from GaussianMF import run_gaussian_mf

In [17]:
%matplotlib inline
plt.style.use('default')

logging.basicConfig(format='%(message)s', level=logging.INFO)
# Enable validation checks
pyro.enable_validation(True)
smoke_test = ('CI' in os.environ)
pyro.set_rng_seed(1)

In [18]:
data = pd.read_csv("utr_train_UPDATED.csv", index_col="resultid")

In [19]:
# unique players
len(data.winnerid.append(data.loserid).unique())

1146

In [20]:
data.resultmonth.unique()

array(['2019-05', '2019-03', '2019-11', '2019-10', '2019-09', '2019-08',
       '2019-01', '2019-07', '2019-02', '2019-12', '2019-06', '2019-04'],
      dtype=object)

In [21]:
data["year"] = data.resultmonth.apply(extract_year)
data["month"] = data.resultmonth.apply(extract_month)

In [22]:
data["matchtype"] = data.apply(match_type, axis=1)

In [23]:
# most of the matches are BO3
data.matchtype.value_counts()

BO3    4374
BO5     178
Name: matchtype, dtype: int64

In [24]:
data["set1"] = data.apply(set_result, set_num=1, axis=1)
data["set2"] = data.apply(set_result, set_num=2, axis=1)
data["set3"] = data.apply(set_result, set_num=3, axis=1)
data["set4"] = data.apply(set_result, set_num=4, axis=1)
data["set5"] = data.apply(set_result, set_num=5, axis=1)
data = data[["winnerid","loserid","year","month","matchtype","set1","set2","set3","set4","set5"]]

In [25]:
#['6-0', '6-1', '6-2', '6-3', '6-4', '7-5', '7-6']
game_prob_dict = {}
for set_result in np.unique(data[['set1','set2','set3','set4','set5']].values):
    ls = set_result.split('-')
    ls = [int(i) for i in ls]
    n = ls[0] + ls[1]
    k = ls[0]
    
    prob_dict = {}
    for p in [p/100 for p in range(0,101)]:
        prob_dict[p] = binomial(n,k,p)
    
    game_prob_dict[set_result] = max(prob_dict, key=prob_dict.get)

In [26]:
data['game_win_prob'] = data.apply(get_game_win_prob, game_prob_dict=game_prob_dict, axis=1)
data['set_win_prob'] = data.game_win_prob.apply(set_win_prob)
data['match_win_prob'] = data.game_win_prob.apply(match_win_prob)

In [27]:
data = data.rename(columns={"winnerid": "player1id", "loserid": "player2id"}, errors="raise")

In [28]:
data = data[['player1id','player2id','year','month','matchtype','match_win_prob']]
data = data.append(data.apply(reverse_result, axis=1))

In [29]:
data

Unnamed: 0_level_0,player1id,player2id,year,month,matchtype,match_win_prob
resultid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,57529,3765,2019,5,BO3,0.611425
2,83218,3871,2019,3,BO3,0.802313
4,4021,4487,2019,11,BO3,0.549905
5,1984892,411593,2019,10,BO3,0.795037
7,52294,224678,2019,9,BO3,0.849895
...,...,...,...,...,...,...
6541,2602428,11767,2019,1,BO3,0.676784
6542,1945656,162815,2019,7,BO3,0.370455
6544,4464,3458,2019,9,BO3,0.258908
6546,11161,74588,2019,3,BO3,0.176559


In [30]:
train, test = train_test_split(data, test_size=0.2, random_state=42)
train_matrix = train.groupby(['player1id','player2id']).agg({'match_win_prob': lambda x: x.mean(skipna=True)}).reset_index().pivot(index='player1id', columns='player2id', values='match_win_prob')

In [None]:
loss_list, mae_list = run_gaussian_mf(train_matrix, train, test, k=50, mae_tol=0.03)