In [15]:
import pickle
import pandas as pd
import numpy as np
import sys
from IPython.display import display, clear_output
from datetime import datetime
from tqdm import tqdm_notebook
import itertools
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
import os
from scipy.sparse import csr_matrix
from scipy.optimize import minimize

Филигранно распикливаем ([с возможной утечкой памяти:)](https://stackoverflow.com/questions/7395542/is-explicitly-closing-files-important))

In [16]:
players_dict = pickle.load(open('data/chgk/players.pkl', 'rb'))
results_dict = pickle.load(open('data/chgk/results.pkl', 'rb'))
tournaments_dict = pickle.load(open('data/chgk/tournaments.pkl', 'rb'))

В тренировочном наборе оставляем только турниры, у которых `dateStart` 2019, в тестовый набор - турниры, у которых `dateStart` 2020

`players` и `tournaments` сразу в датафрейм, `results` пока что в словарь датафреймов 

Фильтруем `results`, оставляем только те, где есть `mask`

In [17]:
results_filtered_train = {}
results_filtered_test = {}
for k, res in results_dict.items():
    t_res_new = []
    for t_res in results_dict[k]:
        if 'mask' in t_res.keys():
            if t_res['mask'] is not None:
                if 'X' not in t_res['mask'] and '?' not in t_res['mask']:
                    t_res_new.append(t_res)
    if t_res_new:
        if tournaments_dict[k]['dateStart'].startswith('2019'):
            results_filtered_train[k] = t_res_new
        if tournaments_dict[k]['dateStart'].startswith('2020'):
            results_filtered_test[k] = t_res_new

In [18]:
tournament_question_count = {}

for k, v in results_filtered_train.items():
    tournament_question_count[k]= max([len(t_res['mask']) for t_res in v])
    
for k, v in results_filtered_test.items():
    tournament_question_count[k]= max([len(t_res['mask']) for t_res in v])

In [19]:
results_filtered_train_len = {}
results_filtered_test_len = {}
for k, v in results_filtered_train.items():
    t_res_ = []
    for t_res in v:
        if len(t_res['mask']) == tournament_question_count[k]:
            t_res_.append(t_res)
    results_filtered_train_len[k] = t_res_

for k, v in results_filtered_test.items():
    t_res_ = []
    for t_res in v:
        if len(t_res['mask']) == tournament_question_count[k]:
            t_res_.append(t_res)
    results_filtered_test_len[k] = t_res_

In [20]:
pq_df = pd.DataFrame(columns=['pid', 'qid', 'tournament', 'tid', 'res'])

pid = []
qid = []
res = []
tournament = []
tid = []
for k, v in results_filtered_train_len.items():
    for t_res in v:
        members = [m['player']['id'] for m in t_res['teamMembers']]
        t_pid = list(itertools.chain.from_iterable(itertools.repeat(m, tournament_question_count[k]) for m in members))
        pid.extend(t_pid)
        t_qid = [f'{k}_{i}' for i in range(tournament_question_count[k])] * len(members)
        qid.extend(t_qid)
        tid.extend([t_res['team']['id']] * len(t_qid))
        tournament.extend([k] * len(t_qid))
        res.extend(list(map(int, t_res['mask'])) * len(members))

pq_df['pid'] = np.int32(pid)
pq_df['qid'] = qid
pq_df['tournament'] = tournament
pq_df['tid'] = tid
pq_df['res'] = np.int8(res)

In [21]:
pq_df.head()

Unnamed: 0,pid,qid,tournament,tid,res
0,6212,4772_0,4772,45556,1
1,6212,4772_1,4772,45556,1
2,6212,4772_2,4772,45556,1
3,6212,4772_3,4772,45556,1
4,6212,4772_4,4772,45556,1


### Тут нужно написать тексточек про математику (почему ВанХот и можно ли сделать без ВанХот)

In [22]:
encoder = OneHotEncoder(categorical_features=[0, 1], sparse=True, dtype=np.int8)
encoder.fit(pq_df)
pq_df_oh = encoder.transform(pq_df[['pid', 'qid']])
clf = LogisticRegression(solver='lbfgs', n_jobs=10)
clf.fit(pq_df_oh, pq_df['res'])

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=10, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
test_rating = {}
for tournament, result in results_filtered_test_len.items():
    for t_res in result:
        for member in t_res['teamMembers']:
            test_rating[member['player']['id']] = member['rating']

            
rating_test = pd.DataFrame.from_dict({'player_id': list(test_rating.keys()), 'score' :list(test_rating.values())}).sort_values(by='score', ascending=False)
rating_test['place'] = rating_test.reset_index().index + 1

Строим рейтинг

In [29]:
def get_full_test_rating_from_learned_weights(weights, encoder, players_dict, test_rating_df):
    feature_names = np.array([fname.replace('.0','') for fname in encoder.get_feature_names(pq_df.columns[:2])])
    players = feature_names[[fname.startswith('pid') for fname in feature_names]]
    players = np.array(list(map(lambda x: np.int32(x.replace('pid_','')), players)))
    rating = pd.DataFrame({'player_id': players, 'score': weights[:len(players)]})
    rating = rating.sort_values(by='score', ascending=False)
    players_df = pd.DataFrame.from_dict(players_dict, orient='index')
    rating_named = rating.merge(players_df.rename(columns={'id':'player_id'}), on='player_id')
    rating_named['place'] = rating_named.index    
    rating_full = rating_named.merge(test_rating_df, on='player_id', suffixes=['_pred', '_test'])
    corr_kendall = rating_full[['place_pred', 'place_test']].corr(method='kendall')['place_pred']['place_test']
    corr_spearman = rating_full[['place_pred', 'place_test']].corr(method='spearman')['place_pred']['place_test']
    return rating_full, (corr_kendall, corr_spearman)

In [14]:
clf

NameError: name 'clf' is not defined

In [31]:
rating, (corr_kendall, corr_spearmen) = get_full_test_rating_from_learned_weights(clf.coef_[0], encoder, players_dict, rating_test)
print(f'Correlation of Spearman is {corr_spearmen: .5f}')
print(f'Correlation of Kendall is {corr_kendall: .5f}')
rating.head(25)

Correlation of Spearman is  0.69430
Correlation of Kendall is  0.53051


Unnamed: 0,player_id,score_pred,name,patronymic,surname,place_pred,score_test,place_test
0,27403,3.705755,Максим,Михайлович,Руссо,0,14434,5
1,4270,3.550033,Александра,Владимировна,Брутер,1,14068,8
2,30152,3.421566,Артём,Сергеевич,Сорожкин,2,14741,1
3,27822,3.298092,Михаил,Владимирович,Савченков,4,14665,2
4,28751,3.282823,Иван,Николаевич,Семушин,5,14665,3
5,38196,3.229765,Артём,Александрович,Митрофанов,6,11870,293
6,56647,3.203445,Наталья,Евгеньевна,Горелова,8,11196,452
7,30270,3.171842,Сергей,Леонидович,Спешков,10,14665,4
8,20691,3.163746,Станислав,Григорьевич,Мереминский,11,13177,58
9,12307,3.161685,Светлана,Сергеевна,Иванцова,12,11330,423


### Тут нормально описать про EM-алгоритм который придумал

In [33]:
def sigmoid(X, weights):
    return 1 / (1 + np.exp(-X.dot(weights)))

def log_likelihood(expectations, pq_df_oh, weights):
    sigma = sigmoid(pq_df_oh, weights)
    return np.sum(expectations * np.log(sigma) + (1 - expectations) * np.log(1 - sigma))
    
def weights_gradient(weights, expectations, pq_df_oh):
    return (csr_matrix(expectations - sigmoid(pq_df_oh, weights)).dot(pq_df_oh)).toarray()[0] / np.array((pq_df_oh != 0).sum(axis=0)).ravel()

def Expectation(weights, pq_df, pq_df_oh):
    pq_df_ = pq_df.copy()
    sigma = sigmoid(pq_df_oh, weights)
    pq_df_['sigma'] = sigma
    pq_df_['one_sigma'] = 1 - sigma
    prods_series = 1 - pq_df_.groupby(['tournament', 'tid', 'qid'])['one_sigma'].prod().rename('prod')
    pq_df_ = pq_df_.merge(prods_series, on=['tournament', 'tid', 'qid'])
    expectations = (pq_df_['sigma'] / (pq_df_['prod'])).values
    expectations[pq_df_['res'] == 0] = 0    
    return expectations

def Maximization(expectations, pq_df_oh, weights):
    weights_ = weights
    for i in range(30):
        weights_ += 1.5 * weights_gradient(weights_, expectations, pq_df_oh)
    return weights_

In [49]:
weights = np.random.rand(pq_df_oh.shape[1])
em_df = pd.DataFrame(columns=['iter', 'log_ll', 'Kendall_corr', 'Spearman_corr'])
for i in range(10):
    expectations = Expectation(weights, pq_df, pq_df_oh)
    weights = Maximization(expectations, pq_df_oh, weights)
    rating, (corr_kendall, corr_spearman) = get_full_test_rating_from_learned_weights(weights, encoder, players_dict, rating_test)
    em_df = em_df.append(
       pd.DataFrame(
        {
        'iter': [i + 1],
        'log_ll': [log_likelihood(expectations, pq_df_oh, weights) / pq_df_oh.shape[0]],
        'Kendall_corr': [corr_kendall],
        'Spearman_corr' : [corr_spearman]
    }), ignore_index=True)
    clear_output()
    display(em_df)

Unnamed: 0,iter,log_ll,Kendall_corr,Spearman_corr
0,1,-0.570441,0.498985,0.660464
1,2,-0.421136,0.562888,0.733255
2,3,-0.355133,0.597733,0.774129
3,4,-0.331569,0.619944,0.799898
4,5,-0.323207,0.634035,0.815327
5,6,-0.320151,0.643263,0.824783
6,7,-0.318992,0.649762,0.83106
7,8,-0.318536,0.654674,0.835568
8,9,-0.31835,0.658567,0.839011
9,10,-0.318275,0.661724,0.841726


In [59]:
r = _

In [61]:
r[r['surname'] == 'Николенко']

Unnamed: 0,player_id,score_pred,name,patronymic,surname,place_pred,score_test,place_test
17,22799,0.260857,Сергей,Игоревич,Николенко,47,13846,15
3544,22797,-0.677679,Любовь,Владимировна,Николенко,5486,6356,4988
11001,115591,-1.357621,Александр,Владимирович,Николенко,19855,3878,8532


In [71]:
rating_test.merge(players.rename(columns={'id':'player_id'}), on='player_id')

Unnamed: 0,player_id,score,place,name,patronymic,surname
0,30152,14741,1,Артём,Сергеевич,Сорожкин
1,27822,14665,2,Михаил,Владимирович,Савченков
2,28751,14665,3,Иван,Николаевич,Семушин
3,30270,14665,4,Сергей,Леонидович,Спешков
4,27403,14434,5,Максим,Михайлович,Руссо
...,...,...,...,...,...,...
28106,208771,0,28107,Дмитрий,Викторович,Булатников
28107,208772,0,28108,Никита,Романович,Горбань
28108,208773,0,28109,Альбина,Евгеньевна,Саксудаева
28109,208774,0,28110,Алина,Олеговна,Пинчук


In [72]:
r

Unnamed: 0,player_id,score_pred,name,patronymic,surname,place_pred,score_test,place_test
0,22474,1.272906,Илья,Сергеевич,Немец,0,6344,5009
1,707,0.541894,Елена,Андреевна,Александрова,8,2390,10683
2,28751,0.440655,Иван,Николаевич,Семушин,13,14665,3
3,4270,0.416106,Александра,Владимировна,Брутер,16,14068,8
4,32777,0.394327,Павел,Сергеевич,Уточкин,18,10561,706
...,...,...,...,...,...,...,...,...
22168,217208,-6.235938,Алина,Алексеевна,Игушкина,55526,0,24733
22169,202213,-6.356228,Анастасия,Олеговна,Кузьмина,55552,0,21359
22170,202218,-6.357969,Ксения,Владимировна,Сергеева,55577,0,20699
22171,204366,-6.358597,Анна,Сергеевна,Лапшина,55590,0,26076
