In [1]:
import pickle
import pandas as pd
import numpy as np
import sys
from IPython.display import display, clear_output
from datetime import datetime
from tqdm import tqdm_notebook
import itertools
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
import os
from scipy.sparse import csr_matrix

Филигранно распикливаем ([с возможной утечкой памяти:)](https://stackoverflow.com/questions/7395542/is-explicitly-closing-files-important))

In [2]:
players_dict = pickle.load(open('data/chgk/players.pkl', 'rb'))
results_dict = pickle.load(open('data/chgk/results.pkl', 'rb'))
tournaments_dict = pickle.load(open('data/chgk/tournaments.pkl', 'rb'))

In [3]:
tournaments_dict_train = {k: v for k, v in tournaments_dict.items() if v['dateStart'].startswith('2019')}
tournaments_dict_test = {k: v for k, v in tournaments_dict.items() if v['dateStart'].startswith('2020')}

`players` и `tournaments` сразу в датафрейм, `results` пока что в словарь датафреймов 

In [4]:
results_dict_train = {k: v for k, v in results_dict.items() if k in tournaments_dict_train.keys()}
results_dict_test = {k: v for k, v in results_dict.items() if k in tournaments_dict_test.keys()}

Фильтруем `results`, оставляем только те, где есть `mask`

In [5]:
results_filtered_train = {}
results_filtered_test = {}
for k, res in results_dict.items():
    t_res_new = []
    for t_res in results_dict[k]:
        if 'mask' in t_res.keys():
            if t_res['mask'] is not None:
                if 'X' not in t_res['mask'] and '?' not in t_res['mask']:
                    t_res_new.append(t_res)
    if t_res_new:
        if tournaments_dict[k]['dateStart'].startswith('2019'):
            results_filtered_train[k] = t_res_new
        if tournaments_dict[k]['dateStart'].startswith('2020'):
            results_filtered_test[k] = t_res_new

In [6]:
tournament_question_count = {}

for k, v in results_filtered_train.items():
    tournament_question_count[k]= max([len(t_res['mask']) for t_res in v])
    
for k, v in results_filtered_test.items():
    tournament_question_count[k]= max([len(t_res['mask']) for t_res in v])

In [7]:
results_filtered_train_len = {}
results_filtered_test_len = {}
for k, v in results_filtered_train.items():
    t_res_ = []
    for t_res in v:
        if len(t_res['mask']) == tournament_question_count[k]:
            t_res_.append(t_res)
    results_filtered_train_len[k] = t_res_

for k, v in results_filtered_test.items():
    t_res_ = []
    for t_res in v:
        if len(t_res['mask']) == tournament_question_count[k]:
            t_res_.append(t_res)
    results_filtered_test_len[k] = t_res_

In [8]:
pq_df = pd.DataFrame(columns=['pid', 'qid', 'res'])

In [9]:
pid = []
qid = []
res = []
tournament = []
tid = []
for k, v in results_filtered_train_len.items():
    for t_res in v:
        members = [m['player']['id'] for m in t_res['teamMembers']]
        t_pid = list(itertools.chain.from_iterable(itertools.repeat(m, tournament_question_count[k]) for m in members))
        pid.extend(t_pid)
        t_qid = [f'{k}_{i}' for i in range(tournament_question_count[k])] * len(members)
        qid.extend(t_qid)
        tid.extend([t_res['team']['id']] * len(t_qid))
        tournament.extend([k] * len(t_qid))
        res.extend(list(map(int, t_res['mask'])) * len(members))

In [10]:
pq_df['pid'] = np.int32(pid)
pq_df['qid'] = qid
pq_df['tournament'] = tournament
pq_df['tid'] = tid
pq_df['res'] = np.int8(res)

In [11]:
pq_df.head()

Unnamed: 0,pid,qid,res,tournament,tid
0,6212,4772_0,1,4772,45556
1,6212,4772_1,1,4772,45556
2,6212,4772_2,1,4772,45556
3,6212,4772_3,1,4772,45556
4,6212,4772_4,1,4772,45556


In [63]:
encoder = OneHotEncoder(categorical_features=[0, 1], sparse=True, dtype=np.int8)
encoder.fit(pq_df)
pq_df_oh = encoder.transform(pq_df[['pid', 'qid']])
clf = LogisticRegression(solver='lbfgs', n_jobs=10)
clf.fit(pq_df_oh, pq_df['res'])

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=10, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [97]:
feature_names = np.array([fname.replace('.0','') for fname in encoder.get_feature_names(pq_df.columns[:2])])
players = feature_names[[fname.startswith('pid') for fname in feature_names]]
players = np.array(list(map(lambda x: np.int32(x.replace('pid_','')), players)))
rating = pd.DataFrame({'player_id': players, 'score': clf.coef_[0][:len(players)]})
rating = rating.sort_values(by='score', ascending=False)
players_df = pd.DataFrame.from_dict(players_dict, orient='index')

rating_named = rating.merge(players_df.rename(columns={'id':'player_id'}), on='player_id')
rating_named['place'] = rating_named.index

rating_named.head(25)

Unnamed: 0,player_id,score,name,patronymic,surname,place
0,27403,3.705755,Максим,Михайлович,Руссо,0
1,4270,3.550033,Александра,Владимировна,Брутер,1
2,30152,3.421566,Артём,Сергеевич,Сорожкин,2
3,37047,3.415909,Мария,Алексеевна,Юнгер,3
4,27822,3.298092,Михаил,Владимирович,Савченков,4
5,28751,3.282823,Иван,Николаевич,Семушин,5
6,38196,3.229765,Артём,Александрович,Митрофанов,6
7,216863,3.212365,Глеб,Юрьевич,Гаврилов,7
8,56647,3.203445,Наталья,Евгеньевна,Горелова,8
9,3843,3.189115,Светлана,Борисовна,Бомешко,9


In [99]:
test_rating = {}
for tournament, result in results_filtered_test_len.items():
    for t_res in result:
        for member in t_res['teamMembers']:
            test_rating[member['player']['id']] = member['rating']

            
rating_test = pd.DataFrame.from_dict({'player_id': list(test_rating.keys()), 'score' :list(test_rating.values())}).sort_values(by='score', ascending=False)
rating_test['place'] = rating_test.reset_index().index
rating_full = rating_named.merge(rating_test, on='player_id', suffixes=['_pred', '_test'])

In [100]:
rating_full.head(25)

Unnamed: 0,player_id,score_pred,name,patronymic,surname,place_pred,score_test,place_test
0,27403,3.705755,Максим,Михайлович,Руссо,0,14434,4
1,4270,3.550033,Александра,Владимировна,Брутер,1,14068,7
2,30152,3.421566,Артём,Сергеевич,Сорожкин,2,14741,0
3,27822,3.298092,Михаил,Владимирович,Савченков,4,14665,1
4,28751,3.282823,Иван,Николаевич,Семушин,5,14665,2
5,38196,3.229765,Артём,Александрович,Митрофанов,6,11870,292
6,56647,3.203445,Наталья,Евгеньевна,Горелова,8,11196,451
7,30270,3.171842,Сергей,Леонидович,Спешков,10,14665,3
8,20691,3.163746,Станислав,Григорьевич,Мереминский,11,13177,57
9,12307,3.161685,Светлана,Сергеевна,Иванцова,12,11330,422


In [101]:
rating_full[['place_pred','place_test']].corr()

Unnamed: 0,place_pred,place_test
place_pred,1.0,0.618296
place_test,0.618296,1.0


In [44]:
def sigmoid(X, weights):
    return 1 / (1 + np.exp(-X.dot(weights)))

def log_likelihood(expectations, pq_df_oh, weights):
    sigma = sigmoid(pq_df_oh, weights)
    return np.sum(expectations * np.log(sigma) + (1 - expectations) * np.log(1 - sigma))
    
def weights_gradient(weights, expectations, pq_df_oh):
    return (csr_matrix(expectations - sigmoid(pq_df_oh, weights)).dot(pq_df_oh)).toarray()[0] / np.array((pq_df_oh != 0).sum(axis=0)).ravel()

def Expectation(weights, pq_df, pq_df_oh):
    pq_df_ = pq_df.copy()
    sigma = sigmoid(pq_df_oh, weights)
    pq_df_['sigma'] = sigma
    pq_df_['one_sigma'] = 1 - sigma
    prods_series = 1 - pq_df_.groupby(['tournament', 'tid', 'qid'])['one_sigma'].prod().rename('prod')
    pq_df_ = pq_df_.merge(prods_series, on=['tournament', 'tid', 'qid'])
    expectations = (pq_df_['sigma'] / (pq_df_['prod'])).values
    expectations[pq_df_['res'] == 0] = 0    
    return expectations

def Maximization(expectations, pq_df_oh):
    weights = np.random.rand(pq_df_oh.shape[1])
    for i in range(30):
#         print(np.exp(log_likelihood(expectations, pq_df_oh, weights)/ pq_df_oh.shape[0])
        weights += 2 * weights_gradient(weights, expectations, pq_df_oh)
    return weights

In [45]:
weights = np.random.rand(pq_df_oh.shape[1])
for i in range(2):
    expectations = Expectation(weights, pq_df, pq_df_oh)
    weights = Maximization(expectations, pq_df_oh)
    print('LLL = ', log_likelihood(expectations, pq_df_oh, weights)/ pq_df_oh.shape[0])

LLL =  -0.5702204595848975
LLL =  -0.421954227876556


In [107]:
feature_names = np.array([fname.replace('.0','') for fname in encoder.get_feature_names(pq_df.columns[:2])])
players = feature_names[[fname.startswith('pid') for fname in feature_names]]
players = np.array(list(map(lambda x: np.int32(x.replace('pid_','')), players)))
rating_em = pd.DataFrame({'player_id': players, 'score': weights[:len(players)]})
rating_em = rating_em.sort_values(by='score', ascending=False)
players_df = pd.DataFrame.from_dict(players_dict, orient='index')

rating_named_em = rating_em.merge(players_df.rename(columns={'id':'player_id'}), on='player_id')
rating_named_em['place'] = rating_named_em.index

rating_named_em.head(25)

Unnamed: 0,player_id,score,name,patronymic,surname,place
0,190096,1.338966,Дмитрий,Михайлович,Мартьянов,0
1,139511,1.2315,Михаил,Сергеевич,Ганущак,1
2,191874,1.231012,Марина,Александровна,Христенко,2
3,24342,1.21333,Денис,Владимирович,Пахомов,3
4,707,1.161986,Елена,Андреевна,Александрова,4
5,17750,1.109744,Галина,Вячеславовна,Лазарева,5
6,199961,1.073025,Владимир,Александрович,Смирнов,6
7,22474,1.060584,Илья,Сергеевич,Немец,7
8,28751,1.057177,Иван,Николаевич,Семушин,8
9,4270,1.039831,Александра,Владимировна,Брутер,9


In [108]:
rating_full_em = rating_named_em.merge(rating_test, on='player_id', suffixes=['_pred', '_test'])

In [111]:
rating_full

Unnamed: 0,player_id,score_pred,name,patronymic,surname,place_pred,score_test,place_test
0,27403,3.705755,Максим,Михайлович,Руссо,0,14434,4
1,4270,3.550033,Александра,Владимировна,Брутер,1,14068,7
2,30152,3.421566,Артём,Сергеевич,Сорожкин,2,14741,0
3,27822,3.298092,Михаил,Владимирович,Савченков,4,14665,1
4,28751,3.282823,Иван,Николаевич,Семушин,5,14665,2
...,...,...,...,...,...,...,...,...
22168,208188,-2.758978,Валерия,Дмитриевна,Ермошкина,55618,0,26553
22169,208186,-2.898312,Альбина,Маратовна,Гайнулина,55619,0,26552
22170,208183,-2.898312,Василий,Владимирович,Александров,55620,0,26551
22171,208182,-2.898312,Дмитрий,Сергеевич,Рырак,55621,0,26550


In [112]:
rating_full_em

Unnamed: 0,player_id,score_pred,name,patronymic,surname,place_pred,score_test,place_test
0,707,1.161986,Елена,Андреевна,Александрова,4,2390,10682
1,22474,1.060584,Илья,Сергеевич,Немец,7,6344,5008
2,28751,1.057177,Иван,Николаевич,Семушин,8,14665,2
3,4270,1.039831,Александра,Владимировна,Брутер,9,14068,7
4,27822,1.025991,Михаил,Владимирович,Савченков,11,14665,1
...,...,...,...,...,...,...,...,...
22168,204366,-4.244970,Анна,Сергеевна,Лапшина,55571,0,26075
22169,202238,-4.247810,Дарья,Руслановна,Яшуткина,55577,0,21356
22170,216696,-4.256974,Александр,,Лиховидов,55586,0,24293
22171,218396,-4.260736,Илья,Олегович,Шахов,55588,0,18962


In [116]:
rating_full_em[['place_pred','place_test']].corr('kendall')['place_pred']['place_test']

0.5011805424678661

In [81]:
rating

Unnamed: 0,player_id,score
29937,190096,1.338966
15872,139511,1.231500
31483,191874,1.231012
3363,24342,1.213330
93,707,1.161986
...,...,...
34770,195531,-4.417665
35859,196739,-4.419624
34754,195514,-4.422932
24915,178474,-4.425285


In [88]:
weights

array([-1.2091781 , -0.9383701 , -0.26911188, ..., -0.37335121,
       -0.31807361, -0.40383207])

In [89]:
clf.coef_

array([[0.15730694, 1.05967814, 0.17651964, ..., 0.98917429, 1.84226592,
        3.56371494]])