## 0. Подготовительная часть ##

In [55]:
from collections import defaultdict
from datetime import datetime
import os
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
from operator import getitem
from collections import OrderedDict, defaultdict
from copy import deepcopy
import requests
import itertools
import math
import scipy
from tqdm import tqdm

#others
from tqdm import tqdm
from IPython.display import display, clear_output

#ml
import torch
import torch.nn as nn
from sklearn.base import BaseEstimator
from sklearn.mixture import GaussianMixture
from sklearn.linear_model import Ridge, LogisticRegression, LogisticRegressionCV, RidgeCV, LassoCV
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy.stats.mstats import spearmanr, kendalltau

In [2]:
DATA_PATH = 'chgk_data'
PLAYERS_PATH = os.path.join(DATA_PATH, 'players.pkl')
RESULTS_PATH = os.path.join(DATA_PATH, 'results.pkl')
TOURNAMENTS_PATH = os.path.join(DATA_PATH, 'tournaments.pkl')


## 1. Подготовка данных ##

In [3]:
with open(RESULTS_PATH, 'rb') as file:
    results = pickle.load(file)
with open(PLAYERS_PATH, 'rb') as file:
    players = pickle.load(file)
with open(TOURNAMENTS_PATH, 'rb') as file:
    tournaments = pickle.load(file)

In [4]:
results_clean = {}
for key, value in results.items():
    if len(value) and 'mask' in value[0].keys() and value[0]['mask'] is not None:
        results_clean[key] = value

In [6]:
results_transformed = {}
for tournament_id, value in results_clean.items():
    current = {}
    current['teams'] = []
    current['results'] = []
    current['position'] = []
    current['team_members'] = []
    tournament_isOK = False
    tournament_len = 0
    #print(tournament_id)
    for team_data in value:
        if team_data['mask'] is None or '?' in team_data['mask'] or 'X' in team_data['mask'] or not len(team_data['mask']):
            continue
        if not tournament_len:
            tournament_len = len(team_data['mask'])
            tournament_isOK = True
        if len(team_data['mask']) != tournament_len:
            tournament_isOK = False
            break
        current['teams'].append(team_data['team']['id'])
        current['results'].append(team_data['mask'])
        if 'position' in team_data.keys():
            current['position'].append(team_data['position'])
        else:
            current['position'].append(-1)
        members = [mem['player']['id'] for mem in team_data['teamMembers']]
        current['team_members'].append(members)
    current['tournament_len'] = tournament_len
    if tournament_isOK:
        results_transformed[tournament_id] = current

In [7]:
results = pd.DataFrame(results_transformed).transpose()
results['id'] = results.index
results.reset_index(drop=True, inplace=True)

In [8]:
results.head()

Unnamed: 0,teams,results,position,team_members,tournament_len,id
0,"[1, 2, 670, 173, 175, 188, 240, 176, 5, 7, 3, ...",[011101110110111000110111001111111111001111110...,"[1, 2.5, 2.5, 4.5, 4.5, 6.5, 6.5, 8, 9, 10, 11...","[[1560, 2935, 3270, 4878, 18935, 32979, 36497]...",90,22
1,"[5, 188, 1, 2, 7, 175, 3, 168, 313, 1902, 173,...",[001111101111111001011101111011100001110010111...,"[1, 2, 3, 4, 5, 6, 8.5, 8.5, 8.5, 8.5, 11, 12....","[[1603, 5935, 17050, 18036, 18332, 30990], [32...",90,76
2,"[1, 188, 670, 2, 168, 5335, 5, 313, 414, 700, ...",[110110111101111111011110011110111111100111111...,"[1, 2.5, 2.5, 4, 5, 6, 9, 9, 9, 9, 9, 12.5, 12...","[[2935, 3270, 18935, 20612, 29800, 32979], [32...",90,141
3,"[5, 1, 168, 188, 670, 2, 175, 414, 214, 967, 2...",[101011110010010101111010111110110011011111111...,"[1, 2.5, 2.5, 4.5, 4.5, 6.5, 6.5, 8, 10, 10, 1...","[[5935, 17050, 18036, 18332, 30990, 35065], [2...",90,226
4,"[2, 188, 5, 670, 26, 173, 6000, 1, 168, 407, 1...",[011111000111101111110111111110111101001011111...,"[1, 2, 3.5, 3.5, 6, 6, 6, 8.5, 8.5, 10.5, 10.5...","[[27403, 25882, 30475, 34846, 13551, 707, 2180...",90,315


In [9]:
players = pd.DataFrame(players).transpose()
players['id'] = players['id'].astype(int)

In [10]:
tournaments = pd.DataFrame(tournaments).transpose()
tournaments['id'] = tournaments['id'].astype(int)
tournaments['start_date'] = pd.to_datetime(tournaments['dateStart'].str[:10], infer_datetime_format=True)
tournaments['finish_date'] = pd.to_datetime(tournaments['dateEnd'].str[:10], infer_datetime_format=True)

tournaments_train = tournaments[tournaments['start_date'].dt.year == 2019].reset_index(drop=True)
tournaments_test = tournaments[tournaments['start_date'].dt.year == 2020].reset_index(drop=True)

In [11]:
tournaments_train.head()

Unnamed: 0,id,name,dateStart,dateEnd,type,season,orgcommittee,synchData,questionQty,start_date,finish_date
0,4772,Синхрон северных стран. Зимний выпуск,2019-01-05T19:00:00+03:00,2019-01-09T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 28379, 'name': 'Константин', 'patronym...",{'dateRequestsAllowedTo': '2019-01-09T23:59:59...,"{'1': 12, '2': 12, '3': 12}",2019-01-05,2019-01-09
1,4973,Балтийский Берег. 3 игра,2019-01-25T19:05:00+03:00,2019-01-29T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 23030, 'name': 'Марина', 'patronymic':...",{'dateRequestsAllowedTo': '2019-01-28T23:59:59...,"{'1': 12, '2': 12, '3': 12}",2019-01-25,2019-01-29
2,4974,Балтийский Берег. 4 игра,2019-03-01T19:05:00+03:00,2019-03-05T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 23030, 'name': 'Марина', 'patronymic':...",{'dateRequestsAllowedTo': '2019-03-04T23:59:59...,"{'1': 12, '2': 12, '3': 12}",2019-03-01,2019-03-05
3,4975,Балтийский Берег. 5 игра,2019-04-05T19:05:00+03:00,2019-04-09T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 23030, 'name': 'Марина', 'patronymic':...",{'dateRequestsAllowedTo': '2019-04-08T23:59:59...,"{'1': 12, '2': 12, '3': 12}",2019-04-05,2019-04-09
4,4986,ОВСЧ. 6 этап,2019-02-15T20:00:00+03:00,2019-02-19T20:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 59140, 'name': 'Борис', 'patronymic': ...",{'dateRequestsAllowedTo': '2019-02-19T23:59:59...,"{'1': 12, '2': 12, '3': 12}",2019-02-15,2019-02-19


In [12]:
tournaments_test.head()

Unnamed: 0,id,name,dateStart,dateEnd,type,season,orgcommittee,synchData,questionQty,start_date,finish_date
0,4628,Семь сорок,2020-12-30T16:00:00+03:00,2020-12-30T16:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",,"[{'id': 7533, 'name': 'Денис', 'patronymic': '...",{'dateRequestsAllowedTo': '2020-12-30T23:55:00...,"{'1': 12, '2': 12, '3': 12}",2020-12-30,2020-12-30
1,4957,Синхрон Биркиркары,2020-02-21T00:00:00+03:00,2020-02-27T23:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/53,"[{'id': 2421, 'name': 'Ася', 'patronymic': 'Се...",{'dateRequestsAllowedTo': '2020-02-27T18:00:00...,"{'1': 13, '2': 13, '3': 13}",2020-02-21,2020-02-27
2,5151,Яровой,2020-08-01T14:00:00+03:00,2020-08-05T14:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/53,"[{'id': 22325, 'name': 'Михаил', 'patronymic':...",{'dateRequestsAllowedTo': '2020-07-24T23:59:59...,"{'1': 12, '2': 12, '3': 12}",2020-08-01,2020-08-05
3,5414,Синхрон северных стран,2020-01-03T19:00:00+03:00,2020-01-10T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/53,"[{'id': 28379, 'name': 'Константин', 'patronym...",{'dateRequestsAllowedTo': '2020-01-10T23:59:00...,"{'1': 12, '2': 12, '3': 12}",2020-01-03,2020-01-10
4,5477,Онлайн: Синхрон Урюбджирова,2020-04-18T19:00:00+03:00,2020-04-30T19:00:00+03:00,"{'id': 8, 'name': 'Асинхрон'}",/seasons/53,"[{'id': 91324, 'name': 'Эрдни', 'patronymic': ...",{'dateRequestsAllowedTo': '2020-04-30T23:55:00...,"{'1': 12, '2': 12, '3': 12}",2020-04-18,2020-04-30


In [13]:
'train length: ' + str(tournaments_train.shape[0]), 'test length: ' + str(tournaments_test.shape[0])

('train length: 687', 'test length: 418')

In [14]:
tournaments_results_train = tournaments_train.merge(results, on='id', how='inner')
tournaments_results_test = tournaments_test.merge(results, on='id', how='inner')

In [15]:
tournaments_results_train.head()

Unnamed: 0,id,name,dateStart,dateEnd,type,season,orgcommittee,synchData,questionQty,start_date,finish_date,teams,results,position,team_members,tournament_len
0,4772,Синхрон северных стран. Зимний выпуск,2019-01-05T19:00:00+03:00,2019-01-09T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 28379, 'name': 'Константин', 'patronym...",{'dateRequestsAllowedTo': '2019-01-09T23:59:59...,"{'1': 12, '2': 12, '3': 12}",2019-01-05,2019-01-09,"[45556, 1030, 4252, 5444, 40931, 47075, 53185,...","[111111111011111110111111111100010010, 1111111...","[1, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 12...","[[6212, 18332, 18036, 22799, 15456, 26089], [1...",36
1,4973,Балтийский Берег. 3 игра,2019-01-25T19:05:00+03:00,2019-01-29T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 23030, 'name': 'Марина', 'patronymic':...",{'dateRequestsAllowedTo': '2019-01-28T23:59:59...,"{'1': 12, '2': 12, '3': 12}",2019-01-25,2019-01-29,"[45556, 69309, 27522, 67979, 2055, 4032, 6936,...","[111111111111111011110101110111111111, 1111111...","[1.5, 1.5, 3.5, 3.5, 7.5, 7.5, 7.5, 7.5, 7.5, ...","[[6212, 18332, 18036, 22799, 7008, 26089], [27...",36
2,4974,Балтийский Берег. 4 игра,2019-03-01T19:05:00+03:00,2019-03-05T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 23030, 'name': 'Марина', 'patronymic':...",{'dateRequestsAllowedTo': '2019-03-04T23:59:59...,"{'1': 12, '2': 12, '3': 12}",2019-03-01,2019-03-05,"[2865, 69309, 6874, 27522, 56664, 1021, 4622, ...","[111111111111111111111111111101111111, 1111111...","[1.5, 1.5, 4, 4, 4, 7, 7, 7, 13, 13, 13, 13, 1...","[[19411, 24290, 32979, 5195, 33806, 9680], [27...",36
3,4975,Балтийский Берег. 5 игра,2019-04-05T19:05:00+03:00,2019-04-09T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 23030, 'name': 'Марина', 'patronymic':...",{'dateRequestsAllowedTo': '2019-04-08T23:59:59...,"{'1': 12, '2': 12, '3': 12}",2019-04-05,2019-04-09,"[4174, 45556, 70530, 7864, 7896, 27119, 33088,...","[110101011111111111110011111110111110, 1111010...","[2, 2, 2, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7...","[[13345, 29425, 52183, 34417, 30772, 93424], [...",36
4,5000,Чёрная Быль,2019-04-26T18:00:00+03:00,2019-04-30T18:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 59436, 'name': 'Олег', 'patronymic': '...",{'dateRequestsAllowedTo': '2019-04-30T23:59:59...,"{'1': 12, '2': 12, '3': 12}",2019-04-26,2019-04-30,"[1799, 45365, 47297, 55270, 66744, 70991]","[101111011101110101111011100001011011, 0011110...","[1.5, 1.5, 3, 4, 5.5, 5.5]","[[26972, 101697, 30720, 146440], [133148, 2459...",36


In [16]:
'train length: ' + str(tournaments_results_train.shape[0]), 'test length: ' + str(tournaments_results_test.shape[0])

('train length: 603', 'test length: 156')

## 2. Baseline-модель (логистическая регрессия) ##

Пусть, каждый игрок будет закодирован one-hot вектором и список вопросов так же, тогда попробуем по ним предсказывать вероятность ответа игрока на вопрос

In [25]:
train_players_list = []
tot_players_questions = 0
for team_players_list, n_quest in zip(tournaments_results_train['team_members'], tournaments_results_train['tournament_len']):
    for players_list in team_players_list:
        for player in players_list:
            train_players_list.append(player)
            tot_players_questions += n_quest
            
unique_train_players_list = list(set(train_players_list))
n_unique_train_players_list = len(unique_train_players_list)
n_unique_train_players_list, tot_players_questions

(55150, 13747623)

In [26]:
n_questions = tournaments_results_train['tournament_len'].sum()
n_questions

28194

In [27]:
map_ohe_player_id = {p: i for i, p in enumerate(unique_train_players_list)}
map_ohe_id_player = {i: p for p, i in map_ohe_player_id.items()}

In [31]:
X = scipy.sparse.lil_matrix((tot_players_questions, n_unique_train_players_list + n_questions),  dtype = int)
y = scipy.sparse.lil_matrix((tot_players_questions, 1), dtype = float)

In [53]:
idx = 0
for teams, results in tqdm(zip(tournaments_results_train['team_members'], tournaments_results_train['results'])):
    for team, mask_answers in zip(teams, results):    
        answers = list(map(int,[i for i in mask_answers]))
        for ans_num, answer in enumerate(answers): 
            for player in team:             
                X[idx, map_ohe_player_id[player]] = 1
                X[idx, n_unique_train_players_list + ans_num] = 1
                y[idx] = answer
                idx += 1

603it [18:58,  1.89s/it]


In [54]:
y = y.todense()

In [56]:
lrm = LogisticRegression(max_iter=1000)

In [57]:
lrm.fit(X, y)

  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=1000)

In [58]:
players_ks = lrm.coef_[0][:n_unique_train_players_list]
map_player_id_k = dict()
for player_id, player in map_ohe_id_player.items():
    map_player_id_k[player] = players_ks[player_id]
        
sorted_ks = dict(sorted(map_player_id_k.items(), key = lambda item: item[1],  reverse=True))

In [63]:
players

Unnamed: 0,id,name,patronymic,surname
1,1,Алексей,,Абабилов
10,10,Игорь,,Абалов
11,11,Наталья,Юрьевна,Абалымова
12,12,Артур,Евгеньевич,Абальян
13,13,Эрик,Евгеньевич,Абальян
...,...,...,...,...
224700,224700,Артём,Евгеньевич,Садов
224701,224701,Даниил,Олегович,Трефилов
224702,224702,Владимир,Араратович,Басенцян
224703,224703,Руслан,Ринатович,Дауранов


In [68]:
pre_players_rating = pd.DataFrame.from_dict(dict(itertools.islice(sorted_ks.items(), 50)), orient='index', columns=['score'])
pre_players_rating['id'] = pre_players_rating.index.to_series()
players_rating = pre_players_rating.merge(players, on='id', how='inner')
players_rating['player'] = players_rating['name']+' '+players_rating['surname']
players_rating['position'] = range(1, len(players_rating) + 1)

In [72]:
players_rating[['score', 'id', 'player','position']].head(50)

Unnamed: 0,score,id,player,position
0,2.53168,121433,София Савенко,1
1,2.53168,186002,Инга Лоренц,2
2,2.53168,202410,Валентина Подюкова,3
3,2.332983,135951,Артём Стетой,4
4,2.332983,170977,Давид Кан,5
5,2.332983,171845,Михаил Завьялов,6
6,2.332983,199963,Елена Бровченко,7
7,2.332983,216863,Глеб Гаврилов,8
8,2.332983,100134,Алина Бауэр,9
9,2.332983,103161,Надежда Бирюкова,10


In [None]:
realtime_rating = dict()
for player_id in unique_train_players_list:
    url = f'https://rating.chgk.info/api/players/{player_id}/rating/last'
    rating = 0
    try:
        rating = int(requests.get(url).json()['rating_position'])
    except Exception:
        pass
    realtime_rating[player_id] = rating

In [None]:
players_rating['real_rating'] = players_rating['id'].apply(lambda x: realtime_rating[x])

## 3. Оценка качества ##

Для ранжирования команд будем рассчитывать сумму коэффициентов входящих в нее игроков и подавать на вход сигмоиды, таким образом имитируя поведение модели.

In [75]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [76]:
teams_real_ratings = []
teams_pred_ratings = []

for team_list, teams, poistions  in zip(tournaments_results_test['teams'], tournaments_results_test['team_members'], tournaments_results_test['position']):   
    teams_real_ratings.append(list(range(1, len(team_list) + 1)))

    teams_rating = dict()
    for team_id, team in zip(team_list, teams):
        team_scores = []
        for player in team:
            if player in map_player_id_k.keys():
                team_scores.append(map_player_id_k[player])
                
        if (len(team_scores) == 0):
            teams_rating[team_id] = 0
        else:
            teams_rating[team_id] = sigmoid(np.sum(team_scores))

    ranged_teams = list(dict(sorted(teams_rating.items(), key = lambda item: item[1],  reverse=True)).keys())    
    rating = []
    for team_id in team_list:
        rating.append(ranged_teams.index(team_id) + 1)

    teams_pred_ratings.append(rating)

In [78]:
spearmanr_corr = []
kendalltau_corr = []
for i in range(len(teams_real_ratings)):
    spearmanr_corr.append(np.abs(scipy.stats.spearmanr(teams_real_ratings[i], teams_pred_ratings[i]).correlation))
    kendalltau_corr.append(np.abs(scipy.stats.kendalltau(teams_real_ratings[i], teams_pred_ratings[i]).correlation))
        
print(f'Корреляция Спирмена: {np.mean(spearmanr_corr)}')
print(f'Корреляция Кендалла: {np.mean(kendalltau_corr)}')

Корреляция Спирмена: 0.7357880517732427
Корреляция Кендалла: 0.5657925367288706


## 4. EM-модель ##

В разработке(

## 5. Рейтинг турниров ##

In [79]:
questions_ks = lrm.coef_[0][n_unique_train_players_list:]

In [80]:
tournament_rating_scores = dict()
current_idx = 0
for tournament_id, n_questions in zip(tournaments_results_train['id'], tournaments_results_train['tournament_len']):
    tournament_rating_scores[tournament_id] = np.mean(questions_ks[current_idx : current_idx + n_questions])
    current_idx += n_questions

In [82]:
pre_tournament_rating = pd.DataFrame.from_dict(tournament_rating_scores, orient='index', columns=['score'])
pre_tournament_rating['id'] =  pre_tournament_rating.index.to_series()

tournament_rating = pre_tournament_rating.merge(tournaments, on='id', how='inner')
tournament_rating = tournament_rating.sort_values('score')

tournament_rating['position'] = range(1, len(tournament_rating) + 1)

Самые сложные турниры

In [83]:
tournament_rating[['score','id','name','start_date','position']].head(20)

Unnamed: 0,score,id,name,start_date,position
11,-1.505647,5021,Синхрон-lite. Выпуск XXII,2019-02-08,1
3,-1.206059,4975,Балтийский Берег. 5 игра,2019-04-05,2
4,-0.971738,5000,Чёрная Быль,2019-04-26,3
2,-0.71589,4974,Балтийский Берег. 4 игра,2019-03-01,4
8,-0.590138,5011,(а)Синхрон-lite. Лига старта. Эпизод IV,2019-03-01,5
1,-0.403563,4973,Балтийский Берег. 3 игра,2019-01-25,6
541,-0.332286,6012,Зеркало Горького Октября,2019-11-01,7
368,-0.322071,5749,Чемпионат Кипра среди школьников,2019-06-09,8
579,-0.297432,6113,Чацвёртая актава. Ліга нацый: Беларусь,2019-12-12,9
480,-0.295775,5927,Кубок пустого стула,2019-11-08,10


Самые "простые" турниры

In [87]:
tournament_rating[['score','id','name','start_date','position']].tail(20).sort_values('score', ascending=False)

Unnamed: 0,score,id,name,start_date,position
10,1.932754,5013,(а)Синхрон-lite. Лига старта. Эпизод V,2019-04-05,603
12,1.621307,5025,Кубок городов,2019-03-29,602
9,0.698763,5012,Школьный Синхрон-lite. Выпуск 2.5,2019-04-05,601
13,0.501408,5042,Студенческий Кубок княгини Ольги,2019-03-30,600
5,0.440756,5008,Школьный Синхрон-lite. Выпуск 2.3,2019-01-25,599
554,0.255046,6055,Чемпионат Минска. Лига А. Тур первый,2019-10-28,598
352,0.235926,5724,Загадочный ларец,2019-06-21,597
107,0.207243,5400,Триптих. Весна,2019-04-26,596
365,0.206563,5744,Самая лёгкая лодка в мире,2019-08-23,595
278,0.185778,5621,Berlin Alexanderplatz,2019-06-08,594


Визуально, по названиям, кажется, что логично, поскольку верхний топ каежтся более устрашающим и серьезным, чем второй, в сосотаве которого большая часть школьных и студенчеких турниров, в то время как среди сложный есть Чемпионат мира и Линч