## 0. Подготовительная часть ##

In [1]:
from collections import defaultdict
from datetime import datetime
import os
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
from operator import getitem
from collections import OrderedDict, defaultdict
from copy import deepcopy
import requests
import itertools
import math

#others
from tqdm import tqdm
from IPython.display import display, clear_output

#ml
import torch
import torch.nn as nn
from sklearn.base import BaseEstimator
from sklearn.mixture import GaussianMixture
from sklearn.linear_model import LogisticRegressionCV, RidgeCV, LassoCV
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy.stats.mstats import spearmanr, kendalltau

In [2]:
DATA_PATH = 'chgk_data'
PLAYERS_PATH = os.path.join(DATA_PATH, 'players.pkl')
RESULTS_PATH = os.path.join(DATA_PATH, 'results.pkl')
TOURNAMENTS_PATH = os.path.join(DATA_PATH, 'tournaments.pkl')


## 1. Подготовка данных ##

In [37]:
with open(RESULTS_PATH, 'rb') as file:
    results = pickle.load(file)
with open(PLAYERS_PATH, 'rb') as file:
    players = pickle.load(file)
with open(TOURNAMENTS_PATH, 'rb') as file:
    tournaments = pickle.load(file)

In [38]:
results_clean = {}
for key, value in results.items():
    if len(value) and 'mask' in value[0].keys() and value[0]['mask'] is not None:
        results_clean[key] = value

In [46]:
results_clean[1526]

[{'team': {'id': 168,
   'name': 'Сборная Кирибати',
   'town': {'id': 285, 'name': 'Санкт-Петербург'}},
  'mask': '111010111111111111111111111110101101111111',
  'current': {'name': 'Сборная Кирибати',
   'town': {'id': 285, 'name': 'Санкт-Петербург'}},
  'questionsTotal': 37,
  'synchRequest': None,
  'position': 1,
  'controversials': [],
  'flags': [],
  'teamMembers': [{'flag': 'Б',
    'usedRating': 10105,
    'rating': 10105,
    'player': {'id': 15727,
     'name': 'Александр',
     'patronymic': 'Геннадьевич',
     'surname': 'Коробейников'}},
   {'flag': 'Б',
    'usedRating': 6616,
    'rating': 7941,
    'player': {'id': 8082,
     'name': 'Сергей',
     'patronymic': 'Валерьевич',
     'surname': 'Григорьев'}},
   {'flag': 'Б',
    'usedRating': 4755,
    'rating': 7124,
    'player': {'id': 15664,
     'name': 'Кирилл',
     'patronymic': 'Сергеевич',
     'surname': 'Корконосенко'}},
   {'flag': 'Б',
    'usedRating': 2631,
    'rating': 5261,
    'player': {'id': 5162,


In [62]:
results_transformed = {}
for tournament_id, value in results_clean.items():
    current = {}
    current['teams'] = []
    current['results'] = []
    current['position'] = []
    current['team_members'] = []
    tournament_isOK = False
    tournament_len = 0
    #print(tournament_id)
    for team_data in value:
        if team_data['mask'] is None or '?' in team_data['mask'] or 'X' in team_data['mask'] or not len(team_data['mask']):
            continue
        if not tournament_len:
            tournament_len = len(team_data['mask'])
            tournament_isOK = True
        if len(team_data['mask']) != tournament_len:
            tournament_isOK = False
            break
        current['teams'].append(team_data['team']['id'])
        current['results'].append(team_data['mask'])
        if 'position' in team_data.keys():
            current['position'].append(team_data['position'])
        else:
            current['position'].append(-1)
        members = [mem['player']['id'] for mem in team_data['teamMembers']]
        current['team_members'].append(members)
    current['tournament_len'] = tournament_len
    if tournament_isOK:
        results_transformed[tournament_id] = current

In [63]:
results = pd.DataFrame(results_transformed).transpose()
results['id'] = results.index
results.reset_index(drop=True, inplace=True)

In [64]:
results.head()

Unnamed: 0,teams,results,position,team_members,tournament_len,id
0,"[1, 2, 670, 173, 175, 188, 240, 176, 5, 7, 3, ...",[011101110110111000110111001111111111001111110...,"[1, 2.5, 2.5, 4.5, 4.5, 6.5, 6.5, 8, 9, 10, 11...","[[1560, 2935, 3270, 4878, 18935, 32979, 36497]...",90,22
1,"[5, 188, 1, 2, 7, 175, 3, 168, 313, 1902, 173,...",[001111101111111001011101111011100001110010111...,"[1, 2, 3, 4, 5, 6, 8.5, 8.5, 8.5, 8.5, 11, 12....","[[1603, 5935, 17050, 18036, 18332, 30990], [32...",90,76
2,"[1, 188, 670, 2, 168, 5335, 5, 313, 414, 700, ...",[110110111101111111011110011110111111100111111...,"[1, 2.5, 2.5, 4, 5, 6, 9, 9, 9, 9, 9, 12.5, 12...","[[2935, 3270, 18935, 20612, 29800, 32979], [32...",90,141
3,"[5, 1, 168, 188, 670, 2, 175, 414, 214, 967, 2...",[101011110010010101111010111110110011011111111...,"[1, 2.5, 2.5, 4.5, 4.5, 6.5, 6.5, 8, 10, 10, 1...","[[5935, 17050, 18036, 18332, 30990, 35065], [2...",90,226
4,"[2, 188, 5, 670, 26, 173, 6000, 1, 168, 407, 1...",[011111000111101111110111111110111101001011111...,"[1, 2, 3.5, 3.5, 6, 6, 6, 8.5, 8.5, 10.5, 10.5...","[[27403, 25882, 30475, 34846, 13551, 707, 2180...",90,315


In [17]:
players = pd.DataFrame(players).transpose()
players['id'] = players['id'].astype(int)

In [18]:
tournaments = pd.DataFrame(tournaments).transpose()
tournaments['id'] = tournaments['id'].astype(int)
tournaments['start_date'] = pd.to_datetime(tournaments['dateStart'].str[:10], infer_datetime_format=True)
tournaments['finish_date'] = pd.to_datetime(tournaments['dateEnd'].str[:10], infer_datetime_format=True)

tournaments_train = tournaments[tournaments['start_date'].dt.year == 2019].reset_index(drop=True)
tournaments_test = tournaments[tournaments['start_date'].dt.year == 2020].reset_index(drop=True)

In [20]:
tournaments_train.head()

Unnamed: 0,id,name,dateStart,dateEnd,type,season,orgcommittee,synchData,questionQty,start_date,finish_date
0,4628,Семь сорок,2020-12-30T16:00:00+03:00,2020-12-30T16:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",,"[{'id': 7533, 'name': 'Денис', 'patronymic': '...",{'dateRequestsAllowedTo': '2020-12-30T23:55:00...,"{'1': 12, '2': 12, '3': 12}",2020-12-30,2020-12-30
1,4957,Синхрон Биркиркары,2020-02-21T00:00:00+03:00,2020-02-27T23:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/53,"[{'id': 2421, 'name': 'Ася', 'patronymic': 'Се...",{'dateRequestsAllowedTo': '2020-02-27T18:00:00...,"{'1': 13, '2': 13, '3': 13}",2020-02-21,2020-02-27
2,5151,Яровой,2020-08-01T14:00:00+03:00,2020-08-05T14:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/53,"[{'id': 22325, 'name': 'Михаил', 'patronymic':...",{'dateRequestsAllowedTo': '2020-07-24T23:59:59...,"{'1': 12, '2': 12, '3': 12}",2020-08-01,2020-08-05
3,5414,Синхрон северных стран,2020-01-03T19:00:00+03:00,2020-01-10T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/53,"[{'id': 28379, 'name': 'Константин', 'patronym...",{'dateRequestsAllowedTo': '2020-01-10T23:59:00...,"{'1': 12, '2': 12, '3': 12}",2020-01-03,2020-01-10
4,5477,Онлайн: Синхрон Урюбджирова,2020-04-18T19:00:00+03:00,2020-04-30T19:00:00+03:00,"{'id': 8, 'name': 'Асинхрон'}",/seasons/53,"[{'id': 91324, 'name': 'Эрдни', 'patronymic': ...",{'dateRequestsAllowedTo': '2020-04-30T23:55:00...,"{'1': 12, '2': 12, '3': 12}",2020-04-18,2020-04-30


In [21]:
tournaments_test.head()

Unnamed: 0,id,name,dateStart,dateEnd,type,season,orgcommittee,synchData,questionQty,start_date,finish_date
0,4628,Семь сорок,2020-12-30T16:00:00+03:00,2020-12-30T16:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",,"[{'id': 7533, 'name': 'Денис', 'patronymic': '...",{'dateRequestsAllowedTo': '2020-12-30T23:55:00...,"{'1': 12, '2': 12, '3': 12}",2020-12-30,2020-12-30
1,4957,Синхрон Биркиркары,2020-02-21T00:00:00+03:00,2020-02-27T23:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/53,"[{'id': 2421, 'name': 'Ася', 'patronymic': 'Се...",{'dateRequestsAllowedTo': '2020-02-27T18:00:00...,"{'1': 13, '2': 13, '3': 13}",2020-02-21,2020-02-27
2,5151,Яровой,2020-08-01T14:00:00+03:00,2020-08-05T14:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/53,"[{'id': 22325, 'name': 'Михаил', 'patronymic':...",{'dateRequestsAllowedTo': '2020-07-24T23:59:59...,"{'1': 12, '2': 12, '3': 12}",2020-08-01,2020-08-05
3,5414,Синхрон северных стран,2020-01-03T19:00:00+03:00,2020-01-10T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/53,"[{'id': 28379, 'name': 'Константин', 'patronym...",{'dateRequestsAllowedTo': '2020-01-10T23:59:00...,"{'1': 12, '2': 12, '3': 12}",2020-01-03,2020-01-10
4,5477,Онлайн: Синхрон Урюбджирова,2020-04-18T19:00:00+03:00,2020-04-30T19:00:00+03:00,"{'id': 8, 'name': 'Асинхрон'}",/seasons/53,"[{'id': 91324, 'name': 'Эрдни', 'patronymic': ...",{'dateRequestsAllowedTo': '2020-04-30T23:55:00...,"{'1': 12, '2': 12, '3': 12}",2020-04-18,2020-04-30


In [25]:
'train length: ' + str(tournaments_train.shape[0]), 'test length: ' + str(tournaments_test.shape[0])

('train length: 687', 'test length: 418')

In [65]:
tournaments_results_train = tournaments_train.merge(results, on='id', how='inner')
tournaments_results_test = tournaments_test.merge(results, on='id', how='inner')

In [66]:
tournaments_results_train.head()

Unnamed: 0,id,name,dateStart,dateEnd,type,season,orgcommittee,synchData,questionQty,start_date,finish_date,teams,results,position,team_members,tournament_len
0,4772,Синхрон северных стран. Зимний выпуск,2019-01-05T19:00:00+03:00,2019-01-09T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 28379, 'name': 'Константин', 'patronym...",{'dateRequestsAllowedTo': '2019-01-09T23:59:59...,"{'1': 12, '2': 12, '3': 12}",2019-01-05,2019-01-09,"[45556, 1030, 4252, 5444, 40931, 47075, 53185,...","[111111111011111110111111111100010010, 1111111...","[1, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 12...","[[6212, 18332, 18036, 22799, 15456, 26089], [1...",36
1,4973,Балтийский Берег. 3 игра,2019-01-25T19:05:00+03:00,2019-01-29T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 23030, 'name': 'Марина', 'patronymic':...",{'dateRequestsAllowedTo': '2019-01-28T23:59:59...,"{'1': 12, '2': 12, '3': 12}",2019-01-25,2019-01-29,"[45556, 69309, 27522, 67979, 2055, 4032, 6936,...","[111111111111111011110101110111111111, 1111111...","[1.5, 1.5, 3.5, 3.5, 7.5, 7.5, 7.5, 7.5, 7.5, ...","[[6212, 18332, 18036, 22799, 7008, 26089], [27...",36
2,4974,Балтийский Берег. 4 игра,2019-03-01T19:05:00+03:00,2019-03-05T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 23030, 'name': 'Марина', 'patronymic':...",{'dateRequestsAllowedTo': '2019-03-04T23:59:59...,"{'1': 12, '2': 12, '3': 12}",2019-03-01,2019-03-05,"[2865, 69309, 6874, 27522, 56664, 1021, 4622, ...","[111111111111111111111111111101111111, 1111111...","[1.5, 1.5, 4, 4, 4, 7, 7, 7, 13, 13, 13, 13, 1...","[[19411, 24290, 32979, 5195, 33806, 9680], [27...",36
3,4975,Балтийский Берег. 5 игра,2019-04-05T19:05:00+03:00,2019-04-09T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 23030, 'name': 'Марина', 'patronymic':...",{'dateRequestsAllowedTo': '2019-04-08T23:59:59...,"{'1': 12, '2': 12, '3': 12}",2019-04-05,2019-04-09,"[4174, 45556, 70530, 7864, 7896, 27119, 33088,...","[110101011111111111110011111110111110, 1111010...","[2, 2, 2, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7...","[[13345, 29425, 52183, 34417, 30772, 93424], [...",36
4,5000,Чёрная Быль,2019-04-26T18:00:00+03:00,2019-04-30T18:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 59436, 'name': 'Олег', 'patronymic': '...",{'dateRequestsAllowedTo': '2019-04-30T23:59:59...,"{'1': 12, '2': 12, '3': 12}",2019-04-26,2019-04-30,"[1799, 45365, 47297, 55270, 66744, 70991]","[101111011101110101111011100001011011, 0011110...","[1.5, 1.5, 3, 4, 5.5, 5.5]","[[26972, 101697, 30720, 146440], [133148, 2459...",36


In [67]:
'train length: ' + str(tournaments_results_train.shape[0]), 'test length: ' + str(tournaments_results_test.shape[0])

('train length: 603', 'test length: 156')

## 2. Baseline-модель (логистическая регрессия) ##

Пусть, каждый игрок будет закодирован one-hot вектором и список вопросов так же, тогда попробуем по ним предсказывать вероятность ответа игрока на вопрос

In [69]:
train_players_list = []
for team_players_list in tournaments_results_train['team_members']:
    for players_list in team_players_list:
        for player in players_list:
            train_players_list.append(player)
            
unique_train_players_list = list(set(train_players_list))
n_unique_train_players_list = len(unique_train_players_list)
n_unique_train_players_list

55150

In [70]:
n_questions = tournaments_results_train['tournament_len'].sum()
n_questions

28194

In [71]:
map_ohe_player_id = {p: i for i, p in enumerate(unique_train_players_list)}
map_ohe_id_player = {i: p for p, i in map_ohe_player_id.items()}