In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.pipeline import Pipeline
from collections import defaultdict

from scipy.stats import spearmanr, kendalltau

In [2]:
players = pd.read_pickle('players.pkl')
players = pd.DataFrame(players).T
players['id'] = players['id'].apply(int)
players.head()

Unnamed: 0,id,name,patronymic,surname
1,1,Алексей,,Абабилов
10,10,Игорь,,Абалов
11,11,Наталья,Юрьевна,Абалымова
12,12,Артур,Евгеньевич,Абальян
13,13,Эрик,Евгеньевич,Абальян


In [3]:
tournaments = pd.read_pickle('tournaments.pkl')
tournaments = pd.DataFrame(tournaments).T
tournaments[['type_id', 'type_name']] = tournaments['type'].apply(lambda x: pd.Series({'type_id':x['id'], 'type_name':x['name']}))

tournaments['year'] = tournaments['dateStart'].apply(lambda x: int(x[:4]))
tournaments = tournaments[(tournaments.year.isin([2019, 2020]))]
tournaments_ids = set(tournaments['id'].to_list())

tournaments.head()

Unnamed: 0,id,name,dateStart,dateEnd,type,season,orgcommittee,synchData,questionQty,type_id,type_name,year
4628,4628,Семь сорок,2020-12-30T16:00:00+03:00,2020-12-30T16:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",,"[{'id': 7533, 'name': 'Денис', 'patronymic': '...",{'dateRequestsAllowedTo': '2020-12-30T23:55:00...,"{'1': 12, '2': 12, '3': 12}",3,Синхрон,2020
4772,4772,Синхрон северных стран. Зимний выпуск,2019-01-05T19:00:00+03:00,2019-01-09T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 28379, 'name': 'Константин', 'patronym...",{'dateRequestsAllowedTo': '2019-01-09T23:59:59...,"{'1': 12, '2': 12, '3': 12}",3,Синхрон,2019
4957,4957,Синхрон Биркиркары,2020-02-21T00:00:00+03:00,2020-02-27T23:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/53,"[{'id': 2421, 'name': 'Ася', 'patronymic': 'Се...",{'dateRequestsAllowedTo': '2020-02-27T18:00:00...,"{'1': 13, '2': 13, '3': 13}",3,Синхрон,2020
4973,4973,Балтийский Берег. 3 игра,2019-01-25T19:05:00+03:00,2019-01-29T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 23030, 'name': 'Марина', 'patronymic':...",{'dateRequestsAllowedTo': '2019-01-28T23:59:59...,"{'1': 12, '2': 12, '3': 12}",3,Синхрон,2019
4974,4974,Балтийский Берег. 4 игра,2019-03-01T19:05:00+03:00,2019-03-05T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 23030, 'name': 'Марина', 'patronymic':...",{'dateRequestsAllowedTo': '2019-03-04T23:59:59...,"{'1': 12, '2': 12, '3': 12}",3,Синхрон,2019


In [4]:
results = pd.read_pickle('results.pkl')
mask_unique_symbols = set()
for key, val in results.items():
    for team in val:
        if team.get('mask'):
            mask_unique_symbols.update(set(team['mask']))
print(mask_unique_symbols)

{'X', '0', '?', '1'}


In [5]:
#отбираем турниры с масками (есть результаты) и кол-во вопросов у всех команд одинаковое
keys = []
for key, val in results.items():
    if len(val)>0 and key in tournaments_ids:
        mask_fl = 0
        mask_lengths = set()
        for team in val:
            if team.get('mask'):
                mask_fl = 1
                mask_lengths.add(len(team['mask']))
        if mask_fl == 1 and len(mask_lengths) == 1:
            keys.append(key)

In [6]:
keys = set(keys)

In [7]:
tournaments = tournaments[tournaments['id'].isin(keys)]

In [8]:
tournaments.groupby('year').size()

year
2019    663
2020    169
dtype: int64

In [9]:
tournaments_train_ids = set(tournaments[tournaments.year==2019]['id'].to_list())
tournaments_test_ids = set(tournaments[tournaments.year==2020]['id'].to_list())

# 2. Baseline логистическая регрессия

соберем датасет для логистической регрессии:

In [10]:
train_data = []
test_data = []
for tournament_id, result in results.items():
    if tournament_id not in keys:
        continue
    
    for team in result:
        if team.get('mask') is None or 'X' in team['mask'] or '?' in team['mask']:
            continue
        
        team_result = {'tournament_id': tournament_id}
        team_result['team_id'] = team['team']['id']
        team_result['position'] = team['position']
        
        team_result['mask'] = list(map(int, team['mask']))
        if len(team['teamMembers']) == 0:
            continue
        team_result['players'] = [player['player']['id'] for player in team['teamMembers']]
        
        #дадим каждому вопросу уникальный идентификатор
        team_result['questions'] = [f'question_{tournament_id}_{idx}' for idx, question in enumerate(team_result['mask'])]
        
        
        if tournament_id in tournaments_train_ids:
            train_data.append(team_result)
        elif tournament_id in tournaments_test_ids:
            test_data.append(team_result)

In [11]:
y = train_data[0]['mask']
members = train_data[0]['players']

In [12]:
players_stats = {}
for tournament in train_data:
    for player in tournament['players']:
        if players_stats.get(player) is None:
            players_stats[player] = {'cnt_questions': 0, 'cnt_wright_answers': 0}
        players_stats[player]['cnt_questions'] += len(tournament['mask'])
        players_stats[player]['cnt_wright_answers'] += sum(tournament['mask'])
        
players_stats = pd.DataFrame(players_stats).T.reset_index()
players_stats.rename(columns={'index': 'player_id'}, inplace=True)

In [13]:
train = []
test = []
for tournament in train_data:
    for player in tournament['players']:
        for question, result in zip(tournament['questions'], tournament['mask']):
            sample = {'tournament_id': tournament['tournament_id']}
            sample['question_id'] = question
            sample['team_id'] = tournament['team_id']
            sample['text'] = f'{player} {question}'
            sample['player_id'] = player
            sample['team_position'] = tournament['position']
            sample['target'] = result
            train.append(sample)
            


for tournament in test_data:
    for player in tournament['players']:
        for question, result in zip(tournament['questions'], tournament['mask']):
            sample = {'tournament_id': tournament['tournament_id']}
            sample['question_id'] = question
            sample['team_id'] = tournament['team_id']
            sample['text'] = f'{player} {question}'
            sample['player_id'] = player
            sample['team_position'] = tournament['position']
            sample['target'] = result
            test.append(sample)

Собрал датасет следующего вида: каждый сэмпл - турнир-вопрос-игрок

In [14]:
train = pd.DataFrame(train)
test = pd.DataFrame(test)
print(train.shape, test.shape)

(13749583, 7) (3663746, 7)


In [15]:
train.head()

Unnamed: 0,tournament_id,question_id,team_id,text,player_id,team_position,target
0,4772,question_4772_0,45556,6212 question_4772_0,6212,1.0,1
1,4772,question_4772_1,45556,6212 question_4772_1,6212,1.0,1
2,4772,question_4772_2,45556,6212 question_4772_2,6212,1.0,1
3,4772,question_4772_3,45556,6212 question_4772_3,6212,1.0,1
4,4772,question_4772_4,45556,6212 question_4772_4,6212,1.0,1


In [16]:
lr_classifier = Pipeline(steps=[
    ('vectorizer', CountVectorizer(binary=True)
    ),
    ('classifier', LogisticRegression()
    )
])

In [17]:
lr_classifier.fit(train['text'].values, train['target'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('vectorizer', CountVectorizer(binary=True)),
                ('classifier', LogisticRegression())])

In [18]:
vectorizer = lr_classifier['vectorizer']
clf = lr_classifier['classifier']

In [19]:
weights = pd.DataFrame((zip(vectorizer.get_feature_names(), clf.coef_[0])), columns=['feature_name', 'weight'])

Посчитаем рейтинг игроков исходя из их весов в модели

In [20]:
players_rating = weights[~weights.feature_name.str.contains('question')]

In [21]:
players_rating.rename(columns={'feature_name':'player_id'}, inplace=True)
players_rating['player_id'] = players_rating['player_id'].apply(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players_rating['player_id'] = players_rating['player_id'].apply(int)


In [22]:
players_rating = players_rating.merge(players, how='left', left_on='player_id', right_on='id')

In [23]:
players_rating = players_rating.sort_values('weight', ascending=False).reset_index(drop=True)

In [24]:
players_rating = players_rating.merge(players_stats, how='left', on='player_id')

In [25]:
real_players_rating = pd.read_csv('players-release-2020-01-02.csv', usecols=[' ИД', 'Место'])
real_players_rating.rename(columns={' ИД':'player_id'}, inplace=True)
real_players_rating.head()

Unnamed: 0,player_id,Место
0,30152,1
1,28751,2
2,27822,3
3,30270,4
4,27403,5


Топ 40 игроков:

In [26]:
players_rating.merge(real_players_rating, how='left', on='player_id')[:40]

Unnamed: 0,player_id,weight,id,name,patronymic,surname,cnt_questions,cnt_wright_answers,Место
0,27403,3.836724,27403,Максим,Михайлович,Руссо,1796,1411,5.0
1,4270,3.553571,4270,Александра,Владимировна,Брутер,2240,1759,6.0
2,30152,3.399995,30152,Артём,Сергеевич,Сорожкин,4006,3034,1.0
3,37047,3.383565,37047,Мария,Алексеевна,Юнгер,452,312,292.0
4,20691,3.348223,20691,Станислав,Григорьевич,Мереминский,1370,989,96.0
5,28751,3.276845,28751,Иван,Николаевич,Семушин,3071,2385,2.0
6,27822,3.262564,27822,Михаил,Владимирович,Савченков,2666,2056,3.0
7,34328,3.261122,34328,Михаил,Сергеевич,Царёв,366,222,238.0
8,18036,3.260824,18036,Михаил,Ильич,Левандовский,1113,803,28.0
9,3843,3.240876,3843,Светлана,Борисовна,Бомешко,336,227,2572.0


Топ 40 игроков(при условии не меньше 1000 сыгранных вопроса):

In [27]:
players_rating[players_rating.cnt_questions>=1000].merge(real_players_rating, how='left', on='player_id')[:40]

Unnamed: 0,player_id,weight,id,name,patronymic,surname,cnt_questions,cnt_wright_answers,Место
0,27403,3.836724,27403,Максим,Михайлович,Руссо,1796,1411,5.0
1,4270,3.553571,4270,Александра,Владимировна,Брутер,2240,1759,6.0
2,30152,3.399995,30152,Артём,Сергеевич,Сорожкин,4006,3034,1.0
3,20691,3.348223,20691,Станислав,Григорьевич,Мереминский,1370,989,96.0
4,28751,3.276845,28751,Иван,Николаевич,Семушин,3071,2385,2.0
5,27822,3.262564,27822,Михаил,Владимирович,Савченков,2666,2056,3.0
6,18036,3.260824,18036,Михаил,Ильич,Левандовский,1113,803,28.0
7,56647,3.238179,56647,Наталья,Евгеньевна,Горелова,1769,1269,234.0
8,22935,3.226795,22935,Илья,Сергеевич,Новиков,1266,905,183.0
9,30270,3.176472,30270,Сергей,Леонидович,Спешков,3017,2283,4.0


# 3. Оценка качества

Проранжируем команды в турнире по их вероятности ошибиться на вопросе (без учета сложности вопроса).

Предскажем вероятность ошибки на вопрос каждого игрока, тогда вероятность ошибки всей команды произведение вероятностей ошибок всех членов команды <br>
Тогда вероятность дать правильный ответ: 1 - вероятность ошибки всей команды <br>
$P(team=0) = \prod P(player=0)$ <br>
$P(team=1) = 1 - P(team=0)$

In [28]:
train_players = set(train.player_id.unique())

In [29]:
new_test = test[test.player_id.isin(train_players)].groupby(['tournament_id', 'team_id', 'player_id', 'team_position'], as_index=False).size()
#new_test = test.groupby(['tournament_id', 'team_id', 'player_id', 'team_position'], as_index=False).size()

т.к. мы не знаем вес вопросов на тесте, сгенерируем им токен question_unkwnonw

In [30]:
new_test['text'] = new_test['player_id'].apply(str) + ' question_unkwnonw'
new_test.head()

Unnamed: 0,tournament_id,team_id,player_id,team_position,size,text
0,5414,2,6482,2.5,36,6482 question_unkwnonw
1,5414,2,25882,2.5,36,25882 question_unkwnonw
2,5414,2,30475,2.5,36,30475 question_unkwnonw
3,5414,2,32458,2.5,36,32458 question_unkwnonw
4,5414,2,34846,2.5,36,34846 question_unkwnonw


In [31]:
new_test['fail_score'] = lr_classifier.predict_proba(new_test['text'])[:, 0]

In [32]:
test_predict = new_test.groupby(['tournament_id', 'team_id', 'team_position'], as_index=False).fail_score.prod()

In [33]:
test_predict['success_score'] = 1 - test_predict['fail_score']

In [34]:
spearman_results = []
kendal_results = []
for tournament in test_predict.tournament_id.unique():
    res_spearman = spearmanr(test_predict[test_predict.tournament_id==tournament].team_position.values, 
                    test_predict[test_predict.tournament_id==tournament].fail_score.values)[0]
    res_kendal = kendalltau(test_predict[test_predict.tournament_id==tournament].team_position.values, 
                    test_predict[test_predict.tournament_id==tournament].fail_score.values)[0]
    spearman_results.append(res_spearman)
    kendal_results.append(res_kendal)

In [35]:
print(f'spearman: {np.mean(spearman_results)}')

spearman: 0.7737685318813404


In [36]:
print(f'kendall: {np.mean(kendal_results)}')

kendall: 0.6164076807995837


# EM

В качестве вектора скрытых переменных будем использовать следующие вероятности: $z = P(player=1|team)$

Сделаем следующее предположение: $P(player=1|team=0) = 0$

E-шаг: предсказываем вероятности ответа на вопрос игрока при условии команды: $P(player=1|team=1) = \frac{P(player=1 \cap team=1)}{P(team=1)} = \frac{P(team=1|player=1) P(player=1)}{P(team=1)} = \frac{P(player=1)}{P(team=1)}$

M-шаг: максимизируем правдоподобие. Обучаем модель на вероятностях с E-шага



In [37]:
import torch
import torch.nn.functional as F
from torch import nn
from torch.optim.lr_scheduler import StepLR

In [38]:
len(vectorizer.get_feature_names())

83415

In [39]:
X_train = vectorizer.transform(train['text'])
y_train = train['target'].values

In [40]:
X_test = vectorizer.transform(test['text'])
y_test = test['target'].values

In [41]:
print(X_train.shape, X_test.shape)

(13749583, 83415) (3663746, 83415)


In [42]:
X_train_coo = X_train.tocoo()
X_test_coo = X_test.tocoo()

In [43]:
X_test_coo.shape

(3663746, 83415)

In [44]:
X_train_torch = torch.sparse.FloatTensor(torch.LongTensor([X_train_coo.row.tolist(), X_train_coo.col.tolist()]),
                              torch.FloatTensor(X_train_coo.data))
y_train_torch = torch.Tensor(y_train)

In [45]:
X_test_torch = torch.sparse.FloatTensor(torch.LongTensor([X_test_coo.row.tolist(), X_test_coo.col.tolist()]),
                              torch.FloatTensor(X_test_coo.data), size=(3663746,83415))
y_test_torch = torch.Tensor(y_test)

In [46]:
def train_model(
    model,
    opt,
    X_train_torch,
    y_train_torch,
    X_val_torch,
    y_val_torch,
    n_iterations=20
):

    local_train_loss_history = []
    val_loss_history = []

    
    for i in range(n_iterations):
        opt.zero_grad()
        #y_predicted = torch.log_softmax(model(X_train_torch), dim=-1)
        y_predicted = model(X_train_torch)[:, 0]
        loss = loss_function(y_predicted, y_train_torch)

        loss.backward()
        opt.step()
        opt.zero_grad()

        local_train_loss_history.append(loss.data.numpy())
        predictions_val = model(X_val_torch)[:, 0]
        val_loss_history.append(loss_function(predictions_val, y_val_torch).to('cpu').detach().item())
    return local_train_loss_history, val_loss_history

In [47]:
def e_step(X_train_torch, train, model):
    preds = model(X_train_torch)[:, 0]
    preds = preds.data.numpy()
    train['success_proba'] = preds
    train['fail_proba'] = 1 - preds
    agg = train.groupby(['team_id', 'question_id'], as_index=False).agg(fail_team_proba = ('fail_proba', 'prod'),)
    agg['success_team_proba'] = 1 - agg['fail_team_proba']
    train = train.merge(agg[['team_id', 'question_id', 'success_team_proba']], on=['team_id', 'question_id'])
    z = np.clip(train['success_proba']/train['success_team_proba'], 0, 1)
    train.drop(['success_proba', 'fail_proba', 'success_team_proba'], axis=1, inplace=True)
    return z
    

def m_step(z):
    z_torch = torch.Tensor(z)
    local_train_loss_history, val_loss_history =  train_model(model, opt,
                                                             X_train_torch, z_torch, 
                                                             X_test_torch, y_test_torch, 100)
    return local_train_loss_history[-1], val_loss_history[-1]

In [48]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):
        outputs = F.sigmoid(self.linear(x))
        return outputs

In [49]:
X_test4rating = vectorizer.transform(new_test['text'])
X_test4rating_coo = X_test4rating.tocoo()
X_test4rating_torch = torch.sparse.FloatTensor(torch.LongTensor([X_test4rating_coo.row.tolist(), X_test4rating_coo.col.tolist()]),
                              torch.FloatTensor(X_test4rating_coo.data), size=(84663,83415))

In [50]:
model = LogisticRegression(83415, 1)
model.linear.weight = nn.Parameter(torch.Tensor(clf.coef_))
model.linear.bias = nn.Parameter(torch.Tensor(clf.intercept_))
opt = torch.optim.SGD(model.parameters(), lr=0.001)

Напишем свою реализацию BCE, т.к. torch из коробки не поддерживает soft_label (я не нашел)

In [51]:
def loss_function(y_pred, y_true):
    return -(y_true*torch.log(y_pred) + (1 - y_true)*torch.log(1-y_pred)).mean()

In [52]:
for step in range(5):
    z = e_step(X_train_torch, train, model)
    train_loss, val_loss = m_step(z)
    
    try:
        new_test.drop('fail_score', axis=1, inplace=True)
    except:
        pass
    new_test['fail_score'] = 1 - model(X_test4rating_torch)[:, 0].data.numpy()

    test_predict = new_test.groupby(['tournament_id', 'team_id', 'team_position'], as_index=False).fail_score.prod()
    spearman_results = []
    kendal_results = []
    for tournament in test_predict.tournament_id.unique():
        res_spearman = spearmanr(test_predict[test_predict.tournament_id==tournament].team_position.values, 
                        test_predict[test_predict.tournament_id==tournament].fail_score.values)[0]
        res_kendal = kendalltau(test_predict[test_predict.tournament_id==tournament].team_position.values, 
                        test_predict[test_predict.tournament_id==tournament].fail_score.values)[0]
        spearman_results.append(res_spearman)
        kendal_results.append(res_kendal)
    
    print(f'train_loss: {train_loss}, val_loss: {val_loss}')
    print(f'spearman: {np.mean(spearman_results)}')
    print(f'kendal: {np.mean(kendal_results)}')
    print('#################################')



train_loss: 0.8960064649581909, val_loss: 0.707648754119873
spearman: 0.7737376859024166
kendal: 0.6163674365555789
#################################




train_loss: 0.8958763480186462, val_loss: 0.7071412801742554
spearman: 0.7737925607610705
kendal: 0.6163981576009671
#################################




train_loss: 0.8957443237304688, val_loss: 0.706641674041748
spearman: 0.7737960785124696
kendal: 0.6164197328896915
#################################




train_loss: 0.8956105709075928, val_loss: 0.7061499357223511
spearman: 0.7737731449689361
kendal: 0.6164059087077199
#################################




train_loss: 0.895474910736084, val_loss: 0.7056660056114197
spearman: 0.7735697563925025
kendal: 0.6162620271443843
#################################


Лоссы стабильно падают (медленно) <br>
В целом можно этот рост немного ускорить (подобрав гиперпараметры), но в такой реализации это довольно медленно учится и подбор параметров довольно затруднителен

# 5. Рейтинг турниров

Для расчета рейтига вопросов можно взять (по аналогии с рейтигом игроков) коэффициенты модели по вопросам, только в данной ситуации нужно исходить из того что чем меньше вес, тем сложнее вопрос

Зная рейтинг вопросов, можем расчитать рейтинг турниров усреднив в нем вопросы по сложности

In [53]:
questions_rating = weights[weights.feature_name.str.contains('question')]

In [54]:
questions_rating.rename(columns={'feature_name': 'question_id'}, inplace=True)
questions_rating.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,question_id,weight
55151,question_4772_0,2.300731
55152,question_4772_1,1.864197
55153,question_4772_10,2.150981
55154,question_4772_11,0.345801
55155,question_4772_12,5.088316


In [55]:
questions = train.groupby(['tournament_id', 'question_id'], as_index=False).size()

In [56]:
tournaments_rating = questions.merge(questions_rating, on='question_id').groupby('tournament_id',as_index=False).weight.mean()

Топ 40 турниров по сложности вопросов

In [57]:
tournaments_rating.merge(tournaments[['id','name']], left_on='tournament_id', right_on='id').sort_values('weight')[:40]

Unnamed: 0,tournament_id,weight,id,name
595,6149,-3.580522,6149,Чемпионат Санкт-Петербурга. Первая лига
482,5928,-2.448037,5928,Угрюмый Ёрш
35,5159,-2.045916,5159,Первенство правого полушария
571,6101,-1.918391,6101,Воображаемый музей
251,5587,-1.811525,5587,Записки охотника
12,5025,-1.748834,5025,Кубок городов
22,5083,-1.687673,5083,Ускользающая сова
339,5693,-1.653818,5693,Знание – Сила VI
156,5465,-1.582272,5465,Чемпионат России
36,5161,-1.533916,5161,Антибинго
