In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timezone
from sklearn.linear_model import LogisticRegression, LinearRegression
from tqdm.notebook import tqdm, trange
import six
import matplotlib.pyplot as plt
import scipy.sparse
import urllib3
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from requests_html import HTMLSession
import swifter
import itertools
from scipy import stats

import warnings
warnings.filterwarnings('ignore')

# 1 часть. Работа с данными

Прочитайте и проанализируйте данные, выберите турниры, в которых есть данные о составах команд и повопросных результатах (поле mask в results.pkl). 

Для унификации предлагаю:
- взять в тренировочный набор турниры с dateStart из 2019 года; 
- в тестовый — турниры с dateStart из 2020 года.


In [2]:
dict_results = pd.read_pickle('data/results.pkl')

In [3]:
dict_players = pd.read_pickle('data/players.pkl')

In [4]:
dict_tournaments = pd.read_pickle('data/tournaments.pkl')

In [5]:
TOURNAMENT_ID_KEY = TEAM_ID_KEY = "id"
TOURNAMENT_NAME_KEY = TEAM_ID_KEY = "name"
TOURNAMENT_TEAMS_KEY = "teams"
TEAM_ANSWERS_KEY = "answers"
TEAM_PLAYERS_KEY = "players"

def make_team_dict(team_id, answers, players):
    res = {
        TEAM_ID_KEY: team_id,
        TEAM_ANSWERS_KEY : answers,
        TEAM_PLAYERS_KEY: players,
    }
    return res


def make_tournament_dict(tournament_id, tournament_name, teams):
    res = {
        TOURNAMENT_ID_KEY: tournament_id,
        TOURNAMENT_NAME_KEY: tournament_name,
        TOURNAMENT_TEAMS_KEY: teams,
    }
    return res

In [6]:
POSSIBLE_ANSWERS = {"0", "1"}

def process_answers(answers_string):
    if not isinstance(answers_string, six.string_types) or set(answers_string) - POSSIBLE_ANSWERS:
        return []
    return [int(a) for a in answers_string]
    

def is_valid_team(team_dict):
    players = team_dict[TEAM_PLAYERS_KEY]
    if not players:
        return False
    
    answers = team_dict[TEAM_ANSWERS_KEY]
    if not answers:
        return False
    
    return True


def is_valid_tournament(tournament_dict):
    teams = tournament_dict[TOURNAMENT_TEAMS_KEY]
    if not teams:
        return False
    
    num_answers = None
    for team in teams:
        answers = team[TEAM_ANSWERS_KEY]
        num_answers = num_answers or len(answers)
        if num_answers != len(answers):
            return False
    
    return True

In [7]:
def tournaments_train_test_split(tournaments, results):
    
    TRAIN_YEAR = 2019
    TEST_YEAR = 2020

    TOURNAMENT_DATE_KEY = "dateStart" 
    TOURNAMENT_ID_KEY = TEAM_ID_KEY = PLAYER_ID_KEY = "id"
    RESULT_ANSWERS_KEY = "mask"
    TEAM_MEMBERS_KEY = "teamMembers"
    TEAM_KEY = "team"
    PLAYER_KEY = "player"
    
    train, test = [], []

    for tournament_info in tqdm(tournaments.values()):
    
        tournament_year = datetime.fromisoformat(tournament_info[TOURNAMENT_DATE_KEY]).year
        if tournament_year not in (TRAIN_YEAR, TEST_YEAR):
            continue
    
        tournament_id = tournament_info[TOURNAMENT_ID_KEY]
        tournament_name = tournament_info[TOURNAMENT_NAME_KEY]
        
        tournament_result = results[tournament_id]
        if not tournament_result:
            continue
    
        teams = []
        for team in tournament_result:
            team_id = team[TEAM_KEY][TEAM_ID_KEY]
        
            answers = process_answers(team.get(RESULT_ANSWERS_KEY))
        
            players = []
            for member in team[TEAM_MEMBERS_KEY]:
                player_id = member[PLAYER_KEY][PLAYER_ID_KEY]
                players.append(player_id)
        
            team_dict = make_team_dict(team_id, answers, players)
            if not is_valid_team(team_dict):
                continue
            teams.append(team_dict)
        
        tournament_dict = make_tournament_dict(tournament_id, tournament_name, teams)
        if not is_valid_tournament(tournament_dict):
            continue
    
        if tournament_year == TRAIN_YEAR:
            train.append(tournament_dict)
        else:
            test.append(tournament_dict)
            
    return train, test

In [8]:
train, test = tournaments_train_test_split(dict_tournaments, dict_results)

  0%|          | 0/5528 [00:00<?, ?it/s]

In [9]:
len(train), len(test)

(604, 156)

In [10]:
unique_players = set()

for tournament in tqdm(train): 
    teams = tournament[TOURNAMENT_TEAMS_KEY]
    for team in teams:
        players = team[TEAM_PLAYERS_KEY]
        unique_players.update(players)
        
player_idx_map = {player: idx for idx, player in enumerate(unique_players)}

  0%|          | 0/604 [00:00<?, ?it/s]

In [11]:
[18490, 116901, 8532, 42346, 123190, 22482]

[18490, 116901, 8532, 42346, 123190, 22482]

In [12]:
num_players = len(player_idx_map)

player_num_answers = np.zeros(num_players)
player_num_correct_answers = np.zeros(num_players)
player_num_tournaments = np.zeros(num_players)
player_teams = [set() for _ in range(num_players)]

num_questions = 0
unique_teams = set()
for tournament in tqdm(train): 
    teams = tournament[TOURNAMENT_TEAMS_KEY]
    
    first_team = True
    for team in teams:
        team_id = team[TEAM_ID_KEY]
        unique_teams.add(team_id)
        
        answers = team[TEAM_ANSWERS_KEY]
        num_answers = len(answers)
        num_correct_answers = sum(answers)
        if first_team:
            num_questions += num_answers
        
        players = team[TEAM_PLAYERS_KEY]
        for player in players:
            player_idx = player_idx_map[player]
            player_num_answers[player_idx] += num_answers
            player_num_correct_answers[player_idx] += num_correct_answers
            player_num_tournaments[player_idx] += 1
            player_teams[player_idx].add(team_id)
        
        first_team = False
        
player_num_teams = np.array([len(pt) for pt in player_teams])
player_correct_ratio = player_num_correct_answers / player_num_answers

  0%|          | 0/604 [00:00<?, ?it/s]

In [13]:
print(f"Уникальных игроков в турнирах для обучения: {num_players}")
print(f"Команд в турнирах для обучения: {len(unique_teams)}")
print(f"Было задано вопросов в турнирах для обучения: {num_questions}")
print(f"В среднем вопросов на турнир: {int(num_questions / len(train))}")

Уникальных игроков в турнирах для обучения: 55151
Команд в турнирах для обучения: 10996
Было задано вопросов в турнирах для обучения: 28264
В среднем вопросов на турнир: 46


# 2 часть. Baseline-модель

Постройте baseline-модель на основе линейной или логистической регрессии, которая будет обучать рейтинг-лист игроков. Замечания и подсказки:
- повопросные результаты — это фактически результаты броска монетки, и их предсказание скорее всего имеет отношение к бинарной классификации;
- в разных турнирах вопросы совсем разного уровня сложности, поэтому модель должна это учитывать; скорее всего, модель должна будет явно обучать не только силу каждого игрока, но и сложность каждого вопроса;
- для baseline-модели можно забыть о командах и считать, что повопросные результаты команды просто относятся к каждому из её игроков.


## Описание решения

Будем обучать логистическую регрессию: 
- в столбцах будут one-hot представление вопросов и one-hot представление игроков
- таргет - правильность ответа на вопрос (1 или 0)
- таблица будет очень большая, поэтому положим ее в scipy.sparse.coo_matrix

Таким образом получим коэффициенты при каждом игроке и сможем получить рейтинг, отранжировав их (у самого лучшего игрока самый большой коэффициент в логрегрессии)

Будем сравнивать с рейтингом на "2020-01-01", потому что в обучались на данных 2019 года и к 2022 реальные рейтинги могли поменяться

In [14]:
CORRECT_LABEL = 1
INCORRECT_LABEL = 0

question_player_correct = [[] for _ in range(num_questions)]
question_player_incorrect = [[] for _ in range(num_questions)]
question_player_correct_team = [[] for _ in range(num_questions)]
question_player_incorrect_team = [[] for _ in range(num_questions)]

question_tournament = [None] * num_questions

max_question_id = -1
for tournament in tqdm(train): 
    tournament_name = tournament[TOURNAMENT_NAME_KEY]
    teams = tournament[TOURNAMENT_TEAMS_KEY]
    for team in teams:
        team_id = team[TEAM_ID_KEY]
        answers = team[TEAM_ANSWERS_KEY]
        players = team[TEAM_PLAYERS_KEY]
        for question_id, answer in enumerate(answers, 1):
            question_id += max_question_id
            
            question_tournament[question_id] = tournament_name
            
            for player in players:
                player_idx = player_idx_map[player]
                if answer == CORRECT_LABEL:
                    question_player_correct[question_id].append(player_idx)
                    question_player_correct_team[question_id].append(team_id)
                else:
                    question_player_incorrect[question_id].append(player_idx)
                    question_player_incorrect_team[question_id].append(team_id)
    
    max_question_id = question_id

  0%|          | 0/604 [00:00<?, ?it/s]

In [15]:
def construct_sparse_matrix(question_player, dtype=None):
    row = []
    col = []
    data = []
    
    max_row_idx = -1
    max_player_idx = -1
    for question_idx, player_idx_collection in enumerate(question_player):
        for row_idx, player_idx in enumerate(player_idx_collection, 1):
            if player_idx > max_player_idx:
                max_player_idx = player_idx
            row_idx += max_row_idx
            
            row.append(row_idx)
            col.append(question_idx)
            data.append(1)
            
            row.append(row_idx)
            col.append(len(question_player) + player_idx)
            data.append(1)
            
        max_row_idx = row_idx
    num_rows = max_row_idx + 1
    num_cols = max_player_idx + 1 + len(question_player)
    coo = scipy.sparse.coo_matrix((data, (row, col)), shape=(num_rows, num_cols), dtype=dtype)
    return coo

In [16]:
def make_Xy(question_player_correct, question_player_incorrect, dtype=None):
    correct_coo = construct_sparse_matrix(question_player_correct, dtype=dtype)
    incorrect_coo = construct_sparse_matrix(question_player_incorrect, dtype=dtype)
    
    X = scipy.sparse.vstack([correct_coo, incorrect_coo]).tocsr()
    y = np.array(
        [CORRECT_LABEL] * correct_coo.shape[0] + [INCORRECT_LABEL] * incorrect_coo.shape[0]
    )
    
    return X, y

In [17]:
def _extend_question_team(t, question_player_team):
    for team_collection in question_player_team:
        t.extend(team_collection)

In [18]:
def make_t(question_player_correct_team, question_player_incorrect_team):
    t = []
    _extend_question_team(t, question_player_correct_team)
    _extend_question_team(t, question_player_incorrect_team)
    t = np.array(t)
    return t

In [19]:
%%time

X, y = make_Xy(question_player_correct, question_player_incorrect, dtype=np.uint8)

CPU times: user 12.7 s, sys: 1.36 s, total: 14.1 s
Wall time: 14.4 s


In [20]:
%%time

lr = LogisticRegression(solver="sag", n_jobs=-1)
lr.fit(X, y)

CPU times: user 4min 53s, sys: 4.85 s, total: 4min 58s
Wall time: 5min 5s


LogisticRegression(n_jobs=-1, solver='sag')

In [21]:
idx_player_map = {idx: player for player, idx in player_idx_map.items()}

player_baseline_ratings_ = {}
for idx, rating in enumerate(lr.coef_.flatten()[len(question_player_correct):]):
    player_baseline_ratings_[idx_player_map[idx]] = rating

player_baseline_ratings_ = dict(sorted(player_baseline_ratings_.items(), key=lambda x: -x[1]))

assert len(player_baseline_ratings_) == len(idx_player_map)

In [22]:
def session_with_retries(session, retries=5, backoff_factor=0.3, status_forcelist=(500, 502, 504)):
    retry = Retry(total=retries, read=retries, connect=retries, backoff_factor=backoff_factor,
                  status_forcelist=status_forcelist)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session


def get_player_actual_rating(player_id, verify=False, **get_kws):
    URL = f"https://rating.chgk.info/api/players/{player_id}/rating/"
    RATING_KEY = "rating_position"
    DATE_OF_RATING = "2020-01-01"
    
    with session_with_retries(HTMLSession()) as session:
        response = session.get(URL, verify=verify, **get_kws)
    response_json = response.json()
    if response_json:
        df = pd.DataFrame(response_json)
        idx_rating = abs(pd.to_datetime(df['date']) - pd.to_datetime(DATE_OF_RATING)).idxmin()    
        actual_rating = int(df.iloc[idx_rating][RATING_KEY])
        return actual_rating
    return 0

In [23]:
def nearest(items, pivot):
    return min(items, key=lambda x: abs(x - pivot))

In [26]:
player_baseline_ratings = pd.Series(player_baseline_ratings_).reset_index().head(1000)
player_baseline_ratings.columns = ["player_id", "baseline_score"]
player_baseline_ratings["actual_rating"] = player_baseline_ratings["player_id"].swifter.apply(get_player_actual_rating)
player_baseline_ratings = player_baseline_ratings.loc[player_baseline_ratings["actual_rating"] != 0].copy()
player_baseline_ratings["baseline_rating"] = range(1, len(player_baseline_ratings) + 1)

player_baseline_ratings.head(40)

Dask Apply:   0%|          | 0/16 [00:00<?, ?it/s]

Unnamed: 0,player_id,baseline_score,actual_rating,baseline_rating
0,27403,4.069354,5,1
1,4270,3.932639,6,2
2,28751,3.885278,2,3
3,27822,3.853132,3,4
4,30270,3.758611,4,5
5,30152,3.752961,1,6
6,18036,3.619303,28,7
7,20691,3.61452,96,8
8,87637,3.542938,159,9
9,22799,3.539642,16,10


In [27]:
def kendall(x, y):
    corr, _ = stats.kendalltau(x, y)
    return corr

def spearman(x, y):
    corr, _ = stats.spearmanr(x, y)
    return corr

In [28]:
kendall_corr_baseline_actual_rating = kendall(
    player_baseline_ratings["baseline_rating"], player_baseline_ratings["actual_rating"]
)

spearman_corr_baseline_actual_rating = spearman(
    player_baseline_ratings["baseline_rating"], player_baseline_ratings["actual_rating"]
)

print(f"Корреляция Кенделла между рейтингами игроков и реальными: {kendall_corr_baseline_actual_rating}")
print(f"Корреляция Cпирмена между рейтингами игроков и реальными: {spearman_corr_baseline_actual_rating}")

Корреляция Кенделла между рейтингами игроков и реальными: 0.348702393603322
Корреляция Cпирмена между рейтингами игроков и реальными: 0.49537469337691215


# 3 часть. Качество рейтинг-системы

Качество рейтинг-системы оценивается качеством предсказаний результатов турниров. Но сами повопросные результаты наши модели предсказывать вряд ли смогут, ведь неизвестно, насколько сложными окажутся вопросы в будущих турнирах; да и не нужны эти предсказания сами по себе. Поэтому:

- предложите способ предсказать результаты нового турнира с известными составами, но неизвестными вопросами, в виде ранжирования команд;
- в качестве метрики качества на тестовом наборе давайте считать ранговые корреляции Спирмена и Кендалла (их можно взять в пакете scipy) между реальным ранжированием в результатах турнира и предсказанным моделью, усреднённые по тестовому множеству турниров.


## Описание решения

Попробуем две стратегии ранжирования команд:
- по рейтингу лучшего игрока
- по среднему рейтингу всей команды

In [29]:
NUM_QUESTIONS = int(num_questions / len(train))


def fillna_and_sort_team_player_scores(team_player_scores):
    team_player_scores_flatten = [s for s in itertools.chain(*team_player_scores) if s is not None]
    player_avg_score = np.mean(team_player_scores_flatten)
    
    for i, _ in enumerate(team_player_scores):
        team_player_scores[i] = sorted([s if s is not None else player_avg_score for s in team_player_scores[i]], reverse=True)


def estimated_team_ranking(teams, player_ratings, aggregated_top=1):
    team_player_scores = [None] * len(teams) 
    for idx, team in enumerate(teams):
        players = team[TEAM_PLAYERS_KEY]
        player_scores = [player_ratings.get(player) for player in players]
        team_player_scores[idx] = player_scores
    
    fillna_and_sort_team_player_scores(team_player_scores)
    
    team_estimated_scores = np.array([np.mean(tps[:aggregated_top]) for tps in team_player_scores])
    team_estimated_ranking = np.argsort(-team_estimated_scores) + 1
    
    return team_estimated_ranking


def actual_team_ranking(teams):
    team_actual_scores = [None] * len(teams) 
    for idx, team in enumerate(teams):
        answers = team[TEAM_ANSWERS_KEY]
        team_actual_scores[idx] = sum(answers)
    
    team_actual_scores = np.array(team_actual_scores)
    team_actual_ranking = np.argsort(-team_actual_scores) + 1
    
    return team_actual_ranking


def construct_one_hot(idx, size):
    row = [0]
    col = [idx]
    data = [1]
    one_hot = scipy.sparse.coo_matrix((data, (row, col)), shape=(1, size))
    return one_hot


def construct_question_player_row(question_idx, player_idx, size):
    qsize, psize = size
    question_one_hot = construct_one_hot(question_idx, qsize)
    player_one_hot = construct_one_hot(player_idx, psize)
    res = scipy.sparse.hstack([question_one_hot, player_one_hot])
    return res


def model_predict_team_ranking(model, teams, player_idx_map, question_player, num_questions, correct_answer_prob_threshold=0.5):
    qsize = len(question_player)
    psize = len(player_idx_map)
    
    question_idxs = np.random.choice(range(len(question_player)), size=num_questions)
    
    team_predicted_scores = [0] * len(teams)
    for team_idx, team in enumerate(teams):
        players = team[TEAM_PLAYERS_KEY]
        player_idxs = [player_idx_map[player] for player in players if player in player_idx_map]
        
        for question_idx in question_idxs:
            rows = [
                construct_question_player_row(question_idx, player_idx, size=(qsize, psize))
                for player_idx in player_idxs
            ]
            if not rows:
                continue
            rows = scipy.sparse.vstack(rows)
            probas = model.predict_proba(rows)[:, 0]
            wrong_answer_proba = np.prod(probas)
            correct_answer_proba = 1 - wrong_answer_proba
            if correct_answer_proba >= correct_answer_prob_threshold:
                team_predicted_scores[team_idx] += 1
                
    team_predicted_scores = np.array(team_predicted_scores)
    team_predicted_ranking = np.argsort(-team_predicted_scores) + 1
    
    return team_predicted_ranking

In [40]:
top1_kendall_corrs, top1_spearman_corrs = [], []
top5_kendall_corrs, top5_spearman_corrs = [], []

for tournament in tqdm(test): 
    teams = tournament[TOURNAMENT_TEAMS_KEY]
    
    actual_ranking = actual_team_ranking(teams)
    
    top1_ranking = estimated_team_ranking(teams, player_baseline_ratings_, 1)
    top1_kendall_corrs.append(kendall(actual_ranking, top1_ranking))
    top1_spearman_corrs.append(spearman(actual_ranking, top1_ranking))
    
    top5_ranking = estimated_team_ranking(teams, player_baseline_ratings_, 1000)
    top5_kendall_corrs.append(kendall(actual_ranking, top5_ranking))
    top5_spearman_corrs.append(spearman(actual_ranking, top5_ranking))    

print('\n Лучший игрок в команде:')    
print(f"1) Корреляция Кенделла на тестовом множестве: {np.mean(top1_kendall_corrs)}")
print(f"1) Корреляция Cпирмена на тестовом множестве: {np.mean(top1_spearman_corrs)}")
print('\n Средний рейтинг игроков команды:')    
print(f"2) Корреляция Кенделла на тестовом множестве: {np.mean(top5_kendall_corrs)}")
print(f"2) Корреляция Cпирмена на тестовом множестве: {np.mean(top5_spearman_corrs)}")

  0%|          | 0/156 [00:00<?, ?it/s]


 Лучший игрок в команде:
1) Корреляция Кенделла на тестовом множестве: 0.5428304455477393
1) Корреляция Cпирмена на тестовом множестве: 0.7153277613165121

 Средний рейтинг игроков команды:
2) Корреляция Кенделла на тестовом множестве: 0.5530961871867169
2) Корреляция Cпирмена на тестовом множестве: 0.7201702738804968
