In [None]:
# -*- coding: utf-8 -*-
"""all_trueskill.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1oZ8w6c6tGe-euAZNG05K6jL_dz-BrxDM
"""

!pip install trueskill

import os
import sys
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import gc
import random
import pickle
from pickle import dump, load
from trueskill import Rating, quality_1vs1, rate_1vs1
import math
import trueskill
pd.options.mode.chained_assignment = None

TRAIN_DTYPES = {
    'row_id': np.uint64,
    'timestamp': np.int64,
    'user_id': np.uint64,
    'content_id': np.uint16,
    'content_type_id': np.int8,
    'task_container_id': np.uint16,
    'user_answer': np.int8,
    'answered_correctly': np.int8,
    'prior_question_elapsed_time': np.float32,
    'prior_question_had_explanation': 'boolean'
}

def read_csv(file_name = "train.csv", dtype = None, skiprows = None, nrows = None, usecols = None):
    data = pd.read_csv(file_name, dtype=dtype, skiprows = skiprows, nrows = nrows, low_memory = True, header = 0, usecols = usecols)
    return data

def read_feather(file_name = "../input/feather-data/train.feather"):
    data = pd.read_feather(file_name)
    return data

#sigma_df = read_feather("/content/drive/MyDrive/sigma/sigma_df.feather")
#sigma_user_df = read_feather("/content/drive/MyDrive/sigma/result.feather")



In [None]:
tdf = read_feather('../input/feather-data/train.feather')
tdf = tdf[tdf.content_type_id == 0].reset_index(drop = True)
tdf = tdf[0:12005874]

pickle_dir = r'../input/feather-data/c_percent_dict.pkl'
with open(pickle_dir, 'rb') as fo:
    temp_question_dict = pickle.load(fo)
content_dict = {}
for key, value in temp_question_dict.items():
    content_dict[key] = value[0]/value[1]

user_mmr_dict = {}

users = np.unique(tdf['user_id'])
questions = np.unique(tdf['content_id'])

user_ratings = []
for user in users:
    rating_object = trueskill.setup(mu=0.3,
                                                  sigma=0.164486,
                                                  beta=0.05, tau=0.00164,
                                                  draw_probability=0).Rating()
    user_ratings.append(rating_object)

# question_ratings = []
# for question in questions:
#     rating_object= Rating()
#     question_ratings.append(rating_object)

user_dict = dict(zip(users, user_ratings))
#question_dict= dict(zip(questions, question_ratings))

answers = tdf['answered_correctly'].values
temp_user = tdf['user_id'].values
temp_question = tdf['content_id'].values

def win_probability(team1, team2):
    delta_mu = team1.mu - team2.mu
    sum_sigma = sum([team1.sigma ** 2, team2.sigma ** 2])
    size = 2
    denom = math.sqrt(size * (0.05 * 0.05) + sum_sigma)
    ts = trueskill.global_env()
    return ts.cdf(delta_mu / denom)

In [None]:
# tdf2 = tdf[23999970:23999980]
# tdf2

In [None]:
count = 0
winning_prob = []
user_current_mu = []
user_current_sigma = []
for user_id, content_id, answer in zip(temp_user, temp_question, answers):
    count += 1
    prev_user_rating = user_dict[user_id]
    prev_question_rating = trueskill.setup(mu=1 - content_dict[content_id],
                                                                      sigma=0.164486,
                                                                      beta=0.05, tau=0.00164,
                                                                      draw_probability=0).Rating()
    prob = win_probability(prev_user_rating, prev_question_rating)
    winning_prob.append(prob)
    user_current_mu.append(user_dict[user_id].mu)
    user_current_sigma.append(user_dict[user_id].sigma)
    #____winning_prob.append(prob)
    if answer == 1:
        new_user_rating, new_question_rating = rate_1vs1(prev_user_rating, prev_question_rating)
    if answer == 0:
        new_question_rating, new_user_rating = rate_1vs1(prev_question_rating, prev_user_rating)
    user_dict[user_id] = new_user_rating
    #user_mmr_dict[user_id]= new_user
    #____user_dict[user_id] = new_user_rating
    #question_dict[content_id] = new_question_rating
    if count % 1000000 == 0:
        print((count/10000000), "done")

In [None]:
tdf["trueskill_probability"] = winning_prob
tdf["mu"] = user_current_mu
tdf["sigma"] = user_current_sigma

tdf_trueskill = tdf[['row_id','user_id', 'content_id', 'trueskill_probability']]
tdf_mu = tdf[['row_id','user_id', 'content_id', 'mu']]
tdf_sigma = tdf[['row_id','user_id', 'content_id', 'sigma']]

tdf_trueskill.reset_index(drop=True, inplace=True)
tdf_mu.reset_index(drop=True, inplace=True)
tdf_sigma.reset_index(drop=True, inplace=True)

tdf_trueskill.to_feather('trueskill_1.feather')
tdf_mu.to_feather('mu_1.feather')
tdf_sigma.to_feather('sigma_1.feather')



dump(user_dict, open('global_mean_trueskill_mu_sigma.pkl', 'wb'))
#dump(question_dict, open('/content/drive/MyDrive/all_trueskill/question_dict.pkl', 'wb'))

# del users
# del questions
# del user_ratings
# del question_ratings
gc.collect()

tdf