# *DATA PREPARATION*

In [None]:
import os
import sys
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import gc
import random
from pickle import dump, load
from trueskill import Rating, quality_1vs1, rate_1vs1
import math
import trueskill
pd.options.mode.chained_assignment = None

In [None]:
TRAIN_DTYPES = {
    'row_id': np.uint64,
    'timestamp': np.int64,
    'user_id': np.uint64,
    'content_id': np.uint16,
    'content_type_id': np.int8,
    'task_container_id': np.uint16,
    'user_answer': np.int8,
    'answered_correctly': np.int8,
    'prior_question_elapsed_time': np.float32,
    'prior_question_had_explanation': 'boolean'
}

QUESTION_DTYPES = {
    'question_id': np.uint16,
    'bundle_id': np.uint16,
    'correct_answer': np.int8,
    'part': np.int8,
    'tags': str
}

LECTURE_DTYPES = {
    'lecture_id': np.uint16,
    'tag': np.uint16,
    'part': np.int8,
    'type_of':str
}

T_MASK = {24: 0, 23: 1, 58: 2, 134: 3, 52: 4, 124: 5, 44: 6, 123: 7, 67: 8, 167: 9, 161: 10, 43: 11, 80: 12,
          46: 13, 28: 14, 103: 15, 94: 16, 186: 17, 26: 18, 180: 19, 50: 20, 182: 21, 31: 22, 6: 23, 15: 24,
          11: 25, 108: 26, 47: 27, 76: 28, 165: 29, 174: 30, 48: 31, 152: 32, 132: 33, 170: 34, 49: 35,
          181: 36, 159: 37, 145: 38, 73: 39, 64: 40, 1: 41, 7: 42, 16: 43, 57: 44, 21: 45, 95: 46, 72: 47,
          91: 48, 125: 49, 157: 50, 96: 51, 156: 52, 53: 53, 55: 54, 45: 55, 4: 56, 133: 57, 136: 58, 75: 59,
          39: 60, 89: 61, 65: 62, 117: 63, 173: 64, 83: 65, 8: 66, 166: 67, 25: 68, 168: 69, 79: 70, 3: 71,
          97: 72, 60: 73, 128: 74, 179: 75, 14: 76, 151: 77, 164: 78, 112: 79, 116: 80, 42: 81, 22: 82, 0: 83,
          127: 84, 160: 85, 147: 86, 19: 87, 32: 88, 183: 89, 12: 90, 9: 91, 86: 92, 109: 93, 175: 94, 10: 95,
          115: 96, 78: 97, 171: 98, 148: 99, 113: 100, 27: 101, 35: 102, 169: 103, 92: 104, 122: 105, 54: 106,
          114: 107, 18: 108, 17: 109, 56: 110, 107: 111, 90: 112, 163: 113, 126: 114, 29: 115, 66: 116,
          106: 117, 135: 118, 2: 119, 87: 120, 138: 121, 71: 122, 100: 123, 41: 124, 30: 125, 154: 126,
          102: 127, 84: 128, 81: 129, 37: 130, 146: 131, 185: 132, 155: 133, 176: 134, 143: 135, 121: 136,
          85: 137, 162: 138, 184: 139, 104: 140, 38: 141, 140: 142, 82: 143, 120: 144, 20: 145, 88: 146,
          141: 147, 119: 148, 139: 149, 150: 150, 98: 151, 62: 152, 33: 153, 144: 154, 158: 155, 74: 156,
          13: 157, 61: 158, 110: 159, 69: 160, 137: 161, 111: 162, 34: 163, 118: 164, 153: 165, 129: 166,
          178: 167, 105: 168, 177: 169, 36: 170, 172: 171, 142: 172, 63: 173, 101: 174, 59: 175, 5: 176,
          131: 177, 99: 178, 93: 179, 51: 180, 77: 181, 40: 182, 70: 183, 149: 184, 68: 185, 187: 186, 130: 187}

In [None]:
def read_csv(file_name = "train.csv", dtype = None, skiprows = None, nrows = None, usecols = None):
    data = pd.read_csv(file_name, dtype=dtype, skiprows = skiprows, nrows = nrows, low_memory = True, header = 0, usecols = usecols)
    return data

def read_feather(file_name = "../input/feather-data/train.feather"):
    data = pd.read_feather(file_name)
    return data

In [None]:
##Get ts_delta of train.csv, save to feather
def get_ts_delta():
    df = read_feather()
    df = df[df['content_type_id'] == 0]
    tsdf = df['timestamp'].astype(np.int64)
    del df
    oidx = list(range(tsdf.shape[0]))
    last = [oidx[-1]]
    oidx = oidx[:-1]
    last.extend(oidx)
    del oidx
    gc.collect()
    tsdf.reset_index(drop = True, inplace = True)
    retsdf = tsdf.reindex(index=last)
    retsdf.reset_index(drop = True, inplace = True)
    delta_tsdf = tsdf - retsdf
    delta_tsdf[delta_tsdf < 0] = -1
    del tsdf
    del retsdf
    gc.collect()
    delta_tsdf = pd.DataFrame(delta_tsdf, dtype = np.int64)
    delta_tsdf.rename(columns = {'timestamp': 'ts_delta'}, inplace = True)
    retsdf = delta_tsdf
    while delta_tsdf[delta_tsdf == 0].notna().max()['ts_delta']:
        retsdf = retsdf.reindex(index=last)
        retsdf.reset_index(drop = True, inplace = True)
        delta_tsdf[delta_tsdf == 0] = retsdf[delta_tsdf == 0]
    delta_tsdf.to_feather('ts_delta.feather')
    return 0

In [None]:
def ordered_encode_python(values, uniques=None, encode=False):
    # only used in _encode below, see docstring there for details
    if uniques is None:
        uniques = list(dict.fromkeys(values))
        uniques = np.array(uniques, dtype=values.dtype)
    if encode:
        table = {val: i for i, val in enumerate(uniques)}
        try:
            encoded = np.array([table[v] for v in values])
        except KeyError as e:
            raise ValueError("y contains previously unseen labels: %s"
                             % str(e))
        return uniques, encoded
    else:
        return uniques

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.validation import column_or_1d

class OrderedLabelEncoder(LabelEncoder):
    def fit(self, y):
        y = column_or_1d(y, warn=True)
        self.classes_ = ordered_encode_python(y)
    def fit_transform(self, y):
        y = column_or_1d(y, warn=True)
        self.classes_, y = ordered_encode_python(y, encode=True)
        return y 

In [None]:
tdf = tdf[tdf.content_type_id == 0].reset_index(drop = True)
qdf = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv', dtype = QUESTION_DTYPES)
tdf = pd.merge(tdf, qdf[['question_id', 'part']], left_on = 'content_id', right_on = 'question_id', how = 'left')
del qdf
tdf.drop(['question_id'], axis = 1, inplace = True)
gc.collect()
qdf = pd.read_feather('../input/feather-data/questions_processed.feather')
qdf.question_id = qdf.question_id - 1
qdf['num_tag'] = qdf[qdf[['t1','t2','t3','t4','t5','t6']] > 1].T.count()
tdf = pd.merge(tdf, qdf[['question_id', 't1', 'num_tag']], left_on = 'content_id', right_on = 'question_id', how = 'left')
del qdf
tdf.drop(['row_id', 'content_type_id', 'user_answer', 'question_id'], axis = 1, inplace = True)

In [None]:
users = np.unique(tdf['user_id'])
questions = np.unique(tdf['content_id'])

In [None]:
user_ratings = []
for user in users:
    rating_object = Rating()
    user_ratings.append(rating_object)

question_ratings = []
for question in questions:
    rating_object= Rating()
    question_ratings.append(rating_object)

In [None]:
user_dict = dict(zip(users, user_ratings))
question_dict= dict(zip(questions, question_ratings))

In [None]:
answers = tdf['answered_correctly'].values
temp_user = tdf['user_id'].values
temp_question = tdf['content_id'].values

In [None]:
def win_probability(team1, team2):
    delta_mu = team1.mu - team2.mu
    sum_sigma = sum([team1.sigma ** 2, team2.sigma ** 2])
    size = 2
    denom = math.sqrt(size * (0.05 * 0.05) + sum_sigma)
    ts = trueskill.global_env()
    return ts.cdf(delta_mu / denom)

In [None]:
count = 0
winning_prob = []
for user_id, content_id, answer in zip(temp_user, temp_question, answers):
    count += 1
    prev_user_rating = user_dict[user_id]
    prev_question_rating = question_dict[content_id]
    prob = win_probability(prev_user_rating, prev_question_rating)
    winning_prob.append(prob)
    if answer == 1:
        new_user_rating, new_question_rating = rate_1vs1(prev_user_rating, prev_question_rating)
    if answer == 0:
        new_question_rating, new_user_rating = rate_1vs1(prev_question_rating, prev_user_rating)
    user_dict[user_id] = new_user_rating
    question_dict[content_id] = new_question_rating
    if count % 1000000 == 0:
        print("10^6 done")

In [None]:
tdf["trueskill_probaility"]= winning_prob

In [None]:
#orginal
tdf.t1.fillna(0, inplace = True)
tdf.fillna(-1, inplace = True)
tdf.t1 = tdf.t1.astype(np.uint8)
tdf.prior_question_had_explanation = tdf.prior_question_had_explanation.astype(np.int8)
tdf.prior_question_elapsed_time = (tdf.prior_question_elapsed_time/1000).astype(np.int16)
gc.collect()

In [None]:
tdf.num_tag = tdf.num_tag.astype(np.int8)

In [None]:
tdf['u_attempt_c'] = (tdf.groupby(['user_id', 'content_id']).cumcount()).astype(np.uint8)

In [None]:
qdf = pd.read_feather('../input/riiid-data-processing4/questions_processed.feather')
qdf.question_id = qdf.question_id - 1
tdf = pd.merge(tdf, qdf[['question_id', 'bundle_id']], left_on = 'content_id', right_on = 'question_id', how = 'left')
del qdf
tdf.drop(['question_id'], axis = 1, inplace = True)
gc.collect()
tdf.bundle_id = tdf.bundle_id.astype(np.int16)

In [None]:
adf = tdf.groupby('user_id')
u_dict = adf.groups
del adf
gc.collect()

In [None]:
ts_delta_df = read_feather("../input/feather-data/ts_delta.feather")

In [None]:
tdf = pd.concat([ts_delta_df, tdf], axis = 1)
del ts_delta_df
tdf['ts_delta'] = (tdf['ts_delta']//1000).astype(np.int32)
gc.collect()

In [None]:
tdf['total_explained'] = tdf['prior_question_had_explanation']
tdf['task_container_id_sorted'] = tdf['task_container_id']
tdf['10_recent_correctness'] = (tdf['answered_correctly']).astype(np.float16)
tdf['10_recent_mean_gap'] = tdf['ts_delta']
tdf['10_recent_mean_gap'][tdf['10_recent_mean_gap'] == -1] = 0
tdf['mean_elapsed'] = tdf['prior_question_elapsed_time']
gc.collect()

In [None]:
for uid in tqdm(u_dict):
    task_enc = OrderedLabelEncoder()
    begin = u_dict[uid][0]
    end = u_dict[uid][-1] + 1
    tdf['10_recent_correctness'][begin:end] = (tdf['10_recent_correctness'][begin:end].rolling(window=11,min_periods=0).sum() - tdf['10_recent_correctness'][begin:end])/(tdf['10_recent_correctness'][begin:end].rolling(window=11,min_periods=0).count() - 1)
    tdf['10_recent_mean_gap'][begin:end] = (tdf['10_recent_mean_gap'][begin:end].rolling(window=10,min_periods=0).mean())
    tdf['total_explained'][begin:end] = tdf.total_explained[begin:end].cumsum() + 1
    tdf['task_container_id_sorted'][begin:end] = task_enc.fit_transform(tdf.task_container_id_sorted[begin:end])
    tdf['mean_elapsed'][begin:end] = (tdf['mean_elapsed'][begin:end].cumsum()/(tdf.task_container_id_sorted[begin:end]))

In [None]:
tdf['total_explained'].fillna(0, inplace = True)
tdf['total_explained'] = tdf['total_explained'].astype(np.int16)
tdf['10_recent_mean_gap'].fillna(0, inplace = True)
tdf['10_recent_mean_gap'] = tdf['10_recent_mean_gap'].astype(np.float32)
tdf['10_recent_correctness'].fillna(0, inplace = True)
tdf['10_recent_correctness'] = tdf['10_recent_correctness'].astype(np.float32)
tdf['mean_elapsed'].fillna(0, inplace = True)
tdf['mean_elapsed'] = tdf['mean_elapsed'].astype(np.int16)
gc.collect()

In [None]:
tdf['mean_gap'] = tdf.timestamp/tdf.task_container_id_sorted
tdf['mean_gap'] = tdf['mean_gap'].replace([np.inf, -np.inf], np.nan)
tdf['mean_gap'].fillna(0, inplace = True)
tdf.mean_gap = (tdf.mean_gap/1000).astype(np.uint32)
tdf.drop(['task_container_id_sorted'], axis = 1, inplace = True)
gc.collect()

In [None]:
tdf['abs_time'] = (round(tdf.user_id / 50 + tdf.timestamp / 1000)).astype(np.int32)
tdf.drop(['timestamp'], axis = 1, inplace = True)
gc.collect()

In [None]:
tdf.sort_values(['abs_time', 'user_id'], inplace = True, ignore_index=True)
tdf.drop(['abs_time'], axis = 1, inplace = True)
gc.collect()

In [None]:
bet_df = read_feather('../input/feather-data/bundle_elapsed_time_mean.feather')
tdf = pd.concat([tdf, bet_df], axis = 1)
del bet_df
gc.collect()

In [None]:
tdf1 = tdf[:25000000]
tdf2 = tdf[25000000:50000000]
tdf3 = tdf[50000000:75000000]
tdf4 = tdf[75000000:]
del tdf
gc.collect()
tdf2.reset_index(drop = True, inplace = True)
tdf3.reset_index(drop = True, inplace = True)
tdf4.reset_index(drop = True, inplace = True)
tdf1.to_feather('trainlgb_prep_1.feather')
tdf2.to_feather('trainlgb_prep_2.feather')
tdf3.to_feather('trainlgb_prep_3.feather')
tdf4.to_feather('trainlgb_prep_4.feather')