# *DATA PREPARATION*

In [None]:
import os
import sys
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import gc
import random
from pickle import dump, load

In [None]:
TRAIN_DTYPES = {
    'row_id': np.uint64,
    'timestamp': np.int64,
    'user_id': np.uint64,
    'content_id': np.uint16,
    'content_type_id': np.int8,
    'task_container_id': np.uint16,
    'user_answer': np.int8,
    'answered_correctly': np.int8,
    'prior_question_elapsed_time': np.float32,
    'prior_question_had_explanation': 'boolean'
}

QUESTION_DTYPES = {
    'question_id': np.uint16,
    'bundle_id': np.uint16,
    'correct_answer': np.int8,
    'part': np.int8,
    'tags': str
}

LECTURE_DTYPES = {
    'lecture_id': np.uint16,
    'tag': np.uint16,
    'part': np.int8,
    'type_of':str
}

In [None]:
def read_csv(file_name = "train.csv", dtype = None, skiprows = None, nrows = None, usecols = None):
    data = pd.read_csv(file_name, dtype=dtype, skiprows = skiprows, nrows = nrows, low_memory = True, header = 0, usecols = usecols)
    return data

def read_feather(file_name = "../input/feather-data/train.feather"):
    data = pd.read_feather(file_name)
    return data

In [None]:
from sklearn.preprocessing import LabelBinarizer

In [None]:
def feat_en_baseline(df, i, c_percent_dict, u_percent_dict, t_percent_dict):
    u_chance = []
    u_attempts = []
    c_chance = []
    c_attempts = []
    u_part_chance = []
    u_part_attempts = []
    u_skill_chance = []
    u_skill_attempts = []
    t_chance = []
    t_attempts = []

    for x in tqdm(range(df.shape[0])):
        cid = df.content_id[x]
        tid = df.t1[x]
        uid = df.user_id[x]
        part = df.part[x]
        skill = 'lis'
        if part > 4: skill = 'read'
        if not uid in u_percent_dict: u_percent_dict[uid] = {'cor':0, 'tot':0, 1:[0,0], 2:[0,0], 3:[0,0], 4:[0,0],
                                                             5:[0,0], 6:[0,0], 7:[0,0], 'lis':[0,0], 'read':[0,0],
                                                             'mean_gap':0, 'total_explained':0}
        if not cid in c_percent_dict: c_percent_dict[cid] = [0, 0]
        if not tid in t_percent_dict: t_percent_dict[tid] = [0, 0]

        c_attempts.append(c_percent_dict[cid][1])
        u_attempts.append(u_percent_dict[uid]['tot'])
        t_attempts.append(t_percent_dict[tid][1])
        u_part_attempts.append(u_percent_dict[uid][part][1])
        u_skill_attempts.append(u_percent_dict[uid][skill][1])

        try:
            u_chance.append(u_percent_dict[uid]['cor'] / u_percent_dict[uid]['tot'])
        except:
            u_chance.append(-1)
        try:
            c_chance.append(c_percent_dict[cid][0] / c_percent_dict[cid][1])
        except:
            c_chance.append(-1)
        try:
            t_chance.append(t_percent_dict[tid][0] / t_percent_dict[tid][1])
        except:
            t_chance.append(-1)
        try:
            u_part_chance.append(u_percent_dict[uid][part][0] / u_percent_dict[uid][part][1])
        except:
            u_part_chance.append(-1)
        try:
            u_skill_chance.append(u_percent_dict[uid][skill][0] / u_percent_dict[uid][skill][1])
        except:
            u_skill_chance.append(-1)
        u_percent_dict[uid]['mean_gap'] = df['mean_gap'][x]
        u_percent_dict[uid]['total_explained'] = df['total_explained'][x]
        c_percent_dict[cid][1] += 1
        t_percent_dict[tid][1] += 1
        u_percent_dict[uid]['tot'] += 1
        u_percent_dict[uid][part][1] += 1
        u_percent_dict[uid][skill][1] += 1
        if df.answered_correctly[x]:
            u_percent_dict[uid]['cor'] += 1
            c_percent_dict[cid][0] += 1
            u_percent_dict[uid][part][0] += 1
            u_percent_dict[uid][skill][0] += 1
            t_percent_dict[tid][0] += 1

    df['u_chance'] = u_chance
    del u_chance
    df['u_attempts'] = u_attempts
    del u_attempts
    df.u_attempts = df.u_attempts.astype(np.uint16)
    df['c_chance'] = c_chance
    del c_chance
    df['c_attempts'] = c_attempts
    del c_attempts
    df.c_attempts = df.c_attempts.astype(np.uint16)
    df['u_part_chance'] = u_part_chance
    del u_part_chance
    df['u_part_attempts'] = u_part_attempts
    del u_part_attempts
    df.u_part_attempts = df.u_part_attempts.astype(np.uint16)
    df['u_skill_chance'] = u_skill_chance
    del u_skill_chance
    df['u_skill_attempts'] = u_skill_attempts
    del u_skill_attempts
    df.u_skill_attempts = df.u_skill_attempts.astype(np.uint16)
    df['t_chance'] = t_chance
    del t_chance
    df['t_attempts'] = t_attempts
    del t_attempts
    df.t_attempts = df.t_attempts.astype(np.uint16)
    gc.collect()
    df.to_feather('trainlgb_feat-en_{}.feather'.format(i-1))
    del df
    gc.collect()
    return c_percent_dict, u_percent_dict, t_percent_dict

def create_baseline_featen():
    c_percent_dict = {}
    u_percent_dict = {}
    t_percent_dict = {}
    a = [1,2,3,4]
    for i in a:
        tdf = read_feather('../input/riiid-data-processing-lgbm/trainlgb_prep_{}.feather'.format(i))
        c_percent_dict, u_percent_dict, t_percent_dict = feat_en_baseline(tdf, i, c_percent_dict, u_percent_dict, t_percent_dict)
        del tdf
        gc.collect()
    dump(c_percent_dict, open('c_percent_dict.pkl', 'wb'))
    del c_percent_dict
    gc.collect()
    dump(t_percent_dict, open('t_percent_dict.pkl', 'wb'))
    del t_percent_dict
    gc.collect()
    dump(u_percent_dict, open('u_percent_dict.pkl', 'wb'))
    return 0

In [None]:
create_baseline_featen()