# *DATA PREPARATION*

In [None]:
import os
import sys
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import gc
import random
from pickle import dump, load

In [None]:
TRAIN_DTYPES = {
    'row_id': np.uint64,
    'timestamp': np.int64,
    'user_id': np.uint64,
    'content_id': np.uint16,
    'content_type_id': np.int8,
    'task_container_id': np.uint16,
    'user_answer': np.int8,
    'answered_correctly': np.int8,
    'prior_question_elapsed_time': np.float32,
    'prior_question_had_explanation': 'boolean'
}

QUESTION_DTYPES = {
    'question_id': np.uint16,
    'bundle_id': np.uint16,
    'correct_answer': np.int8,
    'part': np.int8,
    'tags': str
}

LECTURE_DTYPES = {
    'lecture_id': np.uint16,
    'tag': np.uint16,
    'part': np.int8,
    'type_of':str
}

T_MASK = {24: 0, 23: 1, 58: 2, 134: 3, 52: 4, 124: 5, 44: 6, 123: 7, 67: 8, 167: 9, 161: 10, 43: 11, 80: 12,
          46: 13, 28: 14, 103: 15, 94: 16, 186: 17, 26: 18, 180: 19, 50: 20, 182: 21, 31: 22, 6: 23, 15: 24,
          11: 25, 108: 26, 47: 27, 76: 28, 165: 29, 174: 30, 48: 31, 152: 32, 132: 33, 170: 34, 49: 35,
          181: 36, 159: 37, 145: 38, 73: 39, 64: 40, 1: 41, 7: 42, 16: 43, 57: 44, 21: 45, 95: 46, 72: 47,
          91: 48, 125: 49, 157: 50, 96: 51, 156: 52, 53: 53, 55: 54, 45: 55, 4: 56, 133: 57, 136: 58, 75: 59,
          39: 60, 89: 61, 65: 62, 117: 63, 173: 64, 83: 65, 8: 66, 166: 67, 25: 68, 168: 69, 79: 70, 3: 71,
          97: 72, 60: 73, 128: 74, 179: 75, 14: 76, 151: 77, 164: 78, 112: 79, 116: 80, 42: 81, 22: 82, 0: 83,
          127: 84, 160: 85, 147: 86, 19: 87, 32: 88, 183: 89, 12: 90, 9: 91, 86: 92, 109: 93, 175: 94, 10: 95,
          115: 96, 78: 97, 171: 98, 148: 99, 113: 100, 27: 101, 35: 102, 169: 103, 92: 104, 122: 105, 54: 106,
          114: 107, 18: 108, 17: 109, 56: 110, 107: 111, 90: 112, 163: 113, 126: 114, 29: 115, 66: 116,
          106: 117, 135: 118, 2: 119, 87: 120, 138: 121, 71: 122, 100: 123, 41: 124, 30: 125, 154: 126,
          102: 127, 84: 128, 81: 129, 37: 130, 146: 131, 185: 132, 155: 133, 176: 134, 143: 135, 121: 136,
          85: 137, 162: 138, 184: 139, 104: 140, 38: 141, 140: 142, 82: 143, 120: 144, 20: 145, 88: 146,
          141: 147, 119: 148, 139: 149, 150: 150, 98: 151, 62: 152, 33: 153, 144: 154, 158: 155, 74: 156,
          13: 157, 61: 158, 110: 159, 69: 160, 137: 161, 111: 162, 34: 163, 118: 164, 153: 165, 129: 166,
          178: 167, 105: 168, 177: 169, 36: 170, 172: 171, 142: 172, 63: 173, 101: 174, 59: 175, 5: 176,
          131: 177, 99: 178, 93: 179, 51: 180, 77: 181, 40: 182, 70: 183, 149: 184, 68: 185, 187: 186, 130: 187}

In [None]:
def read_csv(file_name = "train.csv", dtype = None, skiprows = None, nrows = None, usecols = None):
    data = pd.read_csv(file_name, dtype=dtype, skiprows = skiprows, nrows = nrows, low_memory = True, header = 0, usecols = usecols)
    return data

def read_feather(file_name = "../input/feather-data/train.feather"):
    data = pd.read_feather(file_name)
    return data

In [None]:
##Get ts_delta of train.csv, save to feather
def get_ts_delta():
    df = read_feather()
    df = df[df['content_type_id'] == 0]
    tsdf = df['timestamp'].astype(np.int64)
    del df
    oidx = list(range(tsdf.shape[0]))
    last = [oidx[-1]]
    oidx = oidx[:-1]
    last.extend(oidx)
    del oidx
    gc.collect()
    tsdf.reset_index(drop = True, inplace = True)
    retsdf = tsdf.reindex(index=last)
    retsdf.reset_index(drop = True, inplace = True)
    delta_tsdf = tsdf - retsdf
    delta_tsdf[delta_tsdf < 0] = -1
    del tsdf
    del retsdf
    gc.collect()
    delta_tsdf = pd.DataFrame(delta_tsdf, dtype = np.int64)
    delta_tsdf.rename(columns = {'timestamp': 'ts_delta'}, inplace = True)
    retsdf = delta_tsdf
    while delta_tsdf[delta_tsdf == 0].notna().max()['ts_delta']:
        retsdf = retsdf.reindex(index=last)
        retsdf.reset_index(drop = True, inplace = True)
        delta_tsdf[delta_tsdf == 0] = retsdf[delta_tsdf == 0]
    delta_tsdf.to_feather('ts_delta.feather')
    return 0

In [None]:
##Load train.csv and (scale ts and uid), get cor as label
from sklearn.preprocessing import RobustScaler, LabelEncoder
def ts_delta_feature():
    df = read_feather('../input/feather-data/ts_delta.feather')
    df1 = df//60000
    df1[df1 > 1440] = 1440
    df1 = (df1 + 2).astype(np.int16)
    df1.rename(columns = {'ts_delta': 'minute_delta'}, inplace = True)
    df3 = (df//86400000)
    df3[df3 > 30] = 30
    df3 = (df3 + 2).astype(np.int8)
    df3.rename(columns = {'ts_delta': 'day_delta'}, inplace = True)
    df2 = df//2592000000
    df2[df2 > 6] = 6
    df2 = (df2 + 2).astype(np.int8)
    df2.rename(columns = {'ts_delta': 'month_delta'}, inplace = True)
    df1 = pd.concat([df1, df3, df2], axis = 1)
    return df1

def load_train(skiprows= None, nrows = None):
    df = read_feather("../input/feather-data/train.feather")
    df = df[df['content_type_id'] == 0]
    gc.collect()
    df.reset_index(drop = True, inplace = True)
    df.drop(columns = ['user_answer', 'content_type_id'], inplace = True)#'timestamp'
    df['prior_question_had_explanation'].fillna(0, inplace = True)
    df['prior_question_had_explanation'] = df['prior_question_had_explanation'].astype(np.int8)
    df['prior_question_elapsed_time'].fillna(23000, inplace = True) #mean: 25300 med: 21000
    df['prior_question_elapsed_time'] = (df['prior_question_elapsed_time'] // 1000).astype(np.int16)
    for x in df:
        if (x == 'content_id' or x == 'task_container_id' or 
            x == 'prior_question_elapsed_time' or x == 'prior_question_had_explanation'):
            df[x] = df[x] + 1
        else: continue
    label_df = df.pop('answered_correctly')
    row_id = pd.DataFrame(df.pop('row_id'))
    tsdf = ts_delta_feature()
    df = pd.concat([row_id, tsdf, df], axis = 1)
    df.to_feather('train_scaled.feather')
    pd.DataFrame(label_df).to_feather('label_nolec.feather')
    return 0

In [None]:
##Load questions.csv and ohe part, preprocess tags
from sklearn.preprocessing import LabelBinarizer
def load_q_content():
    df = read_csv(file_name = '../input/riiid-test-answer-prediction/questions.csv', dtype = QUESTION_DTYPES)
    df.drop(columns = 'correct_answer', inplace = True)
    bundle_encoder = LabelEncoder()
    df.bundle_id = bundle_encoder.fit_transform(df.bundle_id) + 1
    df.question_id = df.question_id + 1
    part_encoder = load(open('../input/feather-data/part_enc.pkl', 'rb'))
    df.fillna(value = '0', inplace = True)
    tag_content = df.pop('tags')
    part_content = df.pop('part')
    df['p_listen'] = (part_content < 5)
    df['p_read'] = (part_content >= 5)
    df['p_listen'] = df['p_listen'].astype(np.int8)
    df['p_read'] = df['p_read'].astype(np.int8)
    df['3_mul'] = part_content == 2
    df['4_mul'] = part_content != 2
    df['3_mul'] = df['3_mul'].astype(np.int8)
    df['4_mul'] = df['4_mul'].astype(np.int8)
    part_ohe = part_encoder.transform(part_content)
    part_ohe = pd.DataFrame(part_ohe, dtype = np.int8, columns = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7'])
    df = pd.concat([df, part_ohe], axis = 1)
    parsed_tag = []
    for i in range(tag_content.shape[0]):
        value = tag_content[i].split()
        for j, k in enumerate(value):  
            value[j] = int(T_MASK[int(k)])+1
        value.sort(reverse = True)
        pad = 6 - len(value)
        value.extend([1]*pad)
        parsed_tag.append(value)
    parsed_tag = pd.DataFrame(parsed_tag, dtype = np.uint8, columns = ['t1', 't2', 't3', 't4', 't5', 't6'])
    parsed_tag.drop(['t4', 't5', 't6'], axis = 1, inplace = True)
    df = pd.concat([df, parsed_tag], axis = 1)
    df.to_feather('questions_processed.feather')
    return 0

In [None]:
##Load lectures.csv and ohe type_of
from sklearn.preprocessing import LabelBinarizer
def load_lecture():
    df = read_csv(file_name = "../input/riiid-test-answer-prediction/lectures.csv", dtype = LECTURE_DTYPES)
    tag_content = df.pop('tag')
    parsed_tag = []
    for i in range(tag_content.shape[0]):
        value = [int(T_MASK[int(tag_content[i])])+1]
        parsed_tag.append(value)
    parsed_tag = pd.DataFrame(parsed_tag, dtype = np.uint8, columns = ['tag'])
    type_encoder = load(open('../input/feather-data/lecty_enc.pkl', 'rb'))
    type_ohe = type_encoder.transform(df['type_of'])
    type_ohe = pd.DataFrame(type_ohe, dtype = np.int8, columns = ['ty1', 'ty2', 'ty3', 'ty4'])
    df = pd.concat([df, parsed_tag, type_ohe], axis = 1)
    df.drop(columns = ['type_of'], inplace = True)
    df.to_feather('lectures_processed.feather')
    return 0

In [None]:
##Split train data into 7 folds, group by uid
from sklearn.model_selection import GroupKFold
def split_data(df=None):
    if df is None: df = read_feather('train_scaled.feather')
    group_kfold = GroupKFold(n_splits=7)
    groups = df['user_id']
    x_df = df['row_id']
    y_df = read_feather('label_nolec.feather')
    y_df = y_df[y_df['answered_correctly'] != -1]
    y_df = y_df['answered_correctly']
    i = 0
    for _, test_index in group_kfold.split(x_df, y_df, groups):
        new_df = df.iloc[test_index]
        new_df.reset_index(drop = True, inplace = True)
        new_df.to_feather('train_scaled_{}.feather'.format(i))
        new_label = y_df.iloc[test_index]
        new_label.reset_index(drop = True, inplace = True)
        pd.DataFrame(new_label).to_feather('label_data_{}.feather'.format(i))
        i+=1
    return 0

In [None]:
##Create lecture dict of users n train.csv
def get_train_lecture():
    df = read_feather()
    df = df[df['content_type_id'] == 1]
    gc.collect()
    df.drop(columns = ['prior_question_elapsed_time', 'prior_question_had_explanation', 
                       'content_type_id', 'task_container_id', 'user_answer', 'answered_correctly'],
            inplace = True)
    df.reset_index(drop = True, inplace = True)
    return df

def create_trainlec_dict():
    df_trainlec = get_train_lecture()
    df_lec = read_feather('lectures_processed.feather')
    lec_pos = {}
    for i in range(df_lec.shape[0]):
        lec_pos[df_lec['lecture_id'][i]] = i
    trainlec_dict = {}
    for i in tqdm(range(df_trainlec.shape[0])):
        uid = df_trainlec['user_id'][i]
        if not uid in trainlec_dict: trainlec_dict[uid] = {}
        lid = df_trainlec['content_id'][i]
        pos = lec_pos[lid]
        part = df_lec['part'][pos]
        tag = df_lec['tag'][pos]
        ltype = np.argmax(df_lec[['ty1', 'ty2', 'ty3', 'ty4']][pos : pos+1].to_numpy())
        if not part in trainlec_dict[uid]: trainlec_dict[uid][part] = {'tt':[], 'lt':[], 'ts':[], 'rid':[]}
        trainlec_dict[uid][part]['tt'].append(tag)
        trainlec_dict[uid][part]['lt'].append(ltype)
        trainlec_dict[uid][part]['ts'].append(df_trainlec['timestamp'][i])
        trainlec_dict[uid][part]['rid'].append(df_trainlec['row_id'][i])
    dump(trainlec_dict, open('trainlec_dict.pkl', 'wb')) # 149606 uid
    return trainlec_dict

In [None]:
##Concat train data with lecture data for each question.
from bisect import bisect
def get_train_q(fold):
    df = read_feather('../input/riiid-data-processing/train_scaled_{}.feather'.format(fold))
    print(df.shape)
    gc.collect()
    df.reset_index(drop = True, inplace = True)
    return df

def create_lecture_dep(fold):
    part_df = read_csv(file_name = '../input/riiid-test-answer-prediction/questions.csv', dtype = QUESTION_DTYPES)
    part_df.drop(columns = 'bundle_id', inplace=True)
    part_df.drop(columns = 'correct_answer', inplace = True)
    part_df.drop(columns = 'tags', inplace = True)
    gc.collect()
    #col: row_id  timestamp  user_id  content_id  content_type_id  task_container_id  prior_question_elapsed_time  prior_question_had_explanation
    tdf = get_train_q(fold)
    lect_dep_df = pd.DataFrame(columns = ['numlect', 'llecty1', 'llecty2', 'llecty3', 'llecty4', 
                                          'hour_past'], #tag of lecture not use
                              index = range(tdf.shape[0]))
    trainlec_dict = load(open('../input/feather-data/trainlec_dict.pkl', 'rb'))
    user_range = {}
    tpast = [[0,0,0,0]]*7
    num_lect = [1]*7
    prev_idx_cur = [-1]*7
    with_df = []
    for i in tqdm(range(tdf.shape[0])):
        if i-1 >= 0 and tdf['task_container_id'][i] == tdf['task_container_id'][i-1]:
            with_df.append(2)
        elif i+1 < tdf.shape[0] and tdf['task_container_id'][i] == tdf['task_container_id'][i+1]:
            with_df.append(2)
        else: with_df.append(1)
        uid = tdf['user_id'][i]
        if not uid in user_range:
            user_range[uid] = [tdf['row_id'][i], tdf['row_id'][i]]
            tpast = [[0,0,0,0], [0,0,0,0], [0,0,0,0], [0,0,0,0], [0,0,0,0], [0,0,0,0], [0,0,0,0]]
            prev_idx_cur = [-1]*7
            num_lect = [1]*7
        user_range[uid][1] = tdf['row_id'][i]
        if not uid in trainlec_dict: continue
        cid = tdf['content_id'][i] - 1
        assert part_df['question_id'][cid] == cid, 'hmm somethings wrong' + str(cid)
        part = part_df['part'][cid]
        if not part in trainlec_dict[uid]: continue
        ts_cur = tdf['timestamp'][i]
        idx_cur = bisect(trainlec_dict[uid][part]['ts'], ts_cur) - 1
        if idx_cur == -1: continue
        if idx_cur != prev_idx_cur[part-1]:
            numlec_between = idx_cur - prev_idx_cur[part-1]
            num_lect[part - 1] += numlec_between
            for k in range(numlec_between):
                t_incre = trainlec_dict[uid][part]['lt'][idx_cur - k]
                tpast[part-1][t_incre] += 1
        tlast = [0]*4
        tlast[trainlec_dict[uid][part]['lt'][idx_cur]] = 1
        taglast = trainlec_dict[uid][part]['tt'][idx_cur]
        tsago = min((ts_cur - trainlec_dict[uid][part]['ts'][idx_cur])//3600000 + 2, 723)
#         row_to_add = tpast[part-1].copy()
        row_to_add = [num_lect[part - 1]]
        row_to_add.extend(tlast)
#         row_to_add.append(taglast)
        row_to_add.append(tsago)
        lect_dep_df.loc[i] = row_to_add
        prev_idx_cur[part-1] = idx_cur
    lect_dep_df['hour_past'].fillna(1, inplace = True)
#     lect_dep_df['llectag'].fillna(1, inplace = True)
    lect_dep_df['numlect'].fillna(1, inplace = True)
    lect_dep_df.fillna(0, inplace = True)
    for x in lect_dep_df:
        if x == 'hour_past': lect_dep_df[x] = lect_dep_df[x].astype(np.uint16); continue
        lect_dep_df[x] = lect_dep_df[x].astype(np.uint8)
    with_df = pd.DataFrame(with_df, columns = ['with'], dtype = np.int8)
    tdf = pd.concat([tdf, with_df, lect_dep_df], axis = 1)
    tdf.to_feather('train_processed_{}.feather'.format(fold))
#     dump(user_range, open('train_data/urange{}.pkl'.format(fold), 'wb'))
    return tdf, lect_dep_df, user_range

def create_train_lec_dep_all(x):
    for i in x:
        df, df_to_add, urange = create_lecture_dep(i)
        del df
        del df_to_add
        del urange
        gc.collect()

In [None]:
##Merge train and questions data
def merge_train_and_q(fold):
    qdf = read_feather('questions_processed.feather')
    tdf = read_feather('../input/riiid-data-processing2/train_processed_{}.feather'.format(fold))
    tdf = tdf.merge(qdf, left_on = 'content_id', right_on = 'question_id', how = 'left')
    gc.collect()
    a = tdf.pop('question_id')
    a = tdf.pop('row_id')
    del a
    gc.collect()
    tdf.to_feather('train_dat_{}.feather'.format(fold))
    return 0

def merge_train_q_all(x):
    for i in x:
        merge_train_and_q(i)
        gc.collect()

In [None]:
def merge_train_label(x):
    for i in x:
        tdf = read_feather('train_dat_{}.feather'.format(i))
        ldf = read_feather('../input/riiid-data-processing/label_data_{}.feather'.format(i))
        tdf = pd.concat([tdf, ldf], axis = 1)
        tdf.to_feather('train_dat_{}_plus.feather'.format(i))

In [None]:
def merge_all_dat():
    all_train_dat = []
    for x in range(7):
        df = read_feather('train_dat_{}_plus.feather'.format(x))
        all_train_dat.append(df)
    df = pd.concat(all_train_dat, axis = 0, ignore_index = True)
    for i in all_train_dat: del i
    del all_train_dat
    df.to_feather('all_train_dat_plus.feather')
    del df
    return 0

In [None]:
def get_user_dict():
    tdf = read_feather('all_train_dat_plus.feather')
    tdf = tdf.groupby('user_id')
    user_dict = tdf.groups
    for k in tqdm(user_dict):
        user_dict[k] = list(user_dict[k])
    dump(user_dict, open('user_dict.pkl', 'wb'))
    return 0

In [None]:
# get_ts_delta()
# load_train()
load_q_content()
# load_lecture()
# split_data()
# create_trainlec_dict()

In [None]:
x = [0,1,2,3,4,5,6]
# create_train_lec_dep_all(x)
merge_train_q_all(x)
merge_train_label(x)

In [None]:
merge_all_dat()
get_user_dict()