In [1]:
import pandas as pd
import numpy as np
import warnings
import datetime
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from sklearn import preprocessing
from sklearn.model_selection import GroupKFold, StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error, log_loss, roc_auc_score, precision_score, recall_score, accuracy_score, f1_score, confusion_matrix
import lightgbm as lgb
from functools import partial
import json
import copy
import time
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
from hyperopt import hp, tpe, Trials, fmin, space_eval
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows",1000)
np.set_printoptions(precision=8)
warnings.filterwarnings("ignore")
import random
import feather
import riiideducation

In [2]:
data_types_dict = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float16',
    'prior_question_had_explanation': 'boolean'
}

In [3]:
DIR = "../input/riiid-test-answer-prediction/"
#train = pd.read_feather("train.feather")
train = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', 
                       nrows=10**4,
                       usecols = data_types_dict.keys(),
                       dtype=data_types_dict, 
                       index_col = 0)
lectures = pd.read_csv(DIR+"lectures.csv")
questions = pd.read_csv(DIR+"questions.csv")
example_test = pd.read_csv(DIR+"example_test.csv")

In [4]:
lectures.head()

Unnamed: 0,lecture_id,tag,part,type_of
0,89,24584,5,concept
1,100,22243,1,concept
2,185,7035,6,concept
3,192,31458,5,solving question
4,317,19653,5,solving question


In [5]:
tmp = train[train.user_id==115].copy()

In [6]:
train[train.user_id==115]

Unnamed: 0_level_0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,115,5692,0,1,3,1,,
1,56943,115,5716,0,2,2,1,36992.0,False
2,118363,115,128,0,0,0,1,55008.0,False
3,131167,115,7860,0,3,0,1,19008.0,False
4,137965,115,7922,0,4,1,1,11000.0,False
5,157063,115,156,0,5,2,1,5000.0,False
6,176092,115,51,0,6,0,1,16992.0,False
7,194190,115,50,0,7,3,1,16992.0,False
8,212463,115,7896,0,8,2,1,16000.0,False
9,230983,115,7863,0,9,0,1,16000.0,False


# Feature engineering
- reference: https://www.kaggle.com/taichin/final-my-model/notebook

In [7]:
all_mean_acc= train[train["answered_correctly"]!=-1]["answered_correctly"].sum() / train.shape[0]
all_mean_acc

0.6578

In [8]:
def feature_engineering(user_sample, test_set=False):
    #content_id_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    exp_false_count = user_sample[user_sample["prior_question_had_explanation"]==False].shape[0]
    exp_true_count = user_sample[user_sample["prior_question_had_explanation"]==True].shape[0]
    all_df = []
    features = {"user_id": 0}
    features["user_id"] = user_sample.iloc[0]["user_id"]
    features["lecture_time"] = user_sample[user_sample["answered_correctly"]==-1].shape[0]
    features["mean_accuracy"] = user_sample[user_sample["answered_correctly"]!=-1]["answered_correctly"].sum() / (user_sample.shape[0] - features["lecture_time"])
    features["timestamp_diff_ave"] = np.mean(user_sample["timestamp"].diff(1))
    features["timestamp_diff_std"] = np.std(user_sample["timestamp"].diff(1))
    features["prior_explanation_true_ratio"] = exp_true_count / (exp_true_count + exp_false_count)
    features["prior_question_elapsed_time_ave"] = np.mean(user_sample["prior_question_elapsed_time"].dropna().values)
    features["mean_acuracy_diff"] = features["mean_accuracy"] - all_mean_acc
    all_df.append(features)
    return all_df

In [9]:
def get_data(df):
    compiled_df = []

    for i, (ins_id, user_sample) in tqdm(enumerate(df.groupby('user_id', sort=False)), total=df.user_id.nunique(), desc='user_id', position=0):
        compiled_df += feature_engineering(user_sample)
    #del train
    #for ins_id, user_sample in tqdm(test.groupby('user_id', sort=False), total=test.user_id.nunique(), desc='user_id', position=0):
    #    test_data, val_data = get_data(user_sample, test_set=True)
    #    compiled_test.append(test_data)
    #    compiled_val += val_data
    #del test
    reduce_df = pd.DataFrame(compiled_df)
    #reduce_test = pd.DataFrame(compiled_test)
    #reduce_val = pd.DataFrame(compiled_val)

    #categoricals = ['session_title']
    return reduce_df
new_train = get_data(train)
new_train

HBox(children=(FloatProgress(value=0.0, description='user_id', max=21.0, style=ProgressStyle(description_width…




Unnamed: 0,user_id,lecture_time,mean_accuracy,timestamp_diff_ave,timestamp_diff_std,prior_explanation_true_ratio,prior_question_elapsed_time_ave,mean_acuracy_diff
0,115,0,0.695652,14846450.0,98224420.0,0.133333,19936.0,0.037852
1,124,0,0.233333,19700.79,26264.78,0.0,18800.0,-0.424467
2,2746,1,0.578947,43971.42,46275.72,0.578947,18048.0,-0.078853
3,5382,3,0.672,16547650.0,86589860.0,0.889764,inf,0.0142
4,8623,3,0.642202,7768817.0,35600770.0,0.864865,inf,-0.015598
5,8701,0,0.588235,98205.69,223997.6,0.5625,21376.0,-0.069565
6,12741,6,0.573585,16538840.0,192586300.0,0.925926,inf,-0.084215
7,13134,7,0.706356,14509240.0,223835800.0,0.98719,inf,0.048556
8,24418,181,0.690275,2203889.0,20785350.0,0.967198,inf,0.032475
9,24600,0,0.34,31649.61,40704.03,0.387755,22720.0,-0.3178


# modelling

In [10]:
def modelling_lgb(new_train, target):
    
    X_train = new_train.copy()
    y_train = target_train[target].copy()
    
    params = {'objective': 'binary', 'boosting_type': 'gbdt', 'tree_learner': 'serial', 'learning_rate': 0.01, 
               "num_leaves": 10, 'random_seed':44, 'max_depth': 5} 
        
    n_folds=5
    skf=StratifiedKFold(n_splits = n_folds, shuffle=True, random_state=0)
    
    pred_value = np.zeros(X_test.shape[0])
    valid = np.zeros([X_train.shape[0]])
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]

        X_valid2 = X_train.iloc[test_index,:]
        y_valid2 = y_train.iloc[test_index]
        
        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_valid2, y_valid2, reference=lgb_train)
        
        clf = lgb.train(params, lgb_train,valid_sets=[lgb_train, lgb_eval], 
               num_boost_round=10000,early_stopping_rounds=25,verbose_eval = 0) 

        valid_predict = clf.predict(X_valid2, num_iteration = clf.best_iteration)
        valid[test_index] = valid_predict
        #pred_value += clf.predict(X_test, num_iteration = clf.best_iteration) / n_folds
            
    score = roc_auc_score(y_train, valid)
            
    return valid, pred_value, score

# prediction