- 1st modelling
- set num_boost_round to 1000 because it was time consuming

In [1]:
import sys
import pandas as pd
import numpy as np
import warnings
import datetime
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from sklearn import preprocessing
from sklearn.model_selection import GroupKFold, StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error, log_loss, roc_auc_score, precision_score, recall_score, accuracy_score, f1_score, confusion_matrix
import lightgbm as lgb
from functools import partial
import json
import copy
import time
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
from hyperopt import hp, tpe, Trials, fmin, space_eval
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows",1000)
np.set_printoptions(precision=8)
warnings.filterwarnings("ignore")
import random
import feather
import riiideducation

In [2]:
DIR = "../input/riiid-test-answer-prediction/"
train = pd.read_feather("../input/riid-train/train.feather")
lectures = pd.read_csv(DIR+"lectures.csv")
questions = pd.read_csv(DIR+"questions.csv")
example_test = pd.read_csv(DIR+"example_test.csv")

# preprocess

In [3]:
train = train.loc[train['answered_correctly'] != -1].reset_index(drop=True)
train = train.drop(['timestamp','content_type_id'], axis=1)
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].fillna(value = False).astype(bool)

user_answers_df = train.groupby('user_id').agg({'answered_correctly': ['mean', 'count']}).copy()
user_answers_df.columns = ['mean_user_accuracy', 'questions_answered']

content_answers_df = train.groupby('content_id').agg({'answered_correctly': ['mean', 'count']}).copy()
content_answers_df.columns = ['mean_accuracy', 'question_asked']

# user_content_answers_df = train.groupby(['user_id', 'content_id']).agg({'answered_correctly': ['mean', 'count']}).copy()
# user_content_answers_df.columns = ['mean_user_content_accuracy', 'content_questions_answered']

In [4]:
le = preprocessing.LabelEncoder()
train["prior_question_had_explanation"] = le.fit_transform(train["prior_question_had_explanation"])

In [5]:
train = train.iloc[90000000:,:]

# Feature engineering

In [6]:
def fe(df):
    df = df.merge(user_answers_df, how = 'left', on = 'user_id')
    df = df.merge(content_answers_df, how = 'left', on = 'content_id')
    # df = df.merge(user_content_answers_df, how = 'left', on = ['user_id', 'content_id'])
    df['mean_diff1'] = df['mean_user_accuracy'] - df['mean_accuracy']
    # df['mean_diff2'] = df['mean_accuracy'] - df['mean_user_content_accuracy']
    return df
        
train = fe(train)

In [7]:
train.fillna(value = 0.5, inplace = True)
train.drop("row_id", axis=1, inplace=True)
train = train.sort_values(['user_id'])

In [8]:
target = 'answered_correctly'
columns = ['mean_user_accuracy', 'questions_answered', 'mean_accuracy', 'question_asked','mean_diff1']

# modelling

In [9]:
def modelling_lgb(X_train, y_train):
    params = {'objective': 'binary', "metric": 'auc','eval_metric':'auc', 'boosting_type': 'gbdt', 'tree_learner': 'serial', 'learning_rate': 0.01, 
               "num_leaves": 10, 'random_seed':44, 'max_depth': 5} 
        
    n_folds=5
    skf=StratifiedKFold(n_splits = n_folds, shuffle=True, random_state=0)
    
    models = []
    valid = np.zeros([X_train.shape[0]])
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]

        X_valid2 = X_train.iloc[test_index,:]
        y_valid2 = y_train.iloc[test_index]
        
        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_valid2, y_valid2, reference=lgb_train)
        
        clf = lgb.train(params, lgb_train,valid_sets=[lgb_train, lgb_eval], 
               num_boost_round=1000,early_stopping_rounds=10,verbose_eval = 100) 

        valid_predict = clf.predict(X_valid2, num_iteration = clf.best_iteration)
        valid[test_index] = valid_predict
        
        models.append(clf) 
            
    score = roc_auc_score(y_train, valid)
    print(score)
        
    return models
models = modelling_lgb(train[columns], train[target])

Training until validation scores don't improve for 10 rounds
[100]	training's auc: 0.751529	valid_1's auc: 0.750954
[200]	training's auc: 0.753748	valid_1's auc: 0.753252
[300]	training's auc: 0.754974	valid_1's auc: 0.754504
[400]	training's auc: 0.755897	valid_1's auc: 0.755447
[500]	training's auc: 0.756447	valid_1's auc: 0.756002
[600]	training's auc: 0.756756	valid_1's auc: 0.756314
[700]	training's auc: 0.756941	valid_1's auc: 0.7565
[800]	training's auc: 0.757071	valid_1's auc: 0.75663
[900]	training's auc: 0.757174	valid_1's auc: 0.756732
[1000]	training's auc: 0.75725	valid_1's auc: 0.756806
Did not meet early stopping. Best iteration is:
[1000]	training's auc: 0.75725	valid_1's auc: 0.756806
Training until validation scores don't improve for 10 rounds
[100]	training's auc: 0.751324	valid_1's auc: 0.751994
[200]	training's auc: 0.75354	valid_1's auc: 0.754118
[300]	training's auc: 0.754775	valid_1's auc: 0.755319
[400]	training's auc: 0.75572	valid_1's auc: 0.756231
[500]	trai

# prediction

In [10]:
del train

In [11]:
env = riiideducation.make_env()
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    y_preds = []
    test_df = fe(test_df)

    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(value = False).astype(bool)
    test_df = test_df.loc[test_df['content_type_id'] == 0].reset_index(drop=True)
    test_df.fillna(value = 0.5, inplace = True)
    test_df["prior_question_had_explanation"] = le.fit_transform(test_df["prior_question_had_explanation"])

    for model in models:
        y_pred = model.predict(test_df[columns], num_iteration=model.best_iteration)
        y_preds.append(y_pred)

    y_preds = sum(y_preds) / len(y_preds)
    test_df['answered_correctly'] = y_preds
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])