- add user_id
- change learning rate

In [1]:
import sys
import copy
import random
import feather
import warnings
import numpy as np
import pandas as pd
from time import time
import lightgbm as lgb
from collections import Counter
from sklearn import preprocessing
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt 
from tqdm._tqdm_notebook import tqdm_notebook

import riiideducation
warnings.filterwarnings("ignore")
tqdm_notebook.pandas(desc="progress: ")

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`


In [2]:
DIR = "../input/riiid-test-answer-prediction/"
train = pd.read_feather("../input/riid-train/train.feather")
lectures = pd.read_csv(DIR+"lectures.csv")
questions = pd.read_csv(DIR+"questions.csv")

In [3]:
lecture_id = list(lectures.lecture_id.unique())
lectures['type_of'] = lectures['type_of'].replace('solving question', 'solving_question')
lectures = pd.get_dummies(lectures, columns=['type_of'])

types_columns = [column for column in lectures.columns if column.startswith('type_of_')]

In [4]:
train.shape

(101230332, 10)

# preprocess

In [5]:
# executed in advance
train = train.sort_values(['user_id', 'timestamp']).reset_index(drop=True)
train.drop(["user_answer", "timestamp"], axis=1, inplace=True)

user_try_count = train.groupby("user_id")["row_id"].count().reset_index()
user_try_count.columns = ["user_id", "max_count"]

train.drop(["row_id", 'task_container_id'], axis=1, inplace=True)

In [6]:
print("consider lecture data")
#user_lec_df = train.groupby('user_id').agg({'content_type_id': ['mean']}).copy()
#user_lec_df.columns = ["lec_ratio"]

#train["lec"] = train.groupby(["user_id"])['content_type_id'].progress_apply(lambda x: x.shift(1))
#train.drop(["content_type_id"], axis=1, inplace=True)

#train = train.merge(lectures[["lecture_id"]+types_columns], how="left", left_on=['content_id'], right_on=['lecture_id']).drop(columns='lecture_id')
#user_lec_type = train.groupby("user_id").agg("mean")[types_columns]
#user_lec_type = user_lec_type.fillna(0)

print("remove lecture data")
train = train.loc[train['answered_correctly'] != -1].reset_index(drop=True)

print("merge question data")
# https://stackoverflow.com/questions/25888207/pandas-join-dataframes-on-field-with-different-names
train = train.merge(questions[["question_id", "part"]], how="left", left_on=['content_id'], right_on=['question_id'], right_index=True).reset_index(drop=True)
train.drop(["question_id"], axis=1, inplace=True)

print("groupby calculation")
user_answers_df = train.groupby('user_id').agg({'answered_correctly': ['mean']}).copy()
user_answers_df.columns = ["acc_before_ans"]

#user_answers_past2 = pd.DataFrame(train.groupby("user_id").tail(2).groupby("user_id").agg("mean")["answered_correctly"])
#user_answers_past2.columns = ["mean_of_last_2_answered_correctly_by_user_id"]
#user_answers_past3 = pd.DataFrame(train.groupby("user_id").tail(3).groupby("user_id").agg("mean")["answered_correctly"])
#user_answers_past3.columns = ["mean_of_last_3_answered_correctly_by_user_id"]

#user_part_accuracy = train.groupby(['user_id', 'part']).agg({'answered_correctly': ['mean']}).copy().unstack()
#user_part_accuracy.columns = ["part1_acc", "part2_acc", "part3_acc", "part4_acc", "part5_acc", "part6_acc", "part7_acc"]

#user_lec_ratio = train.groupby(['user_id']).agg({'lec': ['mean']}).copy()
#user_lec_ratio.columns = ["lec"]

freq = train["user_id"].value_counts()
    
mean_accuracy_by_part = np.array([0.74503238, 0.70869406, 0.70145593, 0.63099758, 0.61008792, 0.66938835, 0.65960112])

consider lecture data
remove lecture data
merge question data
groupby calculation


In [7]:
tmp = train.groupby("content_id")["answered_correctly"].mean().reset_index()
difficult_content = list(tmp[tmp.answered_correctly<0.3]["content_id"])
del tmp

In [8]:
#train = train.iloc[90007298:,:]
#train['task_container_id'] = train.groupby('user_id')['task_container_id'].transform(lambda x: pd.factorize(x)[0]).astype('int16')
#train["lec"] = train["lec"].fillna(0)
print(train.shape, len(train.user_id.unique()))
train.head(10)

(99271300, 7) 393656


Unnamed: 0,user_id,content_id,content_type_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,part
0,115,5692,0,1,4294967295,,5
1,115,5716,0,1,37000,False,5
2,115,128,0,1,55000,False,1
3,115,7860,0,1,19000,False,1
4,115,7922,0,1,11000,False,1
5,115,156,0,1,5000,False,1
6,115,51,0,1,17000,False,1
7,115,50,0,1,17000,False,1
8,115,7896,0,1,16000,False,1
9,115,7863,0,1,16000,False,1


# Feature engineering

In [9]:
%%time
def get_data(df, test_set=False):
    if test_set == False:
        
        # cumsum correct answer by user_id
        df['lag'] = df.groupby('user_id')["answered_correctly"].shift()
        cum = df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])
        df['acc_before_ans'] = cum['cumsum'] / cum['cumcount']
        df.drop(columns=['lag'], inplace=True)
        del cum
                    
        df['prior_question_had_explanation'] = df['prior_question_had_explanation'].fillna(value = False).astype(bool)
        df["count"] = 1
        df["count"] = df.groupby("user_id")["count"].cumsum()
        
    else:
        df = df.merge(user_answers_df, how = 'left', left_on = 'user_id', right_index=True).reset_index(drop=True) 
        df = df.merge(user_try_count, how="left", on="user_id", right_index=True).reset_index(drop=True)
        df["max_count"] = df["max_count"].fillna(0)
        df["count"] = 1
        df["count"] = df.groupby("user_id")["count"].cumsum()
        df["count"] += df["max_count"]
        df["user_id"] = df["user_id"].map(freq)
        df["user_id"] = df["user_id"].fillna(1)
        df.drop(["timestamp","max_count"], axis=1, inplace=True)
    
    df["difficult_content"] = df["content_id"].apply(lambda x: 1 if x in difficult_content else 0)
    df.fillna(value = 0.5, inplace = True) #0?
    return df
train = get_data(train)

CPU times: user 6min 52s, sys: 15.9 s, total: 7min 8s
Wall time: 7min 9s


In [10]:
target = 'answered_correctly'
columns = ["acc_before_ans", 'content_id', 'part', 'prior_question_had_explanation', 'prior_question_elapsed_time',
            "difficult_content", "count", "user_id"]

drop_columns = [i for i in train.columns if i not in columns + [target]]
train.drop(drop_columns, axis=1, inplace=True)

In [11]:
train = train.groupby('user_id').tail(24).reset_index(drop=True)
valid = train.groupby('user_id').tail(6)
train.drop(valid.index, inplace=True)

train["user_id"] = train["user_id"].map(freq)
valid["user_id"] = valid["user_id"].map(freq)

In [12]:
train.shape, valid.shape

((6536675, 9), (2360984, 9))

In [13]:
train.columns

Index(['user_id', 'content_id', 'answered_correctly',
       'prior_question_elapsed_time', 'prior_question_had_explanation', 'part',
       'acc_before_ans', 'count', 'difficult_content'],
      dtype='object')

# modelling

In [14]:
categoricals = ["content_id", "part", "difficult_content"]
params = {
    'objective': 'binary', "metric": 'auc','eval_metric':'auc', 'boosting_type': 'gbdt', 
          'tree_learner': 'serial', 'learning_rate': 0.09,  "num_leaves": 80 
        }

features_list = [i for i in columns]
feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
        
X_train = train[columns]
y_train = train[target]

X_valid = valid[columns]
y_valid = valid[target]

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
        
model = lgb.train(params, lgb_train,valid_sets=[lgb_train, lgb_eval], 
               num_boost_round=550,early_stopping_rounds=50,verbose_eval = 50, categorical_feature=categoricals,)
feature_importance_df["Importance"] = model.feature_importance()

valid_predict = model.predict(X_valid, num_iteration = model.best_iteration)
                            
feature_importance_df = feature_importance_df.sort_values("Importance", ascending=False).reset_index(drop=True)
        
score = roc_auc_score(y_valid, valid_predict)
print("Overall ROC AUC: ", score)

Training until validation scores don't improve for 50 rounds
[50]	training's auc: 0.750803	valid_1's auc: 0.729521
[100]	training's auc: 0.759325	valid_1's auc: 0.735028
[150]	training's auc: 0.762916	valid_1's auc: 0.735941
[200]	training's auc: 0.765152	valid_1's auc: 0.735993
Early stopping, best iteration is:
[180]	training's auc: 0.764358	valid_1's auc: 0.736058
Overall ROC AUC:  0.7360577756628861


In [15]:
feature_importance_df

Unnamed: 0,Feature,Importance
0,content_id,12632
1,acc_before_ans,470
2,count,269
3,prior_question_elapsed_time,260
4,user_id,259
5,part,179
6,difficult_content,77
7,prior_question_had_explanation,74


# prediction

In [16]:
env = riiideducation.make_env()
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    y_preds = []
    test_df = test_df.merge(questions[["question_id", "part"]], how="left", left_on=['content_id'], right_on=['question_id'], right_index=True).reset_index(drop=True)
    test_df.drop(["question_id"], axis=1, inplace=True)

    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(value = False).astype(bool)
    test_df = get_data(test_df, test_set=True)

    y_pred = model.predict(test_df[columns], num_iteration=model.best_iteration)

    test_df['answered_correctly'] = y_pred
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])