In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import riiideducation
env = riiideducation.make_env()
iter_test = env.iter_test()

/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv
/kaggle/input/riiid-test-answer-prediction/example_test.csv
/kaggle/input/riiid-test-answer-prediction/questions.csv
/kaggle/input/riiid-test-answer-prediction/train.csv
/kaggle/input/riiid-test-answer-prediction/lectures.csv
/kaggle/input/riiid-test-answer-prediction/riiideducation/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/riiid-test-answer-prediction/riiideducation/__init__.py


In [2]:
import optuna
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [3]:
used_data_types_dict = {
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id':'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float16',
    'prior_question_had_explanation': 'boolean'
}

target = 'answered_correctly'

train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',
                       usecols = used_data_types_dict.keys(),
                       dtype=used_data_types_dict, nrows=10**6)

train_df = train_df[train_df['answered_correctly'] != -1].reset_index(drop=True)

print(train_df.dtypes)
print(train_df.shape)
train_df.head(10)

user_id                             int32
content_id                          int16
content_type_id                      int8
answered_correctly                   int8
prior_question_elapsed_time       float16
prior_question_had_explanation    boolean
dtype: object
(980093, 6)


Unnamed: 0,user_id,content_id,content_type_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,115,5692,0,1,,
1,115,5716,0,1,36992.0,False
2,115,128,0,1,55008.0,False
3,115,7860,0,1,19008.0,False
4,115,7922,0,1,11000.0,False
5,115,156,0,1,5000.0,False
6,115,51,0,1,16992.0,False
7,115,50,0,1,16992.0,False
8,115,7896,0,1,16000.0,False
9,115,7863,0,1,16000.0,False


row_id：行に対するid<br>
timestamp：ユーザーが新規登録してから、該当イベント完了までにかかる時間<br>
user_id：ユーザーのid<br>
content_id：コンテンツのid<br>
content_type_id：イベントが問題が投げていれば0、ユーザーがそのイベントを見ていれば1<br>
task_container_id：問題または講義のバッチのid<br>
user_answer：問題に対するユーザの解答した数字（講義の場合は、-1をnullとする）<br>
answered_correctly：ユーザが正しく応答したもの（講義の場合は、-1をnullとする）<br>
prior_question_elapsed_time：前の問題の回答にかかった時間<br>
prior_question_had_explanation：前の問題の解説を見たかどうか

In [4]:
train_df['prior_question_had_explanation'].fillna(False, inplace=True)

print(train_df.isnull().sum())
train_df.head()

user_id                              0
content_id                           0
content_type_id                      0
answered_correctly                   0
prior_question_elapsed_time       3816
prior_question_had_explanation       0
dtype: int64


Unnamed: 0,user_id,content_id,content_type_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,115,5692,0,1,,False
1,115,5716,0,1,36992.0,False
2,115,128,0,1,55008.0,False
3,115,7860,0,1,19008.0,False
4,115,7922,0,1,11000.0,False


In [5]:
train_df['lag'] = train_df.groupby('user_id')[target].shift()

print(train_df.shape)
train_df.head()

(980093, 7)


Unnamed: 0,user_id,content_id,content_type_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,lag
0,115,5692,0,1,,False,
1,115,5716,0,1,36992.0,False,1.0
2,115,128,0,1,55008.0,False,1.0
3,115,7860,0,1,19008.0,False,1.0
4,115,7922,0,1,11000.0,False,1.0


In [6]:
cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])

print(cum.shape)
cum.head()

(980093, 2)


Unnamed: 0,cumsum,cumcount
0,,0
1,1.0,1
2,2.0,2
3,3.0,3
4,4.0,4


In [7]:
train_df['user_correctness'] = cum['cumsum'] / cum['cumcount']
train_df.drop(columns=['lag'], inplace=True)

print(train_df.isnull().sum())
print(train_df.shape)
train_df.head()

user_id                              0
content_id                           0
content_type_id                      0
answered_correctly                   0
prior_question_elapsed_time       3816
prior_question_had_explanation       0
user_correctness                  3824
dtype: int64
(980093, 7)


Unnamed: 0,user_id,content_id,content_type_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_correctness
0,115,5692,0,1,,False,
1,115,5716,0,1,36992.0,False,1.0
2,115,128,0,1,55008.0,False,1.0
3,115,7860,0,1,19008.0,False,1.0
4,115,7922,0,1,11000.0,False,1.0


In [8]:
questions_df = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv', usecols=[0, 3],
                           dtype={'question_id': 'int16', 'part': 'int8'})

questions_df.columns = ['question_id', 'question_part']

print(questions_df.shape)
questions_df.head(10)

(13523, 2)


Unnamed: 0,question_id,question_part
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1
5,5,1
6,6,1
7,7,1
8,8,1
9,9,1


question_id：コンテンツが問題ならば0、train・testデータの外部キー、小問番号<br>
bundle_id：問題が一緒に出される番号、大問番号<br>
correct_answer：問題へ解答した番号、train.csvの「user_answer」と照らし合わせて、正解したかどうかが分かるもの<br>
part：TOEICにおける問題の各セクション<br>
tags：質問をまとめるタグ

In [9]:
train_df = pd.merge(train_df, questions_df, left_on='content_id', right_on='question_id', how='left')
train_df.drop(columns=['question_id'], inplace=True)

print(train_df.isnull().sum())
print(train_df.shape)
train_df.head()

user_id                              0
content_id                           0
content_type_id                      0
answered_correctly                   0
prior_question_elapsed_time       3816
prior_question_had_explanation       0
user_correctness                  3824
question_part                        0
dtype: int64
(980093, 8)


Unnamed: 0,user_id,content_id,content_type_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_correctness,question_part
0,115,5692,0,1,,False,,5
1,115,5716,0,1,36992.0,False,1.0,5
2,115,128,0,1,55008.0,False,1.0,1
3,115,7860,0,1,19008.0,False,1.0,1
4,115,7922,0,1,11000.0,False,1.0,1


In [10]:
lectures_df = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/lectures.csv")

print(lectures_df.dtypes)
print(lectures_df.shape)
lectures_df.head(10)

lecture_id     int64
tag            int64
part           int64
type_of       object
dtype: object
(418, 4)


Unnamed: 0,lecture_id,tag,part,type_of
0,89,159,5,concept
1,100,70,1,concept
2,185,45,6,concept
3,192,79,5,solving question
4,317,156,5,solving question
5,335,114,2,concept
6,484,179,5,concept
7,641,134,6,solving question
8,761,93,1,concept
9,814,80,5,solving question


lecture_id：コンテンツタイプが講義なら1、train・testデータにおけるcontent_idの外部キー<br>
tag：講義をまとめるタグ<br>
part：講義のカテゴリーコード（識別するもの）<br>
type_of：講義の目的の説明

In [11]:
lectures_df['type_of'] = lectures_df['type_of'].replace('solving question', 'solving_question')
lectures_df = pd.get_dummies(lectures_df, columns=['part', 'type_of'])

part_lectures_columns = [column for column in lectures_df.columns if column.startswith('part')]
types_of_lectures_columns = [column for column in lectures_df.columns if column.startswith('type_of_')]

print(part_lectures_columns)
print(types_of_lectures_columns)
lectures_df.head()

['part_1', 'part_2', 'part_3', 'part_4', 'part_5', 'part_6', 'part_7']
['type_of_concept', 'type_of_intention', 'type_of_solving_question', 'type_of_starter']


Unnamed: 0,lecture_id,tag,part_1,part_2,part_3,part_4,part_5,part_6,part_7,type_of_concept,type_of_intention,type_of_solving_question,type_of_starter
0,89,159,0,0,0,0,1,0,0,1,0,0,0
1,100,70,1,0,0,0,0,0,0,1,0,0,0
2,185,45,0,0,0,0,0,1,0,1,0,0,0
3,192,79,0,0,0,0,1,0,0,0,0,1,0
4,317,156,0,0,0,0,1,0,0,0,0,1,0


下記のように2分割するまでに、train_df全体としての特徴量を増やしておきたい。

In [12]:
features_df = train_df.iloc[:int(9/10 * len(train_df))]
train_df = train_df.iloc[int(9/10 * len(train_df)):]

print(features_df.shape)
print(train_df.shape)

(882083, 8)
(98010, 8)


In [13]:
grouped_by_user_df = features_df.groupby('user_id')

user_answers_df = grouped_by_user_df.agg({'answered_correctly': ['mean', 'count','std','median','skew']}).copy()

user_answers_df.columns = ['mean_user_accuracy', 
                           'questions_answered', 
                           'std_user_accuracy', 
                           'median_user_accuracy', 
                           'skew_user_accuracy']

print(user_answers_df.dtypes)
print(user_answers_df.shape)
user_answers_df.head()

mean_user_accuracy      float64
questions_answered        int64
std_user_accuracy       float64
median_user_accuracy    float64
skew_user_accuracy      float64
dtype: object
(3458, 5)


Unnamed: 0_level_0,mean_user_accuracy,questions_answered,std_user_accuracy,median_user_accuracy,skew_user_accuracy
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
115,0.695652,46,0.465215,1.0,-0.879359
124,0.233333,30,0.430183,0.0,1.328338
2746,0.578947,19,0.507257,1.0,-0.347892
5382,0.672,125,0.471374,1.0,-0.741648
8623,0.642202,109,0.481566,1.0,-0.601619


In [14]:
grouped_by_content_df = features_df.groupby('content_id')

content_answers_df = grouped_by_content_df.agg({'answered_correctly': ['mean', 'count','std','median','skew']}).copy()

content_answers_df.columns = ['mean_accuracy', 
                              'question_asked', 
                              'std_accuracy', 
                              'median_accuracy', 
                              'skew_accuracy']

print(content_answers_df.dtypes)
print(content_answers_df.shape)
content_answers_df.head()

mean_accuracy      float64
question_asked       int64
std_accuracy       float64
median_accuracy    float64
skew_accuracy      float64
dtype: object
(12986, 5)


Unnamed: 0_level_0,mean_accuracy,question_asked,std_accuracy,median_accuracy,skew_accuracy
content_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.857143,70,0.352454,1.0,-2.086215
1,0.918367,49,0.276642,1.0,-3.153323
2,0.555276,398,0.497561,1.0,-0.223312
3,0.784091,176,0.412625,1.0,-1.392819
4,0.60076,263,0.490676,1.0,-0.413848


In [15]:
del features_df
del grouped_by_user_df
del grouped_by_content_df

In [16]:
train_df = train_df.merge(user_answers_df, how='left', on='user_id')
train_df = train_df.merge(content_answers_df, how='left', on='content_id')

print(train_df.dtypes)
print(train_df.shape)
train_df.head()

user_id                             int32
content_id                          int16
content_type_id                      int8
answered_correctly                   int8
prior_question_elapsed_time       float16
prior_question_had_explanation    boolean
user_correctness                  float64
question_part                        int8
mean_user_accuracy                float64
questions_answered                float64
std_user_accuracy                 float64
median_user_accuracy              float64
skew_user_accuracy                float64
mean_accuracy                     float64
question_asked                    float64
std_accuracy                      float64
median_accuracy                   float64
skew_accuracy                     float64
dtype: object
(98010, 18)


Unnamed: 0,user_id,content_id,content_type_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_correctness,question_part,mean_user_accuracy,questions_answered,std_user_accuracy,median_user_accuracy,skew_user_accuracy,mean_accuracy,question_asked,std_accuracy,median_accuracy,skew_accuracy
0,18789336,7796,0,0,1400.0,True,0.310865,7,0.310865,2255.0,0.46295,0.0,0.817812,0.962963,54.0,0.190626,1.0,-5.044113
1,18789336,7798,0,0,1400.0,True,0.310727,7,0.310865,2255.0,0.46295,0.0,0.817812,0.87037,54.0,0.33905,1.0,-2.268786
2,18789336,2908,0,0,1600.0,True,0.310589,4,0.310865,2255.0,0.46295,0.0,0.817812,0.676768,99.0,0.470091,1.0,-0.767564
3,18789336,2909,0,0,1600.0,True,0.310452,4,0.310865,2255.0,0.46295,0.0,0.817812,0.616162,99.0,0.488794,1.0,-0.485099
4,18789336,2907,0,0,1600.0,True,0.310314,4,0.310865,2255.0,0.46295,0.0,0.817812,0.454545,99.0,0.500464,0.0,0.185395


In [17]:
features = [
    'mean_user_accuracy', 
    'questions_answered',
    'std_user_accuracy', 
    'median_user_accuracy',
    'skew_user_accuracy',
    'mean_accuracy', 
    'question_asked',
    'std_accuracy', 
    'median_accuracy',
    'prior_question_elapsed_time', 
    'prior_question_had_explanation',
    'user_correctness',
    'skew_accuracy'
]

target = 'answered_correctly'

print(train_df.isnull().sum())

user_id                               0
content_id                            0
content_type_id                       0
answered_correctly                    0
prior_question_elapsed_time         366
prior_question_had_explanation        0
user_correctness                    366
question_part                         0
mean_user_accuracy                97881
questions_answered                97881
std_user_accuracy                 97881
median_user_accuracy              97881
skew_user_accuracy                97881
mean_accuracy                       100
question_asked                      100
std_accuracy                        231
median_accuracy                     100
skew_accuracy                       325
dtype: int64


In [18]:
train_df = train_df.fillna(value=0.5)
train_df.isnull().sum()

user_id                           0
content_id                        0
content_type_id                   0
answered_correctly                0
prior_question_elapsed_time       0
prior_question_had_explanation    0
user_correctness                  0
question_part                     0
mean_user_accuracy                0
questions_answered                0
std_user_accuracy                 0
median_user_accuracy              0
skew_user_accuracy                0
mean_accuracy                     0
question_asked                    0
std_accuracy                      0
median_accuracy                   0
skew_accuracy                     0
dtype: int64

In [19]:
train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].astype('int')

# Modeling

In [20]:
train_df, test_df = train_test_split(train_df, random_state=666, test_size=0.2)

print(train_df.dtypes)
print(train_df.shape)
print(test_df.shape)
train_df.head()

user_id                             int32
content_id                          int16
content_type_id                      int8
answered_correctly                   int8
prior_question_elapsed_time       float16
prior_question_had_explanation      int64
user_correctness                  float64
question_part                        int8
mean_user_accuracy                float64
questions_answered                float64
std_user_accuracy                 float64
median_user_accuracy              float64
skew_user_accuracy                float64
mean_accuracy                     float64
question_asked                    float64
std_accuracy                      float64
median_accuracy                   float64
skew_accuracy                     float64
dtype: object
(78408, 18)
(19602, 18)


Unnamed: 0,user_id,content_id,content_type_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_correctness,question_part,mean_user_accuracy,questions_answered,std_user_accuracy,median_user_accuracy,skew_user_accuracy,mean_accuracy,question_asked,std_accuracy,median_accuracy,skew_accuracy
1273,18802864,10940,0,1,26336.0,1,0.470588,6,0.5,0.5,0.5,0.5,0.5,0.833333,42.0,0.377195,1.0,-1.855802
5034,18824449,350,0,1,16992.0,1,0.666667,2,0.5,0.5,0.5,0.5,0.5,0.733813,139.0,0.443562,1.0,-1.069643
37526,19492810,8742,0,1,15000.0,1,0.689655,5,0.5,0.5,0.5,0.5,0.5,0.763636,55.0,0.428764,1.0,-1.276157
29573,19444654,5810,0,0,7000.0,1,0.57307,5,0.5,0.5,0.5,0.5,0.5,0.428571,21.0,0.507093,0.0,0.311373
31622,19489973,8460,0,0,20992.0,1,0.667297,5,0.5,0.5,0.5,0.5,0.5,0.710227,176.0,0.454951,1.0,-0.934798


In [21]:
def create_model(trial):
    booster = 'gbtree'
    n_estimators = trial.suggest_int("n_estimators", 50, 3000)
    max_depth = trial.suggest_int('max_depth', 3, 8)
    min_child_weight = trial.suggest_int('min_child_weight', 0, 2)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 0.99)
    colsample_bytree = 1,
    subsample = 1
    
    model = xgb.XGBClassifier(
        booster = 'gbtree',
        n_estimators = n_estimators, 
        max_depth=max_depth, 
        min_child_weight = min_child_weight, 
        learning_rate=learning_rate,
        colsample_bytree = 1,
        subsample = 1,
        random_state=666
    )
    return model

In [22]:
def objective(trial):
    model = create_model(trial)
    model.fit(train_df[features], train_df[target])
    score = roc_auc_score(
        test_df[target].values, 
        model.predict_proba(test_df[features])[:,1]
    )
    return score

In [23]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=70)
params = study.best_params

print(params)

[32m[I 2020-12-28 00:16:26,336][0m A new study created in memory with name: no-name-bc8b5c10-dec5-4537-ab1f-7e1721fc47ad[0m
[32m[I 2020-12-28 00:17:28,557][0m Trial 0 finished with value: 0.6710996080856886 and parameters: {'n_estimators': 1452, 'max_depth': 6, 'min_child_weight': 0, 'learning_rate': 0.5175211647825423}. Best is trial 0 with value: 0.6710996080856886.[0m
[32m[I 2020-12-28 00:19:06,148][0m Trial 1 finished with value: 0.6620231065894209 and parameters: {'n_estimators': 1701, 'max_depth': 8, 'min_child_weight': 0, 'learning_rate': 0.8069406779931929}. Best is trial 0 with value: 0.6710996080856886.[0m
[32m[I 2020-12-28 00:19:18,797][0m Trial 2 finished with value: 0.6693821081926203 and parameters: {'n_estimators': 229, 'max_depth': 7, 'min_child_weight': 1, 'learning_rate': 0.9171530145578296}. Best is trial 0 with value: 0.6710996080856886.[0m
[32m[I 2020-12-28 00:20:19,768][0m Trial 3 finished with value: 0.664370361240579 and parameters: {'n_estimators'

{'n_estimators': 806, 'max_depth': 4, 'min_child_weight': 0, 'learning_rate': 0.022672908469859535}


In [24]:
%%time

model = xgb.XGBClassifier(**params)

model.fit(train_df[features], train_df[target])
print('XGB score: ', roc_auc_score(test_df[target].values, model.predict_proba(test_df[features])[:,1]))

XGB score:  0.7366344750691804
CPU times: user 1min 35s, sys: 862 ms, total: 1min 36s
Wall time: 25.2 s


# Submit

In [25]:
test_df = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/example_test.csv")

print(test_df.dtypes)
print(test_df.shape)
test_df.head()

row_id                              int64
group_num                           int64
timestamp                           int64
user_id                             int64
content_id                          int64
content_type_id                     int64
task_container_id                   int64
prior_question_elapsed_time       float64
prior_question_had_explanation     object
prior_group_answers_correct        object
prior_group_responses              object
dtype: object
(104, 11)


Unnamed: 0,row_id,group_num,timestamp,user_id,content_id,content_type_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation,prior_group_answers_correct,prior_group_responses
0,0,0,0,275030867,5729,0,0,,,[],[]
1,1,0,13309898705,554169193,12010,0,4427,19000.0,True,,
2,2,0,4213672059,1720860329,457,0,240,17000.0,True,,
3,3,0,62798072960,288641214,13262,0,266,23000.0,True,,
4,4,0,10585422061,1728340777,6119,0,162,72400.0,True,,


In [26]:
for (test_df, sample_prediction_df) in iter_test:
    test_df['prior_question_had_explanation'].fillna(False, inplace=True)
    test_df['user_correctness'] = cum['cumsum'] / cum['cumcount']
    test_df = test_df.merge(user_answers_df, how='left', on='user_id')
    test_df = test_df.merge(content_answers_df, how='left', on='content_id')
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].astype('int')
    test_df.fillna(value = 0.5, inplace = True)

    test_df['answered_correctly'] = model.predict_proba(test_df[features])[:,1]
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])