In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv
/kaggle/input/riiid-test-answer-prediction/example_test.csv
/kaggle/input/riiid-test-answer-prediction/questions.csv
/kaggle/input/riiid-test-answer-prediction/train.csv
/kaggle/input/riiid-test-answer-prediction/lectures.csv
/kaggle/input/riiid-test-answer-prediction/riiideducation/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/riiid-test-answer-prediction/riiideducation/__init__.py


In [2]:
from sklearn.model_selection import train_test_split
import optuna
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import roc_auc_score

In [3]:
import riiideducation
env = riiideducation.make_env()
iter_test = env.iter_test()

# Data

In [4]:
used_data_types_dict = {
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float16',
    'prior_question_had_explanation': 'boolean'
}

train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',
                       usecols = used_data_types_dict.keys(),
                       dtype=used_data_types_dict,
                       index_col = 0,
                       nrows=10**6)

print(train_df.dtypes)
print(train_df.shape)
train_df.head()

user_id                             int32
content_id                          int16
answered_correctly                   int8
prior_question_elapsed_time       float16
prior_question_had_explanation    boolean
dtype: object
(1000000, 5)


Unnamed: 0_level_0,user_id,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,115,5692,1,,
56943,115,5716,1,36992.0,False
118363,115,128,1,55008.0,False
131167,115,7860,1,19008.0,False
137965,115,7922,1,11000.0,False


In [5]:
test_df = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/example_test.csv")

print(test_df.shape)
test_df.head()

(104, 11)


Unnamed: 0,row_id,group_num,timestamp,user_id,content_id,content_type_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation,prior_group_answers_correct,prior_group_responses
0,0,0,0,275030867,5729,0,0,,,[],[]
1,1,0,13309898705,554169193,12010,0,4427,19000.0,True,,
2,2,0,4213672059,1720860329,457,0,240,17000.0,True,,
3,3,0,62798072960,288641214,13262,0,266,23000.0,True,,
4,4,0,10585422061,1728340777,6119,0,162,72400.0,True,,


In [6]:
submission_df = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv")

print(submission_df.shape)
submission_df.head()

(104, 3)


Unnamed: 0,row_id,answered_correctly,group_num
0,0,0.5,0
1,1,0.5,0
2,2,0.5,0
3,3,0.5,0
4,4,0.5,0


# Feature Engineering

In [7]:
features_df = train_df.iloc[:int(9/10 * len(train_df))]
train_df = train_df.iloc[int(9/10 * len(train_df)):]

In [8]:
print(features_df.dtypes)
print(features_df.shape)
features_df.head()

user_id                             int32
content_id                          int16
answered_correctly                   int8
prior_question_elapsed_time       float16
prior_question_had_explanation    boolean
dtype: object
(900000, 5)


Unnamed: 0_level_0,user_id,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,115,5692,1,,
56943,115,5716,1,36992.0,False
118363,115,128,1,55008.0,False
131167,115,7860,1,19008.0,False
137965,115,7922,1,11000.0,False


In [9]:
print(train_df.dtypes)
print(train_df.shape)
train_df.head()

user_id                             int32
content_id                          int16
answered_correctly                   int8
prior_question_elapsed_time       float16
prior_question_had_explanation    boolean
dtype: object
(100000, 5)


Unnamed: 0_level_0,user_id,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
23452298654,18789336,8144,0,4248.0,True
23452298654,18789336,8146,0,4248.0,True
23452320119,18789336,10166,0,2000.0,True
23452320119,18789336,10165,0,2000.0,True
23452320119,18789336,10167,1,2000.0,True


In [10]:
train_questions_only_df = features_df[features_df['answered_correctly']!=-1]

grouped_by_user_df = train_questions_only_df.groupby('user_id')

user_answers_df = grouped_by_user_df.agg({'answered_correctly': ['mean', 'count','std','median','skew']}).copy()

user_answers_df.columns = ['mean_user_accuracy', 
                           'questions_answered', 
                           'std_user_accuracy', 
                           'median_user_accuracy', 
                           'skew_user_accuracy']

print(user_answers_df.dtypes)
print(user_answers_df.shape)
user_answers_df.head()

mean_user_accuracy      float64
questions_answered        int64
std_user_accuracy       float64
median_user_accuracy    float64
skew_user_accuracy      float64
dtype: object
(3458, 5)


Unnamed: 0_level_0,mean_user_accuracy,questions_answered,std_user_accuracy,median_user_accuracy,skew_user_accuracy
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
115,0.695652,46,0.465215,1.0,-0.879359
124,0.233333,30,0.430183,0.0,1.328338
2746,0.578947,19,0.507257,1.0,-0.347892
5382,0.672,125,0.471374,1.0,-0.741648
8623,0.642202,109,0.481566,1.0,-0.601619


In [11]:
grouped_by_content_df = train_questions_only_df.groupby('content_id')

content_answers_df = grouped_by_content_df.agg({'answered_correctly': ['mean', 'count','std','median','skew']}).copy()

content_answers_df.columns = ['mean_accuracy', 
                              'question_asked', 
                              'std_accuracy', 
                              'median_accuracy', 
                              'skew_accuracy']

print(content_answers_df.dtypes)
print(content_answers_df.shape)
content_answers_df.head()

mean_accuracy      float64
question_asked       int64
std_accuracy       float64
median_accuracy    float64
skew_accuracy      float64
dtype: object
(12986, 5)


Unnamed: 0_level_0,mean_accuracy,question_asked,std_accuracy,median_accuracy,skew_accuracy
content_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.857143,70,0.352454,1.0,-2.086215
1,0.918367,49,0.276642,1.0,-3.153323
2,0.555276,398,0.497561,1.0,-0.223312
3,0.784091,176,0.412625,1.0,-1.392819
4,0.60076,263,0.490676,1.0,-0.413848


In [12]:
del features_df
del grouped_by_user_df
del grouped_by_content_df

In [13]:
features = [
    'mean_user_accuracy', 
    'questions_answered',
    'std_user_accuracy', 
    'median_user_accuracy',
    'skew_user_accuracy',
    'mean_accuracy', 
    'question_asked',
    'std_accuracy', 
    'median_accuracy',
    'prior_question_elapsed_time', 
    'prior_question_had_explanation',
    'skew_accuracy'
]

target = 'answered_correctly'

In [14]:
train_df = train_df[train_df[target] != -1]

print(train_df.dtypes)
print(train_df.shape)
train_df.head()

user_id                             int32
content_id                          int16
answered_correctly                   int8
prior_question_elapsed_time       float16
prior_question_had_explanation    boolean
dtype: object
(97953, 5)


Unnamed: 0_level_0,user_id,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
23452298654,18789336,8144,0,4248.0,True
23452298654,18789336,8146,0,4248.0,True
23452320119,18789336,10166,0,2000.0,True
23452320119,18789336,10165,0,2000.0,True
23452320119,18789336,10167,1,2000.0,True


In [15]:
train_df = train_df.merge(user_answers_df, how='left', on='user_id')
train_df = train_df.merge(content_answers_df, how='left', on='content_id')

print(train_df.dtypes)
print(train_df.shape)
train_df.head()

user_id                             int32
content_id                          int16
answered_correctly                   int8
prior_question_elapsed_time       float16
prior_question_had_explanation    boolean
mean_user_accuracy                float64
questions_answered                float64
std_user_accuracy                 float64
median_user_accuracy              float64
skew_user_accuracy                float64
mean_accuracy                     float64
question_asked                    float64
std_accuracy                      float64
median_accuracy                   float64
skew_accuracy                     float64
dtype: object
(97953, 15)


Unnamed: 0,user_id,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,mean_user_accuracy,questions_answered,std_user_accuracy,median_user_accuracy,skew_user_accuracy,mean_accuracy,question_asked,std_accuracy,median_accuracy,skew_accuracy
0,18789336,8144,0,4248.0,True,0.309689,2312.0,0.462465,0.0,0.823743,0.454545,33.0,0.50565,0.0,0.191386
1,18789336,8146,0,4248.0,True,0.309689,2312.0,0.462465,0.0,0.823743,0.69697,33.0,0.466694,1.0,-0.898565
2,18789336,10166,0,2000.0,True,0.309689,2312.0,0.462465,0.0,0.823743,0.916667,48.0,0.27931,1.0,-3.11326
3,18789336,10165,0,2000.0,True,0.309689,2312.0,0.462465,0.0,0.823743,0.416667,48.0,0.498224,0.0,0.349066
4,18789336,10167,1,2000.0,True,0.309689,2312.0,0.462465,0.0,0.823743,0.770833,48.0,0.424744,1.0,-1.330724


# Preprocess

In [16]:
train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].fillna(value=False).astype(bool)
train_df = train_df.fillna(value=0.5)

print(train_df.isnull().sum())

user_id                           0
content_id                        0
answered_correctly                0
prior_question_elapsed_time       0
prior_question_had_explanation    0
mean_user_accuracy                0
questions_answered                0
std_user_accuracy                 0
median_user_accuracy              0
skew_user_accuracy                0
mean_accuracy                     0
question_asked                    0
std_accuracy                      0
median_accuracy                   0
skew_accuracy                     0
dtype: int64


In [17]:
train_df = train_df[features + [target]]
train_df = train_df.replace([np.inf, -np.inf], np.nan)
train_df = train_df.fillna(0.5)

print(train_df.isnull().sum())

mean_user_accuracy                0
questions_answered                0
std_user_accuracy                 0
median_user_accuracy              0
skew_user_accuracy                0
mean_accuracy                     0
question_asked                    0
std_accuracy                      0
median_accuracy                   0
prior_question_elapsed_time       0
prior_question_had_explanation    0
skew_accuracy                     0
answered_correctly                0
dtype: int64


In [18]:
print(train_df.dtypes)
print(train_df.shape)
train_df.head()

mean_user_accuracy                float64
questions_answered                float64
std_user_accuracy                 float64
median_user_accuracy              float64
skew_user_accuracy                float64
mean_accuracy                     float64
question_asked                    float64
std_accuracy                      float64
median_accuracy                   float64
prior_question_elapsed_time       float16
prior_question_had_explanation       bool
skew_accuracy                     float64
answered_correctly                   int8
dtype: object
(97953, 13)


Unnamed: 0,mean_user_accuracy,questions_answered,std_user_accuracy,median_user_accuracy,skew_user_accuracy,mean_accuracy,question_asked,std_accuracy,median_accuracy,prior_question_elapsed_time,prior_question_had_explanation,skew_accuracy,answered_correctly
0,0.309689,2312.0,0.462465,0.0,0.823743,0.454545,33.0,0.50565,0.0,4248.0,True,0.191386,0
1,0.309689,2312.0,0.462465,0.0,0.823743,0.69697,33.0,0.466694,1.0,4248.0,True,-0.898565,0
2,0.309689,2312.0,0.462465,0.0,0.823743,0.916667,48.0,0.27931,1.0,2000.0,True,-3.11326,0
3,0.309689,2312.0,0.462465,0.0,0.823743,0.416667,48.0,0.498224,0.0,2000.0,True,0.349066,0
4,0.309689,2312.0,0.462465,0.0,0.823743,0.770833,48.0,0.424744,1.0,2000.0,True,-1.330724,1


In [19]:
train_df, test_df = train_test_split(train_df, random_state=666, test_size=0.2)

print(train_df.shape)
print(test_df.shape)
train_df.head()

(78362, 13)
(19591, 13)


Unnamed: 0,mean_user_accuracy,questions_answered,std_user_accuracy,median_user_accuracy,skew_user_accuracy,mean_accuracy,question_asked,std_accuracy,median_accuracy,prior_question_elapsed_time,prior_question_had_explanation,skew_accuracy,answered_correctly
20721,0.5,0.5,0.5,0.5,0.5,0.61086,221.0,0.488662,1.0,25328.0,True,-0.457871,1
488,0.5,0.5,0.5,0.5,0.5,0.573427,429.0,0.495157,1.0,15000.0,True,-0.297968,0
54213,0.5,0.5,0.5,0.5,0.5,0.622951,183.0,0.485977,1.0,20000.0,True,-0.511585,0
62049,0.5,0.5,0.5,0.5,0.5,0.71167,437.0,0.453504,1.0,26000.0,True,-0.93778,0
51788,0.5,0.5,0.5,0.5,0.5,0.761905,42.0,0.431081,1.0,27008.0,True,-1.275864,1


In [20]:
def create_model(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int('max_depth', 3, 8)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 0.99)
    
    model = xgb.XGBClassifier(
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        learning_rate=learning_rate,
        random_state=666
    )
    return model

# Modeling

In [21]:
def objective(trial):
    model = create_model(trial)
    model.fit(train_df[features], train_df[target])
    score = roc_auc_score(
        test_df[target].values, 
        model.predict_proba(test_df[features])[:,1]
    )
    return score

In [22]:
%%time

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=70)
params = study.best_params

print(params)

[32m[I 2020-12-10 07:13:48,013][0m A new study created in memory with name: no-name-635d2ac8-cb3e-4f01-b276-e7a43591fcb9[0m
[32m[I 2020-12-10 07:13:54,945][0m Trial 0 finished with value: 0.6803988662845539 and parameters: {'n_estimators': 155, 'max_depth': 8, 'learning_rate': 0.5376115839881924}. Best is trial 0 with value: 0.6803988662845539.[0m
[32m[I 2020-12-10 07:14:02,225][0m Trial 1 finished with value: 0.6697670765199295 and parameters: {'n_estimators': 187, 'max_depth': 7, 'learning_rate': 0.7696572327801647}. Best is trial 0 with value: 0.6803988662845539.[0m
[32m[I 2020-12-10 07:14:13,005][0m Trial 2 finished with value: 0.6565687690916998 and parameters: {'n_estimators': 228, 'max_depth': 8, 'learning_rate': 0.7888752014161987}. Best is trial 0 with value: 0.6803988662845539.[0m
[32m[I 2020-12-10 07:14:19,078][0m Trial 3 finished with value: 0.7102468086662047 and parameters: {'n_estimators': 175, 'max_depth': 6, 'learning_rate': 0.17728422832658178}. Best is 

{'n_estimators': 275, 'max_depth': 3, 'learning_rate': 0.036391785550043756}
CPU times: user 24min 14s, sys: 11.7 s, total: 24min 25s
Wall time: 6min 19s


In [23]:
%%time

model = xgb.XGBClassifier(**params)

model.fit(train_df[features], train_df[target])
print('XGB score: ', roc_auc_score(test_df[target].values, model.predict_proba(test_df[features])[:,1]))

XGB score:  0.7154813559933868
CPU times: user 20.4 s, sys: 124 ms, total: 20.5 s
Wall time: 5.29 s


# Submit

In [24]:
for (test_df, sample_prediction_df) in iter_test:
    test_df = test_df.merge(user_answers_df, how='left', on='user_id')
    test_df = test_df.merge(content_answers_df, how='left', on='content_id')
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(value=False).astype(bool)
    test_df.fillna(value = 0.5, inplace = True)

    test_df['answered_correctly'] = model.predict_proba(test_df[features])[:,1]
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])