In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv
/kaggle/input/riiid-test-answer-prediction/example_test.csv
/kaggle/input/riiid-test-answer-prediction/questions.csv
/kaggle/input/riiid-test-answer-prediction/train.csv
/kaggle/input/riiid-test-answer-prediction/lectures.csv
/kaggle/input/riiid-test-answer-prediction/riiideducation/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/riiid-test-answer-prediction/riiideducation/__init__.py


In [2]:
from sklearn.model_selection import train_test_split
import optuna
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

In [3]:
import riiideducation
env = riiideducation.make_env()
iter_test = env.iter_test()

# Data

In [4]:
used_data_types_dict = {
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float16',
    'prior_question_had_explanation': 'boolean'
}

train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',
                       usecols = used_data_types_dict.keys(),
                       dtype=used_data_types_dict,
                       index_col = 0,
                       nrows=70**6)

print(train_df.dtypes)
print(train_df.shape)
train_df.head()

  mask |= (ar1 == a)


user_id                             int32
content_id                          int16
answered_correctly                   int8
prior_question_elapsed_time       float16
prior_question_had_explanation    boolean
dtype: object
(101230332, 5)


Unnamed: 0_level_0,user_id,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,115,5692,1,,
56943,115,5716,1,36992.0,False
118363,115,128,1,55008.0,False
131167,115,7860,1,19008.0,False
137965,115,7922,1,11000.0,False


In [5]:
test_df = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/example_test.csv")

print(test_df.shape)
test_df.head()

(104, 11)


Unnamed: 0,row_id,group_num,timestamp,user_id,content_id,content_type_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation,prior_group_answers_correct,prior_group_responses
0,0,0,0,275030867,5729,0,0,,,[],[]
1,1,0,13309898705,554169193,12010,0,4427,19000.0,True,,
2,2,0,4213672059,1720860329,457,0,240,17000.0,True,,
3,3,0,62798072960,288641214,13262,0,266,23000.0,True,,
4,4,0,10585422061,1728340777,6119,0,162,72400.0,True,,


In [6]:
submission_df = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv")

print(submission_df.shape)
submission_df.head()

(104, 3)


Unnamed: 0,row_id,answered_correctly,group_num
0,0,0.5,0
1,1,0.5,0
2,2,0.5,0
3,3,0.5,0
4,4,0.5,0


# Feature Engineering

In [7]:
features_df = train_df.iloc[:int(9/10 * len(train_df))]
train_df = train_df.iloc[int(9/10 * len(train_df)):]

In [8]:
print(features_df.dtypes)
print(features_df.shape)
features_df.head()

user_id                             int32
content_id                          int16
answered_correctly                   int8
prior_question_elapsed_time       float16
prior_question_had_explanation    boolean
dtype: object
(91107298, 5)


Unnamed: 0_level_0,user_id,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,115,5692,1,,
56943,115,5716,1,36992.0,False
118363,115,128,1,55008.0,False
131167,115,7860,1,19008.0,False
137965,115,7922,1,11000.0,False


In [9]:
print(train_df.dtypes)
print(train_df.shape)
train_df.head()

user_id                             int32
content_id                          int16
answered_correctly                   int8
prior_question_elapsed_time       float16
prior_question_had_explanation    boolean
dtype: object
(10123034, 5)


Unnamed: 0_level_0,user_id,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8231964660,1933715875,11259,0,13000.0,True
8232002976,1933715875,4957,1,44000.0,True
8232096407,1933715875,5113,1,22000.0,True
8232119872,1933715875,4699,1,inf,True
8232142930,1933715875,11430,1,9000.0,True


In [10]:
train_questions_only_df = features_df[features_df['answered_correctly']!=-1]

grouped_by_user_df = train_questions_only_df.groupby('user_id')

user_answers_df = grouped_by_user_df.agg({'answered_correctly': ['mean', 'count','std','median','skew']}).copy()

user_answers_df.columns = ['mean_user_accuracy', 
                           'questions_answered', 
                           'std_user_accuracy', 
                           'median_user_accuracy', 
                           'skew_user_accuracy']

print(user_answers_df.dtypes)
print(user_answers_df.shape)
user_answers_df.head()

mean_user_accuracy      float64
questions_answered        int64
std_user_accuracy       float64
median_user_accuracy    float64
skew_user_accuracy      float64
dtype: object
(354308, 5)


Unnamed: 0_level_0,mean_user_accuracy,questions_answered,std_user_accuracy,median_user_accuracy,skew_user_accuracy
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
115,0.695652,46,0.465215,1.0,-0.879359
124,0.233333,30,0.430183,0.0,1.328338
2746,0.578947,19,0.507257,1.0,-0.347892
5382,0.672,125,0.471374,1.0,-0.741648
8623,0.642202,109,0.481566,1.0,-0.601619


In [11]:
grouped_by_content_df = train_questions_only_df.groupby('content_id')

content_answers_df = grouped_by_content_df.agg({'answered_correctly': ['mean', 'count','std','median','skew']}).copy()

content_answers_df.columns = ['mean_accuracy', 
                              'question_asked', 
                              'std_accuracy', 
                              'median_accuracy', 
                              'skew_accuracy']

print(content_answers_df.dtypes)
print(content_answers_df.shape)
content_answers_df.head()

mean_accuracy      float64
question_asked       int64
std_accuracy       float64
median_accuracy    float64
skew_accuracy      float64
dtype: object
(13519, 5)


Unnamed: 0_level_0,mean_accuracy,question_asked,std_accuracy,median_accuracy,skew_accuracy
content_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.908595,6236,0.288207,1.0,-2.836339
1,0.891682,6684,0.310805,1.0,-2.521185
2,0.554656,40499,0.49701,1.0,-0.219949
3,0.779348,20734,0.414696,1.0,-1.347371
4,0.613226,28549,0.48702,1.0,-0.465009


In [12]:
del features_df
del grouped_by_user_df
del grouped_by_content_df

In [13]:
features = [
    'mean_user_accuracy', 
    'questions_answered',
    'std_user_accuracy', 
    'median_user_accuracy',
    'skew_user_accuracy',
    'mean_accuracy', 
    'question_asked',
    'std_accuracy', 
    'median_accuracy',
    'prior_question_elapsed_time', 
    'prior_question_had_explanation',
    'skew_accuracy'
]

target = 'answered_correctly'

In [14]:
train_df = train_df[train_df[target] != -1]

print(train_df.dtypes)
print(train_df.shape)
train_df.head()

user_id                             int32
content_id                          int16
answered_correctly                   int8
prior_question_elapsed_time       float16
prior_question_had_explanation    boolean
dtype: object
(9926951, 5)


Unnamed: 0_level_0,user_id,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8231964660,1933715875,11259,0,13000.0,True
8232002976,1933715875,4957,1,44000.0,True
8232096407,1933715875,5113,1,22000.0,True
8232119872,1933715875,4699,1,inf,True
8232142930,1933715875,11430,1,9000.0,True


In [15]:
train_df = train_df.merge(user_answers_df, how='left', on='user_id')
train_df = train_df.merge(content_answers_df, how='left', on='content_id')

print(train_df.dtypes)
print(train_df.shape)
train_df.head()

user_id                             int32
content_id                          int16
answered_correctly                   int8
prior_question_elapsed_time       float16
prior_question_had_explanation    boolean
mean_user_accuracy                float64
questions_answered                float64
std_user_accuracy                 float64
median_user_accuracy              float64
skew_user_accuracy                float64
mean_accuracy                     float64
question_asked                    float64
std_accuracy                      float64
median_accuracy                   float64
skew_accuracy                     float64
dtype: object
(9926951, 15)


Unnamed: 0,user_id,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,mean_user_accuracy,questions_answered,std_user_accuracy,median_user_accuracy,skew_user_accuracy,mean_accuracy,question_asked,std_accuracy,median_accuracy,skew_accuracy
0,1933715875,11259,0,13000.0,True,0.779843,5219.0,0.414392,1.0,-1.351136,0.532146,1291.0,0.499159,1.0,-0.128999
1,1933715875,4957,1,44000.0,True,0.779843,5219.0,0.414392,1.0,-1.351136,0.584772,2548.0,0.492858,1.0,-0.344273
2,1933715875,5113,1,22000.0,True,0.779843,5219.0,0.414392,1.0,-1.351136,0.603571,1960.0,0.48928,1.0,-0.423795
3,1933715875,4699,1,inf,True,0.779843,5219.0,0.414392,1.0,-1.351136,0.694888,2504.0,0.460547,1.0,-0.847011
4,1933715875,11430,1,9000.0,True,0.779843,5219.0,0.414392,1.0,-1.351136,0.765869,1922.0,0.423565,1.0,-1.256695


# Preprocess

In [16]:
train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].fillna(value=False).astype(bool)
train_df = train_df.fillna(value=0.5)

print(train_df.isnull().sum())

user_id                           0
content_id                        0
answered_correctly                0
prior_question_elapsed_time       0
prior_question_had_explanation    0
mean_user_accuracy                0
questions_answered                0
std_user_accuracy                 0
median_user_accuracy              0
skew_user_accuracy                0
mean_accuracy                     0
question_asked                    0
std_accuracy                      0
median_accuracy                   0
skew_accuracy                     0
dtype: int64


In [17]:
train_df = train_df[features + [target]]
train_df = train_df.replace([np.inf, -np.inf], np.nan)
train_df = train_df.fillna(0.5)

print(train_df.isnull().sum())

mean_user_accuracy                0
questions_answered                0
std_user_accuracy                 0
median_user_accuracy              0
skew_user_accuracy                0
mean_accuracy                     0
question_asked                    0
std_accuracy                      0
median_accuracy                   0
prior_question_elapsed_time       0
prior_question_had_explanation    0
skew_accuracy                     0
answered_correctly                0
dtype: int64


In [18]:
print(train_df.dtypes)
print(train_df.shape)
train_df.head()

mean_user_accuracy                float64
questions_answered                float64
std_user_accuracy                 float64
median_user_accuracy              float64
skew_user_accuracy                float64
mean_accuracy                     float64
question_asked                    float64
std_accuracy                      float64
median_accuracy                   float64
prior_question_elapsed_time       float16
prior_question_had_explanation       bool
skew_accuracy                     float64
answered_correctly                   int8
dtype: object
(9926951, 13)


Unnamed: 0,mean_user_accuracy,questions_answered,std_user_accuracy,median_user_accuracy,skew_user_accuracy,mean_accuracy,question_asked,std_accuracy,median_accuracy,prior_question_elapsed_time,prior_question_had_explanation,skew_accuracy,answered_correctly
0,0.779843,5219.0,0.414392,1.0,-1.351136,0.532146,1291.0,0.499159,1.0,13000.0,True,-0.128999,0
1,0.779843,5219.0,0.414392,1.0,-1.351136,0.584772,2548.0,0.492858,1.0,44000.0,True,-0.344273,1
2,0.779843,5219.0,0.414392,1.0,-1.351136,0.603571,1960.0,0.48928,1.0,22000.0,True,-0.423795,1
3,0.779843,5219.0,0.414392,1.0,-1.351136,0.694888,2504.0,0.460547,1.0,0.5,True,-0.847011,1
4,0.779843,5219.0,0.414392,1.0,-1.351136,0.765869,1922.0,0.423565,1.0,9000.0,True,-1.256695,1


In [19]:
train_df, test_df = train_test_split(train_df, random_state=666, test_size=0.2)

print(train_df.shape)
print(test_df.shape)
train_df.head()

(7941560, 13)
(1985391, 13)


Unnamed: 0,mean_user_accuracy,questions_answered,std_user_accuracy,median_user_accuracy,skew_user_accuracy,mean_accuracy,question_asked,std_accuracy,median_accuracy,prior_question_elapsed_time,prior_question_had_explanation,skew_accuracy,answered_correctly
7195423,0.5,0.5,0.5,0.5,0.5,0.751375,28179.0,0.432224,1.0,19008.0,True,-1.163255,0
507380,0.5,0.5,0.5,0.5,0.5,0.362581,5687.0,0.480788,0.0,25744.0,True,0.571841,0
3855082,0.5,0.5,0.5,0.5,0.5,0.492388,58326.0,0.499946,0.0,46016.0,True,0.030454,0
742455,0.5,0.5,0.5,0.5,0.5,0.691253,9626.0,0.462001,1.0,44000.0,True,-0.828105,0
7013908,0.5,0.5,0.5,0.5,0.5,0.661801,14728.0,0.473113,1.0,26000.0,True,-0.684076,1


In [20]:
def create_model(trial):
    num_leaves = trial.suggest_int("num_leaves", 2, 31)
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int('max_depth', 3, 8)
    min_child_samples = trial.suggest_int('min_child_samples', 100, 1200)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 0.99)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 5, 90)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.0001, 1.0)
    feature_fraction = trial.suggest_uniform('feature_fraction', 0.0001, 1.0)
    
    model = lgb.LGBMClassifier(
        num_leaves=num_leaves,
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        min_child_samples=min_child_samples, 
        min_data_in_leaf=min_data_in_leaf,
        learning_rate=learning_rate,
        feature_fraction=feature_fraction,
        random_state=666
    )
    return model

# Modeling

In [21]:
def objective(trial):
    model = create_model(trial)
    model.fit(train_df[features], train_df[target])
    score = roc_auc_score(
        test_df[target].values, 
        model.predict_proba(test_df[features])[:,1]
    )
    return score

In [22]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=70)
params = study.best_params

print(params)

[32m[I 2020-12-08 23:03:33,254][0m A new study created in memory with name: no-name-b1b1facc-6815-44d1-a7d8-b771ead65e12[0m
[32m[I 2020-12-08 23:04:28,672][0m Trial 0 finished with value: 0.7198165891469471 and parameters: {'num_leaves': 2, 'n_estimators': 136, 'max_depth': 4, 'min_child_samples': 336, 'learning_rate': 0.7891960380158618, 'min_data_in_leaf': 7, 'bagging_fraction': 0.6061515990034877, 'feature_fraction': 0.700437785011895}. Best is trial 0 with value: 0.7198165891469471.[0m
[32m[I 2020-12-08 23:05:33,992][0m Trial 1 finished with value: 0.7212493862979616 and parameters: {'num_leaves': 7, 'n_estimators': 162, 'max_depth': 3, 'min_child_samples': 963, 'learning_rate': 0.7843658599232078, 'min_data_in_leaf': 49, 'bagging_fraction': 0.5269505656716831, 'feature_fraction': 0.2230621733725758}. Best is trial 1 with value: 0.7212493862979616.[0m
[32m[I 2020-12-08 23:07:23,992][0m Trial 2 finished with value: 0.720813860892448 and parameters: {'num_leaves': 23, 'n_e

{'num_leaves': 24, 'n_estimators': 275, 'max_depth': 5, 'min_child_samples': 1021, 'learning_rate': 0.2917353070190297, 'min_data_in_leaf': 54, 'bagging_fraction': 0.672031123804561, 'feature_fraction': 0.8873705167954823}


In [23]:
model = lgb.LGBMClassifier(**params)

model.fit(train_df[features], train_df[target])
print('LGB score: ', roc_auc_score(test_df[target].values, model.predict_proba(test_df[features])[:,1]))

LGB score:  0.7218414484295962


# Submit

In [24]:
for (test_df, sample_prediction_df) in iter_test:
    test_df = test_df.merge(user_answers_df, how='left', on='user_id')
    test_df = test_df.merge(content_answers_df, how='left', on='content_id')
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(value=False).astype(bool)
    test_df.fillna(value = 0.5, inplace = True)

    test_df['answered_correctly'] = model.predict_proba(test_df[features])[:,1]
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])