In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv
/kaggle/input/riiid-test-answer-prediction/example_test.csv
/kaggle/input/riiid-test-answer-prediction/questions.csv
/kaggle/input/riiid-test-answer-prediction/train.csv
/kaggle/input/riiid-test-answer-prediction/lectures.csv
/kaggle/input/riiid-test-answer-prediction/riiideducation/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/riiid-test-answer-prediction/riiideducation/__init__.py


In [2]:
import optuna
import lightgbm as lgb

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import riiideducation

# Data

In [3]:
%%time

used_data_types_dict = {
    'row_id': 'int32',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float16',
    'prior_question_had_explanation': 'boolean'
}

train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',
                       usecols = used_data_types_dict.keys(),
                       dtype=used_data_types_dict,
                       index_col = 0,
                       nrows=10**7)

print(train_df.dtypes)
print(train_df.shape)
train_df.head()

  mask |= (ar1 == a)


timestamp                           int64
user_id                             int32
content_id                          int16
answered_correctly                   int8
prior_question_elapsed_time       float16
prior_question_had_explanation    boolean
dtype: object
(10000000, 6)
CPU times: user 23.7 s, sys: 638 ms, total: 24.3 s
Wall time: 29.1 s


Unnamed: 0_level_0,timestamp,user_id,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,115,5692,1,,
1,56943,115,5716,1,36992.0,False
2,118363,115,128,1,55008.0,False
3,131167,115,7860,1,19008.0,False
4,137965,115,7922,1,11000.0,False


In [4]:
test_df = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/example_test.csv")

print(test_df.shape)
test_df.head()

(104, 11)


Unnamed: 0,row_id,group_num,timestamp,user_id,content_id,content_type_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation,prior_group_answers_correct,prior_group_responses
0,0,0,0,275030867,5729,0,0,,,[],[]
1,1,0,13309898705,554169193,12010,0,4427,19000.0,True,,
2,2,0,4213672059,1720860329,457,0,240,17000.0,True,,
3,3,0,62798072960,288641214,13262,0,266,23000.0,True,,
4,4,0,10585422061,1728340777,6119,0,162,72400.0,True,,


In [5]:
questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')

print(questions.shape)
questions.head()

(13523, 5)


Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92
3,3,3,0,1,131 149 162 29
4,4,4,3,1,131 5 162 38


In [6]:
lectures = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')

print(lectures.shape)
lectures.head()

(418, 4)


Unnamed: 0,lecture_id,tag,part,type_of
0,89,159,5,concept
1,100,70,1,concept
2,185,45,6,concept
3,192,79,5,solving question
4,317,156,5,solving question


# Feature Engineering

### user_df

In [7]:
user_df = train_df[train_df.answered_correctly != -1].groupby('user_id').agg({'answered_correctly': ['count', 'mean']}).reset_index()

user_df.columns = ['user_id', 'user_questions', 'user_mean']

print(user_df.shape)
user_df.head()

(39491, 3)


Unnamed: 0,user_id,user_questions,user_mean
0,115,46,0.695652
1,124,30,0.233333
2,2746,19,0.578947
3,5382,125,0.672
4,8623,109,0.642202


In [8]:
user_lect = train_df.groupby(["user_id", "answered_correctly"]).size().unstack()

# columns:-1, 0, 1
user_lect.columns = ['Lecture', 'Wrong', 'Right']

print(user_lect.shape)
user_lect.head()

(39491, 3)


Unnamed: 0_level_0,Lecture,Wrong,Right
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
115,,14.0,32.0
124,,23.0,7.0
2746,1.0,8.0,11.0
5382,3.0,41.0,84.0
8623,3.0,39.0,70.0


In [9]:
user_lect['Lecture'] = user_lect['Lecture'].fillna(0)
user_lect = user_lect.astype('Int64')
user_lect['watches_lecture'] = np.where(user_lect.Lecture > 0, 1, 0)
user_lect = user_lect.reset_index()
user_lect = user_lect[['user_id', 'watches_lecture']]

print(user_lect.shape)
user_lect.head()

(39491, 2)


Unnamed: 0,user_id,watches_lecture
0,115,0
1,124,0
2,2746,1
3,5382,1
4,8623,1


In [10]:
user_df = user_df.merge(user_lect, on = "user_id", how = "left")
del user_lect

print(user_df.shape)
user_df.head()

(39491, 4)


Unnamed: 0,user_id,user_questions,user_mean,watches_lecture
0,115,46,0.695652,0
1,124,30,0.233333,0
2,2746,19,0.578947,1
3,5382,125,0.672,1
4,8623,109,0.642202,1


### content_df

In [11]:
content_df = train_df[train_df.answered_correctly != -1].groupby('content_id') \
.agg({'answered_correctly': ['count', 'mean']}).reset_index()

content_df.columns = ['content_id', 'content_questions', 'content_mean']

print(content_df.shape)
content_df.head()

(13500, 3)


Unnamed: 0,content_id,content_questions,content_mean
0,0,691,0.908828
1,1,726,0.893939
2,2,4501,0.562319
3,3,2276,0.780316
4,4,3153,0.627973


### train_df

In [12]:
train_df = train_df.merge(user_df, on = "user_id", how = "left")
train_df = train_df.merge(content_df, on = "content_id", how = "left")

print(train_df.isnull().sum())
print(train_df.shape)
train_df.head()

timestamp                              0
user_id                                0
content_id                             0
answered_correctly                     0
prior_question_elapsed_time       234784
prior_question_had_explanation     39388
user_questions                         0
user_mean                              0
watches_lecture                        0
content_questions                 118946
content_mean                      118946
dtype: int64
(10000000, 11)


Unnamed: 0,timestamp,user_id,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_questions,user_mean,watches_lecture,content_questions,content_mean
0,0,115,5692,1,,,46,0.695652,0,3548.0,0.741545
1,56943,115,5716,1,36992.0,False,46,0.695652,0,2376.0,0.73569
2,118363,115,128,1,55008.0,False,46,0.695652,0,1914.0,0.973877
3,131167,115,7860,1,19008.0,False,46,0.695652,0,2137.0,0.955545
4,137965,115,7922,1,11000.0,False,46,0.695652,0,1878.0,0.952609


In [13]:
features_df = train_df.iloc[:int(9/10 * len(train_df))]
train_df = train_df.iloc[int(9/10 * len(train_df)):]

In [14]:
train_questions_only_df = features_df[features_df['answered_correctly']!=-1]

grouped_by_user_df = train_questions_only_df.groupby('user_id')

user_answers_df = grouped_by_user_df.agg({'answered_correctly': ['mean', 'count','std','median','skew']}).copy()

user_answers_df.columns = ['mean_user_accuracy', 
                           'questions_answered', 
                           'std_user_accuracy', 
                           'median_user_accuracy', 
                           'skew_user_accuracy']

print(user_answers_df.dtypes)
print(user_answers_df.shape)
user_answers_df.head()

mean_user_accuracy      float64
questions_answered        int64
std_user_accuracy       float64
median_user_accuracy    float64
skew_user_accuracy      float64
dtype: object
(35683, 5)


Unnamed: 0_level_0,mean_user_accuracy,questions_answered,std_user_accuracy,median_user_accuracy,skew_user_accuracy
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
115,0.695652,46,0.465215,1.0,-0.879359
124,0.233333,30,0.430183,0.0,1.328338
2746,0.578947,19,0.507257,1.0,-0.347892
5382,0.672,125,0.471374,1.0,-0.741648
8623,0.642202,109,0.481566,1.0,-0.601619


In [15]:
grouped_by_content_df = train_questions_only_df.groupby('content_id')

content_answers_df = grouped_by_content_df.agg({'answered_correctly': ['mean', 'count','std','median','skew']}).copy()

content_answers_df.columns = ['mean_accuracy', 
                              'question_asked', 
                              'std_accuracy', 
                              'median_accuracy', 
                              'skew_accuracy']

print(content_answers_df.dtypes)
print(content_answers_df.shape)
content_answers_df.head()

mean_accuracy      float64
question_asked       int64
std_accuracy       float64
median_accuracy    float64
skew_accuracy      float64
dtype: object
(13497, 5)


Unnamed: 0_level_0,mean_accuracy,question_asked,std_accuracy,median_accuracy,skew_accuracy
content_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.901274,628,0.298532,1.0,-2.696906
1,0.889571,652,0.313665,1.0,-2.491635
2,0.558317,4064,0.496649,1.0,-0.234957
3,0.77957,2046,0.414638,1.0,-1.34982
4,0.627244,2841,0.483623,1.0,-0.526582


In [16]:
del(features_df)

train_df = train_df[train_df.answered_correctly != -1]

train_df = train_df.merge(user_answers_df, how='left', on='user_id')
train_df = train_df.merge(content_answers_df, how='left', on='content_id')

print(train_df.dtypes)
print(train_df.shape)

timestamp                           int64
user_id                             int32
content_id                          int16
answered_correctly                   int8
prior_question_elapsed_time       float16
prior_question_had_explanation    boolean
user_questions                      int64
user_mean                         float64
watches_lecture                     int64
content_questions                 float64
content_mean                      float64
mean_user_accuracy                float64
questions_answered                float64
std_user_accuracy                 float64
median_user_accuracy              float64
skew_user_accuracy                float64
mean_accuracy                     float64
question_asked                    float64
std_accuracy                      float64
median_accuracy                   float64
skew_accuracy                     float64
dtype: object
(981094, 21)


In [17]:
print(train_df.isnull().sum())
train_df.head()

timestamp                              0
user_id                                0
content_id                             0
answered_correctly                     0
prior_question_elapsed_time         3803
prior_question_had_explanation      3803
user_questions                         0
user_mean                              0
watches_lecture                        0
content_questions                      0
content_mean                           0
mean_user_accuracy                981019
questions_answered                981019
std_user_accuracy                 981019
median_user_accuracy              981019
skew_user_accuracy                981019
mean_accuracy                          3
question_asked                         3
std_accuracy                          14
median_accuracy                        3
skew_accuracy                         19
dtype: int64


Unnamed: 0,timestamp,user_id,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_questions,user_mean,watches_lecture,content_questions,...,mean_user_accuracy,questions_answered,std_user_accuracy,median_user_accuracy,skew_user_accuracy,mean_accuracy,question_asked,std_accuracy,median_accuracy,skew_accuracy
0,9871729962,196122333,3992,1,19008.0,True,348,0.623563,1,431.0,...,0.611722,273.0,0.488254,1.0,-0.461015,0.867188,384.0,0.339815,1.0,-2.172419
1,9871801320,196122333,9510,0,15000.0,True,348,0.623563,1,475.0,...,0.611722,273.0,0.488254,1.0,-0.461015,0.5338,429.0,0.499439,1.0,-0.135984
2,9871852731,196122333,5214,1,13000.0,True,348,0.623563,1,1151.0,...,0.611722,273.0,0.488254,1.0,-0.461015,0.715385,1040.0,0.451448,1.0,-0.956033
3,9872436636,196122333,5334,0,16000.0,True,348,0.623563,1,2512.0,...,0.611722,273.0,0.488254,1.0,-0.461015,0.674593,2271.0,0.46863,1.0,-0.745776
4,9872617214,196122333,3722,1,16000.0,True,348,0.623563,1,1000.0,...,0.611722,273.0,0.488254,1.0,-0.461015,0.719101,890.0,0.449691,1.0,-0.976647


In [18]:
mean_prior = train_df.prior_question_elapsed_time.mean()
label_enc = preprocessing.LabelEncoder()

train_df['content_questions'] = train_df['content_questions'].fillna(0)
train_df['content_mean'] = train_df['content_mean'].fillna(0.5)
train_df['user_questions'] = train_df['user_questions'].fillna(0)
train_df['user_mean'] = train_df['user_mean'].fillna(0.5)
train_df['prior_question_elapsed_time'] = train_df['prior_question_elapsed_time'].fillna(mean_prior)
train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].fillna(False)
train_df['prior_question_had_explanation'] = label_enc.fit_transform(train_df['prior_question_had_explanation'])
train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].astype(int)
train_df[['content_questions', 'user_questions']] = train_df[['content_questions', 'user_questions']].astype(int)
train_df = train_df.fillna(0.5)

print(train_df.dtypes)
print(train_df.isnull().sum())
train_df.head()

timestamp                           int64
user_id                             int32
content_id                          int16
answered_correctly                   int8
prior_question_elapsed_time       float16
prior_question_had_explanation      int64
user_questions                      int64
user_mean                         float64
watches_lecture                     int64
content_questions                   int64
content_mean                      float64
mean_user_accuracy                float64
questions_answered                float64
std_user_accuracy                 float64
median_user_accuracy              float64
skew_user_accuracy                float64
mean_accuracy                     float64
question_asked                    float64
std_accuracy                      float64
median_accuracy                   float64
skew_accuracy                     float64
dtype: object
timestamp                         0
user_id                           0
content_id                      

Unnamed: 0,timestamp,user_id,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_questions,user_mean,watches_lecture,content_questions,...,mean_user_accuracy,questions_answered,std_user_accuracy,median_user_accuracy,skew_user_accuracy,mean_accuracy,question_asked,std_accuracy,median_accuracy,skew_accuracy
0,9871729962,196122333,3992,1,19008.0,1,348,0.623563,1,431,...,0.611722,273.0,0.488254,1.0,-0.461015,0.867188,384.0,0.339815,1.0,-2.172419
1,9871801320,196122333,9510,0,15000.0,1,348,0.623563,1,475,...,0.611722,273.0,0.488254,1.0,-0.461015,0.5338,429.0,0.499439,1.0,-0.135984
2,9871852731,196122333,5214,1,13000.0,1,348,0.623563,1,1151,...,0.611722,273.0,0.488254,1.0,-0.461015,0.715385,1040.0,0.451448,1.0,-0.956033
3,9872436636,196122333,5334,0,16000.0,1,348,0.623563,1,2512,...,0.611722,273.0,0.488254,1.0,-0.461015,0.674593,2271.0,0.46863,1.0,-0.745776
4,9872617214,196122333,3722,1,16000.0,1,348,0.623563,1,1000,...,0.611722,273.0,0.488254,1.0,-0.461015,0.719101,890.0,0.449691,1.0,-0.976647


In [19]:
train_df, test_df = train_test_split(train_df, random_state=666, test_size=0.2)

print(train_df.shape)
print(test_df.shape)

(784875, 21)
(196219, 21)


In [20]:
features = [col for col in train_df.columns if col != 'answered_correctly']
target = 'answered_correctly'

print(features)
print(target)

['timestamp', 'user_id', 'content_id', 'prior_question_elapsed_time', 'prior_question_had_explanation', 'user_questions', 'user_mean', 'watches_lecture', 'content_questions', 'content_mean', 'mean_user_accuracy', 'questions_answered', 'std_user_accuracy', 'median_user_accuracy', 'skew_user_accuracy', 'mean_accuracy', 'question_asked', 'std_accuracy', 'median_accuracy', 'skew_accuracy']
answered_correctly


In [21]:
def create_model(trial):
    num_leaves = trial.suggest_int("num_leaves", 2, 31)
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int('max_depth', 3, 8)
    min_child_samples = trial.suggest_int('min_child_samples', 100, 1200)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 0.99)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 5, 90)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.0001, 1.0)
    feature_fraction = trial.suggest_uniform('feature_fraction', 0.0001, 1.0)
    
    model = lgb.LGBMClassifier(
        num_leaves=num_leaves,
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        min_child_samples=min_child_samples, 
        min_data_in_leaf=min_data_in_leaf,
        learning_rate=learning_rate,
        feature_fraction=feature_fraction,
        random_state=666
    )
    return model
    
def objective(trial):
    model = create_model(trial)
    model.fit(train_df[features], train_df[target])
    score = roc_auc_score(
        test_df[target].values, 
        model.predict_proba(test_df[features])[:,1]
    )
    return score
    
    
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=70)
params = study.best_params

print(params)

[32m[I 2021-01-04 12:27:32,049][0m A new study created in memory with name: no-name-a6f28c8c-0cd5-4357-a6b9-992d0bd7ea6c[0m
[32m[I 2021-01-04 12:27:41,230][0m Trial 0 finished with value: 0.7669094005932638 and parameters: {'num_leaves': 24, 'n_estimators': 299, 'max_depth': 6, 'min_child_samples': 406, 'learning_rate': 0.5090017194568264, 'min_data_in_leaf': 42, 'bagging_fraction': 0.6774112484542419, 'feature_fraction': 0.16837817348192277}. Best is trial 0 with value: 0.7669094005932638.[0m
[32m[I 2021-01-04 12:27:48,606][0m Trial 1 finished with value: 0.7630853001088651 and parameters: {'num_leaves': 8, 'n_estimators': 279, 'max_depth': 7, 'min_child_samples': 690, 'learning_rate': 0.7824251865205841, 'min_data_in_leaf': 46, 'bagging_fraction': 0.10526737818187129, 'feature_fraction': 0.10710174654685117}. Best is trial 0 with value: 0.7669094005932638.[0m
[32m[I 2021-01-04 12:27:52,249][0m Trial 2 finished with value: 0.7581054566840624 and parameters: {'num_leaves': 3

{'num_leaves': 23, 'n_estimators': 271, 'max_depth': 6, 'min_child_samples': 303, 'learning_rate': 0.26914653564625524, 'min_data_in_leaf': 45, 'bagging_fraction': 0.9999980347158899, 'feature_fraction': 0.6753534078777056}


In [22]:
'''
params = {
'num_leaves': 13,
'n_estimators': 267,
'max_depth': 5,
'min_child_samples': 108,
'learning_rate': 0.1432743218178815,
'min_data_in_leaf': 34,
'bagging_fraction': 0.9288184187681869,
'feature_fraction': 0.7423767848466962}

'''

# score: 0.8008159759884459

"\nparams = {\n'num_leaves': 13,\n'n_estimators': 267,\n'max_depth': 5,\n'min_child_samples': 108,\n'learning_rate': 0.1432743218178815,\n'min_data_in_leaf': 34,\n'bagging_fraction': 0.9288184187681869,\n'feature_fraction': 0.7423767848466962}\n\n"

In [23]:
model = lgb.LGBMClassifier(**params)

model.fit(train_df[features], train_df[target])
print('LGB score: ', roc_auc_score(test_df[target].values, model.predict_proba(test_df[features])[:,1]))

LGB score:  0.7703803124320775


# Submit

In [24]:
test_df = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/example_test.csv")

print(test_df.dtypes)
print(test_df.shape)
test_df.head()

row_id                              int64
group_num                           int64
timestamp                           int64
user_id                             int64
content_id                          int64
content_type_id                     int64
task_container_id                   int64
prior_question_elapsed_time       float64
prior_question_had_explanation     object
prior_group_answers_correct        object
prior_group_responses              object
dtype: object
(104, 11)


Unnamed: 0,row_id,group_num,timestamp,user_id,content_id,content_type_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation,prior_group_answers_correct,prior_group_responses
0,0,0,0,275030867,5729,0,0,,,[],[]
1,1,0,13309898705,554169193,12010,0,4427,19000.0,True,,
2,2,0,4213672059,1720860329,457,0,240,17000.0,True,,
3,3,0,62798072960,288641214,13262,0,266,23000.0,True,,
4,4,0,10585422061,1728340777,6119,0,162,72400.0,True,,


In [25]:
env = riiideducation.make_env()
iter_test = env.iter_test()

In [26]:
for (test_df, sample_prediction_df) in iter_test:
    
    test_df = test_df.merge(user_df, on = "user_id", how = "left")
    test_df = test_df.merge(content_df, on = "content_id", how = "left")
    
    test_df = test_df.merge(user_answers_df, how='left', on='user_id')
    test_df = test_df.merge(content_answers_df, how='left', on='content_id')
    
    mean_prior = test_df.prior_question_elapsed_time.mean()
    
    test_df['content_questions'] = test_df['content_questions'].fillna(0)
    test_df['content_mean'] = test_df['content_mean'].fillna(0.5)
    test_df['user_questions']= test_df['user_questions'].fillna(0)
    test_df['user_mean'] = test_df['user_mean'].fillna(0.5)
    test_df['prior_question_elapsed_time'] = test_df['prior_question_elapsed_time'].fillna(mean_prior)
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False)
    test_df['prior_question_had_explanation'] = label_enc.fit_transform(test_df['prior_question_had_explanation'])
    test_df[['content_questions', 'user_questions']] = test_df[['content_questions', 'user_questions']].astype(int)
    test_df = test_df.fillna(0.5)
    
    # target
    test_df[target] =  model.predict(test_df[features])
    
    # submit_data
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', target]])

In [27]:
submission_df = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv")

print(submission_df.dtypes)
print(submission_df.shape)
submission_df.head()

row_id                  int64
answered_correctly    float64
group_num               int64
dtype: object
(104, 3)


Unnamed: 0,row_id,answered_correctly,group_num
0,0,0.5,0
1,1,0.5,0
2,2,0.5,0
3,3,0.5,0
4,4,0.5,0
