In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv
/kaggle/input/riiid-test-answer-prediction/example_test.csv
/kaggle/input/riiid-test-answer-prediction/questions.csv
/kaggle/input/riiid-test-answer-prediction/train.csv
/kaggle/input/riiid-test-answer-prediction/lectures.csv
/kaggle/input/riiid-test-answer-prediction/riiideducation/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/riiid-test-answer-prediction/riiideducation/__init__.py


In [2]:
import optuna
import lightgbm as lgb

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import riiideducation
env = riiideducation.make_env()
iter_test = env.iter_test()

# Data

In [3]:
%%time

used_data_types_dict = {
    'row_id': 'int32',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float16',
    'prior_question_had_explanation': 'boolean'
}

train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',
                       usecols = used_data_types_dict.keys(),
                       dtype=used_data_types_dict,
                       index_col = 0,
                       nrows=10**6)

print(train_df.dtypes)
print(train_df.shape)
train_df.head()

timestamp                           int64
user_id                             int32
content_id                          int16
answered_correctly                   int8
prior_question_elapsed_time       float16
prior_question_had_explanation    boolean
dtype: object
(1000000, 6)
CPU times: user 2.51 s, sys: 59.3 ms, total: 2.57 s
Wall time: 2.89 s


Unnamed: 0_level_0,timestamp,user_id,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,115,5692,1,,
1,56943,115,5716,1,36992.0,False
2,118363,115,128,1,55008.0,False
3,131167,115,7860,1,19008.0,False
4,137965,115,7922,1,11000.0,False


In [4]:
test_df = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/example_test.csv")

print(test_df.shape)
test_df.head()

(104, 11)


Unnamed: 0,row_id,group_num,timestamp,user_id,content_id,content_type_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation,prior_group_answers_correct,prior_group_responses
0,0,0,0,275030867,5729,0,0,,,[],[]
1,1,0,13309898705,554169193,12010,0,4427,19000.0,True,,
2,2,0,4213672059,1720860329,457,0,240,17000.0,True,,
3,3,0,62798072960,288641214,13262,0,266,23000.0,True,,
4,4,0,10585422061,1728340777,6119,0,162,72400.0,True,,


In [5]:
submission_df = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv")

print(submission_df.shape)
submission_df.head()

(104, 3)


Unnamed: 0,row_id,answered_correctly,group_num
0,0,0.5,0
1,1,0.5,0
2,2,0.5,0
3,3,0.5,0
4,4,0.5,0


In [6]:
questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')

print(questions.shape)
questions.head()

(13523, 5)


Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92
3,3,3,0,1,131 149 162 29
4,4,4,3,1,131 5 162 38


In [7]:
lectures = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')

print(lectures.shape)
lectures.head()

(418, 4)


Unnamed: 0,lecture_id,tag,part,type_of
0,89,159,5,concept
1,100,70,1,concept
2,185,45,6,concept
3,192,79,5,solving question
4,317,156,5,solving question


# Feature Engineering

### user_df

In [8]:
user_df = train_df[train_df.answered_correctly != -1].groupby('user_id').agg({'answered_correctly': ['count', 'mean']}).reset_index()

user_df.columns = ['user_id', 'user_questions', 'user_mean']

print(user_df.shape)
user_df.head()

(3824, 3)


Unnamed: 0,user_id,user_questions,user_mean
0,115,46,0.695652
1,124,30,0.233333
2,2746,19,0.578947
3,5382,125,0.672
4,8623,109,0.642202


In [9]:
user_lect = train_df.groupby(["user_id", "answered_correctly"]).size().unstack()

# columns:-1, 0, 1
user_lect.columns = ['Lecture', 'Wrong', 'Right']

print(user_lect.shape)
user_lect.head()

(3824, 3)


Unnamed: 0_level_0,Lecture,Wrong,Right
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
115,,14.0,32.0
124,,23.0,7.0
2746,1.0,8.0,11.0
5382,3.0,41.0,84.0
8623,3.0,39.0,70.0


In [10]:
user_lect['Lecture'] = user_lect['Lecture'].fillna(0)
user_lect = user_lect.astype('Int64')
user_lect['watches_lecture'] = np.where(user_lect.Lecture > 0, 1, 0)
user_lect = user_lect.reset_index()
user_lect = user_lect[['user_id', 'watches_lecture']]

print(user_lect.shape)
user_lect.head()

(3824, 2)


Unnamed: 0,user_id,watches_lecture
0,115,0
1,124,0
2,2746,1
3,5382,1
4,8623,1


In [11]:
user_df = user_df.merge(user_lect, on = "user_id", how = "left")
del user_lect

print(user_df.shape)
user_df.head()

(3824, 4)


Unnamed: 0,user_id,user_questions,user_mean,watches_lecture
0,115,46,0.695652,0
1,124,30,0.233333,0
2,2746,19,0.578947,1
3,5382,125,0.672,1
4,8623,109,0.642202,1


### content_df

In [12]:
content_df = train_df[train_df.answered_correctly != -1].groupby('content_id') \
.agg({'answered_correctly': ['count', 'mean']}).reset_index()

content_df.columns = ['content_id', 'content_questions', 'content_mean']

print(content_df.shape)
content_df.head()

(13076, 3)


Unnamed: 0,content_id,content_questions,content_mean
0,0,73,0.863014
1,1,55,0.927273
2,2,444,0.560811
3,3,199,0.798995
4,4,307,0.602606


### train_df

In [13]:
train_df = train_df.merge(user_df, on = "user_id", how = "left")
train_df = train_df.merge(content_df, on = "content_id", how = "left")

print(train_df.isnull().sum())
print(train_df.shape)
train_df.head()

timestamp                             0
user_id                               0
content_id                            0
answered_correctly                    0
prior_question_elapsed_time       23723
prior_question_had_explanation     3816
user_questions                        0
user_mean                             0
watches_lecture                       0
content_questions                 12319
content_mean                      12319
dtype: int64
(1000000, 11)


Unnamed: 0,timestamp,user_id,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_questions,user_mean,watches_lecture,content_questions,content_mean
0,0,115,5692,1,,,46,0.695652,0,337.0,0.721068
1,56943,115,5716,1,36992.0,False,46,0.695652,0,242.0,0.768595
2,118363,115,128,1,55008.0,False,46,0.695652,0,181.0,0.961326
3,131167,115,7860,1,19008.0,False,46,0.695652,0,203.0,0.926108
4,137965,115,7922,1,11000.0,False,46,0.695652,0,171.0,0.959064


In [14]:
train_df = train_df[train_df.answered_correctly != -1]
mean_prior = train_df.prior_question_elapsed_time.astype("float64").mean()
label_enc = preprocessing.LabelEncoder()

train_df['content_questions'].fillna(0, inplace = True)
train_df['content_mean'].fillna(0.5, inplace = True)
train_df['user_questions'].fillna(0, inplace = True)
train_df['user_mean'].fillna(0.5, inplace = True)
train_df['prior_question_elapsed_time'].fillna(mean_prior, inplace = True)
train_df['prior_question_had_explanation'].fillna(False, inplace = True)
train_df['prior_question_had_explanation'] = label_enc.fit_transform(train_df['prior_question_had_explanation'])
train_df[['content_questions', 'user_questions']] = train_df[['content_questions', 'user_questions']].astype(int)

print(train_df.dtypes)
print(train_df.shape)
print('total null:', train_df.isnull().sum().sum())
train_df.head()

timestamp                           int64
user_id                             int32
content_id                          int16
answered_correctly                   int8
prior_question_elapsed_time       float16
prior_question_had_explanation      int64
user_questions                      int64
user_mean                         float64
watches_lecture                     int64
content_questions                   int64
content_mean                      float64
dtype: object
(980093, 11)
total null: 0


Unnamed: 0,timestamp,user_id,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_questions,user_mean,watches_lecture,content_questions,content_mean
0,0,115,5692,1,inf,0,46,0.695652,0,337,0.721068
1,56943,115,5716,1,36992.0,0,46,0.695652,0,242,0.768595
2,118363,115,128,1,55008.0,0,46,0.695652,0,181,0.961326
3,131167,115,7860,1,19008.0,0,46,0.695652,0,203,0.926108
4,137965,115,7922,1,11000.0,0,46,0.695652,0,171,0.959064


In [15]:
train_df, test_df = train_test_split(train_df, random_state=666, test_size=0.2)

print(train_df.shape)
print(test_df.shape)

(784074, 11)
(196019, 11)


In [16]:
features = [col for col in train_df.columns if col != 'answered_correctly']
target = 'answered_correctly'

print(features)
print(target)

['timestamp', 'user_id', 'content_id', 'prior_question_elapsed_time', 'prior_question_had_explanation', 'user_questions', 'user_mean', 'watches_lecture', 'content_questions', 'content_mean']
answered_correctly


In [17]:
'''
def create_model(trial):
    num_leaves = trial.suggest_int("num_leaves", 2, 31)
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int('max_depth', 3, 8)
    min_child_samples = trial.suggest_int('min_child_samples', 100, 1200)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 0.99)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 5, 90)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.0001, 1.0)
    feature_fraction = trial.suggest_uniform('feature_fraction', 0.0001, 1.0)
    
    model = lgb.LGBMClassifier(
        num_leaves=num_leaves,
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        min_child_samples=min_child_samples, 
        min_data_in_leaf=min_data_in_leaf,
        learning_rate=learning_rate,
        feature_fraction=feature_fraction,
        random_state=666
    )
    return model
    
def objective(trial):
    model = create_model(trial)
    model.fit(train_df[features], train_df[target])
    score = roc_auc_score(
        test_df[target].values, 
        model.predict_proba(test_df[features])[:,1]
    )
    return score
    
    
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=70)
params = study.best_params

print(params)

'''

'\ndef create_model(trial):\n    num_leaves = trial.suggest_int("num_leaves", 2, 31)\n    n_estimators = trial.suggest_int("n_estimators", 50, 300)\n    max_depth = trial.suggest_int(\'max_depth\', 3, 8)\n    min_child_samples = trial.suggest_int(\'min_child_samples\', 100, 1200)\n    learning_rate = trial.suggest_uniform(\'learning_rate\', 0.0001, 0.99)\n    min_data_in_leaf = trial.suggest_int(\'min_data_in_leaf\', 5, 90)\n    bagging_fraction = trial.suggest_uniform(\'bagging_fraction\', 0.0001, 1.0)\n    feature_fraction = trial.suggest_uniform(\'feature_fraction\', 0.0001, 1.0)\n    \n    model = lgb.LGBMClassifier(\n        num_leaves=num_leaves,\n        n_estimators=n_estimators, \n        max_depth=max_depth, \n        min_child_samples=min_child_samples, \n        min_data_in_leaf=min_data_in_leaf,\n        learning_rate=learning_rate,\n        feature_fraction=feature_fraction,\n        random_state=666\n    )\n    return model\n    \ndef objective(trial):\n    model = creat

In [18]:
params = {
'num_leaves': 24,
'n_estimators': 299,
'max_depth': 5,
'min_child_samples': 189,
'learning_rate': 0.23952703210876622,
'min_data_in_leaf': 62,
'bagging_fraction': 0.3263951638326585,
'feature_fraction': 0.7424117808052549}

# score: 0.7721987996629855

In [19]:
model = lgb.LGBMClassifier(**params)

model.fit(train_df[features], train_df[target])
print('LGB score: ', roc_auc_score(test_df[target].values, model.predict_proba(test_df[features])[:,1]))

LGB score:  0.7721987996629855


# Submit

In [20]:
test_df = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/example_test.csv")

print(test_df.dtypes)
print(test_df.shape)
test_df.head()

row_id                              int64
group_num                           int64
timestamp                           int64
user_id                             int64
content_id                          int64
content_type_id                     int64
task_container_id                   int64
prior_question_elapsed_time       float64
prior_question_had_explanation     object
prior_group_answers_correct        object
prior_group_responses              object
dtype: object
(104, 11)


Unnamed: 0,row_id,group_num,timestamp,user_id,content_id,content_type_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation,prior_group_answers_correct,prior_group_responses
0,0,0,0,275030867,5729,0,0,,,[],[]
1,1,0,13309898705,554169193,12010,0,4427,19000.0,True,,
2,2,0,4213672059,1720860329,457,0,240,17000.0,True,,
3,3,0,62798072960,288641214,13262,0,266,23000.0,True,,
4,4,0,10585422061,1728340777,6119,0,162,72400.0,True,,


In [21]:
for (test_df, sample_prediction_df) in iter_test:
    
    test_df = test_df.merge(user_df, on = "user_id", how = "left")
    test_df = test_df.merge(content_df, on = "content_id", how = "left")
    
    mean_prior = test_df.prior_question_elapsed_time.astype("float64").mean()
    
    test_df['content_questions'].fillna(0, inplace = True)
    test_df['content_mean'].fillna(0.5, inplace = True)
    test_df['user_questions'].fillna(0, inplace = True)
    test_df['user_mean'].fillna(0.5, inplace = True)
    test_df['prior_question_elapsed_time'].fillna(mean_prior, inplace = True)
    test_df['prior_question_had_explanation'].fillna(False, inplace = True)
    test_df['prior_question_had_explanation'] = label_enc.fit_transform(test_df['prior_question_had_explanation'])
    test_df[['content_questions', 'user_questions']] = test_df[['content_questions', 'user_questions']].astype(int)
    
    # target
    test_df['answered_correctly'] =  model.predict(test_df[features])
    
    # submit_data
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

In [22]:
submission_df = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv")

print(submission_df.dtypes)
print(submission_df.shape)
submission_df.head()

row_id                  int64
answered_correctly    float64
group_num               int64
dtype: object
(104, 3)


Unnamed: 0,row_id,answered_correctly,group_num
0,0,0.5,0
1,1,0.5,0
2,2,0.5,0
3,3,0.5,0
4,4,0.5,0
