# Riiid Kaggle Challenge

Based on historical student answers model predicts how students will perform. 

I do data preprocessing and use LGBM with optuma to make a model.

This notebook archives 0.743 score.

[You can read more about it here](https://www.kaggle.com/c/riiid-test-answer-prediction)

In [None]:
%%time

import sys

# Regular Libraries
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.image as mpimg
import gc
from scipy.stats import pearsonr

import copy
import re

import warnings

# Load Datasets

In [None]:
%%time

train = pd.read_csv(
    '/kaggle/input/riiid-test-answer-prediction/train.csv',
    usecols=[1, 2, 3, 4, 5, 7, 8, 9],
    dtype={
        'timestamp': 'int64',
        'user_id': 'int32',
        'content_id': 'int16',
        'content_type_id': 'int8',
        'task_container_id': 'int16',
        'answered_correctly':'int8',
        'prior_question_elapsed_time': 'float32',
        'prior_question_had_explanation': 'boolean'
    }
)

# # Read in data
# dtypes = {
#     "row_id": "int64",
#     "timestamp": "int64",
#     "user_id": "int32",
#     "content_id": "int16",
#     "content_type_id": "boolean",
#     "task_container_id": "int16",
#     "user_answer": "int8",
#     "answered_correctly": "int8",
#     "prior_question_elapsed_time": "float32", 
#     "prior_question_had_explanation": "boolean"
# }

# train = pd.read_hdf("../input/riiid-train-data-multiple-formats/riiid_train.h5", "riiid_train")

# # # Drop column as it doesn't give any information
# train.drop(columns = ["row_id"], axis=1, inplace=True)
# train.drop(columns = ["user_answer"], axis=1, inplace=True)

train.head(2)

In [None]:
questions = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')

questions.head(2)

In [None]:
lectures_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')
lectures_df.head(2)

# Features from Lectures data

In [None]:
lectures_df['type_of'] = lectures_df['type_of'].replace('solving question', 'solving_question')
lectures_df = pd.get_dummies(lectures_df, columns=['part', 'type_of'])
part_lectures_columns = [column for column in lectures_df.columns if column.startswith('part')]
types_of_lectures_columns = [column for column in lectures_df.columns if column.startswith('type_of_')]

In [None]:
train_lectures = train[train.content_type_id == True].merge(lectures_df, left_on='content_id', right_on='lecture_id', how='left')

In [None]:
user_lecture_stats_part = train_lectures.groupby('user_id')[part_lectures_columns + types_of_lectures_columns].sum()

In [None]:
for column in user_lecture_stats_part.columns:
    bool_column = column + '_boolean'
    user_lecture_stats_part[bool_column] = (user_lecture_stats_part[column] > 0).astype(int)
    
user_lecture_stats_part.head()

In [None]:
import gc

del train_lectures
gc.collect()

# Final features from Train data

In [None]:
# Removing lectures
train = train[train.content_type_id == False].sort_values('timestamp').reset_index(drop = True)

elapsed_mean = train.prior_question_elapsed_time.mean()

## Task container

In [None]:
group1 = train.loc[(train.content_type_id == False), ['task_container_id', 'user_id']].groupby(['task_container_id']).agg(['count'])
group1.columns = ['avg_questions']
group2 = train.loc[(train.content_type_id == False), ['task_container_id', 'user_id']].groupby(['task_container_id']).agg(['nunique'])
group2.columns = ['avg_questions']
task_container_id_features = group1 / group2 #group3

del group1, group2
gc.collect()

task_container_id_features['avg_questions_seen'] = task_container_id_features.avg_questions.cumsum() # Cumulative sums, or running
task_container_id_features.head()

## Users

In [None]:
%%time
# --- STUDENT ANSWERS ---
# Group by student
def calculate_matrics_for_student(data):
    # Calculate metrics
    results = data.loc[data.content_type_id == False, ['user_id','answered_correctly']].\
                                groupby(['user_id']).\
                                agg({'answered_correctly': ['mean', 'count', 'std', 'median', 'var', 'skew']}).\
                                reset_index()
    
    results.columns = ['user_id', 'user_mean', 'user_count', 'user_std', 
                               'user_median', 'user_var', 'user_skew']

    return results


results_user_final = calculate_matrics_for_student(train)
# results_user_final['explanation_mean_user'] = train.loc[train.content_type_id == False, ['user_id','prior_question_had_explanation']].groupby(['user_id']).agg(['mean'])
# results_user_final
# results_user_final.explanation_mean_user.isna().sum()

## question 

In [None]:
%%time
# --- CONTENT ID ANSWERS ---
# Group by content
# Calculate metrics 
def calculate_metrics_for_content(data):
    results = data[data['answered_correctly']!=-1].\
                                groupby('content_id').\
                                agg({'answered_correctly': ['mean', 'count', 'std', 'median', 'var', 'skew']}).\
                                reset_index()

    results.columns = ['content_id', 'content_mean', 'content_count', 'content_std', 
                               'content_median', 'content_var', 'content_skew']
    return results
    
results_question_final = calculate_metrics_for_content(train)

# Features from question data

## Tags

In [None]:
%%time

questions['tags'] = questions['tags'].astype(str)

tags = [x.split() for x in questions[questions.tags != "nan"].tags.values]
tags = [item for elem in tags for item in elem]
tags = set(tags)
print(f'There are {len(tags)} different tags')

In [None]:
%%time

# split tags
tags_list = [x.split() for x in questions.tags.values]
questions['tags'] = tags_list
questions.head()

# count right and wrong answers
correct = train[train.answered_correctly != -1].groupby(["content_id", 'answered_correctly'], as_index=False).size()
correct = correct.pivot(index= "content_id", columns='answered_correctly', values='size')
correct.columns = ['Wrong', 'Right']
correct = correct.fillna(0)
correct[['Wrong', 'Right']] = correct[['Wrong', 'Right']].astype(int)
questions = questions.merge(correct, left_on = "question_id", right_on = "content_id", how = "left")

questions.head()

In [None]:
%%time

tags = list(tags)
tags_df = pd.DataFrame()
for x in range(len(tags)):
    df = pd.DataFrame()
    for y in range(len(questions)):
        if (tags[x] in questions.tags.values[y]):
            df = df.append(questions.iloc[y,:])

    df = df.agg({'Wrong': ['sum'], 'Right': ['sum']})
    df['tag'] = tags[x]
#     df = df.set_index('tag')
    tags_df = tags_df.append(df)

tags_df['question_percent_correct'] = tags_df.Right/(tags_df.Right + tags_df.Wrong)
tags_df = tags_df.sort_values(by = "question_percent_correct")

tags_df.head()

In [None]:
%%time

tag_mean_procent = []
for question_tags in questions['tags']:
    sum_of_tags = 0.0
    values = 0.0
    for tag_id in question_tags:
        value_series = tags_df.loc[tags_df['tag'] == str(tag_id),'question_percent_correct']
        if len(value_series) > 0:
            values += 1
            sum_of_tags = sum_of_tags + float(value_series[0])
    if values > 0:
        tag_mean_procent.append(sum_of_tags / values)
    else:
        tag_mean_procent.append(0.5)
        
        
questions['tags_mean_correct'] = tag_mean_procent
questions['question_percent_correct'] = questions.Right/(questions.Right + questions.Wrong)
questions['question_asked_time'] = (questions.Right + questions.Wrong)
questions.head()

## Part

In [None]:
%%time

part = questions.groupby('part').agg({'Wrong': ['sum'], 'Right': ['sum']}).reset_index()
part['part_percent_correct'] = part.Right/(part.Right + part.Wrong)
part = part.drop('Wrong', 1)
part = part.drop('Right', 1)
part.columns = ['part', 'part_percent_correct']
questions = questions.merge(part, how = 'left', on = 'part')
questions.head()

## Questions finish 

In [None]:
questions = questions.drop('bundle_id', 1)
questions = questions.drop('correct_answer', 1)
# questions = questions.drop('part', 1)
questions = questions.drop('tags', 1)
questions = questions.drop('Wrong', 1)
questions = questions.drop('Right', 1)

questions = questions.rename(columns={"question_id": "content_id"})
questions.to_parquet('question_features_data.parquet')

questions.head()

In [None]:
train.drop(['timestamp'], axis=1, inplace=True)

# Spliting data

## Validation set

In [None]:
validation = train.groupby('user_id').tail(5)
train = train[~train.index.isin(validation.index)]

### calculate feature values for train without validation data

In [None]:
results_user_validation = calculate_matrics_for_student(validation)

## test set

In [None]:
X = train.groupby('user_id').tail(18)
train = train[~train.index.isin(X.index)]
len(X) + len(train) + len(validation)

### calculate feature values for test set

In [None]:
results_user_X = calculate_matrics_for_student(X)

# Cleaning

In [None]:
del(train)
gc.collect()

# Question preprocessing

In [None]:
content_mean = questions.question_percent_correct.mean()

# set mean for question asked less then 3 times
questions.question_percent_correct = questions.question_percent_correct.mask((questions['question_asked_time'] < 3), content_mean)

questions.question_percent_correct = questions.question_percent_correct.mask((questions.question_percent_correct < .2) & (questions['question_asked_time'] < 21), .2)
questions.question_percent_correct = questions.question_percent_correct.mask((questions.question_percent_correct > .95) & (questions['question_asked_time'] < 21), .95)

# Adding Features to dataset. (Marging)

In [None]:
def combine_features(data=None, 
                     user_features=None, 
                     question_features=questions, 
                     lectures_features=user_lecture_stats_part,
                     task_container_id_features=task_container_id_features):
    
    # Add "past" information
    data = data.merge(user_features, how = 'left', on = 'user_id')
    data = data.merge(question_features, how = 'left', on = 'content_id')
    data = data.merge(lectures_features, how = 'left', on = 'user_id')
    data = data.merge(task_container_id_features, how="left", left_on=['task_container_id'], right_index= True)
    
    return data

In [None]:
## Adding features to validation dataset

validation = combine_features(validation ,user_features=results_user_validation)

X          = combine_features(X ,user_features=results_user_validation)

# Lebel encoding 

In [None]:
from sklearn.preprocessing import LabelEncoder


def encode_categorical_values(data):
    lb_make = LabelEncoder()
    data.prior_question_had_explanation.fillna(False, inplace = True)
    data["prior_question_had_explanation_enc"] = lb_make.fit_transform(data["prior_question_had_explanation"])
    #     data["type_of_concept"] = lb_make.fit_transform(data["type_of_concept"])
    return data

In [None]:
validation = encode_categorical_values(validation)
X = encode_categorical_values(X)

X.columns

# Split data for X and Y

In [None]:
Y = X['answered_correctly']
X = X.drop(['answered_correctly'], axis=1)

Y_val = validation['answered_correctly']
X_val = validation.drop(['answered_correctly'], axis=1)

X.head()

## Filter columns. Get only one with features

In [None]:
#user_id content_id	task_container_id prior_question_had_explanation 

features_columns_names = ['prior_question_elapsed_time', 
                          'user_mean', 'user_count', 'user_std', 'user_median', 'user_var', 'user_skew', 'part',
                          'tags_mean_correct', 'question_percent_correct', 'question_asked_time', 'part_percent_correct',
                          'part_1', 'part_2', 'part_3', 'part_4', 'part_5', 'part_6', 'part_7',
                          'type_of_intention', 'type_of_solving_question', 'type_of_starter',
                          'part_1_boolean', 'part_2_boolean', 'part_3_boolean', 'part_4_boolean', 'part_5_boolean', 'part_6_boolean', 'part_7_boolean',
                          'type_of_concept_boolean', 'type_of_intention_boolean',
                          'type_of_solving_question_boolean', 'type_of_starter_boolean',
                          'avg_questions_seen', 'prior_question_had_explanation_enc'
                         ]


X = X[features_columns_names]
X_val = X_val[features_columns_names]

X.head()

# Remove nans

In [None]:
def remove_nans(data):
    data['prior_question_elapsed_time'].fillna(elapsed_mean, inplace = True)
    
    data['user_mean'].fillna(0.65,  inplace=True)
    data['user_count'].fillna(0,  inplace=True)
    data['user_std'].fillna(0.42,  inplace=True)
    data['user_median'].fillna(1,  inplace=True)
    data['user_var'].fillna(0.21,  inplace=True)
    data['user_skew'].fillna(-0.16,  inplace=True)
    
    data['tags_mean_correct'].fillna(0.65,  inplace=True)
    data['question_percent_correct'].fillna(content_mean,  inplace=True)
    data['question_asked_time'].fillna(0,  inplace=True)
    data['part_percent_correct'].fillna(0.65,  inplace=True)

    data['part_1'].fillna(0, inplace = True)
    data['part_2'].fillna(0, inplace = True)
    data['part_3'].fillna(0, inplace = True)
    data['part_4'].fillna(0, inplace = True)
    data['part_5'].fillna(0, inplace = True)
    data['part_6'].fillna(0, inplace = True)
    data['part_7'].fillna(0, inplace = True)
    
    data['type_of_intention'].fillna(0, inplace = True)
    data['type_of_solving_question'].fillna(0, inplace = True)
    data['type_of_starter'].fillna(0, inplace = True)

    data['part_1_boolean'].fillna(0, inplace = True)
    data['part_2_boolean'].fillna(0, inplace = True)
    data['part_3_boolean'].fillna(0, inplace = True)
    data['part_4_boolean'].fillna(0, inplace = True)
    data['part_5_boolean'].fillna(0, inplace = True)
    data['part_6_boolean'].fillna(0, inplace = True)
    data['part_7_boolean'].fillna(0, inplace = True)
    
    # data['type_of_concept'].fillna(0, inplace = True)

    data['type_of_concept_boolean'].fillna(0, inplace = True)
    data['type_of_intention_boolean'].fillna(0, inplace = True)
    data['type_of_solving_question_boolean'].fillna(0, inplace = True)
    data['type_of_starter_boolean'].fillna(0, inplace = True)
        
    data['part'].fillna(4, inplace = True)
    data['avg_questions_seen'].fillna(1, inplace = True)
    data['prior_question_had_explanation_enc'].fillna(0, inplace = True)
    return data

In [None]:
X = remove_nans(X)
X_val = remove_nans(X_val)

X.head()

# Data scaling

In [None]:
#maybe I will ues it for other models
def scale_data(data=None, columns_to_scale=None):
    scaled_features = data
    
    features = scaled_features[columns_to_scale]
    scaler = StandardScaler().fit(features.values)
    features = scaler.transform(features.values)
    
    scaled_features[columns_to_scale] = features

    return scaled_features

# Models

In [None]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

In [None]:
%%time

params = {
    'num_leaves': 31, 
    'n_estimators': 200, 
    'max_depth': 8, 
    'min_child_samples': 356, 
    'learning_rate': 0.2982483634778906, 
    'min_data_in_leaf': 82, 
    'bagging_fraction': 0.6545628633239445, 
    'feature_fraction': 0.9164482379289846,
    'random_state': 666
}

full_model = LGBMClassifier(**params)
full_model.fit(X, Y)

preds = full_model.predict_proba(X_val)[:,1]
print('LGB roc auc', roc_auc_score(Y_val, preds))

full_xgb = XGBClassifier(random_state=666)
full_xgb.fit(X, Y)

preds = full_xgb.predict_proba(X_val)[:,1]
print('XGB roc auc', roc_auc_score(Y_val, preds))

In [None]:
import optuna
from optuna.samplers import TPESampler

from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold

In [None]:
%%time


rfe = RFE(estimator=DecisionTreeClassifier(random_state=666), n_features_to_select=14)
rfe.fit(X, Y)
X = rfe.transform(X)
X_val = rfe.transform(X_val)

In [None]:
%%time


sampler = TPESampler(seed=666)

def create_model(trial):
    num_leaves = trial.suggest_int("num_leaves", 2, 31)
    n_estimators = trial.suggest_int("n_estimators", 20, 300)
    max_depth = trial.suggest_int('max_depth', 3, 9)
    min_child_samples = trial.suggest_int('min_child_samples', 100, 1200)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 0.99)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 5, 90)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.0001, 1.0)
    feature_fraction = trial.suggest_uniform('feature_fraction', 0.0001, 1.0)
    model = LGBMClassifier(
        num_leaves=num_leaves,
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        min_child_samples=min_child_samples, 
        min_data_in_leaf=min_data_in_leaf,
        learning_rate=learning_rate,
        feature_fraction=feature_fraction,
        random_state=666
        )
    return model

def objective(trial):
    model = create_model(trial)
    model.fit(X, Y)
    preds = model.predict_proba(X_val)[:,1]
    score = roc_auc_score(Y_val, preds)
    return score

# run optuna
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=350)
params = study.best_params
params['random_state'] = 666

# params = {
#     'num_leaves': 28, 
#     'n_estimators': 295, 
#     'max_depth': 8, 
#     'min_child_samples': 1178, 
#     'learning_rate': 0.2379173491475032, 
#     'min_data_in_leaf': 35, 
#     'bagging_fraction': 0.8389723511600549, 
#     'feature_fraction': 0.9606189400533491,
#     'random_state': 666
# }

model = LGBMClassifier(**params)
model.fit(X, Y)

preds = model.predict_proba(X_val)[:,1]
roc_auc_score(Y_val, preds)

In [None]:
%%time


X = pd.DataFrame(X)
X_val = pd.DataFrame(X_val)

Y = pd.DataFrame(Y)
Y_val = pd.DataFrame(Y_val)

In [None]:
%%time


models = []
preds = []
for n, (tr, te) in enumerate(KFold(n_splits=5, random_state=666, shuffle=True).split(Y)):
    print(f'Fold {n}')
    model = LGBMClassifier(**params)
    model.fit(X.values[tr], Y.values[tr])
    
    pred = model.predict_proba(X_val)[:, 1]
    preds.append(pred)
    print('Fold roc auc:', roc_auc_score(Y.values[te], model.predict_proba(X.values[te])[:, 1])) 
    models.append(model)

In [None]:
%%time


predictions = preds[0]
for i in range(1, 5):
    predictions += preds[i]
predictions /= 5

print('ROC AUC', roc_auc_score(Y_val, predictions))

# Submission

In [None]:
# Import library and create environment
import riiideducation
env = riiideducation.make_env()

In [None]:
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    test_df['task_container_id'] = test_df.task_container_id.mask(test_df.task_container_id > 9999, 9999)
    
    test_df = combine_features(test_df ,user_features=results_user_final)
    test_df = encode_categorical_values(test_df)
    test_df = remove_nans(test_df)
    
    full_preds = full_model.predict_proba(test_df[features_columns_names])[:, 1]
    
    full_preds_xgb = full_xgb.predict_proba(test_df[features_columns_names])[:, 1]
    
    X_test = rfe.transform(test_df[features_columns_names])
    
    preds = [model.predict_proba(X_test)[:,1] for model in models]
    
    predictions = preds[0]
    for i in range(1, 5):
        predictions += preds[i]
    predictions /= 5
    
    test_df['answered_correctly'] =  predictions * 0.6 + full_preds * 0.2 + full_preds_xgb * 0.2
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])