In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from multiprocessing import cpu_count
from tqdm.notebook import tqdm
import gc
import os
import sys
import joblib
submit_kaggle = True

In [2]:
%%time
cols_to_load = ['user_id', 'answered_correctly', 'content_id', 'prior_question_had_explanation', 'prior_question_elapsed_time']
FEATURES = pd.read_pickle('../input/riiid-train-data-multiple-formats/riiid_train.pkl.gzip')[cols_to_load]
FEATURES['prior_question_had_explanation'] = FEATURES['prior_question_had_explanation'].astype('boolean')
# questions = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')
# lectures = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv')
dtype={'num_user_answered_questions':np.int16, 
       'num_user_correctly_questions':np.int16, 
       'mean_user_accuracy':np.float32,
       'watches_lecture':'boolean',
       'part':np.int8,
       'content_questions':np.int32,
       'mean_content_accuracy':np.float32,
       'mean_prior_question_had_explanation':np.float32,
       'mean_prior_question_elapsed_time':np.float32,
      }
user_df = pd.read_csv('../input/data_io/user_df.csv', index_col=0,dtype=dtype)
content_df = pd.read_csv('../input/data_io/content_df.csv', index_col=0,dtype=dtype)

CPU times: user 11 s, sys: 3.8 s, total: 14.8 s
Wall time: 17.8 s


In [3]:
FEATURES = FEATURES[FEATURES['answered_correctly'] != -1]
FEATURES['answered_user'] = FEATURES[['user_id','content_id']].groupby('user_id').cumcount().astype(np.uint16)
FEATURES['answered_correctly_user'] = FEATURES[['answered_correctly', 'user_id']].groupby('user_id').cumsum().astype(np.uint16)
FEATURES['answered_correctly_user'] = FEATURES[['answered_correctly_user', 'user_id']].groupby('user_id').shift(1)
FEATURES['answered_correctly_user'] = FEATURES['answered_correctly_user'].fillna(0)
FEATURES['answered_correctly_user'] = FEATURES['answered_correctly_user'].astype(np.uint16)
FEATURES['attempt'] = FEATURES.groupby(['user_id', 'content_id']).content_id.transform('cumcount').astype(np.uint8)

In [4]:
def make_features(FEATURES):
    # merge with all features
    FEATURES = FEATURES[FEATURES['answered_correctly'] != -1]
    FEATURES['answered_user'] = FEATURES[['user_id','content_id']].groupby('user_id').cumcount().astype(np.uint16)
    FEATURES['answered_correctly_user'] = FEATURES[['answered_correctly', 'user_id']].groupby('user_id').cumsum().astype(np.uint16)
    FEATURES['answered_correctly_user'] = FEATURES[['answered_correctly_user', 'user_id']].groupby('user_id').shift(1)
    FEATURES['answered_correctly_user'] = FEATURES['answered_correctly_user'].fillna(0)
    FEATURES['answered_correctly_user'] = FEATURES['answered_correctly_user'].astype(np.uint16)
    FEATURES = FEATURES.merge(user_df, how="left", on="user_id")
    FEATURES = FEATURES.merge(content_df, how='left', on='content_id')
    # add harmonic mean
    FEATURES['hmean_user_content_accuracy'] = 2 * (
        (FEATURES['mean_user_accuracy'] * FEATURES['mean_content_accuracy']) /
        (FEATURES['mean_user_accuracy'] + FEATURES['mean_content_accuracy'])
    )
    for c in categorical_feature:
        FEATURES[c] = FEATURES[c].astype('category')
    return FEATURES

In [5]:
categorical_feature = [
                     'part',
                     ]
features =['prior_question_elapsed_time',
 'mean_user_accuracy',
 'answered_correctly_user',
 'answered_user',
 'mean_content_accuracy',
 'part',
 'hmean_user_content_accuracy',
 'attempt']
target = 'answered_correctly'

In [6]:
FEATURES = make_features(FEATURES)

In [7]:
col = features + [target]
FEATURES = FEATURES[col]

In [8]:
%%time
if submit_kaggle:
    lgb_train = lgb.Dataset(FEATURES[features], FEATURES[target], categorical_feature = categorical_feature)
    del FEATURES
    gc.collect()
else:
    X_train, X_valid, y_train, y_valid = train_test_split(FEATURES[features], FEATURES[target], test_size=0.2, random_state=1)
    del FEATURES
    gc.collect()
    lgb_train = lgb.Dataset(X_train, y_train, categorical_feature = categorical_feature)
    lgb_eval = lgb.Dataset(X_valid, y_valid, categorical_feature = categorical_feature)
    del X_train, y_train, X_valid, y_valid
    gc.collect()

CPU times: user 579 ms, sys: 192 ms, total: 771 ms
Wall time: 769 ms


In [9]:
params = {'objective': 'binary',
          'metric': 'auc',
          'seed': 1,
          'learning_rate': 0.1, #default
          "boosting_type": "gbdt" #default
         }
METRICS = ['auc']

In [10]:
evals_result = {}
if submit_kaggle:
    model = lgb.train(
        params = params,
        train_set = lgb_train,
        valid_sets = lgb_train,
        verbose_eval = 50,
        num_boost_round = 300,
        early_stopping_rounds = 10,
        categorical_feature = categorical_feature,
        feature_name = features,
        evals_result = evals_result,
    )
else:
    model = lgb.train(
        params = params,
        train_set = lgb_train,
        valid_sets = [lgb_train,lgb_eval]
        verbose_eval = 50,
        num_boost_round = 300,
        early_stopping_rounds = 10,
        categorical_feature = categorical_feature,
        feature_name = features,
        evals_result = evals_result,
    )

SyntaxError: invalid syntax (<ipython-input-10-d36ed1b2e82e>, line 19)

In [None]:
def plot_history(evals_result):
    for metric in METRICS:
        plt.figure(figsize=(20,8))
        
        for key in evals_result.keys():
            history_len = len(evals_result.get(key)[metric])
            history = evals_result.get(key)[metric]
            x_axis = np.arange(1, history_len + 1)
            plt.plot(x_axis, history, label=key)
        
        x_ticks = list(filter(lambda e: (e % (history_len // 100 * 10) == 0) or e == 1, x_axis))
        plt.xticks(x_ticks, fontsize=12)
        plt.yticks(fontsize=12)

        plt.title(f'{metric.upper()} History of training', fontsize=18);
        plt.xlabel('EPOCH', fontsize=16)
        plt.ylabel(metric.upper(), fontsize=16)
        
        if metric in ['auc']:
            plt.legend(loc='upper left', fontsize=14)
        else:
            plt.legend(loc='upper right', fontsize=14)
        plt.grid()
        plt.show()

plot_history(evals_result)

In [None]:
lgb.plot_importance(model)
plt.show()

In [None]:
joblib.dump(model, '../input/data_io/state1_2.pkl')

---

# Predict

In [None]:
# model = joblib.load('../input/data_io/model_Riiid_Competition_Baseline_1.pkl')