- 2nd modelling
- add new feature engineering by combining questions.csv
- add stratified group k fold

In [1]:
import sys
import copy
import random
import feather
import warnings
import numpy as np
import pandas as pd
from time import time
import lightgbm as lgb
from collections import Counter
from sklearn import preprocessing
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import GroupKFold, StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score

import riiideducation

pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows",1000)
np.set_printoptions(precision=8)
warnings.filterwarnings("ignore")

In [2]:
DIR = "../input/riiid-test-answer-prediction/"
train = pd.read_feather("../input/riid-train/train.feather")
lectures = pd.read_csv(DIR+"lectures.csv")
questions = pd.read_csv(DIR+"questions.csv")
example_test = pd.read_csv(DIR+"example_test.csv")

# preprocess

In [3]:
train = train.loc[train['answered_correctly'] != -1].reset_index(drop=True)
train = train.drop(["row_id", 'timestamp'], axis=1)
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].fillna(value = False).astype(bool)

user_answers_df = train.groupby('user_id').agg({'answered_correctly': ['mean', 'count']}).copy()
user_answers_df.columns = ['mean_user_accuracy', 'questions_answered']

content_answers_df = train.groupby('content_id').agg({'answered_correctly': ['mean', 'count']}).copy()
content_answers_df.columns = ['mean_accuracy', 'question_asked']

# user_content_answers_df = train.groupby(['user_id', 'content_id']).agg({'answered_correctly': ['mean', 'count']}).copy()
# user_content_answers_df.columns = ['mean_user_content_accuracy', 'content_questions_answered']

In [4]:
le = preprocessing.LabelEncoder()
train["prior_question_had_explanation"] = le.fit_transform(train["prior_question_had_explanation"])

In [5]:
train = train.iloc[90000000:,:]

In [6]:
train.head()

Unnamed: 0,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
90000000,1947785375,5581,0,234,2,1,12000,1
90000001,1947785375,5600,0,235,0,1,17000,1
90000002,1947785375,5591,0,236,3,1,18000,1
90000003,1947785375,8426,0,237,2,0,19000,1
90000004,1947785375,4299,0,238,0,0,19000,1


# Feature engineering

In [7]:
def fe(df):
    df = df.merge(user_answers_df, how = 'left', on = 'user_id')
    df = df.merge(content_answers_df, how = 'left', on = 'content_id')
    # https://stackoverflow.com/questions/25888207/pandas-join-dataframes-on-field-with-different-names
    df = df.merge(questions[["question_id", "part"]], how="left", left_on=['content_id'], right_on=['question_id'])
    # df = df.merge(user_content_answers_df, how = 'left', on = ['user_id', 'content_id'])
    df['mean_diff1'] = df['mean_user_accuracy'] - df['mean_accuracy']
    # df['mean_diff2'] = df['mean_accuracy'] - df['mean_user_content_accuracy']
    return df
        
train = fe(train)

In [8]:
train.fillna(value = 0.5, inplace = True)
train = train.sort_values(['user_id'])

In [9]:
train.head()

Unnamed: 0,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,mean_user_accuracy,questions_answered,mean_accuracy,question_asked,question_id,part,mean_diff1
0,1947785375,5581,0,234,2,1,12000,1,0.703226,1395,0.571516,50646,5581,5,0.13171
752,1947785375,3919,0,821,3,0,20000,1,0.703226,1395,0.619549,4839,3919,5,0.083676
753,1947785375,8701,0,822,3,1,15000,1,0.703226,1395,0.769857,4419,8701,5,-0.066632
754,1947785375,6429,0,823,3,1,17000,1,0.703226,1395,0.526094,38841,6429,5,0.177132
755,1947785375,5618,0,824,2,1,9000,1,0.703226,1395,0.582133,16936,5618,5,0.121093


In [10]:
target = 'answered_correctly'
columns = ['mean_user_accuracy', 'questions_answered', 'mean_accuracy', 'question_asked','mean_diff1', 'part']

In [11]:
drop_columns = [i for i in train.columns if i not in columns + [target] + ["user_id"]] 
train.drop(drop_columns, axis=1, inplace=True)

# modelling

In [12]:
def stratified_group_k_fold(X, y, groups, k, seed=None):
    labels_num = np.max(y) + 1
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)
    
    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)
    for i in range(k):
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices

In [13]:
def modelling_lgb(X_train, y_train):
    params = {'objective': 'binary', "metric": 'auc','eval_metric':'auc', 'boosting_type': 'gbdt', 'tree_learner': 'serial', 'learning_rate': 0.01, 
               "num_leaves": 10, 'random_seed':44, 'max_depth': 5} 
    
    groups = np.array(X_train.user_id.values)
    n_folds=5
    
    models = []
    valid = np.zeros([X_train.shape[0]])
    for i , (train_index, test_index) in enumerate(stratified_group_k_fold(X_train, y_train, groups, k=n_folds, seed=0)):
        print("Fold {}".format(i+1))
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]
        X_train2.drop("user_id", axis=1, inplace=True)

        X_valid2 = X_train.iloc[test_index,:]
        y_valid2 = y_train.iloc[test_index]
        X_valid2.drop("user_id", axis=1, inplace=True)

        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_valid2, y_valid2, reference=lgb_train)
        
        clf = lgb.train(params, lgb_train,valid_sets=[lgb_train, lgb_eval], 
               num_boost_round=1000,early_stopping_rounds=10,verbose_eval = 100) 

        valid_predict = clf.predict(X_valid2, num_iteration = clf.best_iteration)
        valid[test_index] = valid_predict
        
        models.append(clf) 
            
    score = roc_auc_score(y_train, valid)
    print("Overall ROC AUC: ", score)
        
    return models
models = modelling_lgb(train[columns + ["user_id"]], train[target])

Fold 1
Training until validation scores don't improve for 10 rounds
[100]	training's auc: 0.751762	valid_1's auc: 0.750278
[200]	training's auc: 0.753901	valid_1's auc: 0.752517
[300]	training's auc: 0.755087	valid_1's auc: 0.753783
[400]	training's auc: 0.756096	valid_1's auc: 0.754795
[500]	training's auc: 0.756743	valid_1's auc: 0.755424
[600]	training's auc: 0.75716	valid_1's auc: 0.75583
[700]	training's auc: 0.757438	valid_1's auc: 0.756095
[800]	training's auc: 0.757639	valid_1's auc: 0.756272
[900]	training's auc: 0.757791	valid_1's auc: 0.756399
[1000]	training's auc: 0.757918	valid_1's auc: 0.756518
Did not meet early stopping. Best iteration is:
[1000]	training's auc: 0.757918	valid_1's auc: 0.756518
Fold 2
Training until validation scores don't improve for 10 rounds
[100]	training's auc: 0.751566	valid_1's auc: 0.75131
[200]	training's auc: 0.753681	valid_1's auc: 0.75344
[300]	training's auc: 0.75487	valid_1's auc: 0.754701
[400]	training's auc: 0.755882	valid_1's auc: 0.7

# prediction

In [14]:
del train

In [15]:
env = riiideducation.make_env()
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    y_preds = []
    test_df = fe(test_df)

    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(value = False).astype(bool)
    test_df = test_df.loc[test_df['content_type_id'] == 0].reset_index(drop=True)
    test_df.fillna(value = 0.5, inplace = True)
    test_df["prior_question_had_explanation"] = le.fit_transform(test_df["prior_question_had_explanation"])

    for model in models:
        y_pred = model.predict(test_df[columns], num_iteration=model.best_iteration)
        y_preds.append(y_pred)

    y_preds = sum(y_preds) / len(y_preds)
    test_df['answered_correctly'] = y_preds
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])