- 3rd modelling
- add new features
- remove answered_correctly -1

In [1]:
import sys
import copy
import random
import feather
import warnings
import numpy as np
import pandas as pd
from time import time
import lightgbm as lgb
from collections import Counter
from sklearn import preprocessing
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import GroupKFold, StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score

import riiideducation

pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows",2000)
np.set_printoptions(precision=8)
warnings.filterwarnings("ignore")

In [2]:
DIR = "../input/riiid-test-answer-prediction/"
train = pd.read_feather("../input/riid-train/train.feather")
lectures = pd.read_csv(DIR+"lectures.csv")
questions = pd.read_csv(DIR+"questions.csv")
example_test = pd.read_csv(DIR+"example_test.csv")

# preprocess

In [3]:
train.drop(["row_id", 'user_answer'], axis=1, inplace=True)
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].fillna(value = False).astype(bool)

user_answers_df = train.groupby('user_id').agg({'answered_correctly': ['mean']}).copy()
user_answers_df.columns = ['mean_user_accuracy']

content_answers_df = train.groupby('content_id').agg({'answered_correctly': ['mean']}).copy()
content_answers_df.columns = ['mean_accuracy_by_content']

#user_content_answers_df = train.groupby(['user_id', 'content_id']).agg({'answered_correctly': ['mean', 'count']}).copy()
#user_content_answers_df.columns = ['mean_user_content_accuracy', 'content_questions_answered']

In [4]:
le = preprocessing.LabelEncoder()
train["prior_question_had_explanation"] = le.fit_transform(train["prior_question_had_explanation"])

In [5]:
train = train.iloc[90000000:,:]
train.head(10)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
90000000,9699003707,1911793149,1797,0,817,1,13000,1
90000001,9699209307,1911793149,1899,0,818,0,15666,1
90000002,9699209307,1911793149,1900,0,818,1,15666,1
90000003,9699209307,1911793149,1898,0,818,1,15666,1
90000004,9699262735,1911793149,2128,0,819,1,14666,1
90000005,9699262735,1911793149,2127,0,819,0,14666,1
90000006,9699262735,1911793149,2126,0,819,1,14666,1
90000007,9699589757,1911793149,19184,1,567,-1,4294967295,0
90000008,9699677797,1911793149,3251,0,820,1,12000,1
90000009,9699677797,1911793149,3250,0,820,0,12000,1


# Feature engineering

In [6]:
def fe(df):
    df = df.merge(user_answers_df, how = 'left', on = 'user_id')
    df = df.merge(content_answers_df, how = 'left', on = 'content_id')
    
    # https://stackoverflow.com/questions/25888207/pandas-join-dataframes-on-field-with-different-names
    #df = df.merge(questions[["question_id", "part", "tags"]], how="left", left_on=['content_id'], right_on=['question_id'])
    #df.drop("question_id", axis=1, inplace=True)
    return df
        
train = fe(train)

In [7]:
train = train.sort_values(['user_id', 'timestamp'])
train.head(10)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,mean_user_accuracy,mean_accuracy_by_content
0,9699003707,1911793149,1797,0,817,1,13000,1,0.616754,0.876318
1,9699209307,1911793149,1899,0,818,0,15666,1,0.616754,0.670221
2,9699209307,1911793149,1900,0,818,1,15666,1,0.616754,0.806936
3,9699209307,1911793149,1898,0,818,1,15666,1,0.616754,0.4786
4,9699262735,1911793149,2128,0,819,1,14666,1,0.616754,0.915989
5,9699262735,1911793149,2127,0,819,0,14666,1,0.616754,0.847007
6,9699262735,1911793149,2126,0,819,1,14666,1,0.616754,0.492114
7,9699589757,1911793149,19184,1,567,-1,4294967295,0,0.616754,-1.0
8,9699677797,1911793149,3251,0,820,1,12000,1,0.616754,0.785269
9,9699677797,1911793149,3250,0,820,0,12000,1,0.616754,0.726427


In [8]:
#https://www.kaggle.com/taichin/final-my-model
def get_data(df, test_set=False):
    
    timediff = np.array([])
    lec_num = np.array([])
    
    if test_set == False:
        for i, (ins_id, user_sample) in tqdm(enumerate(df.groupby('user_id', sort=False)), total=train.user_id.nunique(), desc='user_id', position=0):
            timediff = np.concatenate([timediff, user_sample["timestamp"].diff(1).values])
            lec_num = np.concatenate([lec_num, user_sample["content_type_id"].cumsum().values])
    else:
        timediff = np.concatenate([timediff, df["timestamp"].diff(1).values])
        lec_num = np.concatenate([lec_num, df["content_type_id"].cumsum().values])  
        
    df["timediff"] = timediff
    df["lec_num"] = lec_num
    df = df.loc[df['answered_correctly'] != -1].reset_index(drop=True)
    df.drop(["timestamp","content_type_id"], axis=1, inplace=True)
    df.fillna(value = 0.5, inplace = True)
    return df
train = get_data(train)

HBox(children=(FloatProgress(value=0.0, description='user_id', max=43524.0, style=ProgressStyle(description_wi…




In [9]:
train.head()

Unnamed: 0,user_id,content_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,mean_user_accuracy,mean_accuracy_by_content,timediff,lec_num
0,1911793149,1797,817,1,13000,1,0.616754,0.876318,0.5,0.0
1,1911793149,1899,818,0,15666,1,0.616754,0.670221,205600.0,0.0
2,1911793149,1900,818,1,15666,1,0.616754,0.806936,0.0,0.0
3,1911793149,1898,818,1,15666,1,0.616754,0.4786,0.0,0.0
4,1911793149,2128,819,1,14666,1,0.616754,0.915989,53428.0,0.0


In [10]:
target = 'answered_correctly'
columns = ['mean_user_accuracy', 'mean_accuracy_by_content', 'prior_question_had_explanation', 'timediff', 'lec_num']

In [11]:
drop_columns = [i for i in train.columns if i not in columns + [target] + ["user_id"]] 
train.drop(drop_columns, axis=1, inplace=True)

# modelling

In [12]:
def stratified_group_k_fold(X, y, groups, k, seed=None):
    labels_num = np.max(y) + 1
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)
    
    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)
    for i in range(k):
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices

In [13]:
def modelling_lgb(X_train, y_train):
    params = {'objective': 'binary', "metric": 'auc','eval_metric':'auc', 'boosting_type': 'gbdt', 'tree_learner': 'serial', 'learning_rate': 0.01, 
               "num_leaves": 10, 'random_seed':44, 'max_depth': 5} 
    
    groups = np.array(X_train.user_id.values)
    n_folds=5
    
    models = []
    valid = np.zeros([X_train.shape[0]])
    for i , (train_index, test_index) in enumerate(stratified_group_k_fold(X_train, y_train, groups, k=n_folds, seed=0)):
        print("Fold {}".format(i+1))
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]
        X_train2.drop("user_id", axis=1, inplace=True)

        X_valid2 = X_train.iloc[test_index,:]
        y_valid2 = y_train.iloc[test_index]
        X_valid2.drop("user_id", axis=1, inplace=True)

        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_valid2, y_valid2, reference=lgb_train)
        
        clf = lgb.train(params, lgb_train,valid_sets=[lgb_train, lgb_eval], 
               num_boost_round=1000,early_stopping_rounds=10,verbose_eval = 100) 

        valid_predict = clf.predict(X_valid2, num_iteration = clf.best_iteration)
        valid[test_index] = valid_predict
        
        models.append(clf) 
            
    score = roc_auc_score(y_train, valid)
    print("Overall ROC AUC: ", score)
        
    return models
models = modelling_lgb(train[columns + ["user_id"]], train[target])

Fold 1
Training until validation scores don't improve for 10 rounds
[100]	training's auc: 0.748179	valid_1's auc: 0.749378
[200]	training's auc: 0.753205	valid_1's auc: 0.75431
[300]	training's auc: 0.75588	valid_1's auc: 0.756953
[400]	training's auc: 0.757577	valid_1's auc: 0.75867
[500]	training's auc: 0.758447	valid_1's auc: 0.759557
[600]	training's auc: 0.75896	valid_1's auc: 0.760031
[700]	training's auc: 0.759365	valid_1's auc: 0.760415
[800]	training's auc: 0.759649	valid_1's auc: 0.760666
[900]	training's auc: 0.759885	valid_1's auc: 0.760861
[1000]	training's auc: 0.760077	valid_1's auc: 0.760999
Did not meet early stopping. Best iteration is:
[1000]	training's auc: 0.760077	valid_1's auc: 0.760999
Fold 2
Training until validation scores don't improve for 10 rounds
[100]	training's auc: 0.748469	valid_1's auc: 0.7488
[200]	training's auc: 0.753389	valid_1's auc: 0.753747
[300]	training's auc: 0.75613	valid_1's auc: 0.756463
[400]	training's auc: 0.757771	valid_1's auc: 0.758

# prediction

In [14]:
del train

In [15]:
env = riiideducation.make_env()
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    y_preds = []
    test_df = fe(test_df)

    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(value = False).astype(bool)
    test_df = get_data(test_df, test_set=True)
    test_df["prior_question_had_explanation"] = le.fit_transform(test_df["prior_question_had_explanation"])

    for model in models:
        y_pred = model.predict(test_df[columns], num_iteration=model.best_iteration)
        y_preds.append(y_pred)

    y_preds = sum(y_preds) / len(y_preds)
    test_df['answered_correctly'] = y_preds
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

KeyError: 'answered_correctly'