- preparation to extract test data

In [1]:
import sys
import copy
import random
import feather
import warnings
import numpy as np
import pandas as pd
from time import time
import lightgbm as lgb
from collections import Counter
from sklearn import preprocessing
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import GroupKFold, StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score

import riiideducation

pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows",2000)
np.set_printoptions(precision=8)
warnings.filterwarnings("ignore")

In [2]:
DIR = "../input/riiid-test-answer-prediction/"
train = pd.read_feather("../input/riid-train/train.feather")
lectures = pd.read_csv(DIR+"lectures.csv")
questions = pd.read_csv(DIR+"questions.csv")
example_test = pd.read_csv(DIR+"example_test.csv")

In [3]:
test = pd.read_csv(DIR+"example_test.csv")
test.head()

Unnamed: 0,row_id,group_num,timestamp,user_id,content_id,content_type_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation,prior_group_answers_correct,prior_group_responses
0,0,0,0,275030867,5729,0,0,,,[],[]
1,1,0,13309898705,554169193,12010,0,4427,19000.0,True,,
2,2,0,4213672059,1720860329,457,0,240,17000.0,True,,
3,3,0,62798072960,288641214,13262,0,266,23000.0,True,,
4,4,0,10585422061,1728340777,6119,0,162,72400.0,True,,


# preprocess

In [4]:
train.drop(["row_id", 'user_answer'], axis=1, inplace=True)
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].fillna(value = False).astype(bool)

user_answers_df = train.groupby('user_id').agg({'answered_correctly': ['mean']}).copy()
user_answers_df.columns = ['mean_user_accuracy']

content_answers_df = train.groupby('content_id').agg({'answered_correctly': ['mean']}).copy()
content_answers_df.columns = ['mean_accuracy_by_content']

#user_content_answers_df = train.groupby(['user_id', 'content_id']).agg({'answered_correctly': ['mean', 'count']}).copy()
#user_content_answers_df.columns = ['mean_user_content_accuracy', 'content_questions_answered']

In [5]:
le = preprocessing.LabelEncoder()
train["prior_question_had_explanation"] = le.fit_transform(train["prior_question_had_explanation"])

In [6]:
train = train.iloc[100000000:,:]
train.head(10)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
100000000,4870792027,2122503776,117,0,137,1,23000,1
100000001,4870826073,2122503776,10588,0,138,1,24000,1
100000002,4870873332,2122503776,67,0,139,0,25000,1
100000003,4870911284,2122503776,97,0,140,1,29000,1
100000004,4870944944,2122503776,7883,0,141,1,20000,1
100000005,4870977342,2122503776,10546,0,142,1,26000,1
100000006,4871005961,2122503776,158,0,143,1,25000,1
100000007,4871036085,2122503776,42,0,144,1,21000,1
100000008,4871068852,2122503776,162,0,145,1,22000,1
100000009,4871104763,2122503776,10393,0,146,1,25000,1


In [7]:
train.shape, len(train.user_id.unique())

((1230332, 8), 4688)

# Feature engineering

In [8]:
def fe(df):
    df = df.merge(user_answers_df, how = 'left', on = 'user_id')
    df = df.merge(content_answers_df, how = 'left', on = 'content_id')
    
    # https://stackoverflow.com/questions/25888207/pandas-join-dataframes-on-field-with-different-names
    #df = df.merge(questions[["question_id", "part", "tags"]], how="left", left_on=['content_id'], right_on=['question_id'])
    #df.drop("question_id", axis=1, inplace=True)
    return df
        
train = fe(train)

In [9]:
train = train.sort_values(['user_id', 'timestamp'])
train.head(10)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,mean_user_accuracy,mean_accuracy_by_content
0,4870792027,2122503776,117,0,137,1,23000,1,0.672065,0.934104
1,4870826073,2122503776,10588,0,138,1,24000,1,0.672065,0.818406
2,4870873332,2122503776,67,0,139,0,25000,1,0.672065,0.905934
3,4870911284,2122503776,97,0,140,1,29000,1,0.672065,0.898646
4,4870944944,2122503776,7883,0,141,1,20000,1,0.672065,0.930138
5,4870977342,2122503776,10546,0,142,1,26000,1,0.672065,0.861505
6,4871005961,2122503776,158,0,143,1,25000,1,0.672065,0.94589
7,4871036085,2122503776,42,0,144,1,21000,1,0.672065,0.974038
8,4871068852,2122503776,162,0,145,1,22000,1,0.672065,0.868617
9,4871104763,2122503776,10393,0,146,1,25000,1,0.672065,0.641516


In [10]:
#https://www.kaggle.com/taichin/final-my-model
def get_data(df, test_set=False):
    
    timediff = np.array([])
    lec_num = np.array([])
    
    #if test_set == False:
    #    for i, (ins_id, user_sample) in tqdm(enumerate(df.groupby('user_id', sort=False)), total=train.user_id.nunique(), desc='user_id', position=0):
            #timediff = np.concatenate([timediff, user_sample["timestamp"].diff(1).values])
    #        lec_num = np.concatenate([lec_num, user_sample["content_type_id"].cumsum().values])   
    #else:
        #timediff = np.concatenate([timediff, df["timestamp"].diff(1).values])
    #    lec_num = np.concatenate([lec_num, df["content_type_id"].cumsum().values])  

    #df["timediff"] = timediff
    #df["lec_num"] = lec_num
    if test_set == False:
        df = df.loc[df['answered_correctly'] != -1].reset_index(drop=True)
    df.drop(["timestamp"], axis=1, inplace=True)
    df.fillna(value = 0.5, inplace = True)
    return df
train = get_data(train)

In [11]:
train.head()

Unnamed: 0,user_id,content_id,content_type_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,mean_user_accuracy,mean_accuracy_by_content
0,2122503776,117,0,137,1,23000,1,0.672065,0.934104
1,2122503776,10588,0,138,1,24000,1,0.672065,0.818406
2,2122503776,67,0,139,0,25000,1,0.672065,0.905934
3,2122503776,97,0,140,1,29000,1,0.672065,0.898646
4,2122503776,7883,0,141,1,20000,1,0.672065,0.930138


In [12]:
target = 'answered_correctly'
columns = ['mean_user_accuracy', 'content_id', 'mean_accuracy_by_content', 'prior_question_had_explanation']#, 'lec_num']

In [13]:
drop_columns = [i for i in train.columns if i not in columns + [target] + ["user_id"]] 
train.drop(drop_columns, axis=1, inplace=True)

# modelling

In [14]:
def stratified_group_k_fold(X, y, groups, k, seed=None):
    labels_num = np.max(y) + 1
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)
    
    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)
    for i in range(k):
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices

In [15]:
categoricals = ["content_id"]
def modelling_lgb(X_train, y_train):
    params = {'objective': 'binary', "metric": 'auc','eval_metric':'auc', 'boosting_type': 'gbdt', 'tree_learner': 'serial', 'learning_rate': 0.01, 
               "num_leaves": 10, 'random_seed':44, 'max_depth': 5} 
    
    groups = np.array(X_train.user_id.values)
    n_folds=5
    
    models = []
    valid = np.zeros([X_train.shape[0]])
    for i , (train_index, test_index) in enumerate(stratified_group_k_fold(X_train, y_train, groups, k=n_folds, seed=0)):
        print("Fold {}".format(i+1))
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]
        X_train2.drop("user_id", axis=1, inplace=True)

        X_valid2 = X_train.iloc[test_index,:]
        y_valid2 = y_train.iloc[test_index]
        X_valid2.drop("user_id", axis=1, inplace=True)

        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_valid2, y_valid2, reference=lgb_train)
        
        clf = lgb.train(params, lgb_train,valid_sets=[lgb_train, lgb_eval], 
               num_boost_round=1,early_stopping_rounds=10,verbose_eval = 100, categorical_feature=categoricals) 

        valid_predict = clf.predict(X_valid2, num_iteration = clf.best_iteration)
        valid[test_index] = valid_predict
        
        models.append(clf) 
            
    score = roc_auc_score(y_train, valid)
    print("Overall ROC AUC: ", score)
        
    return models
models = modelling_lgb(train[columns + ["user_id"]], train[target])

Fold 1
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.731462	valid_1's auc: 0.733424
Fold 2
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.732656	valid_1's auc: 0.730428
Fold 3
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.732956	valid_1's auc: 0.726387
Fold 4
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.732145	valid_1's auc: 0.727525
Fold 5
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.731453	valid_1's auc: 0.733089
Overall ROC AUC:  0.7309785406759743


# prediction

In [16]:
del train

In [17]:
env = riiideducation.make_env()
iter_test = env.iter_test()

count = 0
for (test_df, sample_prediction_df) in iter_test:
    ##########################################
    if count == 0:
        test = test_df.copy()
        sub = sample_prediction_df.copy()
    else:
        test = pd.concat([test, test_df])
        sub = pd.concat([sub, sample_prediction_df])
    count += 1
    ###########################################
    
    y_preds = []
    test_df = fe(test_df)

    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(value = False).astype(bool)
    test_df = get_data(test_df, test_set=True)
    test_df["prior_question_had_explanation"] = le.fit_transform(test_df["prior_question_had_explanation"])

    for model in models:
        y_pred = model.predict(test_df[columns], num_iteration=model.best_iteration)
        y_preds.append(y_pred)

    y_preds = sum(y_preds) / len(y_preds)
    test_df['answered_correctly'] = y_preds
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

In [18]:
test_df.to_csv("test.csv", index=False)
sub.to_csv("sub.csv", index=False)