- add try count

In [1]:
import sys
import copy
import random
import feather
import warnings
import numpy as np
import pandas as pd
from time import time
import lightgbm as lgb
from collections import Counter
from sklearn import preprocessing
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt 
from tqdm._tqdm_notebook import tqdm_notebook

import riiideducation
warnings.filterwarnings("ignore")
tqdm_notebook.pandas(desc="progress: ")

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`


In [2]:
DIR = "../input/riiid-test-answer-prediction/"
train = pd.read_feather("../input/riid-train/train.feather")
lectures = pd.read_csv(DIR+"lectures.csv")
questions = pd.read_csv(DIR+"questions.csv")

In [3]:
lecture_id = list(lectures.lecture_id.unique())
lectures['type_of'] = lectures['type_of'].replace('solving question', 'solving_question')
lectures = pd.get_dummies(lectures, columns=['type_of'])

types_columns = [column for column in lectures.columns if column.startswith('type_of_')]

In [4]:
train.shape

(101230332, 10)

# preprocess

In [5]:
# executed in advance
train = train.sort_values(['user_id', 'timestamp']).reset_index(drop=True)
train.drop(["row_id", "user_answer", "timestamp"], axis=1, inplace=True)

In [6]:
print("consider lecture data")
user_lec_df = train.groupby('user_id').agg({'content_type_id': ['mean']}).copy()
user_lec_df.columns = ["lec_ratio"]

train["lec"] = train.groupby(["user_id"])['content_type_id'].progress_apply(lambda x: x.shift(1))
train.drop(["content_type_id"], axis=1, inplace=True)

#train = train.merge(lectures[["lecture_id"]+types_columns], how="left", left_on=['content_id'], right_on=['lecture_id']).drop(columns='lecture_id')
#user_lec_type = train.groupby("user_id").agg("mean")[types_columns]
#user_lec_type = user_lec_type.fillna(0)

print("remove lecture data")
train = train.loc[train['answered_correctly'] != -1].reset_index(drop=True)

print("merge question data")
# https://stackoverflow.com/questions/25888207/pandas-join-dataframes-on-field-with-different-names
train = train.merge(questions[["question_id", "part"]], how="left", left_on=['content_id'], right_on=['question_id'], right_index=True).reset_index(drop=True)
train.drop(["question_id"], axis=1, inplace=True)

print("groupby calculation")
user_answers_df = train.groupby('user_id').agg({'answered_correctly': ['mean']}).copy()
user_answers_df.columns = ["acc_before_ans"]

user_answers_past2 = pd.DataFrame(train.groupby("user_id").tail(2).groupby("user_id").agg("mean")["answered_correctly"])
user_answers_past2.columns = ["mean_of_last_2_answered_correctly_by_user_id"]
user_answers_past3 = pd.DataFrame(train.groupby("user_id").tail(3).groupby("user_id").agg("mean")["answered_correctly"])
user_answers_past3.columns = ["mean_of_last_3_answered_correctly_by_user_id"]

user_part_accuracy = train.groupby(['user_id', 'part']).agg({'answered_correctly': ['mean']}).copy().unstack()
user_part_accuracy.columns = ["part1_acc", "part2_acc", "part3_acc", "part4_acc", "part5_acc", "part6_acc", "part7_acc"]

user_lec_ratio = train.groupby(['user_id']).agg({'lec': ['mean']}).copy()
user_lec_ratio.columns = ["lec"]

user_try_count = train.groupby("user_id")["row_id"].count().reset_index()
user_try_count.columns = ["user_id", "max_count"]

mean_accuracy_by_part = np.array([0.74503238, 0.70869406, 0.70145593, 0.63099758, 0.61008792, 0.66938835, 0.65960112])

consider lecture data


HBox(children=(FloatProgress(value=0.0, description='progress: ', max=393656.0, style=ProgressStyle(descriptio…


remove lecture data
merge question data
groupby calculation


KeyError: 'Column not found: row_id'

In [7]:
tmp = train.groupby("content_id")["answered_correctly"].mean().reset_index()
difficult_content = list(tmp[tmp.answered_correctly<0.3]["content_id"])
del tmp

In [8]:
train = train.iloc[90007298:,:]
train['task_container_id'] = train.groupby('user_id')['task_container_id'].transform(lambda x: pd.factorize(x)[0]).astype('int16')
train["lec"] = train["lec"].fillna(0)
print(train.shape, len(train.user_id.unique()))
train.head(10)

(9264002, 8) 36706


Unnamed: 0,user_id,content_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,lec,part
90007298,1947975793,6603,0,1,20500,True,0.0,6
90007299,1947975793,6601,0,1,20500,True,0.0,6
90007300,1947975793,6602,0,1,20500,True,0.0,6
90007301,1947975793,10076,1,1,6000,True,0.0,6
90007302,1947975793,10077,1,0,6000,True,0.0,6
90007303,1947975793,10075,1,0,6000,True,0.0,6
90007304,1947975793,10074,1,0,6000,True,0.0,6
90007305,1947975793,10075,2,0,22750,True,1.0,6
90007306,1947975793,10076,2,1,22750,True,0.0,6
90007307,1947975793,10074,2,0,22750,True,0.0,6


# Feature engineering

In [9]:
%%time
def get_data(df, test_set=False):
    if test_set == False:
        
        # cumsum correct answer by user_id
        tmp1 = df.groupby(['user_id'])['answered_correctly'].apply(lambda x: pd.concat([pd.DataFrame([0]), x.cumsum()[:-1]])).reset_index(drop=True).values
        tmp1 = np.array(tmp1, dtype=float)
        # cumsum total try by user_id
        tmp2 = df.groupby(['user_id']).cumcount().values
        tmp2 = np.array(tmp2, dtype=float).reshape(-1,1)
        # https://stackoverflow.com/questions/26248654/how-to-return-0-with-divide-by-zero
        correct_rate = np.divide(tmp1, tmp2, out=np.ones_like(tmp1)*-1, where=tmp2!=0)
        
        # cumsum lec count by user_id 
        tmp1 = df.groupby(['user_id'])['lec'].apply(lambda x: x.cumsum()).reset_index(drop=True).values
        tmp1 = np.array(tmp1, dtype=float).reshape(-1,1)
        lec_rate = np.divide(tmp1, tmp2, out=np.zeros_like(tmp1), where=tmp2!=0)
        
        # cumsum of each part trial by user
        part1_cumsum = df.groupby(["user_id"])['part'].apply(lambda x: pd.DataFrame([0] + (x==1).cumsum().tolist()[:-1])).reset_index(drop=True)[0].values
        part2_cumsum = df.groupby(["user_id"])['part'].apply(lambda x: pd.DataFrame([0] + (x==2).cumsum().tolist()[:-1])).reset_index(drop=True)[0].values
        part3_cumsum = df.groupby(["user_id"])['part'].apply(lambda x: pd.DataFrame([0] + (x==3).cumsum().tolist()[:-1])).reset_index(drop=True)[0].values
        part4_cumsum = df.groupby(["user_id"])['part'].apply(lambda x: pd.DataFrame([0] + (x==4).cumsum().tolist()[:-1])).reset_index(drop=True)[0].values
        part5_cumsum = df.groupby(["user_id"])['part'].apply(lambda x: pd.DataFrame([0] + (x==5).cumsum().tolist()[:-1])).reset_index(drop=True)[0].values
        part6_cumsum = df.groupby(["user_id"])['part'].apply(lambda x: pd.DataFrame([0] + (x==6).cumsum().tolist()[:-1])).reset_index(drop=True)[0].values
        part7_cumsum = df.groupby(["user_id"])['part'].apply(lambda x: pd.DataFrame([0] + (x==7).cumsum().tolist()[:-1])).reset_index(drop=True)[0].values

        # logical bool if part is matched by user
        part1_g = pd.DataFrame(df.groupby(["user_id"])['part'].apply(lambda x: x==1))
        part2_g = pd.DataFrame(df.groupby(["user_id"])['part'].apply(lambda x: x==2))
        part3_g = pd.DataFrame(df.groupby(["user_id"])['part'].apply(lambda x: x==3))
        part4_g = pd.DataFrame(df.groupby(["user_id"])['part'].apply(lambda x: x==4))
        part5_g = pd.DataFrame(df.groupby(["user_id"])['part'].apply(lambda x: x==5))
        part6_g = pd.DataFrame(df.groupby(["user_id"])['part'].apply(lambda x: x==6))
        part7_g = pd.DataFrame(df.groupby(["user_id"])['part'].apply(lambda x: x==7))
        # logical bool if answer is correct by user
        acc_g = pd.DataFrame(df.groupby(["user_id"])['answered_correctly'].apply(lambda x: x==1))

        # cumsum of correct num per part by user
        part1_acc_cumsum = np.logical_and(acc_g.values ,part1_g.values)
        part2_acc_cumsum = np.logical_and(acc_g.values ,part2_g.values)
        part3_acc_cumsum = np.logical_and(acc_g.values ,part3_g.values)
        part4_acc_cumsum = np.logical_and(acc_g.values ,part4_g.values)
        part5_acc_cumsum = np.logical_and(acc_g.values ,part5_g.values)
        part6_acc_cumsum = np.logical_and(acc_g.values ,part6_g.values)
        part7_acc_cumsum = np.logical_and(acc_g.values ,part7_g.values)

        df["part1_and_ok"] = part1_acc_cumsum
        df["part2_and_ok"] = part2_acc_cumsum
        df["part3_and_ok"] = part3_acc_cumsum
        df["part4_and_ok"] = part4_acc_cumsum 
        df["part5_and_ok"] = part5_acc_cumsum
        df["part6_and_ok"] = part6_acc_cumsum
        df["part7_and_ok"] = part7_acc_cumsum
        
        part1_acc_cumsum = df.groupby(["user_id"])['part1_and_ok'].apply(lambda x: pd.DataFrame([0] + (x==1).cumsum().tolist()[:-1])).reset_index(drop=True)[0].values
        part2_acc_cumsum = df.groupby(["user_id"])['part2_and_ok'].apply(lambda x: pd.DataFrame([0] + (x==1).cumsum().tolist()[:-1])).reset_index(drop=True)[0].values
        part3_acc_cumsum = df.groupby(["user_id"])['part3_and_ok'].apply(lambda x: pd.DataFrame([0] + (x==1).cumsum().tolist()[:-1])).reset_index(drop=True)[0].values
        part4_acc_cumsum = df.groupby(["user_id"])['part4_and_ok'].apply(lambda x: pd.DataFrame([0] + (x==1).cumsum().tolist()[:-1])).reset_index(drop=True)[0].values
        part5_acc_cumsum = df.groupby(["user_id"])['part5_and_ok'].apply(lambda x: pd.DataFrame([0] + (x==1).cumsum().tolist()[:-1])).reset_index(drop=True)[0].values
        part6_acc_cumsum = df.groupby(["user_id"])['part6_and_ok'].apply(lambda x: pd.DataFrame([0] + (x==1).cumsum().tolist()[:-1])).reset_index(drop=True)[0].values
        part7_acc_cumsum = df.groupby(["user_id"])['part7_and_ok'].apply(lambda x: pd.DataFrame([0] + (x==1).cumsum().tolist()[:-1])).reset_index(drop=True)[0].values

        part1_acc_cumsum = np.array(part1_acc_cumsum, dtype=float)
        part2_acc_cumsum = np.array(part2_acc_cumsum, dtype=float)
        part3_acc_cumsum = np.array(part3_acc_cumsum, dtype=float)
        part4_acc_cumsum = np.array(part4_acc_cumsum, dtype=float)
        part5_acc_cumsum = np.array(part5_acc_cumsum, dtype=float)
        part6_acc_cumsum = np.array(part6_acc_cumsum, dtype=float)
        part7_acc_cumsum = np.array(part7_acc_cumsum, dtype=float)
        
        part1_acc_rate = np.divide(part1_acc_cumsum, part1_cumsum, out=np.ones_like(part1_acc_cumsum)*-1, where=part1_cumsum!=0)
        part2_acc_rate = np.divide(part2_acc_cumsum, part2_cumsum, out=np.ones_like(part2_acc_cumsum)*-1, where=part2_cumsum!=0)
        part3_acc_rate = np.divide(part3_acc_cumsum, part3_cumsum, out=np.ones_like(part3_acc_cumsum)*-1, where=part3_cumsum!=0)
        part4_acc_rate = np.divide(part4_acc_cumsum, part4_cumsum, out=np.ones_like(part4_acc_cumsum)*-1, where=part4_cumsum!=0)
        part5_acc_rate = np.divide(part5_acc_cumsum, part5_cumsum, out=np.ones_like(part5_acc_cumsum)*-1, where=part5_cumsum!=0)
        part6_acc_rate = np.divide(part6_acc_cumsum, part6_cumsum, out=np.ones_like(part6_acc_cumsum)*-1, where=part6_cumsum!=0)
        part7_acc_rate = np.divide(part7_acc_cumsum, part7_cumsum, out=np.ones_like(part7_acc_cumsum)*-1, where=part7_cumsum!=0)
                    
        df["acc_before_ans"] = correct_rate
        df["part1_acc"] = part1_acc_rate
        df["part2_acc"] = part2_acc_rate
        df["part3_acc"] = part3_acc_rate
        df["part4_acc"] = part4_acc_rate
        df["part5_acc"] = part5_acc_rate
        df["part6_acc"] = part6_acc_rate
        df["part7_acc"] = part7_acc_rate  
        df['prior_question_had_explanation'] = df['prior_question_had_explanation'].fillna(value = False).astype(bool)
        df["lec"] = lec_rate
        df["count"] = 1
        df["count"] = df.groupby("user_id")["count"].cumsum()
        
    else:
        df = df.merge(user_answers_df, how = 'left', left_on = 'user_id', right_index=True).reset_index(drop=True) 
        df = df.merge(user_part_accuracy, how = 'left', left_on = 'user_id', right_index=True).reset_index(drop=True)
        df = df.merge(user_answers_past2, how = 'left', left_on = 'user_id', right_index=True).reset_index(drop=True)
        df = df.merge(user_answers_past3, how = 'left', left_on = 'user_id', right_index=True).reset_index(drop=True)
        df = df.merge(user_lec_ratio, how = 'left', left_on = 'user_id', right_index=True).reset_index(drop=True)
        df = df.merge(user_try_count, how="left", on="user_id", right_index=True).reset_index(drop=True)
        df["max_count"] = df["max_count"].fillna(0)
        df["count"] = 1
        df["count"] = df.groupby("user_id")["count"].cumsum()
        df["count"] += df["max_count"]
        df.drop("max_count", axis=1, inplace=True)
        df.drop(["timestamp"], axis=1, inplace=True)
    
    df["difficult_content"] = df["content_id"].apply(lambda x: 1 if x in difficult_content else 0)
    df.fillna(value = 0.5, inplace = True) #0?
    return df
train = get_data(train)

CPU times: user 13min 42s, sys: 11.9 s, total: 13min 53s
Wall time: 13min 50s


In [10]:
# target encoding + moving average not to cause target leakage
AGGS = [
    ('answered_correctly', 2, ['user_id'], ['mean']),
    ('answered_correctly', 3, ['user_id'], ['mean']),
]

for on, lag, by, hows in AGGS:
    agg = train.groupby(by)[on].apply(lambda x: (
        x.shift(1).rolling(window=lag, min_periods=1).agg(hows)
    ))
    agg = agg.rename(columns={
        how: f'{how}_of_last_{lag}_{on}_by_' + '_and_'.join(by)
        for how in hows
    })

    train = train.join(agg)

In [11]:
target = 'answered_correctly'
columns = ["acc_before_ans", 'content_id', 'part', 'prior_question_had_explanation', 'prior_question_elapsed_time',
          "part1_acc",  "part2_acc", "part3_acc", "part4_acc", "part5_acc", "part6_acc", "part7_acc",
           "mean_of_last_2_answered_correctly_by_user_id", "mean_of_last_3_answered_correctly_by_user_id", "lec",
          "difficult_content", "count"] 
others = ["user_id", 'task_container_id']

drop_columns = [i for i in train.columns if i not in columns + [target] + others]
train.drop(drop_columns, axis=1, inplace=True)

In [12]:
train.shape

(9264002, 20)

In [13]:
train.columns

Index(['user_id', 'content_id', 'task_container_id', 'answered_correctly',
       'prior_question_elapsed_time', 'prior_question_had_explanation', 'lec',
       'part', 'acc_before_ans', 'part1_acc', 'part2_acc', 'part3_acc',
       'part4_acc', 'part5_acc', 'part6_acc', 'part7_acc', 'count',
       'difficult_content', 'mean_of_last_2_answered_correctly_by_user_id',
       'mean_of_last_3_answered_correctly_by_user_id'],
      dtype='object')

# modelling

In [14]:
def stratified_group_k_fold(X, y, groups, k, seed=None):
    labels_num = np.max(y) + 1
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)
    
    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)
    
    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)
    for i in range(k):
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices
        
    @property
    def before_iteration(self):
        # コールバックは各イテレーションの後に実行する
        return False

In [15]:
categoricals = ["content_id", "part", "difficult_content"]
params = {'objective': 'binary', "metric": 'auc','eval_metric':'auc', 'boosting_type': 'gbdt', 
          'tree_learner': 'serial', 'learning_rate': 0.1,  "num_leaves": 80 }
n_folds=5

groups = np.array(train.user_id.values)
    
models = []

valid = np.array([])
real = np.array([])
features_list = [i for i in columns]
feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
        
for i , (train_index, test_index) in enumerate(stratified_group_k_fold(train[columns + others], train[target], groups, k=n_folds, seed=0)):
    print("Fold {}".format(i+1))
    X_train = train[columns + others].iloc[train_index,:]
    y_train = train[target].iloc[train_index]

    X_valid = train[columns + others].iloc[test_index,:]
        
    # https://stackoverflow.com/questions/15705630/get-the-rows-which-have-the-max-count-in-groups-using-groupby
    max_idx = X_valid.groupby(['user_id'])['task_container_id'].transform(max) == X_valid['task_container_id']
    min_idx = X_valid.groupby(['user_id'])['task_container_id'].transform(min) == X_valid['task_container_id']
    X_valid = X_valid[max_idx|min_idx]
        
    y_valid = train[target].iloc[test_index]
    y_valid = y_valid[max_idx|min_idx]
    X_train.drop(others, axis=1, inplace=True)
    X_valid.drop(others, axis=1, inplace=True)

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
        
    clf = lgb.train(params, lgb_train,valid_sets=[lgb_train, lgb_eval], 
               num_boost_round=550,early_stopping_rounds=10,verbose_eval = 50, categorical_feature=categoricals,)
    feature_importance_df["Fold_"+str(i+1)] = clf.feature_importance()

    valid_predict = clf.predict(X_valid, num_iteration = clf.best_iteration)
        
    valid = np.concatenate([valid, valid_predict])
    real = np.concatenate([real, y_valid])
        
    models.append(clf) 
    
    del max_idx, min_idx
    del X_train, X_valid, y_train, y_valid
        
feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]
feature_importance_df = feature_importance_df.sort_values("Average", ascending=False).reset_index(drop=True)
        
score = roc_auc_score(real, valid)
print("Overall ROC AUC: ", score)

Fold 1
Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.751131	valid_1's auc: 0.749532
[100]	training's auc: 0.76462	valid_1's auc: 0.759202
[150]	training's auc: 0.771264	valid_1's auc: 0.760928
Early stopping, best iteration is:
[147]	training's auc: 0.770972	valid_1's auc: 0.761012
Fold 2
Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.750987	valid_1's auc: 0.750055
[100]	training's auc: 0.764793	valid_1's auc: 0.759569
[150]	training's auc: 0.771183	valid_1's auc: 0.761765
[200]	training's auc: 0.77555	valid_1's auc: 0.762624
Early stopping, best iteration is:
[210]	training's auc: 0.776439	valid_1's auc: 0.762793
Fold 3
Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.75163	valid_1's auc: 0.745916
[100]	training's auc: 0.765446	valid_1's auc: 0.755553
[150]	training's auc: 0.771788	valid_1's auc: 0.757299
Early stopping, best iteration is:
[153]	training's auc: 0.772064	valid_

# prediction

In [16]:
env = riiideducation.make_env()
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    y_preds = []
    test_df = test_df.merge(questions[["question_id", "part"]], how="left", left_on=['content_id'], right_on=['question_id'], right_index=True).reset_index(drop=True)
    test_df.drop(["question_id"], axis=1, inplace=True)

    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(value = False).astype(bool)
    test_df = get_data(test_df, test_set=True)

    for model in models:
        y_pred = model.predict(test_df[columns], num_iteration=model.best_iteration)
        y_preds.append(y_pred)

    y_preds = sum(y_preds) / len(y_preds)
    test_df['answered_correctly'] = y_preds
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

NameError: name 'user_try_count' is not defined