In [1]:
import datatable as dt
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import seaborn as sns

import lightgbm as lgb
from fastprogress import progress_bar
from collections import defaultdict
from sklearn.model_selection import GroupKFold

#import riiideducation

In [2]:
data_path = '../input/riiid-test-answer-prediction/train.csv'
questions_path = '../input/riiid-test-answer-prediction/questions.csv'
data_types_dict = {
    'user_id': 'int32',
    'content_id': 'int16',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float32',
    'prior_question_had_explanation': 'bool'
}
target = 'answered_correctly'

In [3]:
train_df = dt.fread(data_path, columns=set(data_types_dict.keys())).to_pandas()
questions_df = pd.read_csv(
    questions_path,
    usecols=[0, 3],
    dtype={'question_id': 'int16', 'part': 'int8'}
)

In [None]:
def add_features(df, 
                 answered_correctly_u_count, 
                 answered_correctly_u_sum, 
                 elapsed_time_u_sum, 
                 explanation_u_sum, 
                 timestamp_u, 
                 timestamp_u_incorrect, 
                 answered_correctly_q_count, 
                 answered_correctly_q_sum, 
                 elapsed_time_q_sum, 
                 explanation_q_sum, 
                 answered_correctly_uq, 
                 update=True):
    # -----------------------------------------------------------------------
    # Client features
    answered_correctly_u_avg = np.zeros(len(df), dtype = np.float32)
    elapsed_time_u_avg = np.zeros(len(df), dtype = np.float32)
    explanation_u_avg = np.zeros(len(df), dtype = np.float32)
    timestamp_u_recency_1 = np.zeros(len(df), dtype = np.float32)
    timestamp_u_recency_2 = np.zeros(len(df), dtype = np.float32)
    timestamp_u_recency_3 = np.zeros(len(df), dtype = np.float32)
    timestamp_u_incorrect_recency = np.zeros(len(df), dtype = np.float32)
    # -----------------------------------------------------------------------
    # Question features
    answered_correctly_q_avg = np.zeros(len(df), dtype = np.float32)
    elapsed_time_q_avg = np.zeros(len(df), dtype = np.float32)
    explanation_q_avg = np.zeros(len(df), dtype = np.float32)
    # -----------------------------------------------------------------------
    # User Question
    answered_correctly_uq_count = np.zeros(len(df), dtype = np.int32)
    # -----------------------------------------------------------------------
    
    for num, row in enumerate(df[['user_id', 'answered_correctly', 'content_id', 'prior_question_elapsed_time', 'prior_question_had_explanation', 'timestamp']].values):
        
        # Client features assignation
        # ------------------------------------------------------------------
        if answered_correctly_u_count[row[0]] != 0:
            answered_correctly_u_avg[num] = answered_correctly_u_sum[row[0]] / answered_correctly_u_count[row[0]]
            elapsed_time_u_avg[num] = elapsed_time_u_sum[row[0]] / answered_correctly_u_count[row[0]]
            explanation_u_avg[num] = explanation_u_sum[row[0]] / answered_correctly_u_count[row[0]]
        else:
            answered_correctly_u_avg[num] = np.nan
            elapsed_time_u_avg[num] = np.nan
            explanation_u_avg[num] = np.nan
            
        if len(timestamp_u[row[0]]) == 0:
            timestamp_u_recency_1[num] = np.nan
            timestamp_u_recency_2[num] = np.nan
            timestamp_u_recency_3[num] = np.nan
        elif len(timestamp_u[row[0]]) == 1:
            timestamp_u_recency_1[num] = row[5] - timestamp_u[row[0]][0]
            timestamp_u_recency_2[num] = np.nan
            timestamp_u_recency_3[num] = np.nan
        elif len(timestamp_u[row[0]]) == 2:
            timestamp_u_recency_1[num] = row[5] - timestamp_u[row[0]][1]
            timestamp_u_recency_2[num] = row[5] - timestamp_u[row[0]][0]
            timestamp_u_recency_3[num] = np.nan
        elif len(timestamp_u[row[0]]) == 3:
            timestamp_u_recency_1[num] = row[5] - timestamp_u[row[0]][2]
            timestamp_u_recency_2[num] = row[5] - timestamp_u[row[0]][1]
            timestamp_u_recency_3[num] = row[5] - timestamp_u[row[0]][0]
        
        if len(timestamp_u_incorrect[row[0]]) == 0:
            timestamp_u_incorrect_recency[num] = np.nan
        else:
            timestamp_u_incorrect_recency[num] = row[5] - timestamp_u_incorrect[row[0]][0]
            
        # ------------------------------------------------------------------
        # Question features assignation
        if answered_correctly_q_count[row[2]] != 0:
            answered_correctly_q_avg[num] = answered_correctly_q_sum[row[2]] / answered_correctly_q_count[row[2]]
            elapsed_time_q_avg[num] = elapsed_time_q_sum[row[2]] / answered_correctly_q_count[row[2]]
            explanation_q_avg[num] = explanation_q_sum[row[2]] / answered_correctly_q_count[row[2]]
        else:
            answered_correctly_q_avg[num] = np.nan
            elapsed_time_q_avg[num] = np.nan
            explanation_q_avg[num] = np.nan
        # ------------------------------------------------------------------
        # Client Question assignation
        answered_correctly_uq_count[num] = answered_correctly_uq[row[0]][row[2]]
        # ------------------------------------------------------------------
        # ------------------------------------------------------------------
        # Client features updates
        answered_correctly_u_count[row[0]] += 1
        elapsed_time_u_sum[row[0]] += row[3]
        explanation_u_sum[row[0]] += int(row[4])
        if len(timestamp_u[row[0]]) == 3:
            timestamp_u[row[0]].pop(0)
            timestamp_u[row[0]].append(row[5])
        else:
            timestamp_u[row[0]].append(row[5])
        # ------------------------------------------------------------------
        # Question features updates
        answered_correctly_q_count[row[2]] += 1
        elapsed_time_q_sum[row[2]] += row[3]
        explanation_q_sum[row[2]] += int(row[4])
        # ------------------------------------------------------------------
        # Client Question updates
        answered_correctly_uq[row[0]][row[2]] += 1
        # ------------------------------------------------------------------
        # Flag for training and inference
        if update:
            # ------------------------------------------------------------------
            # Client features updates
            answered_correctly_u_sum[row[0]] += row[1]
            if row[1] == 0:
                if len(timestamp_u_incorrect[row[0]]) == 1:
                    timestamp_u_incorrect[row[0]].pop(0)
                    timestamp_u_incorrect[row[0]].append(row[5])
                else:
                    timestamp_u_incorrect[row[0]].append(row[5])
            
            # ------------------------------------------------------------------
            # Question features updates
            answered_correctly_q_sum[row[2]] += row[1]
            # ------------------------------------------------------------------
             
            
    user_df = pd.DataFrame({'answered_correctly_u_avg': answered_correctly_u_avg, 'elapsed_time_u_avg': elapsed_time_u_avg, 'explanation_u_avg': explanation_u_avg, 
                            'answered_correctly_q_avg': answered_correctly_q_avg, 'elapsed_time_q_avg': elapsed_time_q_avg, 'explanation_q_avg': explanation_q_avg, 
                            'answered_correctly_uq_count': answered_correctly_uq_count, 'timestamp_u_recency_1': timestamp_u_recency_1, 'timestamp_u_recency_2': timestamp_u_recency_2,
                            'timestamp_u_recency_3': timestamp_u_recency_3, 'timestamp_u_incorrect_recency': timestamp_u_incorrect_recency})
    
    df = pd.concat([df, user_df], axis = 1)
    return df
        
def update_features(df, 
                    answered_correctly_u_sum, 
                    answered_correctly_q_sum, 
                    timestamp_u_incorrect):
    for row in df[['user_id', 'answered_correctly', 'content_id', 'content_type_id', 'timestamp']].values:
        if row[3] == 0:
            # ------------------------------------------------------------------
            # Client features updates
            answered_correctly_u_sum[row[0]] += row[1]
            if row[1] == 0:
                if len(timestamp_u_incorrect[row[0]]) == 1:
                    timestamp_u_incorrect[row[0]].pop(0)
                    timestamp_u_incorrect[row[0]].append(row[4])
                else:
                    timestamp_u_incorrect[row[0]].append(row[4])
            # ------------------------------------------------------------------
            # Question features updates
            answered_correctly_q_sum[row[2]] += row[1]
            # ------------------------------------------------------------------
            
    return

def read_and_preprocess(feature_engineering=False):
    
    train_pickle = '../input/riiid-cross-validation-files/cv1_train.pickle'
    valid_pickle = '../input/riiid-cross-validation-files/cv1_valid.pickle'
    question_file = '../input/riiid-test-answer-prediction/questions.csv'
    
    # Read data
    feld_needed = ['timestamp', 
                   'user_id', 
                   'answered_correctly', 
                   'content_id', 
                   'content_type_id', 
                   'prior_question_elapsed_time', 
                   'prior_question_had_explanation']
    train = pd.read_pickle(train_pickle)[feld_needed]
    valid = pd.read_pickle(valid_pickle)[feld_needed]
    # Delete some trianing data to don't have ram problems
    if feature_engineering:
        train = train.iloc[-40000000:]
    
    # Filter by content_type_id to discard lectures
    train = train.loc[train.content_type_id == False].reset_index(drop = True)
    valid = valid.loc[valid.content_type_id == False].reset_index(drop = True)
    
    # Changing dtype to avoid lightgbm error
    train['prior_question_had_explanation'] = train.prior_question_had_explanation.fillna(False).astype('int8')
    valid['prior_question_had_explanation'] = valid.prior_question_had_explanation.fillna(False).astype('int8')
    
    # Fill prior question elapsed time with the mean
    prior_question_elapsed_time_mean = train['prior_question_elapsed_time'].dropna().mean()
    train['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace=True)
    valid['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace=True)
    
    # Merge with question dataframe
    questions_df = pd.read_csv(question_file)
    questions_df['part'] = questions_df['part'].astype(np.int32)
    questions_df['bundle_id'] = questions_df['bundle_id'].astype(np.int32)
    
    train = pd.merge(train, 
                     questions_df[['question_id', 'part']], 
                     left_on='content_id', 
                     right_on='question_id', 
                     how='left')
    valid = pd.merge(valid, 
                     questions_df[['question_id', 'part']], 
                     left_on='content_id', 
                     right_on='question_id', 
                     how='left')
    
    # Client dictionaries
    answered_correctly_u_count = defaultdict(int)
    answered_correctly_u_sum = defaultdict(int)
    elapsed_time_u_sum = defaultdict(int)
    explanation_u_sum = defaultdict(int)
    timestamp_u = defaultdict(list)
    timestamp_u_incorrect = defaultdict(list)
    
    # Question dictionaries
    answered_correctly_q_count = defaultdict(int)
    answered_correctly_q_sum = defaultdict(int)
    elapsed_time_q_sum = defaultdict(int)
    explanation_q_sum = defaultdict(int)
    
    # Client Question dictionary
    answered_correctly_uq = defaultdict(lambda: defaultdict(int))
    
    print('User feature calculation started...')
    print('\n')
    train = add_features(train, 
                         answered_correctly_u_count, 
                         answered_correctly_u_sum, 
                         elapsed_time_u_sum, 
                         explanation_u_sum, 
                         timestamp_u, 
                         timestamp_u_incorrect, 
                         answered_correctly_q_count, 
                         answered_correctly_q_sum, 
                         elapsed_time_q_sum, 
                         explanation_q_sum, 
                         answered_correctly_uq)
    valid = add_features(valid, 
                         answered_correctly_u_count, 
                         answered_correctly_u_sum, 
                         elapsed_time_u_sum, 
                         explanation_u_sum, 
                         timestamp_u, 
                         timestamp_u_incorrect, 
                         answered_correctly_q_count, 
                         answered_correctly_q_sum, 
                         elapsed_time_q_sum, 
                         explanation_q_sum, 
                         answered_correctly_uq)
    gc.collect()
    print('User feature calculation completed...')
    print('\n')
    
    features_dicts = {
        'answered_correctly_u_count': answered_correctly_u_count,
        'answered_correctly_u_sum': answered_correctly_u_sum,
        'elapsed_time_u_sum': elapsed_time_u_sum,
        'explanation_u_sum': explanation_u_sum,
        'answered_correctly_q_count': answered_correctly_q_count,
        'answered_correctly_q_sum': answered_correctly_q_sum,
        'elapsed_time_q_sum': elapsed_time_q_sum,
        'explanation_q_sum': explanation_q_sum,
        'answered_correctly_uq': answered_correctly_uq,
        'timestamp_u': timestamp_u,
        'timestamp_u_incorrect': timestamp_u_incorrect
    }
    
    return train, valid, questions_df, prior_question_elapsed_time_mean, features_dicts

In [4]:
# preprocess
train_df = train_df[train_df[target] != -1].reset_index(drop=True)
train_df['prior_question_had_explanation'].fillna(False, inplace=True)
train_df = train_df.astype(data_types_dict)

train_df['lag'] = train_df.groupby('user_id')[target].shift()
cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])
train_df['user_correctness'] = cum['cumsum'] / cum['cumcount']
train_df.drop(columns=['lag'], inplace=True)

user_agg = train_df.groupby('user_id')[target].agg(['sum', 'count'])
content_agg = train_df.groupby('content_id')[target].agg(['sum', 'count'])

train_df = train_df.groupby('user_id').tail(30).reset_index(drop=True)

train_df = pd.merge(train_df, 
                    questions_df,
                    left_on='content_id', 
                    right_on='question_id', 
                    how='left')
train_df.drop(columns=['question_id'], inplace=True)

train_df['content_count'] = train_df['content_id'].map(content_agg['count']).astype('int32')
train_df['content_id'] = train_df['content_id'].map(content_agg['sum'] / content_agg['count'])

train_df

Unnamed: 0,user_id,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_correctness,part,content_count
0,115,0.784906,1,21000.0,False,0.812500,1,21307
1,115,0.730689,0,20000.0,False,0.823529,1,31057
2,115,0.766426,1,18000.0,False,0.777778,1,22708
3,115,0.727708,1,17000.0,False,0.789474,1,36314
4,115,0.613215,0,29000.0,False,0.800000,1,31736
...,...,...,...,...,...,...,...,...
10746450,2147482888,0.741063,1,18000.0,True,0.500000,5,4364
10746451,2147482888,0.527789,1,14000.0,True,0.521739,5,10220
10746452,2147482888,0.616202,1,14000.0,True,0.541667,5,31415
10746453,2147482888,0.661683,0,22000.0,True,0.560000,5,5752


In [5]:
gkf = GroupKFold(5)
for n, fold_idx in enumerate(gkf.split(train_df, groups=train_df.user_id)):
    print(n, fold_idx)

0 (array([       0,        1,        2, ..., 10746452, 10746453, 10746454]), array([     109,      110,      111, ..., 10746335, 10746336, 10746337]))
1 (array([       0,        1,        2, ..., 10746452, 10746453, 10746454]), array([     156,      157,      158, ..., 10746305, 10746306, 10746307]))
2 (array([       0,        1,        2, ..., 10746452, 10746453, 10746454]), array([     139,      140,      141, ..., 10746365, 10746366, 10746367]))
3 (array([       0,        1,        2, ..., 10746452, 10746453, 10746454]), array([      30,       31,       32, ..., 10746041, 10746042, 10746043]))
4 (array([      30,       31,       32, ..., 10746365, 10746366, 10746367]), array([       0,        1,        2, ..., 10746452, 10746453, 10746454]))


In [6]:
features = [
    'content_id', 
    'prior_question_elapsed_time',
    'prior_question_had_explanation', 
    'user_correctness', 
    'part',
    'content_count'
]

trn_idx, val_idx = fold_idx

trn_data = (train_df.loc[trn_idx][features], train_df.loc[trn_idx][target])
val_data = (train_df.loc[val_idx][features], train_df.loc[val_idx][target])

# 1.

In [23]:
# 2.
params = {
    'objective': 'binary',
    'num_rounds': 1000,
    'seed': 42,
    'metric': 'auc',
    'learning_rate': 0.1,
    'force_row_wise': True,
    
    'max_depth': -1,
    'num_leaves': 1000,
    
    'min_child_samples': 100,
      
}

        
model = lgb.LGBMClassifier(**params)
model.fit(
    *trn_data,
    eval_set=[trn_data, val_data], 
    verbose=50,
    early_stopping_rounds=50,
)



Training until validation scores don't improve for 50 rounds
[50]	training's auc: 0.75706	valid_1's auc: 0.735243
Early stopping, best iteration is:
[1]	training's auc: 0.747795	valid_1's auc: 0.747033


Training until validation scores don't improve for 50 rounds
[50]	training's auc: 0.758993	valid_1's auc: 0.748098
Early stopping, best iteration is:
[10]	training's auc: 0.752384	valid_1's auc: 0.749445


Training until validation scores don't improve for 50 rounds
[50]	training's auc: 0.752884	valid_1's auc: 0.750062
[100]	training's auc: 0.755495	valid_1's auc: 0.75028
Early stopping, best iteration is:
[95]	training's auc: 0.755374	valid_1's auc: 0.75029


Training until validation scores don't improve for 50 rounds
[50]	training's auc: 0.750837	valid_1's auc: 0.749346
[100]	training's auc: 0.75292	valid_1's auc: 0.750134
[150]	training's auc: 0.75443	valid_1's auc: 0.750303
[200]	training's auc: 0.755632	valid_1's auc: 0.75035
[250]	training's auc: 0.756651	valid_1's auc: 0.750328
Early stopping, best iteration is:
[206]	training's auc: 0.755756	valid_1's auc: 0.750352


Training until validation scores don't improve for 50 rounds
[50]	training's auc: 0.749033	valid_1's auc: 0.748184
[100]	training's auc: 0.749528	valid_1's auc: 0.748554
[150]	training's auc: 0.749974	valid_1's auc: 0.74885
[200]	training's auc: 0.750401	valid_1's auc: 0.749104
[250]	training's auc: 0.750846	valid_1's auc: 0.749355
[300]	training's auc: 0.751284	valid_1's auc: 0.749566
[350]	training's auc: 0.751711	valid_1's auc: 0.749752
[400]	training's auc: 0.752131	valid_1's auc: 0.749916
[450]	training's auc: 0.752524	valid_1's auc: 0.750044
[500]	training's auc: 0.75292	valid_1's auc: 0.750152
[550]	training's auc: 0.753264	valid_1's auc: 0.75022
[600]	training's auc: 0.753603	valid_1's auc: 0.750275
[650]	training's auc: 0.753921	valid_1's auc: 0.750314
[700]	training's auc: 0.75419	valid_1's auc: 0.750334
[750]	training's auc: 0.754454	valid_1's auc: 0.750349
[800]	training's auc: 0.754704	valid_1's auc: 0.75036
[850]	training's auc: 0.75496	valid_1's auc: 0.750368
[900]	train

Training until validation scores don't improve for 50 rounds
[50]	training's auc: 0.748653	valid_1's auc: 0.747865
[100]	training's auc: 0.749032	valid_1's auc: 0.748184
[150]	training's auc: 0.749295	valid_1's auc: 0.74839
[200]	training's auc: 0.749529	valid_1's auc: 0.748552
[250]	training's auc: 0.749752	valid_1's auc: 0.748705
[300]	training's auc: 0.749973	valid_1's auc: 0.748853
[350]	training's auc: 0.750186	valid_1's auc: 0.748982
[400]	training's auc: 0.750398	valid_1's auc: 0.749101
[450]	training's auc: 0.750623	valid_1's auc: 0.74923
[500]	training's auc: 0.750844	valid_1's auc: 0.74935
[550]	training's auc: 0.751057	valid_1's auc: 0.749458
[600]	training's auc: 0.751284	valid_1's auc: 0.749566
[650]	training's auc: 0.751506	valid_1's auc: 0.74967
[700]	training's auc: 0.751719	valid_1's auc: 0.749758
[750]	training's auc: 0.751913	valid_1's auc: 0.749826
[800]	training's auc: 0.752116	valid_1's auc: 0.749897
[850]	training's auc: 0.752319	valid_1's auc: 0.74997
[900]	trai

[[{'learning_rate': 1.5}, {'max_depth': -1}, 0.74703312099168],
 [{'learning_rate': 0.5}, {'max_depth': -1}, 0.74703312099168],
 [{'learning_rate': 0.1}, {'max_depth': -1}, 0.74703312099168],
 [{'learning_rate': 0.05}, {'max_depth': -1}, 0.74703312099168],
 [{'learning_rate': 0.01}, {'max_depth': -1}, 0.74703312099168],
 [{'learning_rate': 0.005}, {'max_depth': -1}, 0.74703312099168]]

In [None]:
#lgb.plot_importance(model, importance_type='gain')

In [64]:
score = model.evals_result_['valid_1']['auc'][0]
search_cv.append([{p0[0]: i}, {p1[0]: j}, score])
if score > best:
    best = score
    best_param = [{p0[0]: i}, {p1[0]: j}, score]

0.7467622224838212

In [None]:
#save_path = f'lgb_fold{n}.txt'
#model.save_model(save_path)

In [None]:
#a = lgb.Booster(model_file=save_path)

In [None]:
#a.predict(train_df.loc[val_idx][features])

In [None]:
#model.predict(train_df.loc[val_idx][features])

# inference