- return to original cv
- add bundle_id from ver 78
- early stopping to 10

In [1]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1

In [2]:
import gc
import random
import pickle
import numpy as np
import pandas as pd
from numba import jit
import riiideducation
import datatable as dt
import lightgbm as lgb
from bitarray import bitarray
from functools import partial
from collections import defaultdict

_ = np.seterr(divide='ignore', invalid='ignore')
pd.set_option("max_rows", 100)
pd.set_option("max_columns", 100)

random.seed(1)

In [3]:
def make_bitarray():
    a = bitarray(32737, endian='little')
    a.setall(True)   
    return a

def clear_mem():
    %reset -f out
    %reset -f in
    gc.collect()

In [4]:
FULL_TRAIN = False
CV_SCHEME = "original" #"time"

# Preprocess

In [5]:
data_types_dict = {
    'timestamp': 'int64',
    'user_id': 'int32', 
    'content_id': 'int16', 
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool',
}
target = 'answered_correctly'

In [6]:
train_df = dt.fread('../input/riiid-test-answer-prediction/train.csv', columns=set(data_types_dict.keys())).to_pandas()

In [7]:
@jit
def time_from_lec(A, T):
    ans = np.zeros(len(T))
    prev = 0
    flg = False
    for i in range(len(A)):
        if A[i] == -1:
            flg = True
            prev = T[i]
            
        if flg:
            ans[i] = T[i]-prev
        else:
            ans[i] = -1
            
    return ans

time_from_lec_array = train_df.groupby("user_id").apply(lambda x: time_from_lec(x["answered_correctly"].values, x["timestamp"].values))
time_from_lec_array = np.hstack(time_from_lec_array)
train_df["time_from_lec"] = time_from_lec_array
del time_from_lec_array

user_last_lec_time_dict = train_df[train_df[target] == -1].groupby("user_id").tail(1)["timestamp"].to_dict(defaultdict(lambda: -1))

In [8]:
train_df = train_df[train_df[target] != -1].reset_index(drop=True)

time_from_lec_array = train_df["time_from_lec"].values
del train_df["time_from_lec"]

train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].fillna(False).astype('int8')
train_df['prior_question_elapsed_time'].fillna(0, inplace=True)

train_df = train_df.astype(data_types_dict)

In [9]:
if CV_SCHEME=="time":
    max_timestamp_u = train_df[['user_id','timestamp']].groupby(['user_id']).agg(['max']).reset_index()
    max_timestamp_u.columns = ['user_id', 'max_time_stamp']
    MAX_TIME_STAMP = max_timestamp_u.max_time_stamp.max()

    def rand_time(max_time_stamp):
        interval = MAX_TIME_STAMP - max_time_stamp
        rand_time_stamp = random.randint(0,interval)
        return rand_time_stamp

    max_timestamp_u['rand_time_stamp'] = max_timestamp_u.max_time_stamp.apply(rand_time)
    train_df = train_df.merge(max_timestamp_u, on='user_id', how='left')
    train_df['viretual_time_stamp'] = train_df.timestamp + train_df['rand_time_stamp']

    del train_df['max_time_stamp']
    del train_df['rand_time_stamp']
    del max_timestamp_u

    train_index = list(train_df['viretual_time_stamp'].nlargest(10000000).index)

else:
    if FULL_TRAIN:
        train_size = 200
    else:
        train_size = 24
        valid_size = 6
    
    train_index = list(train_df.groupby('user_id').tail(train_size).index)

In [10]:
time_from_lec_array = time_from_lec_array[train_index]

In [11]:
@jit
def count_by_bundle(T):
    ans = np.zeros(len(T))
    prev_time = -1
    count = 0
    for i in range(len(T)):
        if i == 0:
            ans[i] = 0
        elif prev_time == T[i]:
            ans[i] = ans[i-1]
        else:
            ans[i] = count          
        prev_time = T[i]
        count += 1
    return ans

count_array = train_df.groupby("user_id").apply(lambda x: count_by_bundle(x["timestamp"].values))
count_array = np.hstack(count_array)
count_array = count_array[train_index]
count_zero_index = count_array == 0

In [12]:
@jit
def timediff_by_bundle(T):
    ans = np.zeros(len(T))
    prev_time = 0
    for i in range(len(T)): 
        if i == 0:
            ans[i] = np.nan
        elif prev_time == T[i]:
            ans[i] = ans[i-1]
        else:
            ans[i] = T[i] - prev_time
        prev_time = T[i]
    return ans

timediff_array = train_df.groupby("user_id").apply(lambda x: timediff_by_bundle(x["timestamp"].values))
timediff_array = np.hstack(timediff_array)
timediff_array = timediff_array[train_index]

@jit
def timediff2_by_bundle(T):
    ans = np.zeros(len(T))
    prev_time = [-1, -1]
    for i in range(len(T)): 
        if i == 0 or i == 1:
            ans[i] = np.nan     
        elif prev_time[1] == T[i]:
            ans[i] = ans[i-1]
        else:
            ans[i] = T[i]-prev_time[0]
            
        if prev_time[0] == -1:
            prev_time[0] = T[i]
        elif prev_time[0] != T[i] and prev_time[1] == -1:
            prev_time[1] = T[i]
        elif T[i] not in prev_time:
            prev_time[0] = prev_time[1]
            prev_time[1] = T[i]
    return ans

timediff2_array = train_df.groupby("user_id").apply(lambda x: timediff2_by_bundle(x["timestamp"].values))
timediff2_array = np.hstack(timediff2_array)
timediff2_array = timediff2_array[train_index]

@jit
def timediff3_by_bundle(T):
    ans = np.zeros(len(T))
    prev_time = [-1, -1, -1]
    for i in range(len(T)): 
        if i == 0 or i == 1 or i ==2:
            ans[i] = np.nan
        elif prev_time[2] == T[i]:
            ans[i] = ans[i-1]
        else:
            ans[i] = T[i]-prev_time[0]
            
        if prev_time[0] == -1:
            prev_time[0] = T[i]
        elif prev_time[0] != T[i] and prev_time[1] == -1:
            prev_time[1] = T[i]
        elif prev_time[1] != T[i] and prev_time[2] == -1:
            prev_time[2] = T[i]
        elif T[i] not in prev_time:
            prev_time[0] = prev_time[1]
            prev_time[1] = prev_time[2]
            prev_time[2] = T[i]
    return ans

timediff3_array = train_df.groupby("user_id").apply(lambda x: timediff3_by_bundle(x["timestamp"].values))
timediff3_array = np.hstack(timediff3_array)
timediff3_array = timediff3_array[train_index]

@jit
def timediff4_by_bundle(T):
    ans = np.zeros(len(T))
    prev_time = [-1, -1, -1, -1]
    for i in range(len(T)): 
        if i == 0 or i == 1 or i ==2 or i == 3:
            ans[i] = np.nan     
        elif prev_time[3] == T[i]:
            ans[i] = ans[i-1]
        else:
            ans[i] = T[i]- prev_time[0]
            
        if prev_time[0] == -1:
            prev_time[0] = T[i]
        elif prev_time[0] != T[i] and prev_time[1] == -1:
            prev_time[1] = T[i]
        elif prev_time[1] != T[i] and prev_time[2] == -1:
            prev_time[2] = T[i]
        elif prev_time[2] != T[i] and prev_time[3] == -1:
            prev_time[3] = T[i]
        elif T[i] not in prev_time:
            prev_time[0] = prev_time[1]
            prev_time[1] = prev_time[2]
            prev_time[2] = prev_time[3]
            prev_time[3] = T[i]
    return ans

timediff4_array = train_df.groupby("user_id").apply(lambda x: timediff4_by_bundle(x["timestamp"].values))
timediff4_array = np.hstack(timediff4_array)
timediff4_array = timediff4_array[train_index]

user_timestamp_max_dict = train_df.groupby("user_id")["timestamp"].apply(lambda x: x.drop_duplicates()[-4:].values).to_dict(defaultdict(partial(np.ndarray, 0, dtype="int64")))

time_dd_array = timediff2_array - timediff_array
timediff_array = np.nan_to_num(timediff_array, nan=-1)
timediff2_array = np.nan_to_num(timediff2_array, nan=-1)
timediff3_array = np.nan_to_num(timediff3_array, nan=-1)
timediff4_array = np.nan_to_num(timediff4_array, nan=-1)
time_dd_array = np.nan_to_num(time_dd_array, nan=-1)

In [13]:
@jit
def elapsed_time_mean_by_bundle(E, T):
    ans = np.zeros(len(T))
    count = 0
    prev_time = -1 
    for i in range(len(T)): 
        if prev_time == T[i]:
            ans[i] = ans[i-1]
        else:
            ans[i] = count
        prev_time = T[i]
        count = count + E[i]
    return ans

prior_question_elapsed_time_mean_array = train_df.groupby("user_id").apply(lambda x: 
                                                                         elapsed_time_mean_by_bundle(x["prior_question_elapsed_time"].values, 
                                                                                                     x["timestamp"].values))
prior_question_elapsed_time_mean_array = np.hstack(prior_question_elapsed_time_mean_array)
prior_question_elapsed_time_mean_array = prior_question_elapsed_time_mean_array[train_index]
prior_question_elapsed_time_mean_array = prior_question_elapsed_time_mean_array / count_array
prior_question_elapsed_time_mean_array[count_zero_index] = -1

user_prior_question_elapsed_time_sum_agg = train_df.groupby('user_id')["prior_question_elapsed_time"].agg(['sum'])
user_prior_question_elapsed_time_sum_dict = user_prior_question_elapsed_time_sum_agg['sum'].astype('int32').to_dict(defaultdict(int))
del user_prior_question_elapsed_time_sum_agg

prior_question_elapsed_time_array = train_df.prior_question_elapsed_time.values
train_df.drop("prior_question_elapsed_time", axis =1, inplace=True)
prior_question_elapsed_time_array = prior_question_elapsed_time_array[train_index]

In [14]:
questions_df = pd.read_csv(
    '../input/riiid-test-answer-prediction/questions.csv', 
    usecols=[0, 1, 3], 
    dtype={'question_id': 'int16', 'bundle_id': 'int16', 'part': 'int8'} 
)

additional_q_df = pd.read_csv('../input/riiid-question-clustering/question_cmnts.csv')
questions_df["community"] = additional_q_df["community"].astype('int8')
del additional_q_df 
    
train_df = pd.merge(train_df, questions_df, left_on='content_id', right_on='question_id', how='left', right_index=True).reset_index(drop=True)
train_df.drop(columns=['question_id'], inplace=True)

In [15]:
# bundle_agg = train_df.groupby('bundle_id')[target].agg(['sum', 'count'])
# bundle_sum_dict = bundle_agg['sum'].astype('int32').to_dict(defaultdict(int))
# bundle_count_dict = bundle_agg['count'].astype('int32').to_dict(defaultdict(int))
# del bundle_agg

# bundle_correctness_array = train_df['bundle_id'].map(bundle_agg['sum'] / bundle_agg['count']).values
# bundle_correctness_array = bundle_correctness_array[train_index]
# bundle_count_array = train_df['bundle_id'].map(bundle_agg['count']).astype('int32').values
# bundle_count_array = bundle_count_array[train_index]

bundle_id_array = train_df["bundle_id"].values
del train_df["bundle_id"]
bundle_id_array = bundle_id_array[train_index]

In [16]:
###
community_num = len(questions_df.community.unique())
print(community_num)

@jit
def tag_accuracy(A, C):
    ans = []
    community_count = [0] * community_num
    community_correct = [0] * community_num
    for i in range(len(C)):
        if community_count[C[i]]==0:
            ans.append(-1)
        else:
            ans.append(community_correct[C[i]]/community_count[C[i]])
        community_count[C[i]] +=1
        community_correct[C[i]] += A[i]
    return np.array(ans)

@jit
def tag_correct_last(A, C):
    community_correct = [0] * community_num
    for i in range(len(C)):
        community_correct[C[i]] += A[i]
    return np.array(community_correct)

@jit
def tag_count_last(A, C):
    community_count = [0] * community_num
    for i in range(len(C)):
        community_count[C[i]] +=1
    return np.array(community_count)

def init_dict():
    ans = [0] * community_num
    return np.array(ans)

76


In [17]:
###
tag_acc_array = train_df.groupby("user_id").apply(lambda x: tag_accuracy(x["answered_correctly"].values, x["community"].values))
tag_acc_array = np.hstack(tag_acc_array)
tag_acc_array = tag_acc_array[train_index]

user_community_count_dict = train_df.groupby("user_id").apply(lambda x: tag_count_last(x["answered_correctly"].values, x["community"].values)).to_dict(defaultdict(init_dict))
user_community_correct_dict = train_df.groupby("user_id").apply(lambda x: tag_correct_last(x["answered_correctly"].values, x["community"].values)).to_dict(defaultdict(init_dict))
    
#
community_agg = train_df.groupby('community')[target].agg(['count'])
community_count_dict = community_agg['count'].astype('int32').to_dict(defaultdict(int))
community_count_array = train_df['community'].map(community_agg['count']).astype('int32').values
del community_agg
community_count_array = community_count_array[train_index]
    
community_array = train_df["community"].values
community_array = community_array[train_index]
train_df.drop('community', axis=1, inplace=True)

In [18]:
@jit
def user_correctness_by_bundle(A, T):
    ans = np.zeros(len(T))
    count = np.zeros(len(T))
    correct = np.zeros(len(T))
    prev_time = 0
    for i in range(len(T)): 
        if i == 0:
            ans[i] = -1
        elif prev_time == T[i]:
            ans[i] = ans[i-1]
        else:
            if count[i-1] == 0:
                ans[i] = -1
            else:
                ans[i] = correct[i-1] / count[i-1]
        count[i] = count[i-1] + 1
        correct[i] = correct[i-1] + A[i]
        prev_time = T[i]
    return ans

user_correctness_array = train_df.groupby("user_id").apply(lambda x: user_correctness_by_bundle(x["answered_correctly"].values,
                                                                                                x["timestamp"].values))
user_correctness_array = np.hstack(user_correctness_array)
user_correctness_array = user_correctness_array[train_index]

In [19]:
@jit
def part_count_by_bundle(P, T):
    ans = np.zeros(len(T))
    part_count = [0] * 8
    prev_time = -1
    for i in range(len(T)):
        if i == 0:
            ans[i] = 0
        elif prev_time == T[i]:
            ans[i] = ans[i-1]
        else:
            ans[i] = part_count[P[i]]            
        prev_time = T[i]
        part_count[P[i]] += 1
    return ans 

@jit
def part_count_dict_calc(P):
    part_count = [0] * 8
    for i in range(len(P)):
        part_count[P[i]] += 1
    return np.array(part_count)

def part_dict_init():
    ans = [0] * 8
    return np.array(ans)

part_count_array = train_df.groupby("user_id").apply(lambda x: part_count_by_bundle(x["part"].values,
                                                                                    x["timestamp"].values))
part_count_array = np.hstack(part_count_array)
part_count_array = part_count_array[train_index]
part_ratio_array = part_count_array / count_array
part_ratio_array[count_zero_index] = -1

user_part_count_dict = train_df.groupby("user_id").apply(lambda x: part_count_dict_calc(x["part"].values)).to_dict(defaultdict(part_dict_init))

part_array = train_df.part.values
train_df.drop("part", axis=1, inplace=True)
part_array = part_array[train_index]

In [20]:
@jit
def had_explanation_mean_by_bundle(E, T):
    ans = np.zeros(len(T))
    count = 0
    prev_time = -1 
    for i in range(len(T)): 
        if prev_time == T[i]:
            ans[i] = ans[i-1]
        else:
            ans[i] = count
        prev_time = T[i]
        count = count + E[i]
    return ans

prior_question_had_explanation_mean_array = train_df.groupby("user_id").apply(lambda x: 
                                                                         had_explanation_mean_by_bundle(x["prior_question_had_explanation"].values, 
                                                                                                        x["timestamp"].values))
prior_question_had_explanation_mean_array = np.hstack(prior_question_had_explanation_mean_array)
prior_question_had_explanation_mean_array = prior_question_had_explanation_mean_array[train_index]
prior_question_had_explanation_mean_array = prior_question_had_explanation_mean_array / count_array
prior_question_had_explanation_mean_array[count_zero_index] = -1

user_prior_question_had_explanation_sum_agg = train_df.groupby('user_id')["prior_question_had_explanation"].agg(['sum'])
user_prior_question_had_explanation_sum_dict = user_prior_question_had_explanation_sum_agg['sum'].astype('int32').to_dict(defaultdict(int))
del user_prior_question_had_explanation_sum_agg

prior_question_had_explanation_array = train_df.prior_question_had_explanation.values
train_df.drop('prior_question_had_explanation', axis=1, inplace=True)
prior_question_had_explanation_array = prior_question_had_explanation_array[train_index]

In [21]:
first_attempt_df = pd.read_csv("../input/riiidpremadedatabundle/content_first_attempt.csv")
first_attempt_array = first_attempt_df.first_attempt.values
train_df["first_attempt"] = first_attempt_array

unique_attempt_array= train_df.groupby("user_id")["first_attempt"].cumsum().values
train_df["unique_attempt"] = unique_attempt_array
user_unique_agg = train_df.groupby('user_id')["unique_attempt"].agg(['max'])
user_unique_dict = user_unique_agg['max'].astype('int32').to_dict(defaultdict(int))

first_attempt_array = first_attempt_array[train_index]
unique_attempt_array = unique_attempt_array[train_index]
train_df.drop(['first_attempt', 'unique_attempt'], axis=1, inplace=True)
del first_attempt_df, user_unique_agg

clear_mem()

Flushing output cache (0 entries)
Flushing input history


In [22]:
user_agg = train_df.groupby('user_id')[target].agg(['sum', 'count'])
user_sum_dict = user_agg['sum'].astype('int16').to_dict(defaultdict(int))
del user_agg['sum']
user_count_dict = user_agg['count'].astype('int16').to_dict(defaultdict(int))
del user_agg['count']
clear_mem()

#
content_agg = train_df.groupby('content_id')[target].agg(['sum', 'count'])
content_sum_dict = content_agg['sum'].astype('int32').to_dict(defaultdict(int))
content_count_dict = content_agg['count'].astype('int32').to_dict(defaultdict(int))

content_count_array = train_df['content_id'].map(content_agg['count']).astype('int32').values
content_id_array = train_df['content_id'].map(content_agg['sum'] / content_agg['count']).values
del content_agg
clear_mem()
print(min(content_id_array))

Flushing output cache (0 entries)
Flushing input history
Flushing output cache (0 entries)
Flushing input history
0.0


In [23]:
with open('../input/riiidpremadedatabundle/got_point_array.pickle', 'rb') as f:
    got_point_array = pickle.load(f)
got_point_array = got_point_array[train_index]
got_point_array = got_point_array / count_array
got_point_array[count_zero_index] = -1
    
with open('../input/riiidpremadedatabundle/user_point_sum_dict.pickle', 'rb') as f: 
    user_point_sum_dict = pickle.load(f)

content_id_array = content_id_array[train_index]
content_count_array = content_count_array[train_index]

In [24]:
train_df.drop(["content_id"], axis=1, inplace=True)

with open('../input/riiidpremadedatabundle/user_content_dict.pickle', 'rb') as f: 
    user_content_dict = pickle.load(f)

In [25]:
def init_last7():
    ans = [-1] * 7
    return np.array(ans)

with open('../input/riiidpremadedatabundle/answered_correctly_last7_array.pickle','rb') as f:
    answered_correctly_last7_array = pickle.load(f)
answered_correctly_last7_array = answered_correctly_last7_array[train_index]
    
with open('../input/riiidpremadedatabundle/user_last_count7_dict.pickle','rb') as f:
    user_last_count7_dict = pickle.load(f)

with open('../input/riiidpremadedatabundle/user_last_correct7_dict.pickle','rb') as f:
    user_last_correct7_dict = pickle.load(f)

In [26]:
with open('../input/riiidpremadedatabundle/continuous_correct_array.pickle','rb') as f:
    continuous_correct_array = pickle.load(f)
continuous_correct_array=continuous_correct_array[train_index]
    
with open('../input/riiidpremadedatabundle/continuous_correct_dict.pickle','rb') as f:
    continuous_correct_dict = pickle.load(f)
    
with open('../input/riiidpremadedatabundle/continuous_incorrect_array.pickle','rb') as f:
    continuous_incorrect_array = pickle.load(f)
continuous_incorrect_array=continuous_incorrect_array[train_index]
    
with open('../input/riiidpremadedatabundle/continuous_incorrect_dict.pickle','rb') as f:
    continuous_incorrect_dict = pickle.load(f)

In [27]:
answered_correctly_array = train_df[target].values
train_df.drop(target, axis=1, inplace=True)
answered_correctly_array = answered_correctly_array[train_index]

# data formation

In [28]:
if not FULL_TRAIN:
    train_df = train_df[train_df.index.isin(train_index)].reset_index(drop=True)
    if CV_SCHEME == "original":
        valid_index = list(train_df.groupby('user_id').tail(valid_size).index)
    else:
        valid_index = list(train_df['viretual_time_stamp'].nlargest(2500000).index)
    train_index = list(train_df[~train_df.index.isin(valid_index)].index)
del train_df

In [29]:
features_dict = {
    'content_id': content_id_array,
    'prior_question_elapsed_time': prior_question_elapsed_time_array,
    'prior_question_had_explanation':  prior_question_had_explanation_array,
    'user_correctness': user_correctness_array,
    'part': part_array,
    'content_count': content_count_array,
    'count': count_array,
    'first_attempt': first_attempt_array,
    'unique_attempt': unique_attempt_array,
    'part_count': part_count_array,
    'part_ratio': part_ratio_array,
    'prior_question_had_explanation_mean': prior_question_had_explanation_mean_array,
    'got_point': got_point_array,
    'answered_correctly_last7': answered_correctly_last7_array,   
    'timediff': timediff_array,
    'timediff2': timediff2_array,
    'timediff3': timediff3_array,
    'timediff4': timediff4_array,
    'community': community_array,
    'tag_acc': tag_acc_array,
    'community_count': community_count_array,
    'time_dd': time_dd_array,
    'continuous_correct': continuous_correct_array,
    'continuous_incorrect': continuous_incorrect_array,
    'prior_question_elapsed_time_mean': prior_question_elapsed_time_mean_array,
    'time_from_lec': time_from_lec_array,
    'bundle_id': bundle_id_array,
}

features = list(features_dict.keys())
print(len(features))

del content_id_array, prior_question_elapsed_time_array, prior_question_had_explanation_array,
del user_correctness_array, part_array,
del content_count_array, count_array, first_attempt_array, unique_attempt_array,
del part_ratio_array, part_count_array,
del prior_question_had_explanation_mean_array, prior_question_elapsed_time_mean_array, got_point_array, 
del answered_correctly_last7_array, timediff_array, timediff2_array, timediff3_array, timediff4_array,
del community_array,
del tag_acc_array, community_count_array, time_dd_array, continuous_correct_array, continuous_incorrect_array
del time_from_lec_array, bundle_id_array

27


In [30]:
if FULL_TRAIN:
    print(len(train_index), len(features)+1)
else:
    print((len(train_index), len(features)+1), (len(valid_index), len(features)+1))

(6536675, 28) (2360984, 28)


# Train

In [31]:
params = {
    'objective': 'binary',
    'seed': 42,
    'metric': 'auc',
    'learning_rate': 0.05,
    'max_bin': 600,
    'num_leaves': 80,
    #'bagging_fraction': 0.8,
    #'bagging_freq': 1,
}

In [32]:
if FULL_TRAIN:
    X_train = np.ndarray(shape=(len(train_index), len(features)), dtype=np.float32)

    for idx, feature in enumerate(features):
        X_train[:,idx] = features_dict[feature].astype(np.float32).reshape(-1)
        del features_dict[feature]
    y_train = answered_correctly_array.astype(np.float32)
    tr_data = lgb.Dataset(X_train, label=y_train)
else:
    X_train = np.ndarray(shape=(len(train_index), len(features)), dtype=np.float32)
    X_valid = np.ndarray(shape=(len(valid_index), len(features)), dtype=np.float32)

    for idx, feature in enumerate(features):
        X_train[:,idx] = features_dict[feature][train_index].astype(np.float32).reshape(-1)
        X_valid[:,idx] = features_dict[feature][valid_index].astype(np.float32).reshape(-1)
        del features_dict[feature]
    y_train = answered_correctly_array[train_index].astype(np.float32)
    y_valid = answered_correctly_array[valid_index].astype(np.float32)

    tr_data = lgb.Dataset(X_train, label=y_train)
    va_data = lgb.Dataset(X_valid, label=y_valid)

In [33]:
print("training starts")
if FULL_TRAIN:
    model = lgb.train(
        params, 
        tr_data, 
        num_boost_round=3000,
        valid_sets=None, 
        )
    del X_train, y_train
else:
    model = lgb.train(
        params, 
        tr_data, 
        num_boost_round= 2500,
        valid_sets=[tr_data, va_data], 
        early_stopping_rounds=10, #in original cv, 5 in time cv
        verbose_eval=50
        )
    del X_train, y_train, X_valid, y_valid

training starts
Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.766009	valid_1's auc: 0.752416
[100]	training's auc: 0.770948	valid_1's auc: 0.758493
[150]	training's auc: 0.773438	valid_1's auc: 0.761061
[200]	training's auc: 0.774898	valid_1's auc: 0.762351
[250]	training's auc: 0.7759	valid_1's auc: 0.763157
[300]	training's auc: 0.776614	valid_1's auc: 0.763648
[350]	training's auc: 0.777211	valid_1's auc: 0.764022
[400]	training's auc: 0.777755	valid_1's auc: 0.764318
[450]	training's auc: 0.778251	valid_1's auc: 0.764557
[500]	training's auc: 0.77872	valid_1's auc: 0.764782
[550]	training's auc: 0.77914	valid_1's auc: 0.76497
[600]	training's auc: 0.779535	valid_1's auc: 0.765125
[650]	training's auc: 0.779899	valid_1's auc: 0.765261
[700]	training's auc: 0.780288	valid_1's auc: 0.765426
[750]	training's auc: 0.780663	valid_1's auc: 0.765579
[800]	training's auc: 0.781017	valid_1's auc: 0.765676
[850]	training's auc: 0.781384	valid_1's auc: 0.7

# Inference

In [34]:
env = riiideducation.make_env()
iter_test = env.iter_test()
prior_test_df = None

In [35]:
%%time
for (test_df, sample_prediction_df) in iter_test:
    if prior_test_df is not None:
        prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop=True)
        
        user_ids = prior_test_df['user_id'].values
        content_ids = prior_test_df['content_id'].values
        targets = prior_test_df[target].values
        timestamps = prior_test_df['timestamp'].values
                 
        for (user_id, content_id, answered_correctly, first_attempt_ornot, prior_explanation,
             prior_point, prior_community, prior_timestamp, prior_part, prior_elapsed) in zip(user_ids, 
                                                            content_ids, 
                                                            targets, 
                                                            prior_f_attempt_arrays,
                                                            p_prior_question_had_explanation,
                                                            prior_point_array,
                                                            prior_community_arrays,
                                                            timestamps,
                                                            prior_part_array,
                                                            p_prior_question_elapsed_time):
            
            user_sum_dict[user_id] += answered_correctly
            user_count_dict[user_id] += 1
            content_sum_dict[content_id] += answered_correctly
            content_count_dict[content_id] += 1
            user_unique_dict[user_id] += first_attempt_ornot
            user_prior_question_had_explanation_sum_dict[user_id] += prior_explanation
            user_prior_question_elapsed_time_sum_dict[user_id] += prior_elapsed
            user_point_sum_dict[user_id] += prior_point * answered_correctly
            user_part_count_dict[user_id][prior_part] += 1
            user_community_correct_dict[user_id][prior_community] += answered_correctly
            user_community_count_dict[user_id][prior_community] += 1
            community_count_dict[prior_community] += 1  
            
            if np.sum(user_timestamp_max_dict[user_id] == prior_timestamp) == 0:
                if len(user_timestamp_max_dict[user_id]) <= 3: 
                    user_timestamp_max_dict[user_id] = np.concatenate([user_timestamp_max_dict[user_id],[prior_timestamp]])
                else:
                    user_timestamp_max_dict[user_id] = np.concatenate([user_timestamp_max_dict[user_id],[prior_timestamp]])[1:]  
                user_last_count7_dict[user_id] = np.concatenate([user_last_count7_dict[user_id],[1]])[1:]
                user_last_correct7_dict[user_id] = np.concatenate([user_last_correct7_dict[user_id],[answered_correctly]])[1:]
            else:
                user_last_count7_dict[user_id][-1] += 1
                user_last_correct7_dict[user_id][-1] += answered_correctly
                    
            if answered_correctly == 1:
                continuous_correct_dict[user_id] += 1
                continuous_incorrect_dict[user_id] = 0
            else:
                continuous_correct_dict[user_id] = 0
                continuous_incorrect_dict[user_id] += 1
                    
    prior_test_df = test_df.copy()
    
    time_from_last_lec_array = np.zeros(len(test_df), dtype = np.int64)
    for i, (user_id, content_type_id, timestamp) in enumerate(zip(test_df['user_id'].values, test_df['content_type_id'].values,
                                                             test_df['timestamp'].values)):
        if content_type_id == 1:
            time_from_last_lec_array[i] = 0
            user_last_lec_time_dict[user_id] = timestamp
        else:
            if user_last_lec_time_dict[user_id] != -1:
                time_from_last_lec_array[i] = timestamp - user_last_lec_time_dict[user_id]
            else:
                time_from_last_lec_array[i] = -1
    test_df['time_from_lec'] = time_from_last_lec_array 
           
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
    test_df = pd.merge(test_df, questions_df, left_on='content_id', right_on='question_id', how='left', right_index=True).reset_index(drop=True)
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False).astype('int8')
    test_df['prior_question_elapsed_time'] = test_df['prior_question_elapsed_time'].fillna(0)

    p_prior_question_elapsed_time = test_df['prior_question_elapsed_time'].values
    p_prior_question_had_explanation = test_df['prior_question_had_explanation'].values
    prior_community_arrays = test_df['community'].values
    prior_part_array = test_df['part'].values
   
    first_attempt_values = []
    user_sum = np.zeros(len(test_df), dtype=np.int16)
    user_count = np.zeros(len(test_df), dtype=np.int16)
    content_sum = np.zeros(len(test_df), dtype=np.int32)
    content_count = np.zeros(len(test_df), dtype=np.int32)
    part_count = np.zeros(len(test_df), dtype=np.int32)
    part_ratio_array = np.zeros(len(test_df), dtype=np.float32)
    user_correctness_array = np.zeros(len(test_df), dtype=np.float32)
    user_unique_count = np.zeros(len(test_df), dtype=np.int32)
    user_prior_question_had_explanation_mean_array = np.zeros(len(test_df), dtype=np.float32)
    user_prior_question_elapsed_time_mean_array = np.zeros(len(test_df), dtype=np.float32)
    got_point_array = np.zeros(len(test_df), dtype=np.float32)
    user_last7_accuracy_array = np.zeros(len(test_df), dtype=np.float32)
    timediff_array = np.zeros(len(test_df), dtype = np.int64)
    timediff2_array = np.zeros(len(test_df), dtype = np.int64)
    timediff3_array = np.zeros(len(test_df), dtype = np.int64)
    timediff4_array = np.zeros(len(test_df), dtype = np.int64)
    tag_acc_array = np.zeros(len(test_df), dtype=np.float32)
    community_count_array = np.zeros(len(test_df), dtype=np.int32)
    time_dd_array = np.zeros(len(test_df), dtype = np.int64)
    user_con_correct_array = np.zeros(len(test_df), dtype=np.int16)
    user_con_incorrect_array = np.zeros(len(test_df), dtype=np.int16)
    
    for i, (user_id, content_id, timestamp, community, part) in enumerate(zip(test_df['user_id'].values, 
                                                             test_df['content_id'].values,
                                                             test_df['timestamp'].values,
                                                             test_df['community'].values,
                                                             test_df['part'].values)):
        user_sum[i] = user_sum_dict[user_id]
        user_count[i] = user_count_dict[user_id]
        content_sum[i] = content_sum_dict[content_id]
        content_count[i] = content_count_dict[content_id]
        part_count[i] = user_part_count_dict[user_id][part]
        first_attempt_values.append(user_content_dict[user_id][content_id])
        user_content_dict[user_id][content_id] = False             
        user_unique_count[i] = user_unique_dict[user_id]
        if user_count[i] != 0:
            user_correctness_array[i] = user_sum[i] / user_count[i]
            part_ratio_array[i] = part_count[i] / user_count[i]
            user_prior_question_had_explanation_mean_array[i] = user_prior_question_had_explanation_sum_dict[user_id] / user_count[i]
            user_prior_question_elapsed_time_mean_array[i] = user_prior_question_elapsed_time_sum_dict[user_id] / user_count[i]
        else:
            user_correctness_array[i] = -1
            part_ratio_array[i] = -1
            user_prior_question_had_explanation_mean_array[i] = -1
            user_prior_question_elapsed_time_mean_array[i] = -1
        
        if np.sum(user_last_count7_dict != -1)==7:
            user_last7_accuracy_array[i] = user_last_correct7_dict[user_id].sum() / user_last_count7_dict[user_id].sum()
        else:
            user_last7_accuracy_array[i] = np.nan

        if len(user_timestamp_max_dict[user_id]) ==0:
            timediff_array[i] = -1
            timediff2_array[i] = -1
            timediff3_array[i] = -1
            timediff4_array[i] = -1
            time_dd_array[i] = -1
            
        elif len(user_timestamp_max_dict[user_id]) ==1:
            timediff_array[i] = timestamp - user_timestamp_max_dict[user_id][0]
            timediff2_array[i] = -1
            timediff3_array[i] = -1
            timediff4_array[i] = -1
            time_dd_array[i] = -1
            
        elif len(user_timestamp_max_dict[user_id]) ==2:
            timediff_array[i] = timestamp - user_timestamp_max_dict[user_id][1]
            timediff2_array[i] = timestamp - user_timestamp_max_dict[user_id][0]
            timediff3_array[i] = -1
            timediff4_array[i] = -1
            time_dd_array[i] = timediff2_array[i] - timediff_array[i]
            
        elif len(user_timestamp_max_dict[user_id]) ==3:
            timediff_array[i] = timestamp - user_timestamp_max_dict[user_id][2]
            timediff2_array[i] = timestamp - user_timestamp_max_dict[user_id][1]
            timediff3_array[i] = timestamp - user_timestamp_max_dict[user_id][0]
            timediff4_array[i] = -1
            time_dd_array[i] = timediff2_array[i] - timediff_array[i]
    
        else:
            timediff_array[i] = timestamp - user_timestamp_max_dict[user_id][3]
            timediff2_array[i] = timestamp - user_timestamp_max_dict[user_id][2]
            timediff3_array[i] = timestamp - user_timestamp_max_dict[user_id][1]
            timediff4_array[i] = timestamp - user_timestamp_max_dict[user_id][0]
            time_dd_array[i] = timediff2_array[i] - timediff_array[i]
            
        if user_community_count_dict[user_id][community] == 0:
            tag_acc_array[i] = -1
        else:
            tag_acc_array[i] = user_community_correct_dict[user_id][community] / user_community_count_dict[user_id][community]

        got_point_array[i] = user_point_sum_dict[user_id]
        community_count_array[i] = community_count_dict[community]
        user_con_correct_array[i] = continuous_correct_dict[user_id]
        user_con_incorrect_array[i] = continuous_incorrect_dict[user_id]
     
    test_df['count'] = user_count
    test_df['user_correctness'] = user_correctness_array
    test_df['content_count'] = content_count
    test_df['content_id'] = content_sum / content_count
    test_df['part_count'] = part_count
    test_df['part_ratio'] = part_ratio_array
    test_df["first_attempt"] = first_attempt_values
    test_df["unique_attempt"] = test_df.groupby("user_id")["first_attempt"].cumsum()
    test_df["unique_attempt"] += user_unique_count
    test_df['prior_question_had_explanation_mean'] = user_prior_question_had_explanation_mean_array
    
    test_df['prior_question_elapsed_time_mean'] = user_prior_question_elapsed_time_mean_array
    test_df['got_point'] = got_point_array / user_count
    test_df['answered_correctly_last7'] = user_last7_accuracy_array
    test_df['timediff'] = timediff_array
    test_df['timediff2'] = timediff2_array
    test_df['timediff3'] = timediff3_array
    test_df['timediff4'] = timediff4_array
    test_df['tag_acc'] = tag_acc_array
    test_df['community_count'] = community_count_array
    test_df['time_dd'] = time_dd_array
    test_df['continuous_correct'] = user_con_correct_array
    test_df['continuous_incorrect'] = user_con_incorrect_array

    prior_f_attempt_arrays = test_df['first_attempt'].values
    prior_point_array = 1 / (test_df.content_id.values + 0.1)
    
    test_df[target] = model.predict(test_df[features])
    env.predict(test_df[['row_id', target]])

CPU times: user 1.56 s, sys: 82.1 ms, total: 1.64 s
Wall time: 957 ms
