- execute full training size=300, round=3000 based on ver 64

In [1]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1

In [2]:
import gc
import numpy as np
import pandas as pd
from collections import defaultdict
import datatable as dt
import lightgbm as lgb
from matplotlib import pyplot as plt
import riiideducation
from bitarray import bitarray
from functools import partial
import pickle
import math 

from tqdm._tqdm_notebook import tqdm_notebook
from numba import jit
import random

tqdm_notebook.pandas(desc="progress: ")

_ = np.seterr(divide='ignore', invalid='ignore')
pd.set_option("max_rows", 100)
pd.set_option("max_columns", 100)

random.seed(1)

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  
  from pandas import Panel


In [3]:
def make_bitarray():
    a = bitarray(32737, endian='little')
    a.setall(True)   
    return a

def clear_mem():
    %reset -f out
    %reset -f in
    gc.collect()
    
@jit
def last4(A):
    ans = []
    for i in range(len(A)):
        if i < 7:
            ans.append(np.nan)
        else:
            ans.append(np.mean(A[i-5:i]))       
    return np.array(ans)

In [4]:
FULL_TRAIN = True
CV_SCHEME = "original" #"time"

# Preprocess

In [5]:
data_types_dict = {
    'timestamp': 'int64',
    'user_id': 'int32', 
    'content_id': 'int16', 
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool',
}
target = 'answered_correctly'

In [6]:
train_df = dt.fread('../input/riiid-test-answer-prediction/train.csv', columns=set(data_types_dict.keys())).to_pandas()

In [7]:
train_df = train_df[train_df[target] != -1].reset_index(drop=True)

train_df['prior_question_had_explanation'].fillna(False, inplace=True)
train_df['prior_question_elapsed_time'].fillna(0, inplace=True)

train_df = train_df.astype(data_types_dict)

In [8]:
if CV_SCHEME=="time":
    max_timestamp_u = train_df[['user_id','timestamp']].groupby(['user_id']).agg(['max']).reset_index()
    max_timestamp_u.columns = ['user_id', 'max_time_stamp']
    MAX_TIME_STAMP = max_timestamp_u.max_time_stamp.max()

    def rand_time(max_time_stamp):
        interval = MAX_TIME_STAMP - max_time_stamp
        rand_time_stamp = random.randint(0,interval)
        return rand_time_stamp

    max_timestamp_u['rand_time_stamp'] = max_timestamp_u.max_time_stamp.apply(rand_time)
    train_df = train_df.merge(max_timestamp_u, on='user_id', how='left')
    train_df['viretual_time_stamp'] = train_df.timestamp + train_df['rand_time_stamp']

    del train_df['max_time_stamp']
    del train_df['rand_time_stamp']
    del max_timestamp_u

    train_index = list(train_df['viretual_time_stamp'].nlargest(10000000).index)

else:
    if FULL_TRAIN:
        train_size = 300
    else:
        train_size = 24
        valid_size = 6
    
    train_index = list(train_df.groupby('user_id').tail(train_size).index)

In [9]:
train_df["count"] = 1

# normal cumsum
count_array = train_df.groupby("user_id")["count"].cumsum().values
count_array = count_array[train_index]
train_df.drop("count", axis=1, inplace=True)

In [10]:
timediff_array = train_df.groupby("user_id")["timestamp"].diff().values
timediff_array = timediff_array[train_index]
clear_mem()

timediff2_array = train_df.groupby("user_id")["timestamp"].diff(2).values
timediff2_array = timediff2_array[train_index]
clear_mem()

timediff3_array = train_df.groupby("user_id")["timestamp"].diff(3).values
timediff3_array = timediff3_array[train_index]
clear_mem()

timediff4_array = train_df.groupby("user_id")["timestamp"].diff(4).values
timediff4_array = timediff4_array[train_index]
clear_mem()

user_timestamp_max_dict = train_df.groupby("user_id")["timestamp"].apply(lambda x: x[-4:].values).to_dict(defaultdict(partial(np.ndarray, 0, dtype="int64")))

train_df.drop("timestamp", axis=1, inplace=True)

time_dd_array = timediff2_array - timediff_array
timediff_array = np.nan_to_num(timediff_array, nan=-1)
timediff2_array = np.nan_to_num(timediff2_array, nan=-1)
timediff3_array = np.nan_to_num(timediff3_array, nan=-1)
timediff4_array = np.nan_to_num(timediff4_array, nan=-1)
time_dd_array = np.nan_to_num(time_dd_array, nan=-1)

Flushing output cache (0 entries)
Flushing input history
Flushing output cache (0 entries)
Flushing input history
Flushing output cache (0 entries)
Flushing input history
Flushing output cache (0 entries)
Flushing input history


In [11]:
prior_question_elapsed_time_array = train_df.prior_question_elapsed_time.values
train_df.drop("prior_question_elapsed_time", axis =1, inplace=True)
prior_question_elapsed_time_array = prior_question_elapsed_time_array[train_index]

In [12]:
questions_df = pd.read_csv(
    '../input/riiid-test-answer-prediction/questions.csv', 
    usecols=[0, 3], 
    dtype={'question_id': 'int16', 'part': 'int8'} 
)

additional_q_df = pd.read_csv('../input/riiid-question-clustering/question_cmnts.csv')
questions_df["community"] = additional_q_df["community"].astype('int8')
del additional_q_df 
    
train_df = pd.merge(train_df, questions_df, left_on='content_id', right_on='question_id', how='left', right_index=True).reset_index(drop=True)
train_df.drop(columns=['question_id'], inplace=True)

In [13]:
community_num = len(questions_df.community.unique())
print(community_num)

@jit
def tag_accuracy(A, C):
    ans = []
    community_count = [0] * community_num
    community_correct = [0] * community_num
    for i in range(len(C)):
        if community_count[C[i]]==0:
            ans.append(-1)
        else:
            ans.append(community_correct[C[i]]/community_count[C[i]])
        community_count[C[i]] +=1
        community_correct[C[i]] += A[i]
    return np.array(ans)

@jit
def tag_correct_last(A, C):
    community_correct = [0] * community_num
    for i in range(len(C)):
        community_correct[C[i]] += A[i]
    return np.array(community_correct)

@jit
def tag_count_last(A, C):
    community_count = [0] * community_num
    for i in range(len(C)):
        community_count[C[i]] +=1
    return np.array(community_count)

def init_dict():
    ans = [0] * community_num
    return np.array(ans)

76


In [14]:
#
tag_acc_array = train_df.groupby("user_id").apply(lambda x: tag_accuracy(x["answered_correctly"].values, x["community"].values))
tag_acc_array = np.hstack(tag_acc_array)
tag_acc_array = tag_acc_array[train_index]

user_community_count_dict = train_df.groupby("user_id").apply(lambda x: tag_count_last(x["answered_correctly"].values, x["community"].values)).to_dict(defaultdict(init_dict))
user_community_correct_dict = train_df.groupby("user_id").apply(lambda x: tag_correct_last(x["answered_correctly"].values, x["community"].values)).to_dict(defaultdict(init_dict))
    
#
community_agg = train_df.groupby('community')[target].agg(['count'])
community_count_dict = community_agg['count'].astype('int32').to_dict(defaultdict(int))
community_count_array = train_df['community'].map(community_agg['count']).astype('int32').values
del community_agg
community_count_array = community_count_array[train_index]
    
community_array = train_df["community"].values
community_array = community_array[train_index]
train_df.drop('community', axis=1, inplace=True)

In [15]:
train_df['lag'] = train_df.groupby('user_id')[target].shift()
cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])
user_correctness_array = np.array(cum['cumsum'] / cum['cumcount'])
user_correctness_array = user_correctness_array[train_index]
train_df.drop(columns=['lag'], inplace=True)
del cum

In [16]:
@jit
def part_count_calc(P):
    ans = []
    part_count = [0] * 8
    for i in range(len(P)):
        part_count[P[i]] += 1
        ans.append(part_count[P[i]])
    return np.array(ans)

@jit
def part_count_dict_calc(P):
    part_count = [0] * 8
    for i in range(len(P)):
        part_count[P[i]] += 1
    return np.array(part_count)

def part_dict_init():
    ans = [0] * 8
    return np.array(ans)

part_count_array = train_df.groupby("user_id").apply(lambda x: part_count_calc(x["part"].values))
part_count_array = np.hstack(part_count_array)
part_count_array = part_count_array[train_index]
part_ratio_array = part_count_array / count_array

user_part_count_dict = train_df.groupby("user_id").apply(lambda x: part_count_dict_calc(x["part"].values)).to_dict(defaultdict(part_dict_init))

part_array = train_df.part.values
train_df.drop("part", axis=1, inplace=True)
part_array = part_array[train_index]

In [17]:
# prior_question_had_explanation_mean
train_df['lag'] = train_df.groupby('user_id')['prior_question_had_explanation'].shift().astype(bool)
cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])
prior_question_had_explanation_mean_array = np.array(cum['cumsum'] / cum['cumcount'])
prior_question_had_explanation_mean_array = prior_question_had_explanation_mean_array[train_index]

user_prior_question_had_explanation_sum_agg = train_df.groupby('user_id')["prior_question_had_explanation"].agg(['sum'])
user_prior_question_had_explanation_sum_dict = user_prior_question_had_explanation_sum_agg['sum'].astype('int32').to_dict(defaultdict(int))
train_df.drop(columns=['lag'], inplace=True)
del cum, user_prior_question_had_explanation_sum_agg

prior_question_had_explanation_array = train_df.prior_question_had_explanation.values
train_df.drop('prior_question_had_explanation', axis=1, inplace=True)
prior_question_had_explanation_array = prior_question_had_explanation_array[train_index]

In [18]:
first_attempt_df = pd.read_csv("../input/riiid-additional-data/content_first_attempt.csv")
first_attempt_array = first_attempt_df.first_attempt.values
train_df["first_attempt"] = first_attempt_array

unique_attempt_array= train_df.groupby("user_id")["first_attempt"].cumsum().values
train_df["unique_attempt"] = unique_attempt_array
user_unique_agg = train_df.groupby('user_id')["unique_attempt"].agg(['max'])
user_unique_dict = user_unique_agg['max'].astype('int32').to_dict(defaultdict(int))

first_attempt_array = first_attempt_array[train_index]
unique_attempt_array = unique_attempt_array[train_index]
train_df.drop(['first_attempt', 'unique_attempt'], axis=1, inplace=True)
del first_attempt_df, user_unique_agg

clear_mem()

Flushing output cache (0 entries)
Flushing input history


In [19]:
user_agg = train_df.groupby('user_id')[target].agg(['sum', 'count'])
user_sum_dict = user_agg['sum'].astype('int16').to_dict(defaultdict(int))
del user_agg['sum']
user_count_dict = user_agg['count'].astype('int16').to_dict(defaultdict(int))
del user_agg['count']
clear_mem()

#
content_agg = train_df.groupby('content_id')[target].agg(['sum', 'count'])
content_sum_dict = content_agg['sum'].astype('int32').to_dict(defaultdict(int))
content_count_dict = content_agg['count'].astype('int32').to_dict(defaultdict(int))

content_count_array = train_df['content_id'].map(content_agg['count']).astype('int32').values
content_id_array = train_df['content_id'].map(content_agg['sum'] / content_agg['count']).values
del content_agg
clear_mem()

Flushing output cache (0 entries)
Flushing input history
Flushing output cache (0 entries)
Flushing input history


In [20]:
# benefit of solving difficult questions
#point_array = 1 / (content_id_array + 0.1)
content_id_array = content_id_array[train_index]
content_count_array = content_count_array[train_index]

with open('../input/riiid-premade-data/got_point_array.pickle','rb') as f:
    got_point_array = pickle.load(f)
got_point_array = got_point_array[train_index]
    
with open('../input/riiid-premade-data/user_point_sum_dict.pickle','rb') as f:
    user_point_sum_dict = pickle.load(f)

In [21]:
%%time
train_df.drop(["content_id"], axis=1, inplace=True)

with open('../input/riiid-premade-data/user_content_dict.pickle','rb') as f:
    user_content_dict = pickle.load(f)

CPU times: user 4.36 s, sys: 2.49 s, total: 6.85 s
Wall time: 23 s


In [22]:
with open('../input/riiid-premade-data/answered_correctly_last7_array.pickle','rb') as f:
    answered_correctly_last7_array = pickle.load(f)
answered_correctly_last7_array = answered_correctly_last7_array[train_index]
    
with open('../input/riiid-premade-data/user_last7_answer_dict.pickle','rb') as f:
    user_last7_answer_dict = pickle.load(f)

In [23]:
@jit
def continuous_correct(A):
    ans = []
    count = 0
    for i in range(len(A)): 
        ans.append(count)
        if A[i] ==1:
            count += 1
        else:
            count = 0
    return np.array(ans)

@jit
def continuous_incorrect(A):
    ans = []
    count = 0
    for i in range(len(A)): 
        ans.append(count)
        if A[i] ==0:
            count += 1
        else:
            count = 0
    return np.array(ans)

@jit
def continuous_correct_dict_calc(A):
    count = 0
    for i in range(len(A)): 
        if A[i] ==1:
            count += 1
        else:
            count = 0
    return count

@jit
def continuous_incorrect_dict_calc(A):
    count = 0
    for i in range(len(A)): 
        if A[i] ==0:
            count += 1
        else:
            count = 0
    return count

#continuous_correct_array = train_df.groupby("user_id").apply(lambda x: continuous_correct(x["answered_correctly"].values))
#continuous_correct_array = np.hstack(continuous_correct_array)
#continuous_correct_array = continuous_correct_array[train_index]

#continuous_incorrect_array = train_df.groupby("user_id").apply(lambda x: continuous_incorrect(x["answered_correctly"].values))
#continuous_incorrect_array = np.hstack(continuous_incorrect_array)
#continuous_incorrect_array = continuous_incorrect_array[train_index]

#continuous_incorrect_dict = train_df.groupby("user_id").apply(lambda x: continuous_incorrect_dict_calc(x["answered_correctly"].values)).to_dict(defaultdict(int))
#continuous_correct_dict = train_df.groupby("user_id").apply(lambda x: continuous_correct_dict_calc(x["answered_correctly"].values)).to_dict(defaultdict(int))

In [24]:
answered_correctly_array = train_df[target].values
train_df.drop(target, axis=1, inplace=True)
answered_correctly_array = answered_correctly_array[train_index]

# data formation

In [25]:
if not FULL_TRAIN:
    train_df = train_df[train_df.index.isin(train_index)].reset_index(drop=True)
    if CV_SCHEME == "original":
        valid_index = list(train_df.groupby('user_id').tail(valid_size).index)
    else:
        valid_index = list(train_df['viretual_time_stamp'].nlargest(2500000).index)
    train_index = list(train_df[~train_df.index.isin(valid_index)].index)
del train_df

In [26]:
features_dict = {
    'content_id': content_id_array,
    'prior_question_elapsed_time': prior_question_elapsed_time_array,
    'prior_question_had_explanation':  prior_question_had_explanation_array,
    'user_correctness': user_correctness_array,
    'part': part_array,
    'content_count': content_count_array,
    'count': count_array,
    'first_attempt': first_attempt_array,
    'unique_attempt': unique_attempt_array,
    'part_count': part_count_array,
    'part_ratio': part_ratio_array,
    'prior_question_had_explanation_mean': prior_question_had_explanation_mean_array,
    'got_point': got_point_array,
    'answered_correctly_last7': answered_correctly_last7_array,   
    'timediff': timediff_array,
    'timediff2': timediff2_array,
    'timediff3': timediff3_array,
    'timediff4': timediff4_array,
    'community': community_array,
    'tag_acc': tag_acc_array,
    'community_count': community_count_array,
    'time_dd': time_dd_array,
    #'continuous_correct': continuous_correct_array,
    #'continuous_incorrect': continuous_incorrect_array,
}

features = list(features_dict.keys())
print(len(features))

del content_id_array, prior_question_elapsed_time_array, prior_question_had_explanation_array,
del user_correctness_array, part_array,
del content_count_array, count_array, first_attempt_array, unique_attempt_array,
del part_ratio_array, part_count_array,
del prior_question_had_explanation_mean_array, got_point_array, 
del answered_correctly_last7_array, timediff_array, timediff2_array, community_array,
del tag_acc_array, community_count_array, time_dd_array#, continuous_correct_array, continuous_incorrect_array

22


In [27]:
if FULL_TRAIN:
    print(len(train_index), len(features)+1)
else:
    print((len(train_index), len(features)+1), (len(valid_index), len(features)+1))

39633251 23


# Train

In [28]:
params = {
    'objective': 'binary',
    'seed': 42,
    'metric': 'auc',
    'learning_rate': 0.05,
    'max_bin': 800,
    'num_leaves': 80
}

In [29]:
if FULL_TRAIN:
    X_train = np.ndarray(shape=(len(train_index), len(features)), dtype=np.float32)

    for idx, feature in enumerate(features):
        X_train[:,idx] = features_dict[feature].astype(np.float32).reshape(-1)
        del features_dict[feature]
    y_train = answered_correctly_array.astype(np.float32)
    tr_data = lgb.Dataset(X_train, label=y_train)
else:
    X_train = np.ndarray(shape=(len(train_index), len(features)), dtype=np.float32)
    X_valid = np.ndarray(shape=(len(valid_index), len(features)), dtype=np.float32)

    for idx, feature in enumerate(features):
        X_train[:,idx] = features_dict[feature][train_index].astype(np.float32).reshape(-1)
        X_valid[:,idx] = features_dict[feature][valid_index].astype(np.float32).reshape(-1)
        del features_dict[feature]
    y_train = answered_correctly_array[train_index].astype(np.float32)
    y_valid = answered_correctly_array[valid_index].astype(np.float32)

    tr_data = lgb.Dataset(X_train, label=y_train)
    va_data = lgb.Dataset(X_valid, label=y_valid)

In [30]:
import optuna
from optuna import Trial
from sklearn.metrics import roc_auc_score

def objective(trial: Trial, fast_check=True, target_meter=0, return_info=False):      
    params = {
        'objective': 'binary',
        'seed': 42,
        'metric': 'auc',
        'learning_rate': 0.05,
        'max_bin': 800,
        #'num_leaves': 80
        #'num_leaves': trial.suggest_int('num_leaves', 2, 128),
        #'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.1),
        #'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        #'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        #"bagging_freq": 5,
        #"bagging_fraction": trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
        #"feature_fraction": trial.suggest_uniform('feature_fraction', 0.4, 1.0),
    }
    
    model = lgb.train(params, tr_data ,valid_sets=[tr_data, va_data],
           num_boost_round=1950,early_stopping_rounds=15,verbose_eval = 50)
    valid_predict = model.predict(X_valid, num_iteration = model.best_iteration)
    valid_score = roc_auc_score(y_valid, valid_predict)
    
    return valid_score

#study = optuna.create_study(direction='maximize') #maximize or minimize
#study.optimize(objective, n_trials=3)
#print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))

In [31]:
print("training starts")
if FULL_TRAIN:
    model = lgb.train(
        params, 
        tr_data, 
        num_boost_round=3000,
        valid_sets=None, 
        )
    del X_train, y_train
else:
    model = lgb.train(
        params, 
        tr_data, 
        num_boost_round= 1900,
        valid_sets=[tr_data, va_data], 
        early_stopping_rounds=15,
        verbose_eval=50
        )
    del X_train, y_train, X_valid, y_valid

training starts


# Inference

In [32]:
env = riiideducation.make_env()
iter_test = env.iter_test()
prior_test_df = None

In [33]:
%%time
for (test_df, sample_prediction_df) in iter_test:
    if prior_test_df is not None:
        prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop=True)
        
        user_ids = prior_test_df['user_id'].values
        content_ids = prior_test_df['content_id'].values
        targets = prior_test_df[target].values
         
        for user_id, content_id, answered_correctly, first_attempt_ornot, prior_explanation, prior_point, prior_community in zip(user_ids, content_ids, 
                                                            targets, 
                                                            prior_f_attempt_arrays,
                                                            p_prior_question_had_explanation,
                                                            prior_point_array,
                                                            prior_community_arrays):
            
            user_sum_dict[user_id] += answered_correctly
            user_count_dict[user_id] += 1
            content_sum_dict[content_id] += answered_correctly
            content_count_dict[content_id] += 1
            user_unique_dict[user_id] += first_attempt_ornot
            user_prior_question_had_explanation_sum_dict[user_id] += prior_explanation
            user_point_sum_dict[user_id] += prior_point * answered_correctly
            if len(user_last7_answer_dict[user_id])==7:
                user_last7_answer_dict[user_id] = np.concatenate([user_last7_answer_dict[user_id],[answered_correctly]])[1:]
            else:
                user_last7_answer_dict[user_id] = np.concatenate([user_last7_answer_dict[user_id],[answered_correctly]])
            
            user_community_correct_dict[user_id][prior_community] += answered_correctly
            user_community_count_dict[user_id][prior_community] += 1
            community_count_dict[prior_community] += 1            
            #if answered_correctly == 1:
            #    continuous_correct_dict[user_id] += 1
            #    continuous_incorrect_dict[user_id] = 0
            #else:
            #    continuous_correct_dict[user_id] = 0
            #    continuous_incorrect_dict[user_id] += 1
            
    prior_test_df = test_df.copy()
           
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
    test_df = pd.merge(test_df, questions_df, left_on='content_id', right_on='question_id', how='left', right_index=True).reset_index(drop=True)
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False).astype('bool')
    test_df['prior_question_elapsed_time'] = test_df['prior_question_elapsed_time'].fillna(0)

    p_prior_question_had_explanation = test_df['prior_question_had_explanation'].values
    prior_community_arrays = test_df['community'].values
    
    user_sum = np.zeros(len(test_df), dtype=np.int16)
    user_count = np.zeros(len(test_df), dtype=np.int16)
    content_sum = np.zeros(len(test_df), dtype=np.int32)
    content_count = np.zeros(len(test_df), dtype=np.int32)
    part_count = np.zeros(len(test_df), dtype=np.int32)
    first_attempt_values = []
    user_unique_count = np.zeros(len(test_df), dtype=np.int32)
    user_prior_question_had_explanation_sum = np.zeros(len(test_df), dtype=np.int32)
    got_point_array = np.zeros(len(test_df), dtype=np.float32)
    user_last7_accuracy_array = np.zeros(len(test_df), dtype=np.float32)
    timediff_array = np.zeros(len(test_df), dtype = np.int64)
    timediff2_array = np.zeros(len(test_df), dtype = np.int64)
    timediff3_array = np.zeros(len(test_df), dtype = np.int64)
    timediff4_array = np.zeros(len(test_df), dtype = np.int64)
    tag_acc_array = np.zeros(len(test_df), dtype=np.float32)
    community_count_array = np.zeros(len(test_df), dtype=np.int32)
    time_dd_array = np.zeros(len(test_df), dtype = np.int64)
    #user_con_correct_array = np.zeros(len(test_df), dtype=np.int16)
    #user_con_incorrect_array = np.zeros(len(test_df), dtype=np.int16)
    
    for i, (user_id, content_id, timestamp, community, part) in enumerate(zip(test_df['user_id'].values, 
                                                             test_df['content_id'].values,
                                                             test_df['timestamp'].values,
                                                             test_df['community'].values,
                                                             test_df['part'].values)):
        user_sum[i] = user_sum_dict[user_id]
        user_count[i] = user_count_dict[user_id]
        content_sum[i] = content_sum_dict[content_id]
        content_count[i] = content_count_dict[content_id]
        part_count[i] = user_part_count_dict[user_id][part] + 1
        user_part_count_dict[user_id][part] += 1
        first_attempt_values.append(user_content_dict[user_id][content_id])
        user_content_dict[user_id][content_id] = False             
        user_unique_count[i] = user_unique_dict[user_id]
        user_prior_question_had_explanation_sum[i] = user_prior_question_had_explanation_sum_dict[user_id]
        got_point_array[i] = user_point_sum_dict[user_id]
        
        if len(user_last7_answer_dict[user_id])==7:
            user_last7_accuracy_array[i] = user_last7_answer_dict[user_id].mean()
        else:
            user_last7_accuracy_array[i] = np.nan
            
        if len(user_timestamp_max_dict[user_id]) ==0:
            timediff_array[i] = -1
            timediff2_array[i] = -1
            timediff3_array[i] = -1
            timediff4_array[i] = -1
            time_dd_array[i] = -1
            user_timestamp_max_dict[user_id] = np.concatenate([user_timestamp_max_dict[user_id],[timestamp]])
            
        elif len(user_timestamp_max_dict[user_id]) ==1:
            timediff_array[i] = timestamp - user_timestamp_max_dict[user_id][0]
            timediff2_array[i] = -1
            timediff3_array[i] = -1
            timediff4_array[i] = -1
            time_dd_array[i] = -1
            user_timestamp_max_dict[user_id] = np.concatenate([user_timestamp_max_dict[user_id],[timestamp]])
            
        elif len(user_timestamp_max_dict[user_id]) ==2:
            timediff_array[i] = timestamp - user_timestamp_max_dict[user_id][1]
            timediff2_array[i] = timestamp - user_timestamp_max_dict[user_id][0]
            timediff3_array[i] = -1
            timediff4_array[i] = -1
            time_dd_array[i] = timediff2_array[i] - timediff_array[i]
            user_timestamp_max_dict[user_id] = np.concatenate([user_timestamp_max_dict[user_id],[timestamp]])  
            
        elif len(user_timestamp_max_dict[user_id]) ==3:
            timediff_array[i] = timestamp - user_timestamp_max_dict[user_id][2]
            timediff2_array[i] = timestamp - user_timestamp_max_dict[user_id][1]
            timediff3_array[i] = timestamp - user_timestamp_max_dict[user_id][0]
            timediff4_array[i] = -1
            time_dd_array[i] = timediff2_array[i] - timediff_array[i]
            user_timestamp_max_dict[user_id] = np.concatenate([user_timestamp_max_dict[user_id],[timestamp]]) 
    
        else:
            timediff_array[i] = timestamp - user_timestamp_max_dict[user_id][3]
            timediff2_array[i] = timestamp - user_timestamp_max_dict[user_id][2]
            timediff3_array[i] = timestamp - user_timestamp_max_dict[user_id][1]
            timediff4_array[i] = timestamp - user_timestamp_max_dict[user_id][0]
            time_dd_array[i] = timediff2_array[i] - timediff_array[i]
            user_timestamp_max_dict[user_id] = np.concatenate([user_timestamp_max_dict[user_id],[timestamp]])[1:]  
            
        if user_community_count_dict[user_id][community] == 0:
            tag_acc_array[i] = -1
        else:
            tag_acc_array[i] = user_community_correct_dict[user_id][community] / user_community_count_dict[user_id][community]
            
        community_count_array[i] = community_count_dict[community]
        
        #user_con_correct_array[i] = continuous_correct_dict[user_id]
        #user_con_incorrect_array[i] = continuous_incorrect_dict[user_id]
        
    test_df['user_correctness'] = user_sum / user_count
    test_df['content_count'] = content_count
    test_df['content_id'] = content_sum / content_count
    test_df['count'] = 1
    test_df['count'] = test_df.groupby("user_id")["count"].cumsum()
    test_df['count'] += user_count
    test_df['part_count'] = part_count
    test_df['part_ratio'] = part_count / test_df['count'].values
    test_df["first_attempt"] = first_attempt_values
    test_df["unique_attempt"] = test_df.groupby("user_id")["first_attempt"].cumsum()
    test_df["unique_attempt"] += user_unique_count
    test_df['prior_question_had_explanation_mean'] = user_prior_question_had_explanation_sum / user_count
    test_df['got_point'] = got_point_array / user_count
    test_df['answered_correctly_last7'] = user_last7_accuracy_array
    test_df['timediff'] = timediff_array
    test_df['timediff2'] = timediff2_array
    test_df['timediff3'] = timediff3_array
    test_df['timediff4'] = timediff4_array
    test_df['tag_acc'] = tag_acc_array
    test_df['community_count'] = community_count_array
    test_df['time_dd'] = time_dd_array
    #test_df['continuous_correct'] = user_con_correct_array
    #test_df['continuous_incorrect'] = user_con_incorrect_array
    
    prior_f_attempt_arrays = test_df['first_attempt'].values
    prior_point_array = 1 / (test_df.content_id.values + 0.1)
    
    test_df[target] = model.predict(test_df[features])
    env.predict(test_df[['row_id', target]])

CPU times: user 1.35 s, sys: 53.9 ms, total: 1.4 s
Wall time: 740 ms
