- add timediff2
- original cv scheme

In [1]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1

In [2]:
import gc
import numpy as np
import pandas as pd
from collections import defaultdict
import datatable as dt
import lightgbm as lgb
from matplotlib import pyplot as plt
import riiideducation
from bitarray import bitarray
from functools import partial

from tqdm._tqdm_notebook import tqdm_notebook
from numba import jit
import random

tqdm_notebook.pandas(desc="progress: ")

_ = np.seterr(divide='ignore', invalid='ignore')
pd.set_option("max_rows", 100)
pd.set_option("max_columns", 100)

random.seed(1)

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  if sys.path[0] == '':
  from pandas import Panel


In [3]:
def make_bitarray():
    a = bitarray(32737, endian='little')
    a.setall(True)   
    return a

def clear_mem():
    %reset -f out
    %reset -f in
    gc.collect()

@jit
def target2dec(A):
    ans = [A[0]]
    for i in range(1, len(A)): 
        total = ans[-1] * 0.1 + A[i]
        ans.append(total)
    return np.array(ans)  

In [4]:
FULL_TRAIN = False
CV_SCHEME = "original" #"time"

# Preprocess

In [5]:
data_types_dict = {
    #'row_id': 'uint32',
    'timestamp': 'int64',
    'user_id': 'int32', 
    'content_id': 'int16', 
    #'content_type_id': 'int8',
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool',
}
target = 'answered_correctly'

In [6]:
train_df = dt.fread('../input/riiid-test-answer-prediction/train.csv', columns=set(data_types_dict.keys())).to_pandas()
train_df = train_df[train_df[target] != -1].reset_index(drop=True)

train_df['prior_question_had_explanation'].fillna(False, inplace=True)
train_df['prior_question_elapsed_time'].fillna(0, inplace=True)

train_df['day_timestamp'] = (train_df['timestamp'] // 86400000) + 1 # days from start
train_df = train_df.astype(data_types_dict)

In [7]:
if CV_SCHEME=="time":
    max_timestamp_u = train_df[['user_id','timestamp']].groupby(['user_id']).agg(['max']).reset_index()
    max_timestamp_u.columns = ['user_id', 'max_time_stamp']
    MAX_TIME_STAMP = max_timestamp_u.max_time_stamp.max()

    def rand_time(max_time_stamp):
        interval = MAX_TIME_STAMP - max_time_stamp
        rand_time_stamp = random.randint(0,interval)
        return rand_time_stamp

    max_timestamp_u['rand_time_stamp'] = max_timestamp_u.max_time_stamp.apply(rand_time)
    train_df = train_df.merge(max_timestamp_u, on='user_id', how='left')
    train_df['viretual_time_stamp'] = train_df.timestamp + train_df['rand_time_stamp']

    del train_df['max_time_stamp']
    del train_df['rand_time_stamp']
    del max_timestamp_u

    train_index = list(train_df['viretual_time_stamp'].nlargest(10000000).index)

else:
    if FULL_TRAIN:
        train_size = 250
    else:
        train_size = 24
        valid_size = 6
    
    train_index = list(train_df.groupby('user_id').tail(train_size).index)

In [8]:
timediff_array = train_df.groupby("user_id")["timestamp"].diff().values
timediff_array = timediff_array[train_index]
clear_mem()

timediff2_array = train_df.groupby("user_id")["timestamp"].diff(2).values
timediff2_array = timediff2_array[train_index]
clear_mem()

#user_timestamp_max_dict = train_df.groupby("user_id").tail(1)["timestamp"].astype('int64').to_dict(defaultdict(lambda: -1))
user_timestamp_max_dict = train_df.groupby("user_id")["timestamp"].apply(lambda x: x[-2:].values).to_dict(defaultdict(partial(np.ndarray, 0, dtype="int64")))
train_df.drop("timestamp", axis=1, inplace=True)
timediff_array = np.nan_to_num(timediff_array, nan=-1)
timediff2_array = np.nan_to_num(timediff2_array, nan=-1)

Flushing output cache (0 entries)
Flushing input history
Flushing output cache (0 entries)
Flushing input history


In [9]:
prior_question_elapsed_time_array = train_df.prior_question_elapsed_time.values
train_df.drop("prior_question_elapsed_time", axis =1, inplace=True)
prior_question_elapsed_time_array = prior_question_elapsed_time_array[train_index]

In [10]:
questions_df = pd.read_csv(
    '../input/riiid-test-answer-prediction/questions.csv', 
    usecols=[0, 3], #4
    dtype={'question_id': 'int16', 'part': 'int8'}  #'tags': 'str'
)

#additional_q_df = pd.read_csv('../input/riiid-question-clustering/question_cmnts.csv')
#questions_df["community"] = 
    
#questions_df['tags'] = questions_df['tags'].apply(lambda ts: [int(x) for x in str(ts).split() if x != 'nan'])

train_df = pd.merge(train_df, questions_df, left_on='content_id', right_on='question_id', how='left', right_index=True).reset_index(drop=True)
train_df.drop(columns=['question_id'], inplace=True)

In [11]:
train_df['lag'] = train_df.groupby('user_id')[target].shift()
cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])
user_correctness_array = np.array(cum['cumsum'] / cum['cumcount'])
user_correctness_array = user_correctness_array[train_index]
train_df.drop(columns=['lag'], inplace=True)
del cum

In [12]:
#
print(1)
train_df["part_count"] = train_df["part"] ==1
train_df["part_count"] = train_df["part_count"].astype("int8")
train_df['part_count_lag'] = train_df.groupby('user_id')["part_count"].shift()
part1_count_array = train_df.groupby('user_id')['part_count_lag'].agg(['cumsum']).values
train_df["part1_count"] = part1_count_array

part1_count_agg = train_df.groupby('user_id')["part1_count"].agg(['max'])
part1_count_agg['max'].fillna(0,inplace=True)
part1_count_dict = part1_count_agg['max'].astype('int32').to_dict(defaultdict(int))
del part1_count_agg
train_df.drop("part1_count", axis=1, inplace=True)
part1_count_array = part1_count_array[train_index] 

#
print(2)
train_df["part_count"] = train_df["part"] ==2
train_df["part_count"] = train_df["part_count"].astype("int8")
train_df['part_count_lag'] = train_df.groupby('user_id')["part_count"].shift()
part2_count_array = train_df.groupby('user_id')['part_count_lag'].agg(['cumsum']).values
train_df["part2_count"] = part2_count_array

part2_count_agg = train_df.groupby('user_id')["part2_count"].agg(['max'])
part2_count_agg['max'].fillna(0,inplace=True)
part2_count_dict = part2_count_agg['max'].astype('int32').to_dict(defaultdict(int))
del part2_count_agg
train_df.drop("part2_count", axis=1, inplace=True)
part2_count_array = part2_count_array[train_index] 

#
print(3)
train_df["part_count"] = train_df["part"] ==3
train_df["part_count"] = train_df["part_count"].astype("int8")
train_df['part_count_lag'] = train_df.groupby('user_id')["part_count"].shift()
part3_count_array = train_df.groupby('user_id')['part_count_lag'].agg(['cumsum']).values
train_df["part3_count"] = part3_count_array

part3_count_agg = train_df.groupby('user_id')["part3_count"].agg(['max'])
part3_count_agg['max'].fillna(0,inplace=True)
part3_count_dict = part3_count_agg['max'].astype('int32').to_dict(defaultdict(int))
del part3_count_agg
train_df.drop("part3_count", axis=1, inplace=True)
part3_count_array = part3_count_array[train_index] 

#
print(4)
train_df["part_count"] = train_df["part"] ==4
train_df["part_count"] = train_df["part_count"].astype("int8")
train_df['part_count_lag'] = train_df.groupby('user_id')["part_count"].shift()
part4_count_array = train_df.groupby('user_id')['part_count_lag'].agg(['cumsum']).values
train_df["part4_count"] = part4_count_array

part4_count_agg = train_df.groupby('user_id')["part4_count"].agg(['max'])
part4_count_agg['max'].fillna(0,inplace=True)
part4_count_dict = part4_count_agg['max'].astype('int32').to_dict(defaultdict(int))
del part4_count_agg
train_df.drop("part4_count", axis=1, inplace=True)
part4_count_array = part4_count_array[train_index] 

#
print(5)
train_df["part_count"] = train_df["part"] ==5
train_df["part_count"] = train_df["part_count"].astype("int8")
train_df['part_count_lag'] = train_df.groupby('user_id')["part_count"].shift()
part5_count_array = train_df.groupby('user_id')['part_count_lag'].agg(['cumsum']).values
train_df["part5_count"] = part5_count_array

part5_count_agg = train_df.groupby('user_id')["part5_count"].agg(['max'])
part5_count_agg['max'].fillna(0,inplace=True)
part5_count_dict = part5_count_agg['max'].astype('int32').to_dict(defaultdict(int))
del part5_count_agg
train_df.drop("part5_count", axis=1, inplace=True)
part5_count_array = part5_count_array[train_index] 

#
print(6)
train_df["part_count"] = train_df["part"] ==6
train_df["part_count"] = train_df["part_count"].astype("int8")
train_df['part_count_lag'] = train_df.groupby('user_id')["part_count"].shift()
part6_count_array = train_df.groupby('user_id')['part_count_lag'].agg(['cumsum']).values
train_df["part6_count"] = part6_count_array

part6_count_agg = train_df.groupby('user_id')["part6_count"].agg(['max'])
part6_count_agg['max'].fillna(0,inplace=True)
part6_count_dict = part6_count_agg['max'].astype('int32').to_dict(defaultdict(int))
del part6_count_agg
train_df.drop("part6_count", axis=1, inplace=True)
part6_count_array = part6_count_array[train_index] 

#
print(7)
train_df["part_count"] = train_df["part"] ==7
train_df["part_count"] = train_df["part_count"].astype("int8")
train_df['part_count_lag'] = train_df.groupby('user_id')["part_count"].shift()
part7_count_array = train_df.groupby('user_id')['part_count_lag'].agg(['cumsum']).values
train_df.drop(columns=['part_count_lag', 'part_count'], inplace=True)
train_df["part7_count"] = part7_count_array

part7_count_agg = train_df.groupby('user_id')["part7_count"].agg(['max'])
part7_count_agg['max'].fillna(0,inplace=True)
part7_count_dict = part7_count_agg['max'].astype('int32').to_dict(defaultdict(int))
del part7_count_agg
train_df.drop("part7_count", axis=1, inplace=True)
part7_count_array = part7_count_array[train_index] 

part_array = train_df.part.values
train_df.drop("part", axis=1, inplace=True)
part_array = part_array[train_index]

1
2
3
4
5
6
7


In [13]:
# prior_question_had_explanation_mean
train_df['lag'] = train_df.groupby('user_id')['prior_question_had_explanation'].shift().astype(bool)
cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])
prior_question_had_explanation_mean_array = np.array(cum['cumsum'] / cum['cumcount'])
prior_question_had_explanation_mean_array = prior_question_had_explanation_mean_array[train_index]

user_prior_question_had_explanation_sum_agg = train_df.groupby('user_id')["prior_question_had_explanation"].agg(['sum'])
user_prior_question_had_explanation_sum_dict = user_prior_question_had_explanation_sum_agg['sum'].astype('int32').to_dict(defaultdict(int))
train_df.drop(columns=['lag'], inplace=True)
del cum, user_prior_question_had_explanation_sum_agg

prior_question_had_explanation_array = train_df.prior_question_had_explanation.values
train_df.drop('prior_question_had_explanation', axis=1, inplace=True)
prior_question_had_explanation_array = prior_question_had_explanation_array[train_index]

In [14]:
first_attempt_df = pd.read_csv("../input/riiid-additional-data/content_first_attempt.csv")
first_attempt_array = first_attempt_df.first_attempt.values
train_df["first_attempt"] = first_attempt_array

unique_attempt_array= train_df.groupby("user_id")["first_attempt"].cumsum().values
train_df["unique_attempt"] = unique_attempt_array
user_unique_agg = train_df.groupby('user_id')["unique_attempt"].agg(['max'])
user_unique_dict = user_unique_agg['max'].astype('int32').to_dict(defaultdict(int))

first_attempt_array = first_attempt_array[train_index]
unique_attempt_array = unique_attempt_array[train_index]
train_df.drop(['first_attempt', 'unique_attempt'], axis=1, inplace=True)
del first_attempt_df, user_unique_agg

clear_mem()

Flushing output cache (0 entries)
Flushing input history


In [15]:
#
user_agg = train_df.groupby('user_id')[target].agg(['sum', 'count'])
user_sum_dict = user_agg['sum'].astype('int16').to_dict(defaultdict(int))
user_count_dict = user_agg['count'].astype('int16').to_dict(defaultdict(int))
del user_agg

#
content_agg = train_df.groupby('content_id')[target].agg(['sum', 'count'])
content_sum_dict = content_agg['sum'].astype('int32').to_dict(defaultdict(int))
content_count_dict = content_agg['count'].astype('int32').to_dict(defaultdict(int))

content_count_array = train_df['content_id'].map(content_agg['count']).astype('int32').values
content_id_array = train_df['content_id'].map(content_agg['sum'] / content_agg['count']).values
del content_agg

# benefit of solving difficult questions
# https://stackoverflow.com/questions/35215161/most-efficient-way-to-map-function-over-numpy-array
func = lambda t: 1/(t + 0.1)
vfunc = np.vectorize(func)
point_array = vfunc(content_id_array)
content_count_array = content_count_array[train_index]
content_id_array = content_id_array[train_index]
clear_mem()

Flushing output cache (0 entries)
Flushing input history


In [16]:
train_df["got_point"] = point_array * train_df["answered_correctly"].values
del point_array
user_point_agg = train_df.groupby('user_id')["got_point"].agg(['sum'])
user_point_sum_dict = user_point_agg['sum'].astype('float32').to_dict(defaultdict(float))
del user_point_agg

train_df['lag'] = train_df.groupby('user_id')["got_point"].shift()
cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])
got_point_array = np.array(cum['cumsum'] / cum['cumcount'])
train_df.drop(columns=['lag', 'got_point'], inplace=True)
del cum
got_point_array = got_point_array[train_index]

clear_mem()

Flushing output cache (0 entries)
Flushing input history


In [17]:
%%time
user_content_agg = train_df.groupby("user_id")["content_id"].unique().reset_index().set_index("user_id")

value = []
for j in user_content_agg.index:
    a = bitarray(32737, endian='little')
    a.setall(True)
    for i in user_content_agg.loc[j][0]:
        a[i] = 0
    value.append(a)
    
user_content_agg["content_exp"] = value
user_content_dict = user_content_agg["content_exp"].to_dict(defaultdict(make_bitarray))
del user_content_agg, value

CPU times: user 2min 30s, sys: 5 s, total: 2min 35s
Wall time: 2min 35s


In [18]:
train_df["count"] = 1

# normal cumsum
count_array = train_df.groupby("user_id")["count"].cumsum().values
count_array = count_array[train_index]
train_df.drop("count", axis=1, inplace=True)

In [19]:
train_df["count_inday"] = 1
count_inday_array = train_df.groupby(["user_id","day_timestamp"])["count_inday"].cumsum().values
train_df["count_inday"] = count_inday_array
user_count_inday_dict = train_df.groupby("user_id").tail(1)["count_inday"].to_dict(defaultdict(int))
count_inday_array = count_inday_array[train_index]
train_df.drop("count_inday", axis=1, inplace=True)

In [20]:
user_day_timestamp_dict = train_df.groupby("user_id").tail(1)["day_timestamp"].to_dict(defaultdict(int))
day_timestamp_array = train_df["day_timestamp"].values
train_df.drop("day_timestamp", axis=1, inplace=True)
day_timestamp_array = day_timestamp_array[train_index]

In [21]:
# target2dec
#target_dec_array = train_df.groupby("user_id").apply(lambda x: target2dec(x["answered_correctly"].values))
#target_dec_array = np.hstack(target_dec_array)
#train_df["target_dec"] = target_dec_array
#target_dec_dict = train_df.groupby("user_id").tail(1)["target_dec"].to_dict()
#target_dec_array = train_df.groupby("user_id")["target_dec"].shift().values
#target_dec_array = target_dec_array[train_index]
#train_df.drop("target_dec", axis=1, inplace=True)  

In [22]:
answered_correctly_last7_array = train_df.groupby("user_id")["answered_correctly"].shift().rolling(window=7).mean().values
answered_correctly_last7_array = answered_correctly_last7_array[train_index]
user_last7_answer_dict = train_df.groupby("user_id")["answered_correctly"].apply(lambda x: 
                                                                                 x[-7:].values).to_dict(defaultdict(partial(np.ndarray, 0, dtype="int8")))

In [23]:
answered_correctly_array = train_df[target].values
train_df.drop(target, axis=1, inplace=True)
answered_correctly_array = answered_correctly_array[train_index]

# data formation

In [24]:
if not FULL_TRAIN:
    train_df = train_df[train_df.index.isin(train_index)].reset_index(drop=True)
    if CV_SCHEME == "original":
        valid_index = list(train_df.groupby('user_id').tail(valid_size).index)
    else:
        valid_index = list(train_df['viretual_time_stamp'].nlargest(2500000).index)
    train_index = list(train_df[~train_df.index.isin(valid_index)].index)
del train_df

In [25]:
features_dict = {
    'content_id': content_id_array,
    'prior_question_elapsed_time': prior_question_elapsed_time_array,
    'prior_question_had_explanation':  prior_question_had_explanation_array,
    'user_correctness': user_correctness_array,
    'part': part_array,
    'content_count': content_count_array,
    'count': count_array,
    'first_attempt': first_attempt_array,
    'day_timestamp': day_timestamp_array,
    'count_inday': count_inday_array,
    'unique_attempt': unique_attempt_array,
    'part1_count': part1_count_array,
    'part2_count': part2_count_array,
    'part3_count': part3_count_array,
    'part4_count': part4_count_array,
    'part5_count': part5_count_array,
    'part6_count': part6_count_array,
    'part7_count': part7_count_array,
    'prior_question_had_explanation_mean': prior_question_had_explanation_mean_array,
    'got_point': got_point_array,
    'answered_correctly_last7': answered_correctly_last7_array,   
    'timediff': timediff_array,
    'timediff2': timediff2_array,
}

features = list(features_dict.keys())
print(len(features))

del content_count_array, content_id_array, user_correctness_array, first_attempt_array, count_array, prior_question_had_explanation_mean_array
del part1_count_array, part2_count_array, part3_count_array, part4_count_array, part5_count_array, part6_count_array, part7_count_array
del answered_correctly_last7_array, timediff_array, timediff2_array

23


In [26]:
if FULL_TRAIN:
    print(len(train_index), len(features)+1)
else:
    print((len(train_index), len(features)+1), (len(valid_index), len(features)+1))

(6536675, 24) (2360984, 24)


# Train

In [27]:
params = {
    'objective': 'binary',
    'seed': 42,
    'metric': 'auc',
    'learning_rate': 0.05,
    'max_bin': 800,
    'num_leaves': 80
}

In [28]:
if FULL_TRAIN:
    X_train = np.ndarray(shape=(len(train_index), len(features)), dtype=np.float32)

    for idx, feature in enumerate(features):
        X_train[:,idx] = features_dict[feature].astype(np.float32).reshape(-1)
        del features_dict[feature]
    y_train = answered_correctly_array.astype(np.float32)
    tr_data = lgb.Dataset(X_train, label=y_train)
    print("Full training starts")
    model = lgb.train(
        params, 
        tr_data, 
        num_boost_round=3750,
        valid_sets=None, 
        )
    del X_train, y_train

else:
    X_train = np.ndarray(shape=(len(train_index), len(features)), dtype=np.float32)
    X_valid = np.ndarray(shape=(len(valid_index), len(features)), dtype=np.float32)

    for idx, feature in enumerate(features):
        X_train[:,idx] = features_dict[feature][train_index].astype(np.float32).reshape(-1)
        X_valid[:,idx] = features_dict[feature][valid_index].astype(np.float32).reshape(-1)
        del features_dict[feature]
    y_train = answered_correctly_array[train_index].astype(np.float32)
    y_valid = answered_correctly_array[valid_index].astype(np.float32)

    tr_data = lgb.Dataset(X_train, label=y_train)
    va_data = lgb.Dataset(X_valid, label=y_valid)

    model = lgb.train(
        params, 
        tr_data, 
        num_boost_round=10000,
        valid_sets=[tr_data, va_data], 
        early_stopping_rounds=50,
        verbose_eval=50
        )
    del X_train, y_train, X_valid, y_valid

Training until validation scores don't improve for 50 rounds
[50]	training's auc: 0.764757	valid_1's auc: 0.750388
[100]	training's auc: 0.768992	valid_1's auc: 0.755295
[150]	training's auc: 0.77111	valid_1's auc: 0.757577
[200]	training's auc: 0.772279	valid_1's auc: 0.758581
[250]	training's auc: 0.772996	valid_1's auc: 0.759093
[300]	training's auc: 0.773539	valid_1's auc: 0.759415
[350]	training's auc: 0.773997	valid_1's auc: 0.759653
[400]	training's auc: 0.774444	valid_1's auc: 0.759915
[450]	training's auc: 0.774883	valid_1's auc: 0.760138
[500]	training's auc: 0.775281	valid_1's auc: 0.760334
[550]	training's auc: 0.775641	valid_1's auc: 0.760511
[600]	training's auc: 0.776022	valid_1's auc: 0.760674
[650]	training's auc: 0.77636	valid_1's auc: 0.760809
[700]	training's auc: 0.776716	valid_1's auc: 0.760939
[750]	training's auc: 0.777049	valid_1's auc: 0.761051
[800]	training's auc: 0.777382	valid_1's auc: 0.761184
[850]	training's auc: 0.777747	valid_1's auc: 0.761337
[900]	t

# Inference

In [29]:
env = riiideducation.make_env()
iter_test = env.iter_test()
prior_test_df = None

In [30]:
%%time
for (test_df, sample_prediction_df) in iter_test:
    if prior_test_df is not None:
        prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop=True)
        
        user_ids = prior_test_df['user_id'].values
        content_ids = prior_test_df['content_id'].values
        targets = prior_test_df[target].values
        timestamps = prior_test_df['timestamp'].values
        prev_user = ""
         
        for user_id, content_id, answered_correctly, part, first_attempt_ornot, prior_explanation, prior_point, timestamp in zip(user_ids, content_ids, 
                                                            targets, 
                                                            prior_part_arrays, 
                                                            prior_f_attempt_arrays,
                                                            p_prior_question_had_explanation,
                                                            prior_point_array,
                                                            timestamps):
            user_sum_dict[user_id] += answered_correctly
            user_count_dict[user_id] += 1
            content_sum_dict[content_id] += answered_correctly
            content_count_dict[content_id] += 1
            if part == 1:
                part1_count_dict[user_id] +=1
            elif part == 2:
                part2_count_dict[user_id] +=1
            elif part == 3:
                part3_count_dict[user_id] +=1
            elif part == 4:
                part4_count_dict[user_id] +=1
            elif part == 5:
                part5_count_dict[user_id] +=1
            elif part == 6:
                part6_count_dict[user_id] +=1
            else:
                part7_count_dict[user_id] +=1
            user_unique_dict[user_id] += first_attempt_ornot
            user_prior_question_had_explanation_sum_dict[user_id] += prior_explanation
            user_point_sum_dict[user_id] += prior_point * answered_correctly
            if len(user_last7_answer_dict[user_id])==7:
                user_last7_answer_dict[user_id] = np.concatenate([user_last7_answer_dict[user_id],[answered_correctly]])[1:]
            else:
                user_last7_answer_dict[user_id] = np.concatenate([user_last7_answer_dict[user_id],[answered_correctly]])
              
            if prev_user != user_id:
                if len(user_timestamp_max_dict[user_id])==2:
                    user_timestamp_max_dict[user_id] = np.concatenate([user_timestamp_max_dict[user_id],[timestamp]])[1:]
                else:
                    user_timestamp_max_dict[user_id] = np.concatenate([user_timestamp_max_dict[user_id],[timestamp]])
                prev_user = user_id

    prior_test_df = test_df.copy()

    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
    test_df = pd.merge(test_df, questions_df, left_on='content_id', right_on='question_id', how='left', right_index=True).reset_index(drop=True)
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False).astype('bool')
    test_df['prior_question_elapsed_time'] = test_df['prior_question_elapsed_time'].fillna(0)
    test_df['day_timestamp'] = (test_df['timestamp'] // 86400000) + 1

    prior_part_arrays = test_df['part'].values
    p_prior_question_had_explanation = test_df['prior_question_had_explanation'].values
    
    user_sum = np.zeros(len(test_df), dtype=np.int16)
    user_count = np.zeros(len(test_df), dtype=np.int16)
    content_sum = np.zeros(len(test_df), dtype=np.int32)
    content_count = np.zeros(len(test_df), dtype=np.int32)
    part1_count = np.zeros(len(test_df), dtype=np.int32)
    part2_count = np.zeros(len(test_df), dtype=np.int32)
    part3_count = np.zeros(len(test_df), dtype=np.int32)
    part4_count = np.zeros(len(test_df), dtype=np.int32)
    part5_count = np.zeros(len(test_df), dtype=np.int32)
    part6_count = np.zeros(len(test_df), dtype=np.int32)
    part7_count = np.zeros(len(test_df), dtype=np.int32)
    user_count_inday = np.zeros(len(test_df), dtype=np.int32)
    first_attempt_values = []
    user_unique_count = np.zeros(len(test_df), dtype=np.int32)
    user_prior_question_had_explanation_sum = np.zeros(len(test_df), dtype=np.int32)
    got_point_array = np.zeros(len(test_df), dtype=np.float32)
    user_last7_accuracy_array = np.zeros(len(test_df), dtype=np.float32)
    timediff_array = np.zeros(len(test_df), dtype = np.int64)
    timediff2_array = np.zeros(len(test_df), dtype = np.int64)

    for i, (user_id, content_id, day_timestamp, timestamp) in enumerate(zip(test_df['user_id'].values, test_df['content_id'].values,
                                                             test_df['day_timestamp'].values,test_df['timestamp'].values)):
        user_sum[i] = user_sum_dict[user_id]
        user_count[i] = user_count_dict[user_id]
        content_sum[i] = content_sum_dict[content_id]
        content_count[i] = content_count_dict[content_id]
        part1_count[i] = part1_count_dict[user_id]
        part2_count[i] = part2_count_dict[user_id]
        part3_count[i] = part3_count_dict[user_id]
        part4_count[i] = part4_count_dict[user_id]
        part5_count[i] = part5_count_dict[user_id]
        part6_count[i] = part6_count_dict[user_id]
        part7_count[i] = part7_count_dict[user_id]
        first_attempt_values.append(user_content_dict[user_id][content_id])
        user_content_dict[user_id][content_id] = False 
        if user_day_timestamp_dict[user_id] == day_timestamp:
            user_count_inday_dict[user_id] += 1
        else:
            user_count_inday_dict[user_id] = 1
            user_day_timestamp_dict[user_id] = day_timestamp
        user_count_inday[i] = user_count_inday_dict[user_id]
        user_unique_count[i] = user_unique_dict[user_id]
        user_prior_question_had_explanation_sum[i] = user_prior_question_had_explanation_sum_dict[user_id]
        got_point_array[i] = user_point_sum_dict[user_id]
        if len(user_last7_answer_dict[user_id])==7:
            user_last7_accuracy_array[i] = user_last7_answer_dict[user_id].mean()
        else:
            user_last7_accuracy_array[i] = np.nan
        
        if len(user_timestamp_max_dict[user_id]) ==0:
            timediff_array[i] = -1
            timediff2_array[i] = -1
        elif len(user_timestamp_max_dict[user_id]) ==1:
            timediff_array[i] = timestamp - user_timestamp_max_dict[user_id][0]
            timediff2_array[i] = -1
        else:
            timediff_array[i] = timestamp - user_timestamp_max_dict[user_id][1]
            timediff2_array[i] = timestamp - user_timestamp_max_dict[user_id][0]
        
    test_df['user_correctness'] = user_sum / user_count
    test_df['content_count'] = content_count
    test_df['content_id'] = content_sum / content_count
    test_df['count'] = 1
    test_df['count'] = test_df.groupby("user_id")["count"].cumsum()
    test_df['count'] += user_count
    test_df['part1_count'] = part1_count
    test_df['part2_count'] = part2_count
    test_df['part3_count'] = part3_count
    test_df['part4_count'] = part4_count
    test_df['part5_count'] = part5_count
    test_df['part6_count'] = part6_count
    test_df['part7_count'] = part7_count
    test_df["first_attempt"] = first_attempt_values
    test_df['count_inday'] = user_count_inday
    test_df["unique_attempt"] = test_df.groupby("user_id")["first_attempt"].cumsum()
    test_df["unique_attempt"] += user_unique_count
    test_df['prior_question_had_explanation_mean'] = user_prior_question_had_explanation_sum / user_count
    test_df['got_point'] = got_point_array / user_count
    test_df['answered_correctly_last7'] = user_last7_accuracy_array
    test_df['timediff'] = timediff_array
    test_df['timediff2'] = timediff2_array
    
    prior_f_attempt_arrays = test_df['first_attempt'].values
    prior_point_array = vfunc(test_df.content_id.values)

    test_df[target] = model.predict(test_df[features])
    env.predict(test_df[['row_id', target]])

CPU times: user 3.66 s, sys: 137 ms, total: 3.79 s
Wall time: 3.13 s
