- lgb (ver68) + transformer (ver1)
- ensemble ratio　7:3 
- training size 350, boost round 3750 
- change train size and boost round from ver 16 *

In [1]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1

In [2]:
import gc
import numpy as np
import pandas as pd
from collections import defaultdict
import datatable as dt
import lightgbm as lgb
from matplotlib import pyplot as plt
import riiideducation
from bitarray import bitarray
from functools import partial
import pickle
import math 

from tqdm._tqdm_notebook import tqdm_notebook
from numba import jit
import random

tqdm_notebook.pandas(desc="progress: ")

_ = np.seterr(divide='ignore', invalid='ignore')
pd.set_option("max_rows", 100)
pd.set_option("max_columns", 100)

random.seed(1)

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  
  from pandas import Panel


In [3]:
def make_bitarray():
    a = bitarray(32737, endian='little')
    a.setall(True)   
    return a

def clear_mem():
    %reset -f out
    %reset -f in
    gc.collect()

In [4]:
FULL_TRAIN = True

# Preprocess

In [5]:
data_types_dict = {
    'timestamp': 'int64',
    'user_id': 'int32', 
    'content_id': 'int16', 
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool',
}
target = 'answered_correctly'

In [6]:
train_df = dt.fread('../input/riiid-test-answer-prediction/train.csv', columns=set(data_types_dict.keys())).to_pandas()

In [7]:
train_df = train_df[train_df[target] != -1].reset_index(drop=True)

train_df['prior_question_had_explanation'].fillna(False, inplace=True)
train_df['prior_question_elapsed_time'].fillna(0, inplace=True)

train_df = train_df.astype(data_types_dict)

In [8]:
MAX_SEQ = 180
ACCEPTED_USER_CONTENT_SIZE = 4
EMBED_SIZE = 128
BATCH_SIZE = 64
DROPOUT = 0.1

skills = train_df["content_id"].unique()
n_skill = len(skills)
print("number skills", len(skills))

number skills 13523


In [9]:
train_size = 350    
train_index = list(train_df.groupby('user_id').tail(train_size).index)

In [10]:
train_df["count"] = 1

# normal cumsum
count_array = train_df.groupby("user_id")["count"].cumsum().values
count_array = count_array[train_index]
train_df.drop("count", axis=1, inplace=True)

In [11]:
timediff_array = train_df.groupby("user_id")["timestamp"].diff().values
timediff_array = timediff_array[train_index]
clear_mem()

timediff2_array = train_df.groupby("user_id")["timestamp"].diff(2).values
timediff2_array = timediff2_array[train_index]
clear_mem()

timediff3_array = train_df.groupby("user_id")["timestamp"].diff(3).values
timediff3_array = timediff3_array[train_index]
clear_mem()

timediff4_array = train_df.groupby("user_id")["timestamp"].diff(4).values
timediff4_array = timediff4_array[train_index]
clear_mem()

user_timestamp_max_dict = train_df.groupby("user_id")["timestamp"].apply(lambda x: x[-4:].values).to_dict(defaultdict(partial(np.ndarray, 0, dtype="int64")))

train_df.drop("timestamp", axis=1, inplace=True)

timediff_array = np.nan_to_num(timediff_array, nan=-1)
timediff2_array = np.nan_to_num(timediff2_array, nan=-1)
timediff3_array = np.nan_to_num(timediff3_array, nan=-1)
timediff4_array = np.nan_to_num(timediff4_array, nan=-1)

Flushing output cache (0 entries)
Flushing input history
Flushing output cache (0 entries)
Flushing input history
Flushing output cache (0 entries)
Flushing input history
Flushing output cache (0 entries)
Flushing input history


In [12]:
prior_question_elapsed_time_array = train_df.prior_question_elapsed_time.values
train_df.drop("prior_question_elapsed_time", axis =1, inplace=True)
prior_question_elapsed_time_array = prior_question_elapsed_time_array[train_index]

In [13]:
questions_df = pd.read_csv(
    '../input/riiid-test-answer-prediction/questions.csv', 
    usecols=[0, 3], 
    dtype={'question_id': 'int16', 'part': 'int8'} 
)

additional_q_df = pd.read_csv('../input/riiid-question-clustering/question_cmnts.csv')
questions_df["community"] = additional_q_df["community"].astype('int8')
del additional_q_df 
    
train_df = pd.merge(train_df, questions_df, left_on='content_id', right_on='question_id', how='left', right_index=True).reset_index(drop=True)
train_df.drop(columns=['question_id'], inplace=True)

In [14]:
community_num = len(questions_df.community.unique())
print(community_num)

@jit
def tag_accuracy(A, C):
    ans = []
    community_count = [0] * community_num
    community_correct = [0] * community_num
    for i in range(len(C)):
        if community_count[C[i]]==0:
            ans.append(-1)
        else:
            ans.append(community_correct[C[i]]/community_count[C[i]])
        community_count[C[i]] +=1
        community_correct[C[i]] += A[i]
    return np.array(ans)

@jit
def tag_correct_last(A, C):
    community_correct = [0] * community_num
    for i in range(len(C)):
        community_correct[C[i]] += A[i]
    return np.array(community_correct)

@jit
def tag_count_last(A, C):
    community_count = [0] * community_num
    for i in range(len(C)):
        community_count[C[i]] +=1
    return np.array(community_count)

def init_dict():
    ans = [0] * community_num
    return np.array(ans)

76


In [15]:
#
tag_acc_array = train_df.groupby("user_id").apply(lambda x: tag_accuracy(x["answered_correctly"].values, x["community"].values))
tag_acc_array = np.hstack(tag_acc_array)
tag_acc_array = tag_acc_array[train_index]

user_community_count_dict = train_df.groupby("user_id").apply(lambda x: tag_count_last(x["answered_correctly"].values, x["community"].values)).to_dict(defaultdict(init_dict))
user_community_correct_dict = train_df.groupby("user_id").apply(lambda x: tag_correct_last(x["answered_correctly"].values, x["community"].values)).to_dict(defaultdict(init_dict))
    
#
community_agg = train_df.groupby('community')[target].agg(['count'])
community_count_dict = community_agg['count'].astype('int32').to_dict(defaultdict(int))
community_count_array = train_df['community'].map(community_agg['count']).astype('int32').values
del community_agg
community_count_array = community_count_array[train_index]
    
community_array = train_df["community"].values
community_array = community_array[train_index]
train_df.drop('community', axis=1, inplace=True)

In [16]:
train_df['lag'] = train_df.groupby('user_id')[target].shift()
cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])
user_correctness_array = np.array(cum['cumsum'] / cum['cumcount'])
user_correctness_array = user_correctness_array[train_index]
train_df.drop(columns=['lag'], inplace=True)
del cum

In [17]:
@jit
def part_count_calc(P):
    ans = []
    part_count = [0] * 8
    for i in range(len(P)):
        part_count[P[i]] += 1
        ans.append(part_count[P[i]])
    return np.array(ans)

@jit
def part_count_dict_calc(P):
    part_count = [0] * 8
    for i in range(len(P)):
        part_count[P[i]] += 1
    return np.array(part_count)

def part_dict_init():
    ans = [0] * 8
    return np.array(ans)

part_count_array = train_df.groupby("user_id").apply(lambda x: part_count_calc(x["part"].values))
part_count_array = np.hstack(part_count_array)
part_count_array = part_count_array[train_index]
part_ratio_array = part_count_array / count_array

user_part_count_dict = train_df.groupby("user_id").apply(lambda x: part_count_dict_calc(x["part"].values)).to_dict(defaultdict(part_dict_init))

part_array = train_df.part.values
train_df.drop("part", axis=1, inplace=True)
part_array = part_array[train_index]

In [18]:
# prior_question_had_explanation_mean
train_df['lag'] = train_df.groupby('user_id')['prior_question_had_explanation'].shift().astype(bool)
cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])
prior_question_had_explanation_mean_array = np.array(cum['cumsum'] / cum['cumcount'])
prior_question_had_explanation_mean_array = prior_question_had_explanation_mean_array[train_index]

user_prior_question_had_explanation_sum_agg = train_df.groupby('user_id')["prior_question_had_explanation"].agg(['sum'])
user_prior_question_had_explanation_sum_dict = user_prior_question_had_explanation_sum_agg['sum'].astype('int32').to_dict(defaultdict(int))
train_df.drop(columns=['lag'], inplace=True)
del cum, user_prior_question_had_explanation_sum_agg

prior_question_had_explanation_array = train_df.prior_question_had_explanation.values
train_df.drop('prior_question_had_explanation', axis=1, inplace=True)
prior_question_had_explanation_array = prior_question_had_explanation_array[train_index]

In [19]:
first_attempt_df = pd.read_csv("../input/riiid-additional-data/content_first_attempt.csv")
first_attempt_array = first_attempt_df.first_attempt.values
train_df["first_attempt"] = first_attempt_array

unique_attempt_array= train_df.groupby("user_id")["first_attempt"].cumsum().values
train_df["unique_attempt"] = unique_attempt_array
user_unique_agg = train_df.groupby('user_id')["unique_attempt"].agg(['max'])
user_unique_dict = user_unique_agg['max'].astype('int32').to_dict(defaultdict(int))

first_attempt_array = first_attempt_array[train_index]
unique_attempt_array = unique_attempt_array[train_index]
train_df.drop(['first_attempt', 'unique_attempt'], axis=1, inplace=True)
del first_attempt_df, user_unique_agg

clear_mem()

Flushing output cache (0 entries)
Flushing input history


In [20]:
user_agg = train_df.groupby('user_id')[target].agg(['sum', 'count'])
user_sum_dict = user_agg['sum'].astype('int16').to_dict(defaultdict(int))
del user_agg['sum']
user_count_dict = user_agg['count'].astype('int16').to_dict(defaultdict(int))
del user_agg['count']
clear_mem()

#
content_agg = train_df.groupby('content_id')[target].agg(['sum', 'count'])
content_sum_dict = content_agg['sum'].astype('int32').to_dict(defaultdict(int))
content_count_dict = content_agg['count'].astype('int32').to_dict(defaultdict(int))

content_count_array = train_df['content_id'].map(content_agg['count']).astype('int32').values
content_id_array = train_df['content_id'].map(content_agg['sum'] / content_agg['count']).values
del content_agg
clear_mem()

Flushing output cache (0 entries)
Flushing input history
Flushing output cache (0 entries)
Flushing input history


In [21]:
# benefit of solving difficult questions
#point_array = 1 / (content_id_array + 0.1)
content_id_array = content_id_array[train_index]
content_count_array = content_count_array[train_index]

with open('../input/riiid-premade-data/got_point_array.pickle','rb') as f:
    got_point_array = pickle.load(f)
got_point_array = got_point_array[train_index]
    
with open('../input/riiid-premade-data/user_point_sum_dict.pickle','rb') as f:
    user_point_sum_dict = pickle.load(f)

In [22]:
%%time
train_df.drop(["content_id"], axis=1, inplace=True)

with open('../input/riiid-premade-data/user_content_dict.pickle','rb') as f:
    user_content_dict = pickle.load(f)

CPU times: user 5.25 s, sys: 3.01 s, total: 8.27 s
Wall time: 27.6 s


In [23]:
with open('../input/riiid-premade-data/answered_correctly_last7_array.pickle','rb') as f:
    answered_correctly_last7_array = pickle.load(f)
answered_correctly_last7_array = answered_correctly_last7_array[train_index]
    
with open('../input/riiid-premade-data/user_last7_answer_dict.pickle','rb') as f:
    user_last7_answer_dict = pickle.load(f)

In [24]:
answered_correctly_array = train_df[target].values
train_df.drop(target, axis=1, inplace=True)
answered_correctly_array = answered_correctly_array[train_index]

# data formation

In [25]:
del train_df

In [26]:
features_dict = {
    'content_id': content_id_array,
    'prior_question_elapsed_time': prior_question_elapsed_time_array,
    'prior_question_had_explanation':  prior_question_had_explanation_array,
    'user_correctness': user_correctness_array,
    'part': part_array,
    'content_count': content_count_array,
    'count': count_array,
    'first_attempt': first_attempt_array,
    'unique_attempt': unique_attempt_array,
    'part_count': part_count_array,
    'part_ratio': part_ratio_array,
    'prior_question_had_explanation_mean': prior_question_had_explanation_mean_array,
    'got_point': got_point_array,
    'answered_correctly_last7': answered_correctly_last7_array,   
    'timediff': timediff_array,
    'timediff2': timediff2_array,
    'timediff3': timediff3_array,
    'timediff4': timediff4_array,
    'community': community_array,
    'tag_acc': tag_acc_array,
    'community_count': community_count_array,
}

features = list(features_dict.keys())
print(len(features))

del content_id_array, prior_question_elapsed_time_array, prior_question_had_explanation_array,
del user_correctness_array, part_array,
del content_count_array, count_array, first_attempt_array, unique_attempt_array,
del part_ratio_array, part_count_array,
del prior_question_had_explanation_mean_array, got_point_array, 
del answered_correctly_last7_array
del timediff_array, timediff2_array, timediff3_array, timediff4_array
del community_array, tag_acc_array, community_count_array

21


In [27]:
if FULL_TRAIN:
    print(len(train_index), len(features)+1)
else:
    print((len(train_index), len(features)+1), (len(valid_index), len(features)+1))

42726873 22


# Train

In [28]:
params = {
    'objective': 'binary',
    'seed': 2020, #42
    'metric': 'auc',
    'learning_rate': 0.05, #0.1
    #'max_bin': 800,
    'num_leaves': 300, #80
    'max_depth': 15,
    'subsample': 0.8,
}

In [29]:
y_train = answered_correctly_array.astype(np.float32)
del answered_correctly_array

X_train = np.ndarray(shape=(len(train_index), len(features)), dtype=np.float32)

for idx, feature in enumerate(features):
    X_train[:,idx] = features_dict[feature].astype(np.float32).reshape(-1)
    del features_dict[feature]
tr_data = lgb.Dataset(X_train, label=y_train)
del X_train, y_train

In [30]:
print("training starts")
model = lgb.train(
        params, 
        tr_data, 
        num_boost_round= 3750,
        valid_sets=None, 
        )

training starts


# Inference

In [31]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

class FFN(nn.Module):
    def __init__(self, state_size = 200, forward_expansion = 1, bn_size=MAX_SEQ - 1, dropout=0.2):
        super(FFN, self).__init__()
        self.state_size = state_size
        
        self.lr1 = nn.Linear(state_size, forward_expansion * state_size)
        self.relu = nn.ReLU()
        self.bn = nn.BatchNorm1d(bn_size)
        self.lr2 = nn.Linear(forward_expansion * state_size, state_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = self.relu(self.lr1(x))
        x = self.bn(x)
        x = self.lr2(x)
        return self.dropout(x)
    
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, heads = 8, dropout = DROPOUT, forward_expansion = 1):
        super(TransformerBlock, self).__init__()
        self.multi_att = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=heads, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.layer_normal = nn.LayerNorm(embed_dim)
        self.ffn = FFN(embed_dim, forward_expansion = forward_expansion, dropout=dropout)
        self.layer_normal_2 = nn.LayerNorm(embed_dim)
        

    def forward(self, value, key, query, att_mask):
        att_output, att_weight = self.multi_att(value, key, query, attn_mask=att_mask)
        att_output = self.dropout(self.layer_normal(att_output + value))
        att_output = att_output.permute(1, 0, 2) # att_output: [s_len, bs, embed] => [bs, s_len, embed]
        x = self.ffn(att_output)
        x = self.dropout(self.layer_normal_2(x + att_output))
        return x.squeeze(-1), att_weight
    
class Encoder(nn.Module):
    def __init__(self, n_skill, max_seq=100, embed_dim=128, dropout = DROPOUT, forward_expansion = 1, num_layers=1, heads = 8):
        super(Encoder, self).__init__()
        self.n_skill, self.embed_dim = n_skill, embed_dim
        self.embedding = nn.Embedding(2 * n_skill + 1, embed_dim)
        self.pos_embedding = nn.Embedding(max_seq - 1, embed_dim)
        self.e_embedding = nn.Embedding(n_skill+1, embed_dim)
        self.layers = nn.ModuleList([TransformerBlock(embed_dim, forward_expansion = forward_expansion) for _ in range(num_layers)])
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, question_ids):
        device = x.device
        x = self.embedding(x)
        pos_id = torch.arange(x.size(1)).unsqueeze(0).to(device)
        pos_x = self.pos_embedding(pos_id)
        x = self.dropout(x + pos_x)
        x = x.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        e = self.e_embedding(question_ids)
        e = e.permute(1, 0, 2)
        for layer in self.layers:
            att_mask = future_mask(e.size(0)).to(device)
            x, att_weight = layer(e, x, x, att_mask=att_mask)
            x = x.permute(1, 0, 2)
        x = x.permute(1, 0, 2)
        return x, att_weight

def future_mask(seq_length):
    future_mask = (np.triu(np.ones([seq_length, seq_length]), k = 1)).astype('bool')
    return torch.from_numpy(future_mask)
    
class SAKTModel(nn.Module):
    def __init__(self, n_skill, max_seq=100, embed_dim=128, dropout = DROPOUT, forward_expansion = 1, enc_layers=1, heads = 8):
        super(SAKTModel, self).__init__()
        self.encoder = Encoder(n_skill, max_seq, embed_dim, dropout, forward_expansion, num_layers=enc_layers)
        self.pred = nn.Linear(embed_dim, 1)
        
    def forward(self, x, question_ids):
        x, att_weight = self.encoder(x, question_ids)
        x = self.pred(x)
        return x.squeeze(-1), att_weight
    
class TestDataset(Dataset):
    def __init__(self, samples, test_df, n_skill, max_seq=100):
        super(TestDataset, self).__init__()
        self.samples, self.user_ids, self.test_df = samples, [x for x in test_df["user_id"].unique()], test_df
        self.n_skill, self.max_seq = n_skill, max_seq

    def __len__(self):
        return self.test_df.shape[0]
    
    def __getitem__(self, index):
        test_info = self.test_df.iloc[index]
        
        user_id = test_info['user_id']
        target_id = test_info['content_id']
        
        content_id_seq = np.zeros(self.max_seq, dtype=int)
        answered_correctly_seq = np.zeros(self.max_seq, dtype=int)
        
        if user_id in self.samples.index:
            content_id, answered_correctly = self.samples[user_id]
            
            seq_len = len(content_id)
            
            if seq_len >= self.max_seq:
                content_id_seq = content_id[-self.max_seq:]
                answered_correctly_seq = answered_correctly[-self.max_seq:]
            else:
                content_id_seq[-seq_len:] = content_id
                answered_correctly_seq[-seq_len:] = answered_correctly
                
        x = content_id_seq[1:].copy()
        x += (answered_correctly_seq[1:] == 1) * self.n_skill
        
        questions = np.append(content_id_seq[2:], [target_id])
        
        return x, questions

def create_model():
    return SAKTModel(n_skill, max_seq=MAX_SEQ, embed_dim=EMBED_SIZE, forward_expansion=1, enc_layers=1, heads=8, dropout=0.1)
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SAKT_model = create_model()

SAKT_model.load_state_dict(torch.load("../input/riiid-transformer-pretrain/sakt.pth", map_location='cpu'))

SAKT_model.to(device)
SAKT_model.eval()

group = pickle.load(open("../input/riiid-transformer-pretrain/group.pkl", "rb"))

In [32]:
env = riiideducation.make_env()
iter_test = env.iter_test()
prior_test_df = None

In [33]:
%%time
for (test_df, sample_prediction_df) in iter_test:
    if prior_test_df is not None:
        prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop=True)
        
        ######## Transformer State Update
        prev_group = prior_test_df[['user_id', 'content_id', 'answered_correctly']].groupby('user_id').apply(lambda r: (
            r['content_id'].values,
            r['answered_correctly'].values))
        for prev_user_id in prev_group.index:
            prev_group_content = prev_group[prev_user_id][0]
            prev_group_answered_correctly = prev_group[prev_user_id][1]
            if prev_user_id in group.index:
                group[prev_user_id] = (np.append(group[prev_user_id][0], prev_group_content), 
                                       np.append(group[prev_user_id][1], prev_group_answered_correctly))
            else:
                group[prev_user_id] = (prev_group_content, prev_group_answered_correctly)
            
            if len(group[prev_user_id][0]) > MAX_SEQ:
                new_group_content = group[prev_user_id][0][-MAX_SEQ:]
                new_group_answered_correctly = group[prev_user_id][1][-MAX_SEQ:]
                group[prev_user_id] = (new_group_content, new_group_answered_correctly)
        ######### Transformer State Update
        
        user_ids = prior_test_df['user_id'].values
        content_ids = prior_test_df['content_id'].values
        targets = prior_test_df[target].values
         
        for user_id, content_id, answered_correctly, first_attempt_ornot, prior_explanation, prior_point, prior_community in zip(user_ids, content_ids, 
                                                            targets, 
                                                            prior_f_attempt_arrays,
                                                            p_prior_question_had_explanation,
                                                            prior_point_array,
                                                            prior_community_arrays):
            
            user_sum_dict[user_id] += answered_correctly
            user_count_dict[user_id] += 1
            content_sum_dict[content_id] += answered_correctly
            content_count_dict[content_id] += 1
            user_unique_dict[user_id] += first_attempt_ornot
            user_prior_question_had_explanation_sum_dict[user_id] += prior_explanation
            user_point_sum_dict[user_id] += prior_point * answered_correctly
            if len(user_last7_answer_dict[user_id])==7:
                user_last7_answer_dict[user_id] = np.concatenate([user_last7_answer_dict[user_id],[answered_correctly]])[1:]
            else:
                user_last7_answer_dict[user_id] = np.concatenate([user_last7_answer_dict[user_id],[answered_correctly]])
            
            user_community_correct_dict[user_id][prior_community] += answered_correctly
            user_community_count_dict[user_id][prior_community] += 1
            community_count_dict[prior_community] += 1            
            
    prior_test_df = test_df.copy()
    
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
    
    ####### Transformer
    test_dataset = TestDataset(group, test_df, n_skill, max_seq=MAX_SEQ)
    test_dataloader = DataLoader(test_dataset, batch_size=len(test_df), shuffle=False)
    
    item = next(iter(test_dataloader))
    x = item[0].to(device).long()
    target_id = item[1].to(device).long()
    
    with torch.no_grad():
        output, _ = SAKT_model(x, target_id)
        
    output = torch.sigmoid(output)
    output = output[:, -1]
    SAKT_outs = output.cpu().numpy()
    ######## Transformer
    
    test_df = pd.merge(test_df, questions_df, left_on='content_id', right_on='question_id', how='left', right_index=True).reset_index(drop=True)
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False).astype('bool')
    test_df['prior_question_elapsed_time'] = test_df['prior_question_elapsed_time'].fillna(0)

    p_prior_question_had_explanation = test_df['prior_question_had_explanation'].values
    prior_community_arrays = test_df['community'].values
    
    user_sum = np.zeros(len(test_df), dtype=np.int16)
    user_count = np.zeros(len(test_df), dtype=np.int16)
    content_sum = np.zeros(len(test_df), dtype=np.int32)
    content_count = np.zeros(len(test_df), dtype=np.int32)
    part_count = np.zeros(len(test_df), dtype=np.int32)
    first_attempt_values = []
    user_unique_count = np.zeros(len(test_df), dtype=np.int32)
    user_prior_question_had_explanation_sum = np.zeros(len(test_df), dtype=np.int32)
    got_point_array = np.zeros(len(test_df), dtype=np.float32)
    user_last7_accuracy_array = np.zeros(len(test_df), dtype=np.float32)
    timediff_array = np.zeros(len(test_df), dtype = np.int64)
    timediff2_array = np.zeros(len(test_df), dtype = np.int64)
    timediff3_array = np.zeros(len(test_df), dtype = np.int64)
    timediff4_array = np.zeros(len(test_df), dtype = np.int64)
    tag_acc_array = np.zeros(len(test_df), dtype=np.float32)
    community_count_array = np.zeros(len(test_df), dtype=np.int32)
    
    for i, (user_id, content_id, timestamp, community, part) in enumerate(zip(test_df['user_id'].values, 
                                                             test_df['content_id'].values,
                                                             test_df['timestamp'].values,
                                                             test_df['community'].values,
                                                             test_df['part'].values)):
        user_sum[i] = user_sum_dict[user_id]
        user_count[i] = user_count_dict[user_id]
        content_sum[i] = content_sum_dict[content_id]
        content_count[i] = content_count_dict[content_id]
        part_count[i] = user_part_count_dict[user_id][part] + 1
        user_part_count_dict[user_id][part] += 1
        first_attempt_values.append(user_content_dict[user_id][content_id])
        user_content_dict[user_id][content_id] = False             
        user_unique_count[i] = user_unique_dict[user_id]
        user_prior_question_had_explanation_sum[i] = user_prior_question_had_explanation_sum_dict[user_id]
        got_point_array[i] = user_point_sum_dict[user_id]
        
        if len(user_last7_answer_dict[user_id])==7:
            user_last7_accuracy_array[i] = user_last7_answer_dict[user_id].mean()
        else:
            user_last7_accuracy_array[i] = np.nan
            
        if len(user_timestamp_max_dict[user_id]) ==0:
            timediff_array[i] = -1
            timediff2_array[i] = -1
            timediff3_array[i] = -1
            timediff4_array[i] = -1
            user_timestamp_max_dict[user_id] = np.concatenate([user_timestamp_max_dict[user_id],[timestamp]])
            
        elif len(user_timestamp_max_dict[user_id]) ==1:
            timediff_array[i] = timestamp - user_timestamp_max_dict[user_id][0]
            timediff2_array[i] = -1
            timediff3_array[i] = -1
            timediff4_array[i] = -1
            user_timestamp_max_dict[user_id] = np.concatenate([user_timestamp_max_dict[user_id],[timestamp]])
            
        elif len(user_timestamp_max_dict[user_id]) ==2:
            timediff_array[i] = timestamp - user_timestamp_max_dict[user_id][1]
            timediff2_array[i] = timestamp - user_timestamp_max_dict[user_id][0]
            timediff3_array[i] = -1
            timediff4_array[i] = -1
            user_timestamp_max_dict[user_id] = np.concatenate([user_timestamp_max_dict[user_id],[timestamp]])  
            
        elif len(user_timestamp_max_dict[user_id]) ==3:
            timediff_array[i] = timestamp - user_timestamp_max_dict[user_id][2]
            timediff2_array[i] = timestamp - user_timestamp_max_dict[user_id][1]
            timediff3_array[i] = timestamp - user_timestamp_max_dict[user_id][0]
            timediff4_array[i] = -1
            user_timestamp_max_dict[user_id] = np.concatenate([user_timestamp_max_dict[user_id],[timestamp]]) 
    
        else:
            timediff_array[i] = timestamp - user_timestamp_max_dict[user_id][3]
            timediff2_array[i] = timestamp - user_timestamp_max_dict[user_id][2]
            timediff3_array[i] = timestamp - user_timestamp_max_dict[user_id][1]
            timediff4_array[i] = timestamp - user_timestamp_max_dict[user_id][0]
            user_timestamp_max_dict[user_id] = np.concatenate([user_timestamp_max_dict[user_id],[timestamp]])[1:]  
            
        if user_community_count_dict[user_id][community] == 0:
            tag_acc_array[i] = -1
        else:
            tag_acc_array[i] = user_community_correct_dict[user_id][community] / user_community_count_dict[user_id][community]
            
        community_count_array[i] = community_count_dict[community]
        
        
    test_df['user_correctness'] = user_sum / user_count
    test_df['content_count'] = content_count
    test_df['content_id'] = content_sum / content_count
    test_df['count'] = 1
    test_df['count'] = test_df.groupby("user_id")["count"].cumsum()
    test_df['count'] += user_count
    test_df['part_count'] = part_count
    test_df['part_ratio'] = part_count / test_df['count'].values
    test_df["first_attempt"] = first_attempt_values
    test_df["unique_attempt"] = test_df.groupby("user_id")["first_attempt"].cumsum()
    test_df["unique_attempt"] += user_unique_count
    test_df['prior_question_had_explanation_mean'] = user_prior_question_had_explanation_sum / user_count
    test_df['got_point'] = got_point_array / user_count
    test_df['answered_correctly_last7'] = user_last7_accuracy_array
    test_df['timediff'] = timediff_array
    test_df['timediff2'] = timediff2_array
    test_df['timediff3'] = timediff3_array
    test_df['timediff4'] = timediff4_array
    test_df['tag_acc'] = tag_acc_array
    test_df['community_count'] = community_count_array
    
    prior_f_attempt_arrays = test_df['first_attempt'].values
    prior_point_array = 1 / (test_df.content_id.values + 0.1)
    
    lgb_out = model.predict(test_df[features])
    test_df[target] = SAKT_outs * 0.3 + lgb_out * 0.7
    env.predict(test_df[['row_id', target]])

CPU times: user 1.41 s, sys: 211 ms, total: 1.62 s
Wall time: 1.32 s
