- copy fe of lgb version 70
- 1st tabnet with 9 basic features

In [1]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1

In [2]:
! pip install /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.1-py3-none-any.whl

Processing /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.1-py3-none-any.whl
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-2.0.1


In [3]:
import gc
import os
import random
import pickle
import torch
import numpy as np
import pandas as pd
from numba import jit
import riiideducation
import datatable as dt
from bitarray import bitarray
from functools import partial
from collections import defaultdict
from pytorch_tabnet.tab_model import TabNetClassifier

_ = np.seterr(divide='ignore', invalid='ignore')
pd.set_option("max_rows", 100)
pd.set_option("max_columns", 100)

random.seed(1)

In [4]:
def make_bitarray():
    a = bitarray(32737, endian='little')
    a.setall(True)   
    return a

def clear_mem():
    %reset -f out
    %reset -f in
    gc.collect()

# Preprocess

In [5]:
data_types_dict = {
    'timestamp': 'int64',
    'user_id': 'int32', 
    'content_id': 'int16', 
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool',
}
target = 'answered_correctly'

In [6]:
train_df = dt.fread('../input/riiid-test-answer-prediction/train.csv', columns=set(data_types_dict.keys())).to_pandas()

In [7]:
train_df = train_df[train_df[target] != -1].reset_index(drop=True)

train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].fillna(False).astype('int8')
train_df['prior_question_elapsed_time'].fillna(0, inplace=True)

train_df = train_df.astype(data_types_dict)

In [8]:
train_size = 24
valid_size = 6
    
train_index = list(train_df.groupby('user_id').tail(train_size).index)

In [9]:
train_df["count"] = 1

# normal cumsum
count_array = train_df.groupby("user_id")["count"].cumsum().values
count_array = count_array[train_index]
train_df.drop("count", axis=1, inplace=True)

In [10]:
#timediff_array = train_df.groupby("user_id")["timestamp"].diff().values
#timediff_array = timediff_array[train_index]
#clear_mem()

#timediff2_array = train_df.groupby("user_id")["timestamp"].diff(2).values
#timediff2_array = timediff2_array[train_index]
#clear_mem()

#timediff3_array = train_df.groupby("user_id")["timestamp"].diff(3).values
#timediff3_array = timediff3_array[train_index]
#clear_mem()

#timediff4_array = train_df.groupby("user_id")["timestamp"].diff(4).values
#timediff4_array = timediff4_array[train_index]
#clear_mem()

#user_timestamp_max_dict = train_df.groupby("user_id")["timestamp"].apply(lambda x: x[-4:].values).to_dict(defaultdict(partial(np.ndarray, 0, dtype="int64")))

#train_df.drop("timestamp", axis=1, inplace=True)
#time_dd_array = timediff2_array - timediff_array

In [11]:
# prior_question_elapsed_time
prior_question_elapsed_time_array = train_df.prior_question_elapsed_time.values
train_df.drop("prior_question_elapsed_time", axis =1, inplace=True)
prior_question_elapsed_time_array = prior_question_elapsed_time_array[train_index]

In [12]:
prior_question_had_explanation_array = train_df.prior_question_had_explanation.values
train_df.drop('prior_question_had_explanation', axis=1, inplace=True)
prior_question_had_explanation_array = prior_question_had_explanation_array[train_index]

In [13]:
questions_df = pd.read_csv(
    '../input/riiid-test-answer-prediction/questions.csv', 
    usecols=[0, 3], 
    dtype={'question_id': 'int16', 'part': 'int8'} 
)

additional_q_df = pd.read_csv('../input/riiid-question-clustering/question_cmnts.csv')
questions_df["community"] = additional_q_df["community"].astype('int8')
del additional_q_df 
    
train_df = pd.merge(train_df, questions_df, left_on='content_id', right_on='question_id', how='left', right_index=True).reset_index(drop=True)
train_df.drop(columns=['question_id'], inplace=True)

In [14]:
train_df['lag'] = train_df.groupby('user_id')[target].shift()
cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])
user_correctness_array = np.array(cum['cumsum'] / cum['cumcount'])
user_correctness_array = user_correctness_array[train_index]
train_df.drop(columns=['lag'], inplace=True)
del cum

In [15]:
@jit
def part_count_calc(P):
    ans = []
    part_count = [0] * 8
    for i in range(len(P)):
        part_count[P[i]] += 1
        ans.append(part_count[P[i]])
    return np.array(ans)

@jit
def part_count_dict_calc(P):
    part_count = [0] * 8
    for i in range(len(P)):
        part_count[P[i]] += 1
    return np.array(part_count)

def part_dict_init():
    ans = [0] * 8
    return np.array(ans)


part_count_array = train_df.groupby("user_id").apply(lambda x: part_count_calc(x["part"].values))
part_count_array = np.hstack(part_count_array)
part_count_array = part_count_array[train_index]
part_ratio_array = part_count_array / count_array

user_part_count_dict = train_df.groupby("user_id").apply(lambda x: part_count_dict_calc(x["part"].values)).to_dict(defaultdict(part_dict_init))

part_array = train_df.part.values
train_df.drop("part", axis=1, inplace=True)
part_array = part_array[train_index]

In [16]:
user_agg = train_df.groupby('user_id')[target].agg(['sum', 'count'])
user_sum_dict = user_agg['sum'].astype('int16').to_dict(defaultdict(int))
del user_agg['sum']
user_count_dict = user_agg['count'].astype('int16').to_dict(defaultdict(int))
del user_agg['count']
clear_mem()

#
content_agg = train_df.groupby('content_id')[target].agg(['sum', 'count'])
content_sum_dict = content_agg['sum'].astype('int32').to_dict(defaultdict(int))
content_count_dict = content_agg['count'].astype('int32').to_dict(defaultdict(int))

content_count_array = train_df['content_id'].map(content_agg['count']).astype('int32').values
content_id_array = train_df['content_id'].map(content_agg['sum'] / content_agg['count']).values
del content_agg
clear_mem()

Flushing output cache (0 entries)
Flushing input history
Flushing output cache (0 entries)
Flushing input history


In [17]:
# benefit of solving difficult questions

#point_array = 1 / (content_id_array + 0.1)
with open('../input/riiid-premade-data/got_point_array.pickle','rb') as f:
    got_point_array = pickle.load(f)
got_point_array = got_point_array[train_index]
    
with open('../input/riiid-premade-data/user_point_sum_dict.pickle','rb') as f:
    user_point_sum_dict = pickle.load(f)

content_id_array = content_id_array[train_index]
content_count_array = content_count_array[train_index]

In [18]:
with open('../input/riiid-premade-data/answered_correctly_last7_array.pickle','rb') as f:
    answered_correctly_last7_array = pickle.load(f)
answered_correctly_last7_array = answered_correctly_last7_array[train_index]
    
with open('../input/riiid-premade-data/user_last7_answer_dict.pickle','rb') as f:
    user_last7_answer_dict = pickle.load(f)

In [19]:
answered_correctly_array = train_df[target].values
train_df.drop(target, axis=1, inplace=True)
answered_correctly_array = answered_correctly_array[train_index]

# data formation

In [20]:
train_df = train_df[train_df.index.isin(train_index)].reset_index(drop=True)
valid_index = list(train_df.groupby('user_id').tail(valid_size).index)
train_index = list(train_df[~train_df.index.isin(valid_index)].index)

del train_df

In [21]:
features_dict = {
    'content_id': content_id_array,
    'prior_question_elapsed_time': prior_question_elapsed_time_array,
    'prior_question_had_explanation':  prior_question_had_explanation_array,
    'user_correctness': user_correctness_array,
    'part': part_array,
    'content_count': content_count_array,
    'count': count_array,
    #'first_attempt': first_attempt_array,
    #'unique_attempt': unique_attempt_array,
    #'part_count': part_count_array,
    #'part_ratio': part_ratio_array,
    #'prior_question_had_explanation_mean': prior_question_had_explanation_mean_array,
    'got_point': got_point_array,
    'answered_correctly_last7': answered_correctly_last7_array,   
    #'timediff': timediff_array,
    #'timediff2': timediff2_array,
    #'timediff3': timediff3_array,
    #'timediff4': timediff4_array,
    #'community': community_array,
    #'tag_acc': tag_acc_array,
    #'community_count': community_count_array,
    #'time_dd': time_dd_array,
}

features = list(features_dict.keys())
print(len(features))

9


In [22]:
X_train = np.ndarray(shape=(len(train_index), len(features)), dtype=np.float32)
X_valid = np.ndarray(shape=(len(valid_index), len(features)), dtype=np.float32)

for idx, feature in enumerate(features):
    X_train[:,idx] = features_dict[feature][train_index].astype(np.float32).reshape(-1)
    X_valid[:,idx] = features_dict[feature][valid_index].astype(np.float32).reshape(-1)
    del features_dict[feature]
y_train = answered_correctly_array[train_index].astype(np.float32)
y_valid = answered_correctly_array[valid_index].astype(np.float32)

X_train = np.nan_to_num(X_train, nan=-1)
X_valid = np.nan_to_num(X_valid, nan=-1)

In [23]:
print((len(train_index), len(features)+1), (len(valid_index), len(features)+1))

(6536675, 10) (2360984, 10)


# Train

In [24]:
def seed_everything(seed_value=1234):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
seed_everything()

In [25]:
BS = 2**12

# Training for more epoch might improve the model performance
# at the cost of longer training time
MAX_EPOCH = 10

# Defining TabNet model
model = TabNetClassifier(n_d=32, n_a=32, n_steps=3, gamma=1.2,
                         n_independent=2, n_shared=2,
                         lambda_sparse=0., seed=0,
                         #cat_idxs=cat_idxs,
                         #cat_dims=cat_dims,
                         cat_emb_dim=1,
                         mask_type='entmax',
                         device_name='auto',
                         optimizer_fn=torch.optim.Adam,
                         optimizer_params=dict(lr=2e-2),
                         scheduler_params=dict(max_lr=0.05,
                                               steps_per_epoch=int(X_train.shape[0] / BS),
                                               epochs=MAX_EPOCH,
                                               #final_div_factor=100,
                                               is_batch_level=True),
                         scheduler_fn=torch.optim.lr_scheduler.OneCycleLR,
                         verbose=1,)

Device used : cuda


In [26]:
print("training starts")
   
model.fit(X_train=X_train, y_train=y_train,
          eval_set=[(X_valid, y_valid)],
          eval_name=["valid"],
          eval_metric=["auc"],
          batch_size=BS,
          virtual_batch_size=256,
          max_epochs=MAX_EPOCH,
          drop_last=True,
          pin_memory=True,
         )
del X_train, y_train, X_valid, y_valid

training starts
epoch 0  | loss: 0.59093 | valid_auc: 0.74045 |  0:03:42s
epoch 1  | loss: 0.58329 | valid_auc: 0.74086 |  0:07:24s
epoch 2  | loss: 0.58269 | valid_auc: 0.73931 |  0:11:07s
epoch 3  | loss: 0.58225 | valid_auc: 0.7398  |  0:14:49s
epoch 4  | loss: 0.58191 | valid_auc: 0.74079 |  0:18:34s
epoch 5  | loss: 0.58153 | valid_auc: 0.74207 |  0:22:22s
epoch 6  | loss: 0.58095 | valid_auc: 0.74289 |  0:26:10s
epoch 7  | loss: 0.58034 | valid_auc: 0.74294 |  0:29:57s
epoch 8  | loss: 0.57975 | valid_auc: 0.74394 |  0:33:46s
epoch 9  | loss: 0.57935 | valid_auc: 0.74398 |  0:37:34s
Stop training because you reached max_epochs = 10 with best_epoch = 9 and best_valid_auc = 0.74398
Best weights from best epoch are automatically used!


In [27]:
feat_importances = model.feature_importances_
indices = np.argsort(feat_importances)
for i in indices:
    print(features[i], feat_importances[i])

part 0.027835686964623168
prior_question_had_explanation 0.029674689418849046
content_count 0.04014042527218193
count 0.06285561913021365
prior_question_elapsed_time 0.09199648946615266
answered_correctly_last7 0.10747717340587076
user_correctness 0.14732948082232145
got_point 0.19070977274626114
content_id 0.3019806627735262


In [28]:
#saving_path = "../"
#model.save_model(saving_path)

# Inference

In [29]:
env = riiideducation.make_env()
iter_test = env.iter_test()
prior_test_df = None

In [30]:
%%time
for (test_df, sample_prediction_df) in iter_test:
    if prior_test_df is not None:
        prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop=True)
        
        user_ids = prior_test_df['user_id'].values
        content_ids = prior_test_df['content_id'].values
        targets = prior_test_df[target].values
                 
        #for user_id, content_id, answered_correctly, first_attempt_ornot, prior_explanation, prior_point, prior_community in zip(user_ids, 
        #                                                    content_ids, 
        #                                                    targets, 
        #                                                    prior_f_attempt_arrays,
        #                                                    p_prior_question_had_explanation,
        #                                                    prior_point_array,
        #                                                    prior_community_arrays):
            
        for user_id, content_id, answered_correctly, prior_point in zip(user_ids, 
                                                            content_ids, 
                                                            targets, 
                                                            prior_point_array):
            
            user_sum_dict[user_id] += answered_correctly
            user_count_dict[user_id] += 1
            content_sum_dict[content_id] += answered_correctly
            content_count_dict[content_id] += 1
            #user_unique_dict[user_id] += first_attempt_ornot
            #user_prior_question_had_explanation_sum_dict[user_id] += prior_explanation
            user_point_sum_dict[user_id] += prior_point * answered_correctly
            if len(user_last7_answer_dict[user_id])==7:
                user_last7_answer_dict[user_id] = np.concatenate([user_last7_answer_dict[user_id],[answered_correctly]])[1:]
            else:
                user_last7_answer_dict[user_id] = np.concatenate([user_last7_answer_dict[user_id],[answered_correctly]])
            
            #user_community_correct_dict[user_id][prior_community] += answered_correctly
            #user_community_count_dict[user_id][prior_community] += 1
            #community_count_dict[prior_community] += 1            

    prior_test_df = test_df.copy()
           
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
    test_df = pd.merge(test_df, questions_df, left_on='content_id', right_on='question_id', how='left', right_index=True).reset_index(drop=True)
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False).astype('int8')
    test_df['prior_question_elapsed_time'] = test_df['prior_question_elapsed_time'].fillna(0)

    p_prior_question_had_explanation = test_df['prior_question_had_explanation'].values
    prior_community_arrays = test_df['community'].values
    
    user_sum = np.zeros(len(test_df), dtype=np.int16)
    user_count = np.zeros(len(test_df), dtype=np.int16)
    content_sum = np.zeros(len(test_df), dtype=np.int32)
    content_count = np.zeros(len(test_df), dtype=np.int32)
    part_count = np.zeros(len(test_df), dtype=np.int32)
    first_attempt_values = []
    user_unique_count = np.zeros(len(test_df), dtype=np.int32)
    user_prior_question_had_explanation_sum = np.zeros(len(test_df), dtype=np.int32)
    got_point_array = np.zeros(len(test_df), dtype=np.float32)
    user_last7_accuracy_array = np.zeros(len(test_df), dtype=np.float32)
    timediff_array = np.zeros(len(test_df), dtype = np.int64)
    timediff2_array = np.zeros(len(test_df), dtype = np.int64)
    timediff3_array = np.zeros(len(test_df), dtype = np.int64)
    timediff4_array = np.zeros(len(test_df), dtype = np.int64)
    tag_acc_array = np.zeros(len(test_df), dtype=np.float32)
    community_count_array = np.zeros(len(test_df), dtype=np.int32)
    time_dd_array = np.zeros(len(test_df), dtype = np.int64)
    
    for i, (user_id, content_id, timestamp, community, part) in enumerate(zip(test_df['user_id'].values, 
                                                             test_df['content_id'].values,
                                                             test_df['timestamp'].values,
                                                             test_df['community'].values,
                                                             test_df['part'].values)):
        user_sum[i] = user_sum_dict[user_id]
        user_count[i] = user_count_dict[user_id]
        content_sum[i] = content_sum_dict[content_id]
        content_count[i] = content_count_dict[content_id]
        #part_count[i] = user_part_count_dict[user_id][part] + 1
        #user_part_count_dict[user_id][part] += 1
        #first_attempt_values.append(user_content_dict[user_id][content_id])
        #user_content_dict[user_id][content_id] = False             
        #user_unique_count[i] = user_unique_dict[user_id]
        #user_prior_question_had_explanation_sum[i] = user_prior_question_had_explanation_sum_dict[user_id]
        got_point_array[i] = user_point_sum_dict[user_id]
        
        if len(user_last7_answer_dict[user_id])==7:
            user_last7_accuracy_array[i] = user_last7_answer_dict[user_id].mean()
        else:
            user_last7_accuracy_array[i] = np.nan
            
        #if len(user_timestamp_max_dict[user_id]) ==0:
        #   timediff_array[i] = -1
        #    timediff2_array[i] = -1
        #    timediff3_array[i] = -1
        #    timediff4_array[i] = -1
        #    time_dd_array[i] = -1
        #    user_timestamp_max_dict[user_id] = np.concatenate([user_timestamp_max_dict[user_id],[timestamp]])
            
        #elif len(user_timestamp_max_dict[user_id]) ==1:
        #    timediff_array[i] = timestamp - user_timestamp_max_dict[user_id][0]
        #    timediff2_array[i] = -1
        #    timediff3_array[i] = -1
        #    timediff4_array[i] = -1
        #    time_dd_array[i] = -1
        #    user_timestamp_max_dict[user_id] = np.concatenate([user_timestamp_max_dict[user_id],[timestamp]])
            
        #elif len(user_timestamp_max_dict[user_id]) ==2:
        #    timediff_array[i] = timestamp - user_timestamp_max_dict[user_id][1]
        #    timediff2_array[i] = timestamp - user_timestamp_max_dict[user_id][0]
        #    timediff3_array[i] = -1
        #    timediff4_array[i] = -1
        #    time_dd_array[i] = timediff2_array[i] - timediff_array[i]
        #    user_timestamp_max_dict[user_id] = np.concatenate([user_timestamp_max_dict[user_id],[timestamp]])  
            
        #elif len(user_timestamp_max_dict[user_id]) ==3:
        #    timediff_array[i] = timestamp - user_timestamp_max_dict[user_id][2]
        #    timediff2_array[i] = timestamp - user_timestamp_max_dict[user_id][1]
        #    timediff3_array[i] = timestamp - user_timestamp_max_dict[user_id][0]
        #    timediff4_array[i] = -1
        #    time_dd_array[i] = timediff2_array[i] - timediff_array[i]
        #    user_timestamp_max_dict[user_id] = np.concatenate([user_timestamp_max_dict[user_id],[timestamp]]) 
    
        #else:
        #    timediff_array[i] = timestamp - user_timestamp_max_dict[user_id][3]
        #    timediff2_array[i] = timestamp - user_timestamp_max_dict[user_id][2]
        #    timediff3_array[i] = timestamp - user_timestamp_max_dict[user_id][1]
        #    timediff4_array[i] = timestamp - user_timestamp_max_dict[user_id][0]
        #    time_dd_array[i] = timediff2_array[i] - timediff_array[i]
        #    user_timestamp_max_dict[user_id] = np.concatenate([user_timestamp_max_dict[user_id],[timestamp]])[1:]  
            
        #if user_community_count_dict[user_id][community] == 0:
        #    tag_acc_array[i] = -1
        #else:
        #    tag_acc_array[i] = user_community_correct_dict[user_id][community] / user_community_count_dict[user_id][community]
            
        #community_count_array[i] = community_count_dict[community]
     
    test_df['count'] = 1
    test_df['count'] = test_df.groupby("user_id")["count"].cumsum()
    test_df['count'] += user_count
    test_df['user_correctness'] = user_sum / user_count
    test_df['content_count'] = content_count
    test_df['content_id'] = content_sum / content_count
    #test_df['part_count'] = part_count
    #test_df['part_ratio'] = part_count / test_df['count'].values
    #test_df["first_attempt"] = first_attempt_values
    #test_df["unique_attempt"] = test_df.groupby("user_id")["first_attempt"].cumsum()
    #test_df["unique_attempt"] += user_unique_count
    #test_df['prior_question_had_explanation_mean'] = user_prior_question_had_explanation_sum / user_count
    test_df['got_point'] = got_point_array / user_count
    test_df['answered_correctly_last7'] = user_last7_accuracy_array
    #test_df['timediff'] = timediff_array
    #test_df['timediff2'] = timediff2_array
    #test_df['timediff3'] = timediff3_array
    #test_df['timediff4'] = timediff4_array
    #test_df['tag_acc'] = tag_acc_array
    #test_df['community_count'] = community_count_array
    #test_df['time_dd'] = time_dd_array

    #prior_f_attempt_arrays = test_df['first_attempt'].values
    prior_point_array = 1 / (test_df.content_id.values + 0.1)
    
    X_test = np.nan_to_num(test_df[features].values, nan=-1)
    test_df[target] =  model.predict_proba(X_test)[:, -1]
    
    env.predict(test_df[['row_id', target]])

CPU times: user 296 ms, sys: 38 ms, total: 334 ms
Wall time: 707 ms
