In [1]:
import numpy as np 
import pandas as pd 
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

import plotly.express as px
from matplotlib import pyplot as plt

In [2]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

In [3]:
CAT_COLS = [c for c in train_df.columns if c.startswith("cat")]
NUM_COLS = [c for c in train_df.columns if c.startswith("cont")]

In [4]:
LOW_FREQ_THRESH = 50

In [5]:
train_df

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
0,0,A,I,A,B,B,BI,A,S,Q,...,0.759439,0.795549,0.681917,0.621672,0.592184,0.791921,0.815254,0.965006,0.665915,0
1,1,A,I,A,A,E,BI,K,W,AD,...,0.386385,0.541366,0.388982,0.357778,0.600044,0.408701,0.399353,0.927406,0.493729,0
2,2,A,K,A,A,E,BI,A,E,BM,...,0.343255,0.616352,0.793687,0.552877,0.352113,0.388835,0.412303,0.292696,0.549452,0
3,3,A,K,A,C,E,BI,A,Y,AD,...,0.831147,0.807807,0.800032,0.619147,0.221789,0.897617,0.633669,0.760318,0.934242,0
4,4,A,I,G,B,E,BI,C,G,Q,...,0.338818,0.277308,0.610578,0.128291,0.578764,0.279167,0.351103,0.357084,0.328960,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,499993,A,N,F,A,E,BU,A,AS,K,...,0.662428,0.671927,0.390566,0.145840,0.262767,0.514248,0.519340,0.617436,0.688007,0
299996,499995,A,K,A,A,G,BI,A,K,AE,...,0.821657,0.620356,0.384891,0.735879,0.547731,0.726653,0.470575,0.275743,0.638939,0
299997,499996,A,G,M,A,H,BI,C,L,F,...,0.407037,0.232436,0.832482,0.810663,0.596939,0.308821,0.373997,0.518024,0.452144,1
299998,499997,B,H,A,D,B,BI,A,AA,AX,...,0.808045,0.630708,0.346898,0.735147,0.563488,0.609836,0.680430,0.318453,0.335822,0


In [6]:
encoders = {}
# for cat_col in CAT_COLS[0:2]:
for cat_col in CAT_COLS:
   #  print("=" * 50)
    label_enc = LabelEncoder()
    
    value_counts = train_df[cat_col].value_counts()
    # print(value_counts)
    # print(type(value_counts))
    
    is_low_frequency = value_counts < LOW_FREQ_THRESH
    # print("-" * 50)
    # print(is_low_frequency)
    # print(type(is_low_frequency))
    
    low_freq_values = value_counts.index[is_low_frequency]
    
    # print("-" * 50)
    # print(low_freq_values)
    # print(type(low_freq_values))
    
    if len(low_freq_values) > 0:
        train_df.loc[train_df[cat_col].isin(low_freq_values), cat_col] = "low_frequency"
        test_df.loc[test_df[cat_col].isin(low_freq_values), cat_col] = "low_frequency"
        
    train_df[cat_col] = label_enc.fit_transform(train_df[cat_col])
    encoders[cat_col] = label_enc

In [7]:
# Encode test set
for cat_col in CAT_COLS:
    label_enc = encoders[cat_col]
    le_dict = dict(zip(label_enc.classes_, label_enc.transform(label_enc.classes_)))
    
    # Replace unknown values by the most common value
    # Changing this to another value might make more sense
    if le_dict.get("low_frequency") is not None:
        default_val = le_dict["low_frequency"]
    else:
        default_val = train_df[cat_col].mode().values[0]
        
    test_df[cat_col] = test_df[cat_col].apply(lambda x: le_dict.get(x, default_val))

In [8]:
# Clip numerical features in test set to match training set
for num_col in NUM_COLS:
    test_df[num_col] = np.clip(test_df[num_col], train_df[num_col].min(), train_df[num_col].max())
    
    # Taken from https://www.kaggle.com/siavrez/kerasembeddings
    train_df[f'q_{num_col}'], bins_ = pd.qcut(train_df[num_col], 25, retbins=True, labels=[i for i in range(25)])
    test_df[f'q_{num_col}'] = pd.cut(test_df[num_col], bins=bins_, labels=False, include_lowest=True)
    CAT_COLS.append(f'q_{num_col}')

FEATURES = CAT_COLS + NUM_COLS 

In [9]:
cat_dims = train_df[CAT_COLS].nunique().to_list()
cat_idxs = [FEATURES.index(cat_col) for cat_col in CAT_COLS]

In [10]:
cat_emb_dims = np.ceil(np.log(cat_dims)).astype(np.int).tolist()
cat_emb_dims = np.ceil(np.clip((np.array(cat_dims)) / 2, a_min=1, a_max=50)).astype(np.int).tolist()

In [11]:
X = train_df[FEATURES].values
y = train_df["target"].values

X_test = test_df[FEATURES].values

In [12]:
from pytorch_tabnet.pretraining import TabNetPretrainer

N_D = 16
N_A = 16
N_INDEP = 2
N_SHARED = 2
N_STEPS = 1 #2
MASK_TYPE = "sparsemax"
GAMMA = 1.5
BS = 512
MAX_EPOCH =  20 # 20
PRETRAIN = True

In [13]:
# https://www.guruguru.science/competitions/16/discussions/70f25f95-4dcc-4733-9f9e-f7bc6472d7c0/
if PRETRAIN:
    pretrain_params = dict(n_d=N_D, n_a=N_A, n_steps=N_STEPS,  #0.2,
                           n_independent=N_INDEP, n_shared=N_SHARED,
                           cat_idxs=cat_idxs,
                           cat_dims=cat_dims,
                           cat_emb_dim=cat_emb_dims,
                           gamma=GAMMA,
                           lambda_sparse=0., optimizer_fn=torch.optim.Adam,
                           optimizer_params=dict(lr=2e-2),
                           mask_type=MASK_TYPE,
                           scheduler_params=dict(mode="min",
                                                 patience=3,
                                                 min_lr=1e-5,
                                                 factor=0.5,),
                           scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,                         
                           verbose=1,
                          )
    pretrainer = TabNetPretrainer(**pretrain_params)

    pretrainer.fit(X_train=X_test, 
                   eval_set=[X],
                   max_epochs=MAX_EPOCH,
                   patience=25, batch_size=BS, virtual_batch_size=BS, #128,
                   num_workers=0, drop_last=True,
                   pretraining_ratio=0.5 # The bigger your pretraining_ratio the harder it is to reconstruct
                  )

Device used : cpu
epoch 0  | loss: 0.96492 | val_0_unsup_loss: 0.60902 |  0:00:52s
epoch 1  | loss: 0.40539 | val_0_unsup_loss: 0.34515 |  0:01:44s
epoch 2  | loss: 0.30284 | val_0_unsup_loss: 0.27205 |  0:02:32s
epoch 3  | loss: 0.25778 | val_0_unsup_loss: 0.25856 |  0:03:21s
epoch 4  | loss: 0.23388 | val_0_unsup_loss: 0.24927 |  0:04:10s
epoch 5  | loss: 0.22158 | val_0_unsup_loss: 0.22673 |  0:04:57s
epoch 6  | loss: 0.20809 | val_0_unsup_loss: 0.2244  |  0:05:45s
epoch 7  | loss: 0.19475 | val_0_unsup_loss: 0.22669 |  0:06:32s
epoch 8  | loss: 0.18764 | val_0_unsup_loss: 0.20731 |  0:07:20s
epoch 9  | loss: 0.18412 | val_0_unsup_loss: 0.21684 |  0:08:07s
epoch 10 | loss: 0.18259 | val_0_unsup_loss: 0.20244 |  0:08:55s
epoch 11 | loss: 0.17736 | val_0_unsup_loss: 0.21646 |  0:09:42s
epoch 12 | loss: 0.17506 | val_0_unsup_loss: 0.19888 |  0:10:30s
epoch 13 | loss: 0.17098 | val_0_unsup_loss: 0.20125 |  0:11:19s
epoch 14 | loss: 0.17226 | val_0_unsup_loss: 0.19586 |  0:12:06s
epoch 1