- https://medium.com/@vanillaxiangshuyang/self-supervised-learning-on-tabular-data-with-tabnet-544b3ec85cee
- https://colab.research.google.com/drive/1P8Obe07DP3VeOld08ThyT1HnChLip_LO#scrollTo=gvy9vUUNOP0W

- https://www.kaggle.com/code/sisharaneranjana/semi-supervised-pre-training-with-tabnet#%F0%9F%94%8FDescription-of-the-dataset-
- https://dacon.io/en/codeshare/3837

In [1]:
import pandas as pd
import numpy as np
import pickle

import warnings
warnings.simplefilter(action='ignore')

In [2]:
# 원본 데이터
data = pd.read_csv("./dataset/generated/data5.csv")

# 데이터 전처리

In [3]:
df = data.copy()

# Time 을 일중 시간으로 변환
df.loc[:, "Time"] = df.loc[:, "Time"].apply(lambda x : x / 3600 % 24)

# Amount column 은 편차가 크므로 log-scale 로 변환
df['Amount'] = np.log(df.pop('Amount') + 0.001)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,...,V21,V22,V23,V24,V25,V26,V27,V28,Class,Amount
0,0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0,5.008105
1,1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0,0.989913
2,2,0.000278,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0,5.936641
3,3,0.000278,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0,4.816249
4,4,0.000556,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0,4.248367


# train_test_split

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
features = df.values
labels = np.array(df.pop('Class'))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.5, random_state=0, stratify=labels)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=0)

print("X train shape: ", X_train.shape)
print("X validation shape: ", X_val.shape)
print("X test shape: ", X_test.shape)
print("Y train shape: ", y_train.shape)
print("Y validation shape: ", y_val.shape)
print("Y test shape: ", y_test.shape)

X train shape:  (71434, 32)
X validation shape:  (71435, 32)
X test shape:  (142870, 32)
Y train shape:  (71434,)
Y validation shape:  (71435,)
Y test shape:  (142870,)


In [8]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
x_val= sc.transform(X_val)

# TabNetClassifier

https://github.com/dreamquark-ai/tabnet

In [9]:
# train on the whole dataset with labels
import torch
from pytorch_tabnet.tab_model import TabNetClassifier

supervised = TabNetClassifier(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, # how to use learning rate scheduler
                      "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax' # This will be overwritten if using pretrain model
)

In [10]:
from pytorch_tabnet.metrics import Metric
from sklearn.metrics import f1_score

class F1_Score(Metric):
    def __init__(self):
        self._name = "f1"
        self._maximize = True

    def __call__(self, y_true, y_score):
        score = f1_score(y_true, (y_score[:, 1]>0.5)*1)
        return score

In [11]:
supervised.fit(X_train, y_train,
               patience=5,
               eval_set=[(X_train, y_train), (x_val,y_val)],
               eval_metric=['logloss','f1']
)

epoch 0  | loss: 0.09028 | val_0_logloss: 0.01546 | val_0_f1: 0.64047 | val_1_logloss: 0.01578 | val_1_f1: 0.63102 |  0:00:09s
epoch 1  | loss: 0.01062 | val_0_logloss: 0.00574 | val_0_f1: 0.84034 | val_1_logloss: 0.00816 | val_1_f1: 0.80317 |  0:00:17s
epoch 2  | loss: 0.00732 | val_0_logloss: 0.00461 | val_0_f1: 0.91789 | val_1_logloss: 0.00636 | val_1_f1: 0.87798 |  0:00:26s
epoch 3  | loss: 0.00518 | val_0_logloss: 0.0042  | val_0_f1: 0.92073 | val_1_logloss: 0.00701 | val_1_f1: 0.88726 |  0:00:34s
epoch 4  | loss: 0.0031  | val_0_logloss: 0.00255 | val_0_f1: 0.94578 | val_1_logloss: 0.00421 | val_1_f1: 0.92525 |  0:00:42s
epoch 5  | loss: 0.00228 | val_0_logloss: 0.00327 | val_0_f1: 0.91277 | val_1_logloss: 0.00522 | val_1_f1: 0.89755 |  0:00:51s
epoch 6  | loss: 0.00103 | val_0_logloss: 0.00058 | val_0_f1: 0.98834 | val_1_logloss: 0.00259 | val_1_f1: 0.97181 |  0:00:59s
epoch 7  | loss: 0.00147 | val_0_logloss: 0.0005  | val_0_f1: 0.98827 | val_1_logloss: 0.00083 | val_1_f1: 0.98

In [12]:
predicted_test = supervised.predict(X_test)
score = f1_score(y_test,predicted_test)
print(score)

0.9950669485553205


In [13]:
from pytorch_tabnet.pretraining import TabNetPretrainer

# TabNetPretrainer
unsupervised = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax' # "sparsemax"
)

In [14]:
unsupervised.fit(X_train,
                 eval_set=[X_val],
                 pretraining_ratio=0.8,
)

epoch 0  | loss: 1.34445 | val_0_unsup_loss_numpy: 52900.35546875|  0:00:07s
epoch 1  | loss: 0.98648 | val_0_unsup_loss_numpy: 257789.6875|  0:00:13s
epoch 2  | loss: 0.98376 | val_0_unsup_loss_numpy: 13466489.0|  0:00:20s
epoch 3  | loss: 0.98052 | val_0_unsup_loss_numpy: 43241892.0|  0:00:28s
epoch 4  | loss: 0.97417 | val_0_unsup_loss_numpy: 11524437.0|  0:00:34s
epoch 5  | loss: 0.9764  | val_0_unsup_loss_numpy: 273302496.0|  0:00:41s
epoch 6  | loss: 0.96565 | val_0_unsup_loss_numpy: 13362.6396484375|  0:00:49s
epoch 7  | loss: 0.96665 | val_0_unsup_loss_numpy: 3431535.0|  0:00:56s
epoch 8  | loss: 0.96067 | val_0_unsup_loss_numpy: 15332354.0|  0:01:04s
epoch 9  | loss: 0.95795 | val_0_unsup_loss_numpy: 558442560.0|  0:01:11s
epoch 10 | loss: 0.95556 | val_0_unsup_loss_numpy: 631039040.0|  0:01:20s
epoch 11 | loss: 0.95239 | val_0_unsup_loss_numpy: 3607851264.0|  0:01:27s
epoch 12 | loss: 0.95084 | val_0_unsup_loss_numpy: 1527542528.0|  0:01:34s
epoch 13 | loss: 0.94559 | val_0_u

# Pre-trained 된 모델로 test

In [15]:
clf = TabNetClassifier(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, # how to use learning rate scheduler
                      "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax' # This will be overwritten if using pretrain model
)

In [16]:
clf.fit(X_train,y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_name=['train', 'valid'],
    eval_metric=['logloss','f1'],
    from_unsupervised=unsupervised
)

epoch 0  | loss: 0.0866  | train_logloss: 0.00917 | train_f1: 0.81887 | valid_logloss: 6.15292 | valid_f1: 0.02088 |  0:00:07s
epoch 1  | loss: 0.00719 | train_logloss: 0.00969 | train_f1: 0.78317 | valid_logloss: 0.03145 | valid_f1: 0.4153  |  0:00:16s
epoch 2  | loss: 0.00768 | train_logloss: 0.00262 | train_f1: 0.90045 | valid_logloss: 0.46861 | valid_f1: 0.11825 |  0:00:23s
epoch 3  | loss: 0.00237 | train_logloss: 0.00095 | train_f1: 0.98667 | valid_logloss: 0.02724 | valid_f1: 0.11443 |  0:00:35s
epoch 4  | loss: 0.00126 | train_logloss: 0.00029 | train_f1: 0.99561 | valid_logloss: 0.02992 | valid_f1: 0.15059 |  0:00:44s
epoch 5  | loss: 0.00095 | train_logloss: 0.00078 | train_f1: 0.9927  | valid_logloss: 0.03287 | valid_f1: 0.1866  |  0:00:52s
epoch 6  | loss: 0.00088 | train_logloss: 0.00016 | train_f1: 0.99854 | valid_logloss: 0.03508 | valid_f1: 0.09254 |  0:01:02s
epoch 7  | loss: 0.00074 | train_logloss: 0.00016 | train_f1: 0.99854 | valid_logloss: 0.03906 | valid_f1: 0.08

In [17]:
predicted_test=clf.predict(X_test)
score=f1_score(y_test,predicted_test)
print(score)

0.786046511627907
