- https://medium.com/@vanillaxiangshuyang/self-supervised-learning-on-tabular-data-with-tabnet-544b3ec85cee
- https://colab.research.google.com/drive/1P8Obe07DP3VeOld08ThyT1HnChLip_LO#scrollTo=gvy9vUUNOP0W

- https://www.kaggle.com/code/sisharaneranjana/semi-supervised-pre-training-with-tabnet#%F0%9F%94%8FDescription-of-the-dataset-
- https://dacon.io/en/codeshare/3837

In [1]:
import pandas as pd
import numpy as np
import pickle

import warnings
warnings.simplefilter(action='ignore')

In [2]:
# 원본 데이터
data = pd.read_csv("./dataset/generated/data7.csv")

# 데이터 전처리

In [3]:
df = data.copy()

# Time 을 일중 시간으로 변환
df.loc[:, "Time"] = df.loc[:, "Time"].apply(lambda x : x / 3600 % 24)

# Amount column 은 편차가 크므로 log-scale 로 변환
df['Amount'] = np.log(df.pop('Amount') + 0.001)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,...,V21,V22,V23,V24,V25,V26,V27,V28,Class,Amount
0,0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0,5.008105
1,1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0,0.989913
2,2,0.000278,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0,5.936641
3,3,0.000278,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0,4.816249
4,4,0.000556,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0,4.248367


# train_test_split

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
features = df.values
labels = np.array(df.pop('Class'))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.5, random_state=0, stratify=labels)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=0)

print("X train shape: ", X_train.shape)
print("X validation shape: ", X_val.shape)
print("X test shape: ", X_test.shape)
print("Y train shape: ", y_train.shape)
print("Y validation shape: ", y_val.shape)
print("Y test shape: ", y_test.shape)

X train shape:  (71577, 32)
X validation shape:  (71577, 32)
X test shape:  (143155, 32)
Y train shape:  (71577,)
Y validation shape:  (71577,)
Y test shape:  (143155,)


In [8]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
x_val= sc.transform(X_val)

# TabNetClassifier

https://github.com/dreamquark-ai/tabnet

In [9]:
# train on the whole dataset with labels
import torch
from pytorch_tabnet.tab_model import TabNetClassifier

supervised = TabNetClassifier(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, # how to use learning rate scheduler
                      "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax' # This will be overwritten if using pretrain model
)

In [10]:
from pytorch_tabnet.metrics import Metric
from sklearn.metrics import f1_score

class F1_Score(Metric):
    def __init__(self):
        self._name = "f1"
        self._maximize = True

    def __call__(self, y_true, y_score):
        score = f1_score(y_true, (y_score[:, 1]>0.5)*1)
        return score

In [11]:
supervised.fit(X_train, y_train,
               patience=5,
               eval_set=[(X_train, y_train), (x_val,y_val)],
               eval_metric=['logloss','f1']
)

epoch 0  | loss: 0.09331 | val_0_logloss: 0.01228 | val_0_f1: 0.83333 | val_1_logloss: 0.01032 | val_1_f1: 0.82251 |  0:00:11s
epoch 1  | loss: 0.01054 | val_0_logloss: 0.00859 | val_0_f1: 0.87948 | val_1_logloss: 0.0077  | val_1_f1: 0.87349 |  0:00:24s
epoch 2  | loss: 0.01005 | val_0_logloss: 0.01171 | val_0_f1: 0.80508 | val_1_logloss: 0.01103 | val_1_f1: 0.79916 |  0:00:36s
epoch 3  | loss: 0.00872 | val_0_logloss: 0.00348 | val_0_f1: 0.94519 | val_1_logloss: 0.00465 | val_1_f1: 0.92925 |  0:00:48s
epoch 4  | loss: 0.00656 | val_0_logloss: 0.00711 | val_0_f1: 0.84958 | val_1_logloss: 0.00769 | val_1_f1: 0.8312  |  0:00:59s
epoch 5  | loss: 0.0053  | val_0_logloss: 0.00306 | val_0_f1: 0.95228 | val_1_logloss: 0.00319 | val_1_f1: 0.94583 |  0:01:10s
epoch 6  | loss: 0.00285 | val_0_logloss: 0.00164 | val_0_f1: 0.97452 | val_1_logloss: 0.00178 | val_1_f1: 0.97646 |  0:01:21s
epoch 7  | loss: 0.00164 | val_0_logloss: 0.01144 | val_0_f1: 0.89769 | val_1_logloss: 0.01308 | val_1_f1: 0.89

In [12]:
predicted_test = supervised.predict(X_test)
score = f1_score(y_test,predicted_test)
print(score)

0.9954932398597897


In [13]:
from pytorch_tabnet.pretraining import TabNetPretrainer

# TabNetPretrainer
unsupervised = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax' # "sparsemax"
)

In [14]:
unsupervised.fit(X_train,
                 eval_set=[X_val],
                 pretraining_ratio=0.8,
)

epoch 0  | loss: 1.34347 | val_0_unsup_loss_numpy: 89232504.0|  0:00:11s
epoch 1  | loss: 0.98772 | val_0_unsup_loss_numpy: 8663917.0|  0:00:22s
epoch 2  | loss: 0.98489 | val_0_unsup_loss_numpy: 39511256.0|  0:00:34s
epoch 3  | loss: 0.96982 | val_0_unsup_loss_numpy: 62581820.0|  0:00:44s
epoch 4  | loss: 0.96974 | val_0_unsup_loss_numpy: 1033215040.0|  0:00:55s
epoch 5  | loss: 0.96441 | val_0_unsup_loss_numpy: 653611584.0|  0:01:06s
epoch 6  | loss: 0.9639  | val_0_unsup_loss_numpy: 3123280640.0|  0:01:16s
epoch 7  | loss: 0.95778 | val_0_unsup_loss_numpy: 4257378048.0|  0:01:27s
epoch 8  | loss: 0.9589  | val_0_unsup_loss_numpy: 23222102016.0|  0:01:38s
epoch 9  | loss: 0.96101 | val_0_unsup_loss_numpy: 36810911744.0|  0:01:50s
epoch 10 | loss: 0.95535 | val_0_unsup_loss_numpy: 167888784.0|  0:02:01s
epoch 11 | loss: 0.95811 | val_0_unsup_loss_numpy: 242935456.0|  0:02:13s

Early stopping occurred at epoch 11 with best_epoch = 1 and best_val_0_unsup_loss_numpy = 8663917.0


# Pre-trained 된 모델로 test

In [15]:
clf = TabNetClassifier(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, # how to use learning rate scheduler
                      "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax' # This will be overwritten if using pretrain model
)

In [16]:
clf.fit(X_train,y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_name=['train', 'valid'],
    eval_metric=['logloss','f1'],
    from_unsupervised=unsupervised
)

epoch 0  | loss: 0.09569 | train_logloss: 0.01269 | train_f1: 0.75682 | valid_logloss: 0.14967 | valid_f1: 0.29579 |  0:00:11s
epoch 1  | loss: 0.00952 | train_logloss: 0.00736 | train_f1: 0.87983 | valid_logloss: 0.13115 | valid_f1: 0.34029 |  0:00:22s
epoch 2  | loss: 0.00788 | train_logloss: 0.00848 | train_f1: 0.78746 | valid_logloss: 0.21431 | valid_f1: 0.03168 |  0:00:33s
epoch 3  | loss: 0.00567 | train_logloss: 0.00201 | train_f1: 0.9702  | valid_logloss: 0.16939 | valid_f1: 0.04331 |  0:00:43s
epoch 4  | loss: 0.00366 | train_logloss: 0.00461 | train_f1: 0.91751 | valid_logloss: 0.06147 | valid_f1: 0.36    |  0:00:52s
epoch 5  | loss: 0.00333 | train_logloss: 0.00317 | train_f1: 0.9453  | valid_logloss: 0.09865 | valid_f1: 0.23345 |  0:01:02s
epoch 6  | loss: 0.00268 | train_logloss: 0.00326 | train_f1: 0.9501  | valid_logloss: 0.04979 | valid_f1: 0.44564 |  0:01:12s
epoch 7  | loss: 0.00226 | train_logloss: 0.00154 | train_f1: 0.96615 | valid_logloss: 0.06729 | valid_f1: 0.46

In [17]:
predicted_test=clf.predict(X_test)
score=f1_score(y_test,predicted_test)
print(score)

0.9663951120162932
