- https://medium.com/@vanillaxiangshuyang/self-supervised-learning-on-tabular-data-with-tabnet-544b3ec85cee
- https://colab.research.google.com/drive/1P8Obe07DP3VeOld08ThyT1HnChLip_LO#scrollTo=gvy9vUUNOP0W

- https://www.kaggle.com/code/sisharaneranjana/semi-supervised-pre-training-with-tabnet#%F0%9F%94%8FDescription-of-the-dataset-
- https://dacon.io/en/codeshare/3837

In [1]:
import pandas as pd
import numpy as np
import pickle

import warnings
warnings.simplefilter(action='ignore')

In [2]:
# 원본 데이터
data = pd.read_csv("./dataset/generated/data3.csv")

# 데이터 전처리

In [3]:
df = data.copy()

# Time 을 일중 시간으로 변환
df.loc[:, "Time"] = df.loc[:, "Time"].apply(lambda x : x / 3600 % 24)

# Amount column 은 편차가 크므로 log-scale 로 변환
df['Amount'] = np.log(df.pop('Amount') + 0.001)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,...,V21,V22,V23,V24,V25,V26,V27,V28,Class,Amount
0,0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0,5.008105
1,1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0,0.989913
2,2,0.000278,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0,5.936641
3,3,0.000278,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0,4.816249
4,4,0.000556,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0,4.248367


# train_test_split

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
features = df.values
labels = np.array(df.pop('Class'))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.5, random_state=0, stratify=labels)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=0)

print("X train shape: ", X_train.shape)
print("X validation shape: ", X_val.shape)
print("X test shape: ", X_test.shape)
print("Y train shape: ", y_train.shape)
print("Y validation shape: ", y_val.shape)
print("Y test shape: ", y_test.shape)

X train shape:  (71292, 32)
X validation shape:  (71293, 32)
X test shape:  (142585, 32)
Y train shape:  (71292,)
Y validation shape:  (71293,)
Y test shape:  (142585,)


In [8]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
x_val= sc.transform(X_val)

# TabNetClassifier

https://github.com/dreamquark-ai/tabnet

In [9]:
# train on the whole dataset with labels
import torch
from pytorch_tabnet.tab_model import TabNetClassifier

supervised = TabNetClassifier(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, # how to use learning rate scheduler
                      "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax' # This will be overwritten if using pretrain model
)

In [10]:
from pytorch_tabnet.metrics import Metric
from sklearn.metrics import f1_score

class F1_Score(Metric):
    def __init__(self):
        self._name = "f1"
        self._maximize = True

    def __call__(self, y_true, y_score):
        score = f1_score(y_true, (y_score[:, 1]>0.5)*1)
        return score

In [11]:
supervised.fit(X_train, y_train,
               patience=5,
               eval_set=[(X_train, y_train), (x_val,y_val)],
               eval_metric=['logloss','f1']
)

epoch 0  | loss: 0.08366 | val_0_logloss: 0.0169  | val_0_f1: 0.45198 | val_1_logloss: 0.01779 | val_1_f1: 0.33437 |  0:00:10s
epoch 1  | loss: 0.01182 | val_0_logloss: 0.00814 | val_0_f1: 0.72632 | val_1_logloss: 0.00863 | val_1_f1: 0.69914 |  0:00:21s
epoch 2  | loss: 0.00755 | val_0_logloss: 0.00916 | val_0_f1: 0.74328 | val_1_logloss: 0.01024 | val_1_f1: 0.68195 |  0:00:31s
epoch 3  | loss: 0.00669 | val_0_logloss: 0.00903 | val_0_f1: 0.81416 | val_1_logloss: 0.011   | val_1_f1: 0.77641 |  0:00:41s
epoch 4  | loss: 0.00624 | val_0_logloss: 0.00756 | val_0_f1: 0.82511 | val_1_logloss: 0.0069  | val_1_f1: 0.79293 |  0:00:53s
epoch 5  | loss: 0.00707 | val_0_logloss: 0.00494 | val_0_f1: 0.85106 | val_1_logloss: 0.0047  | val_1_f1: 0.816   |  0:01:06s
epoch 6  | loss: 0.00529 | val_0_logloss: 0.00632 | val_0_f1: 0.76316 | val_1_logloss: 0.00655 | val_1_f1: 0.76271 |  0:01:18s
epoch 7  | loss: 0.00503 | val_0_logloss: 0.00436 | val_0_f1: 0.88837 | val_1_logloss: 0.00351 | val_1_f1: 0.85

In [12]:
predicted_test = supervised.predict(X_test)
score = f1_score(y_test,predicted_test)
print(score)

0.8737373737373738


In [13]:
from pytorch_tabnet.pretraining import TabNetPretrainer

# TabNetPretrainer
unsupervised = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax' # "sparsemax"
)

In [14]:
unsupervised.fit(X_train,
                 eval_set=[X_val],
                 pretraining_ratio=0.8,
)

epoch 0  | loss: 1.33566 | val_0_unsup_loss_numpy: 5110445.5|  0:00:11s
epoch 1  | loss: 0.9939  | val_0_unsup_loss_numpy: 89464944.0|  0:00:22s
epoch 2  | loss: 0.98544 | val_0_unsup_loss_numpy: 3225056.5|  0:00:33s
epoch 3  | loss: 0.98276 | val_0_unsup_loss_numpy: 485151.21875|  0:00:43s
epoch 4  | loss: 0.97515 | val_0_unsup_loss_numpy: 62346912.0|  0:00:54s
epoch 5  | loss: 0.96886 | val_0_unsup_loss_numpy: 4731309056.0|  0:01:05s
epoch 6  | loss: 0.96686 | val_0_unsup_loss_numpy: 15139349504.0|  0:01:16s
epoch 7  | loss: 0.96801 | val_0_unsup_loss_numpy: 27528189952.0|  0:01:27s
epoch 8  | loss: 0.96398 | val_0_unsup_loss_numpy: 75670680.0|  0:01:37s
epoch 9  | loss: 0.96017 | val_0_unsup_loss_numpy: 380426048.0|  0:01:48s
epoch 10 | loss: 0.95443 | val_0_unsup_loss_numpy: 134012232.0|  0:02:00s
epoch 11 | loss: 0.9491  | val_0_unsup_loss_numpy: 81459648.0|  0:02:10s
epoch 12 | loss: 0.95885 | val_0_unsup_loss_numpy: 263123920.0|  0:02:20s
epoch 13 | loss: 0.95728 | val_0_unsup_l

# Pre-trained 된 모델로 test

In [15]:
clf = TabNetClassifier(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, # how to use learning rate scheduler
                      "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax' # This will be overwritten if using pretrain model
)

In [16]:
clf.fit(X_train,y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_name=['train', 'valid'],
    eval_metric=['logloss','f1'],
    from_unsupervised=unsupervised
)

epoch 0  | loss: 0.05682 | train_logloss: 0.01411 | train_f1: 0.63934 | valid_logloss: 0.0685  | valid_f1: 0.24786 |  0:00:11s
epoch 1  | loss: 0.00809 | train_logloss: 0.00588 | train_f1: 0.82323 | valid_logloss: 0.06248 | valid_f1: 0.28479 |  0:00:23s
epoch 2  | loss: 0.00513 | train_logloss: 0.00446 | train_f1: 0.83688 | valid_logloss: 0.06317 | valid_f1: 0.28794 |  0:00:34s
epoch 3  | loss: 0.00485 | train_logloss: 0.00348 | train_f1: 0.86331 | valid_logloss: 0.0317  | valid_f1: 0.39628 |  0:00:45s
epoch 4  | loss: 0.00372 | train_logloss: 0.00187 | train_f1: 0.96056 | valid_logloss: 0.06221 | valid_f1: 0.27004 |  0:00:56s
epoch 5  | loss: 0.00146 | train_logloss: 9e-05   | train_f1: 1.0     | valid_logloss: 0.07236 | valid_f1: 0.08411 |  0:01:07s
epoch 6  | loss: 0.00085 | train_logloss: 7e-05   | train_f1: 0.99775 | valid_logloss: 0.07175 | valid_f1: 0.16143 |  0:01:19s
epoch 7  | loss: 0.00065 | train_logloss: 2e-05   | train_f1: 1.0     | valid_logloss: 0.07362 | valid_f1: 0.13

In [17]:
predicted_test=clf.predict(X_test)
score=f1_score(y_test,predicted_test)
print(score)

0.8585987261146497
