- https://medium.com/@vanillaxiangshuyang/self-supervised-learning-on-tabular-data-with-tabnet-544b3ec85cee
- https://colab.research.google.com/drive/1P8Obe07DP3VeOld08ThyT1HnChLip_LO#scrollTo=gvy9vUUNOP0W

- https://www.kaggle.com/code/sisharaneranjana/semi-supervised-pre-training-with-tabnet#%F0%9F%94%8FDescription-of-the-dataset-
- https://dacon.io/en/codeshare/3837

In [1]:
import pandas as pd
import numpy as np
import pickle

import warnings
warnings.simplefilter(action='ignore')

In [2]:
# 원본 데이터
with open("./dataset/creditcard.pkl","rb") as file:
    data = pickle.load(file)
    
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


# 데이터 전처리

In [3]:
df = data.copy()

# Time 을 일중 시간으로 변환
df.loc[:, "Time"] = df.loc[:, "Time"].apply(lambda x : x / 3600 % 24)

# Amount column 은 편차가 크므로 log-scale 로 변환
df['Amount'] = np.log(df.pop('Amount') + 0.001)

In [4]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Class,Amount
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0,5.008105
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0,0.989913
2,0.000278,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0,5.936641
3,0.000278,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0,4.816249
4,0.000556,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0,4.248367


# train_test_split

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
features = df.values
labels = np.array(df.pop('Class'))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.5, random_state=0, stratify=labels)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=0)

print("X train shape: ", X_train.shape)
print("Y train shape: ", y_train.shape)
print("===============")
print("X validation shape: ", X_val.shape)
print("y validation shape: ", y_val.shape)
print("===============")
print("X test shape: ", X_test.shape)
print("Y test shape: ", y_test.shape)

X train shape:  (71201, 31)
Y train shape:  (71201,)
X validation shape:  (71202, 31)
y validation shape:  (71202,)
X test shape:  (142404, 31)
Y test shape:  (142404,)


In [8]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
x_val= sc.transform(X_val)

# TabNetClassifier

https://github.com/dreamquark-ai/tabnet

In [9]:
# train on the whole dataset with labels
import torch
from pytorch_tabnet.tab_model import TabNetClassifier

supervised = TabNetClassifier(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, # how to use learning rate scheduler
                      "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax' # This will be overwritten if using pretrain model
)

In [10]:
from pytorch_tabnet.metrics import Metric
from sklearn.metrics import f1_score

class F1_Score(Metric):
    def __init__(self):
        self._name = "f1"
        self._maximize = True

    def __call__(self, y_true, y_score):
        score = f1_score(y_true, (y_score[:, 1]>0.5)*1)
        return score

In [11]:
supervised.fit(X_train, y_train,
               patience=5,
               eval_set=[(X_train, y_train), (x_val,y_val)],
               eval_metric=['logloss','f1']
)

epoch 0  | loss: 0.04237 | val_0_logloss: 0.01191 | val_0_f1: 0.03279 | val_1_logloss: 0.01335 | val_1_f1: 0.03053 |  0:00:09s
epoch 1  | loss: 0.00929 | val_0_logloss: 0.00751 | val_0_f1: 0.53801 | val_1_logloss: 0.00917 | val_1_f1: 0.39053 |  0:00:17s
epoch 2  | loss: 0.00557 | val_0_logloss: 0.00422 | val_0_f1: 0.76415 | val_1_logloss: 0.00597 | val_1_f1: 0.72146 |  0:00:25s
epoch 3  | loss: 0.00502 | val_0_logloss: 0.00488 | val_0_f1: 0.63212 | val_1_logloss: 0.00653 | val_1_f1: 0.58883 |  0:00:32s
epoch 4  | loss: 0.00531 | val_0_logloss: 0.0044  | val_0_f1: 0.74286 | val_1_logloss: 0.00625 | val_1_f1: 0.65385 |  0:00:40s
epoch 5  | loss: 0.00557 | val_0_logloss: 0.00428 | val_0_f1: 0.68783 | val_1_logloss: 0.00591 | val_1_f1: 0.6114  |  0:00:48s
epoch 6  | loss: 0.00495 | val_0_logloss: 0.00416 | val_0_f1: 0.79464 | val_1_logloss: 0.00571 | val_1_f1: 0.69828 |  0:00:56s
epoch 7  | loss: 0.00437 | val_0_logloss: 0.00299 | val_0_f1: 0.78261 | val_1_logloss: 0.00453 | val_1_f1: 0.73

In [12]:
predicted_test = supervised.predict(X_test)
score = f1_score(y_test,predicted_test)
print(score)

0.7707317073170733


In [13]:
from pytorch_tabnet.pretraining import TabNetPretrainer

# TabNetPretrainer
unsupervised = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax' # "sparsemax"
)

In [14]:
unsupervised.fit(X_train,
                 eval_set=[X_val],
                 pretraining_ratio=0.8,
)

epoch 0  | loss: 0.98345 | val_0_unsup_loss_numpy: 217.4924774169922|  0:00:07s
epoch 1  | loss: 0.34711 | val_0_unsup_loss_numpy: 266.1296691894531|  0:00:13s
epoch 2  | loss: 0.63993 | val_0_unsup_loss_numpy: 40.44477844238281|  0:00:19s
epoch 3  | loss: 0.27321 | val_0_unsup_loss_numpy: 289.6617431640625|  0:00:26s
epoch 4  | loss: 0.70554 | val_0_unsup_loss_numpy: 242.64297485351562|  0:00:34s
epoch 5  | loss: 0.75601 | val_0_unsup_loss_numpy: 222.50965881347656|  0:00:40s
epoch 6  | loss: -0.00715| val_0_unsup_loss_numpy: 589.055908203125|  0:00:47s
epoch 7  | loss: 0.48132 | val_0_unsup_loss_numpy: 167.38934326171875|  0:00:53s
epoch 8  | loss: 0.63309 | val_0_unsup_loss_numpy: 450.40167236328125|  0:01:00s
epoch 9  | loss: 0.79176 | val_0_unsup_loss_numpy: 107.77104949951172|  0:01:07s
epoch 10 | loss: 0.78177 | val_0_unsup_loss_numpy: 46.169281005859375|  0:01:13s
epoch 11 | loss: 0.05704 | val_0_unsup_loss_numpy: 731.3218383789062|  0:01:19s
epoch 12 | loss: 0.69741 | val_0_un

# Pre-trained 된 모델로 test

In [15]:
generated = pd.read_csv('./dataset/ctgan_generated_0320.csv')
generated.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,472,1.989026,6.759762,-3.436226,7.508126,-2.003154,-1.453226,4.816805,-9.721434,-6.471587,...,13.760328,0.463365,-2.383923,-0.485486,-1.485571,1.296262,0.110573,0.440638,219.8,1
1,6986,-2.064557,0.925835,-9.001415,2.821149,2.934994,0.809051,-2.381658,-0.717233,-1.331499,...,1.226586,1.401159,-0.804631,-0.842344,-0.119244,0.886038,0.461688,0.224494,357.95,1
2,6986,-0.695426,0.619314,-30.991254,2.067791,-5.309895,1.159233,-2.481335,-0.407266,-7.06615,...,-3.563537,0.418333,-0.282519,0.912088,-1.493152,-0.326489,0.438027,-1.15578,3.22,1
3,7535,1.904601,4.020354,-9.863136,4.722613,-3.603447,-3.186403,0.689895,-10.156282,0.433262,...,2.174024,0.633162,-1.02663,0.347991,-0.447925,0.185,-2.555247,-1.000923,45.49,1
4,7543,-11.7042,6.025343,-5.197522,1.851403,4.832916,-0.518359,0.454355,-5.387303,-4.300301,...,1.353062,1.197828,-4.923546,0.573469,-0.704693,0.569465,-0.046174,-0.136388,1.0,1


In [16]:
# 데이터 전처리
ctgan = generated.copy()

# Time 을 일중 시간으로 변환
ctgan.loc[:, "Time"] = ctgan.loc[:, "Time"].apply(lambda x : x / 3600 % 24)

# Amount column 은 편차가 크므로 log-scale 로 변환
eps=0.001
ctgan['Amount'] = np.log(ctgan.pop('Amount') + eps)

ctgan.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Class,Amount
0,0.131111,1.989026,6.759762,-3.436226,7.508126,-2.003154,-1.453226,4.816805,-9.721434,-6.471587,...,13.760328,0.463365,-2.383923,-0.485486,-1.485571,1.296262,0.110573,0.440638,1,5.392723
1,1.940556,-2.064557,0.925835,-9.001415,2.821149,2.934994,0.809051,-2.381658,-0.717233,-1.331499,...,1.226586,1.401159,-0.804631,-0.842344,-0.119244,0.886038,0.461688,0.224494,1,5.880396
2,1.940556,-0.695426,0.619314,-30.991254,2.067791,-5.309895,1.159233,-2.481335,-0.407266,-7.06615,...,-3.563537,0.418333,-0.282519,0.912088,-1.493152,-0.326489,0.438027,-1.15578,1,1.169692
3,2.093056,1.904601,4.020354,-9.863136,4.722613,-3.603447,-3.186403,0.689895,-10.156282,0.433262,...,2.174024,0.633162,-1.02663,0.347991,-0.447925,0.185,-2.555247,-1.000923,1,3.817515
4,2.095278,-11.7042,6.025343,-5.197522,1.851403,4.832916,-0.518359,0.454355,-5.387303,-4.300301,...,1.353062,1.197828,-4.923546,0.573469,-0.704693,0.569465,-0.046174,-0.136388,1,0.001


In [17]:
ctgan_features = ctgan.values
ctgan_labels = np.array(ctgan.pop('Class'))

In [18]:
X_train, X_test, y_train, y_test = train_test_split(ctgan_features, ctgan_labels, test_size=0.5, random_state=0, stratify=ctgan_labels)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=0)

In [19]:
clf = TabNetClassifier(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, # how to use learning rate scheduler
                      "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax' # This will be overwritten if using pretrain model
)

In [20]:
clf.fit(X_train,y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_name=['train', 'valid'],
    eval_metric=['logloss','f1'],
    from_unsupervised=unsupervised
)

epoch 0  | loss: 0.0     | train_logloss: 0.59294 | train_f1: 0.31884 | valid_logloss: 0.44945 | valid_f1: 0.38596 |  0:00:00s
epoch 1  | loss: 0.0     | train_logloss: 0.59294 | train_f1: 0.31884 | valid_logloss: 0.44945 | valid_f1: 0.38596 |  0:00:00s
epoch 2  | loss: 0.0     | train_logloss: 0.59294 | train_f1: 0.31884 | valid_logloss: 0.44945 | valid_f1: 0.38596 |  0:00:00s
epoch 3  | loss: 0.0     | train_logloss: 0.59294 | train_f1: 0.31884 | valid_logloss: 0.44945 | valid_f1: 0.38596 |  0:00:00s
epoch 4  | loss: 0.0     | train_logloss: 0.59294 | train_f1: 0.31884 | valid_logloss: 0.44945 | valid_f1: 0.38596 |  0:00:00s
epoch 5  | loss: 0.0     | train_logloss: 0.59294 | train_f1: 0.31884 | valid_logloss: 0.44945 | valid_f1: 0.38596 |  0:00:00s
epoch 6  | loss: 0.0     | train_logloss: 0.59294 | train_f1: 0.31884 | valid_logloss: 0.44945 | valid_f1: 0.38596 |  0:00:00s
epoch 7  | loss: 0.0     | train_logloss: 0.59294 | train_f1: 0.31884 | valid_logloss: 0.44945 | valid_f1: 0.38

In [21]:
predicted_test=clf.predict(X_test)
score=f1_score(y_test,predicted_test)
print(score)

0.36065573770491804
