# Tabnet: Attentive Tabular Learning

In [49]:
import pandas as pd                                        # pandas dataframes
import numpy  as np                                        # numpy arrays
import torch                                               # Pytorch: neural network backend
from pytorch_tabnet.tab_model import TabNetClassifier      # Tabnet: neural network models for tabular learning
from sklearn.model_selection import KFold                  # k-fold cross-validation
from sklearn.metrics import accuracy_score                 # classification accuracy
from sklearn.metrics import cohen_kappa_score              # loss function appropriate for imbalanced classes

In [44]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"

# wisconsin diagnostic breast cancer data set
wdbc = pd.read_csv(url, header=None, usecols=range(1, 32)) # remove zeroth column (irrelevant index feature)
label = list(wdbc.columns)[0]                              # column name of the class label
features = list(wdbc.columns)[1:]                          # column names of the features
y = pd.factorize(wdbc[label])[0]                           # y-vector (with target encoding)
X = wdbc[features]                                         # design matrix

In [45]:
X = X.to_numpy()     # encode for PyTorch
y = torch.tensor(y)  # encode for PyTorch

In [46]:
X # preview

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [47]:
y[0:40] # preview

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0])

In [52]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)  # define k-fold cross-validation plan

predictions_array = [] # initialize list of out-of-fold predictions
CV_score_array    = [] # initialize list of cross-validation scores

for train_index, test_index in kf.split(X):
    
    X_train, X_valid = X[train_index], X[test_index]   # X variables
    y_train, y_valid = y[train_index], y[test_index]   # y vector
    
    classifier = TabNetClassifier(verbose=0,seed=42)   # define the model
    
    classifier.fit(X_train=X_train, y_train=y_train,   # in-sample data
               eval_set=[(X_valid, y_valid)],          # out-of-fold data
               patience=100,                           # stopping threshhold if no improvement
               max_epochs=2000,                        # maximum training epochs
               eval_metric=['logloss'])                # loss function to optimize
    
    CV_score_array.append(classifier.best_cost)
    predictions_array.append(np.expm1(classifier.predict(X_valid)))
    
    print("True class labels:", y_valid[0:10])
    print("Predicted class labels:", classifier.predict(X_valid[0:10]))
    print("Kappa:", cohen_kappa_score(y1=y_valid, y2=classifier.predict(X_valid)))
    print("Accuracy:", accuracy_score(y_true=y_valid, y_pred=classifier.predict(X_valid)))

    # ignore UserWarning: CUDA initialization: Found no NVIDIA driver on your system


Early stopping occured at epoch 113 with best_epoch = 13 and best_val_0_logloss = 1.87535
Best weights from best epoch are automatically used!
True class labels: tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0])
Predicted class labels: [0 0 0 0 0 0 0 1 0 0]
Kappa: 0.2348993288590604
Accuracy: 0.5614035087719298

Early stopping occured at epoch 100 with best_epoch = 0 and best_val_0_logloss = 2.72705
Best weights from best epoch are automatically used!
True class labels: tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0])
Predicted class labels: [0 1 1 1 0 1 0 0 0 0]
Kappa: 0.38827136834036025
Accuracy: 0.7543859649122807

Early stopping occured at epoch 103 with best_epoch = 3 and best_val_0_logloss = 3.29532
Best weights from best epoch are automatically used!
True class labels: tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 1])
Predicted class labels: [0 1 1 0 1 1 0 0 0 1]
Kappa: 0.2411497730711044
Accuracy: 0.6140350877192983

Early stopping occured at epoch 107 with best_epoch = 7 and best_val_0_logloss = 0.9569
Bes

In [None]:
# TODO: examine hyperparams