This notebook has stacking implementation with the following metalearners for **binary** classification:
1. TabNet

# Imports & Dataset Setup

In [1]:
import numpy as np
from optuna import Trial, visualization
seed = 42
np.random.seed(seed)
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, balanced_accuracy_score, classification_report
from sklearn.model_selection import PredefinedSplit, GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform, randint
import torch
import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
!pip install pytorch_tabnet

Collecting pytorch_tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Downloading pytorch_tabnet-4.1.0-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytorch_tabnet
Successfully installed pytorch_tabnet-4.1.0


In [4]:
from pytorch_tabnet.multitask import TabNetMultiTaskClassifier
from pytorch_tabnet.tab_model import TabNetClassifier

In [5]:
b_mobilenet_train = pd.read_csv("/kaggle/input/fork-of-koa-mobilenetv2-2/b_mobilenet_train.csv").drop(columns=['Unnamed: 0'])
b_mobilenet_val = pd.read_csv("/kaggle/input/fork-of-koa-mobilenetv2-2/b_mobilenet_val.csv").drop(columns=['Unnamed: 0'])
b_mobilenet_test = pd.read_csv("/kaggle/input/fork-of-koa-mobilenetv2-2/b_mobilenet_test.csv").drop(columns=['Unnamed: 0'])

In [6]:
b_densenet_train = pd.read_csv("/kaggle/input/koa-densenet-preds/b_densenet_train.csv").drop(columns=['Unnamed: 0'])
b_densenet_val = pd.read_csv("/kaggle/input/koa-densenet-preds/b_densenet_val.csv").drop(columns=['Unnamed: 0'])
b_densenet_test = pd.read_csv("/kaggle/input/koa-densenet-preds/b_densenet_test.csv").drop(columns=['Unnamed: 0'])

In [7]:
b_yolov8_train = pd.read_csv("/kaggle/input/koa-yolov8-preds/b_yolov8_train.csv").drop(columns=['Unnamed: 0'])
b_yolov8_val = pd.read_csv("/kaggle/input/koa-yolov8-preds/b_yolov8_val.csv").drop(columns=['Unnamed: 0'])
b_yolov8_test = pd.read_csv("/kaggle/input/koa-yolov8-preds/b_yolov8_test.csv").drop(columns=['Unnamed: 0'])

In [8]:
b_yolov8_train.rename(columns = {'FilePath': 'FileName'}, inplace=True)
b_yolov8_val.rename(columns = {'FilePath': 'FileName'}, inplace=True)
b_yolov8_test.rename(columns = {'FilePath': 'FileName'}, inplace=True)

In [9]:
train = b_mobilenet_train.merge(b_yolov8_train).merge(b_densenet_train)
val = b_mobilenet_val.merge(b_yolov8_val).merge(b_densenet_val)
test = b_mobilenet_test.merge(b_yolov8_test).merge(b_densenet_test)

In [10]:
X_train = train[['m', 'm_0', 'd']]
X_val = val[['m', 'm_0', 'd']]
X_test = test[['m', 'm_0', 'd']]
y_train = train[['y_true']]
y_val = val[['y_true']]
y_test = test[['y_true']]

# TabNet

In [11]:
X_train = X_train.to_numpy()
y_train = y_train.to_numpy().flatten().reshape(-1,1)

In [12]:
X_val= X_val.to_numpy()
y_val= y_val.to_numpy()
y_val= y_val.flatten()
y_val=y_val.reshape(len(y_val),1)

In [13]:
from pytorch_tabnet.tab_model import TabNetClassifier

In [20]:
classifier = TabNetClassifier(verbose=1,seed=42)

classifier.fit(X_train=X_train, y_train=y_train.ravel(),
               eval_set=[(X_train, y_train.ravel()), (X_val, y_val.ravel())],
               eval_name=['train', 'valid'],
               patience=200,max_epochs=200,
               eval_metric=['accuracy','auc'])

epoch 0  | loss: 0.24923 | train_accuracy: 0.52136 | train_auc: 0.97897 | valid_accuracy: 0.48184 | valid_auc: 0.88353 |  0:00:00s
epoch 1  | loss: 0.15981 | train_accuracy: 0.42347 | train_auc: 0.98222 | valid_accuracy: 0.41768 | valid_auc: 0.88541 |  0:00:01s
epoch 2  | loss: 0.15551 | train_accuracy: 0.42347 | train_auc: 0.97583 | valid_accuracy: 0.41768 | valid_auc: 0.85729 |  0:00:01s
epoch 3  | loss: 0.14814 | train_accuracy: 0.42347 | train_auc: 0.94069 | valid_accuracy: 0.41768 | valid_auc: 0.8155  |  0:00:02s
epoch 4  | loss: 0.14586 | train_accuracy: 0.42347 | train_auc: 0.97706 | valid_accuracy: 0.41768 | valid_auc: 0.88093 |  0:00:03s
epoch 5  | loss: 0.14233 | train_accuracy: 0.42347 | train_auc: 0.96196 | valid_accuracy: 0.41768 | valid_auc: 0.86697 |  0:00:03s
epoch 6  | loss: 0.14009 | train_accuracy: 0.42347 | train_auc: 0.96395 | valid_accuracy: 0.41768 | valid_auc: 0.87681 |  0:00:04s
epoch 7  | loss: 0.13689 | train_accuracy: 0.42347 | train_auc: 0.96509 | valid_acc

In [21]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score

In [33]:
print("AUC:", )
# classifier.predict_proba(X_train)[:,1]

AUC: 0.984


In [44]:
print("Testing on training set:")
print("Accuracy: ", accuracy_score(y_train, classifier.predict(X_train)).round(3))
print("Balanced Accuracy: ", balanced_accuracy_score(y_train, classifier.predict(X_train)).round(3))
print("AUC:", roc_auc_score(y_train,classifier.predict_proba(X_train)[:,1]).round(3))

print("\nTesting on validation set:")
print("Accuracy: ", accuracy_score(y_val,classifier.predict(X_val)).round(3))
print("Balanced Accuracy: ", balanced_accuracy_score(y_val,classifier.predict(X_val)).round(3))
print("AUC:", roc_auc_score(y_val,classifier.predict_proba(X_val)[:,1]).round(3))

print("\nTesting on testing set:")
print("Accuracy: ", accuracy_score(y_test,classifier.predict(X_test.to_numpy())).round(3))
print("Balanced Accuracy: ", balanced_accuracy_score(y_test,classifier.predict(X_test.to_numpy())).round(3))
print("AUC:", roc_auc_score(y_test,classifier.predict_proba(X_test.to_numpy())[:,1]).round(3))

Testing on training set:
Accuracy:  0.865
Balanced Accuracy:  0.88
AUC: 0.984

Testing on validation set:
Accuracy:  0.757
Balanced Accuracy:  0.778
AUC: 0.902

Testing on testing set:
Accuracy:  0.806
Balanced Accuracy:  0.823
AUC: 0.945
