This notebook has stacking implementation with the following metalearners for multiclass classification:
1. Random Forest
2. CatBoost
3. LightGBM

# Imports & Dataset Setup

In [1]:
import numpy as np
seed = 42
np.random.seed(seed)
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, balanced_accuracy_score, classification_report
from sklearn.model_selection import PredefinedSplit, GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform, randint

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
b_mobilenet_train = pd.read_csv("/kaggle/input/fork-of-koa-mobilenetv2-2/b_mobilenet_train.csv").drop(columns=['Unnamed: 0'])
b_mobilenet_val = pd.read_csv("/kaggle/input/fork-of-koa-mobilenetv2-2/b_mobilenet_val.csv").drop(columns=['Unnamed: 0'])
b_mobilenet_test = pd.read_csv("/kaggle/input/fork-of-koa-mobilenetv2-2/b_mobilenet_test.csv").drop(columns=['Unnamed: 0'])

In [4]:
b_densenet_train = pd.read_csv("/kaggle/input/koa-densenet-preds/b_densenet_train.csv").drop(columns=['Unnamed: 0'])
b_densenet_val = pd.read_csv("/kaggle/input/koa-densenet-preds/b_densenet_val.csv").drop(columns=['Unnamed: 0'])
b_densenet_test = pd.read_csv("/kaggle/input/koa-densenet-preds/b_densenet_test.csv").drop(columns=['Unnamed: 0'])

In [5]:
b_yolov8_train = pd.read_csv("/kaggle/input/koa-yolov8-preds/b_yolov8_train.csv").drop(columns=['Unnamed: 0'])
b_yolov8_val = pd.read_csv("/kaggle/input/koa-yolov8-preds/b_yolov8_val.csv").drop(columns=['Unnamed: 0'])
b_yolov8_test = pd.read_csv("/kaggle/input/koa-yolov8-preds/b_yolov8_test.csv").drop(columns=['Unnamed: 0'])

In [6]:
b_yolov8_train.rename(columns = {'FilePath': 'FileName'}, inplace=True)
b_yolov8_val.rename(columns = {'FilePath': 'FileName'}, inplace=True)
b_yolov8_test.rename(columns = {'FilePath': 'FileName'}, inplace=True)

In [7]:
train = b_mobilenet_train.merge(b_yolov8_train).merge(b_densenet_train)
val = b_mobilenet_val.merge(b_yolov8_val).merge(b_densenet_val)
test = b_mobilenet_test.merge(b_yolov8_test).merge(b_densenet_test)

In [8]:
X_train = train[['m', 'm_0', 'd']]
X_val = val[['m', 'm_0', 'd']]
X_test = test[['m', 'm_0', 'd']]
y_train = train[['y_true']]
y_val = val[['y_true']]
y_test = test[['y_true']]

In [9]:
# For explicitly deefining the validation set in grid search CV
split_index = [-1]*len(X_train) + [0]*len(X_val)
X = np.concatenate((X_train, X_val), axis=0)
y = np.concatenate((y_train, y_val), axis=0).ravel()
pds = PredefinedSplit(test_fold = split_index)

# Random Forest Classifier

In [20]:
rf = RandomForestClassifier(n_jobs = -1, warm_start=True)

In [22]:
distributions = {
    "n_estimators": [170,180,190],
    "criterion": ["gini", "entropy", "log_loss"],
    "min_samples_split": [6,8,10]
}


In [23]:
clf = GridSearchCV(estimator = rf,
                   cv=pds,
                   param_grid=distributions, 
                   n_jobs=-1,
                   verbose=1)

In [24]:
clf.fit(X,y)

Fitting 1 folds for each of 27 candidates, totalling 27 fits


In [25]:
clf.best_estimator_

In [28]:
print("Testing on training set:")
print("Accuracy: ", accuracy_score(y_train,clf.best_estimator_.predict(X_train)).round(3))
print("Balanced Accuracy: ", balanced_accuracy_score(y_train,clf.best_estimator_.predict(X_train)).round(3))
print("AUC:", roc_auc_score(y_train,clf.best_estimator_.predict_proba(X_train)[:,1]).round(3))
print("\nTesting on validation set:")
print("Accuracy: ", accuracy_score(y_val,clf.best_estimator_.predict(X_val)).round(3))
print("Balanced Accuracy: ", balanced_accuracy_score(y_val,clf.best_estimator_.predict(X_val)).round(3))
print("AUC:", roc_auc_score(y_val,clf.best_estimator_.predict_proba(X_val)[:,1]).round(3))
print("\nTesting on testing set:")
print("Accuracy: ", accuracy_score(y_test,clf.best_estimator_.predict(X_test)).round(3))
print("Balanced Accuracy: ", balanced_accuracy_score(y_test,clf.best_estimator_.predict(X_test)).round(3))
print("AUC:", roc_auc_score(y_test,clf.best_estimator_.predict_proba(X_test)[:,1]).round(3))

Testing on training set:
Accuracy:  0.991
Balanced Accuracy:  0.99
AUC: 1.0

Testing on validation set:
Accuracy:  0.958
Balanced Accuracy:  0.956
AUC: 0.996

Testing on testing set:
Accuracy:  0.877
Balanced Accuracy:  0.872
AUC: 0.939


# CatBoost

In [10]:
! pip install -q catboost

In [12]:
from catboost import CatBoostClassifier

In [13]:
cbc = CatBoostClassifier(loss_function='MultiClass', verbose=False, random_seed=seed)

In [14]:
distributions = {
    "iterations": [100],
    "depth": [15],
    "learning_rate": [1e-5,0.00005]
}

In [15]:
clf = GridSearchCV(estimator = cbc,
                   cv=pds,
                   param_grid=distributions, 
                   n_jobs=-1,
                   verbose=1)

In [16]:
clf.fit(X,y)

Fitting 1 folds for each of 2 candidates, totalling 2 fits


In [17]:
clf.best_params_

{'depth': 15, 'iterations': 100, 'learning_rate': 5e-05}

In [19]:
print("Testing on training set:")
print("Accuracy: ", accuracy_score(y_train,clf.best_estimator_.predict(X_train)).round(3))
print("Balanced Accuracy: ", balanced_accuracy_score(y_train,clf.best_estimator_.predict(X_train)).round(3))
print("AUC:", roc_auc_score(y_train,clf.best_estimator_.predict_proba(X_train)[:,1]).round(3))
print("\nTesting on validation set:")
print("Accuracy: ", accuracy_score(y_val,clf.best_estimator_.predict(X_val)).round(3))
print("Balanced Accuracy: ", balanced_accuracy_score(y_val,clf.best_estimator_.predict(X_val)).round(3))
print("AUC:", roc_auc_score(y_val,clf.best_estimator_.predict_proba(X_val)[:,1]).round(3))
print("\nTesting on testing set:")
print("Accuracy: ", accuracy_score(y_test,clf.best_estimator_.predict(X_test)).round(3))
print("Balanced Accuracy: ", balanced_accuracy_score(y_test,clf.best_estimator_.predict(X_test)).round(3))
print("AUC:", roc_auc_score(y_test,clf.best_estimator_.predict_proba(X_test)[:,1]).round(3))

Testing on training set:
Accuracy:  0.959
Balanced Accuracy:  0.955
AUC: 0.986

Testing on validation set:
Accuracy:  0.828
Balanced Accuracy:  0.819
AUC: 0.904

Testing on testing set:
Accuracy:  0.879
Balanced Accuracy:  0.875
AUC: 0.945


# LightGBM

In [20]:
import lightgbm as lgb

In [27]:
lgbm = lgb.LGBMClassifier(n_jobs=-1,random_state=seed, warm_start=True)

In [28]:
distributions = {
    "learning_rate": uniform(0.001,0.1),
    "n_estimators": randint(70,120),
}

In [34]:
clf = RandomizedSearchCV(estimator = cbc,
                       cv=pds,
                       param_distributions=distributions, 
                       n_iter = 500,
                       n_jobs=-1,
                       verbose=1,
                       random_state=seed)

In [35]:
clf.fit(X,y)

Fitting 1 folds for each of 500 candidates, totalling 500 fits


In [36]:
clf.best_params_

{'learning_rate': 0.09232405525564713, 'n_estimators': 76}

In [37]:
print("Testing on training set:")
print("Accuracy: ", accuracy_score(y_train,clf.best_estimator_.predict(X_train)).round(3))
print("Balanced Accuracy: ", balanced_accuracy_score(y_train,clf.best_estimator_.predict(X_train)).round(3))
print("AUC:", roc_auc_score(y_train,clf.best_estimator_.predict_proba(X_train)[:,1]).round(3))
print("\nTesting on validation set:")
print("Accuracy: ", accuracy_score(y_val,clf.best_estimator_.predict(X_val)).round(3))
print("Balanced Accuracy: ", balanced_accuracy_score(y_val,clf.best_estimator_.predict(X_val)).round(3))
print("AUC:", roc_auc_score(y_val,clf.best_estimator_.predict_proba(X_val)[:,1]).round(3))
print("\nTesting on testing set:")
print("Accuracy: ", accuracy_score(y_test,clf.best_estimator_.predict(X_test)).round(3))
print("Balanced Accuracy: ", balanced_accuracy_score(y_test,clf.best_estimator_.predict(X_test)).round(3))
print("AUC:", roc_auc_score(y_test,clf.best_estimator_.predict_proba(X_test)[:,1]).round(3))

Testing on training set:
Accuracy:  0.96
Balanced Accuracy:  0.957
AUC: 0.987

Testing on validation set:
Accuracy:  0.828
Balanced Accuracy:  0.82
AUC: 0.906

Testing on testing set:
Accuracy:  0.874
Balanced Accuracy:  0.87
AUC: 0.945
