# MODEL COMPARISON

In [31]:
import model_functions
import importlib

importlib.reload(model_functions)

from model_functions import (
    preprocess_diabetes,
    logistic_cross_validation,
    svm_cross_validation,
    mlp_cross_validation,
    hybrid_nn_svm_cv
)

import torch.nn as nn
import numpy as np
import pandas as pd


# 1. Preprocess data

In [32]:
file = 'data/diabetes.csv'

df = pd.read_csv(file)

prep_data = preprocess_diabetes(df)

df = prep_data["df"]
X_train = prep_data["X_train"]
X_test = prep_data["X_test"]
y_train = prep_data["y_train"]
y_test = prep_data["y_test"]
y_train_numeric = y_train.replace({"Malignant": 1, "Benign": 0}).astype(float)
y_test_numeric  = y_test.replace({"Malignant": 1, "Benign": 0}).astype(float)

Selected 18 features from 21 (threshold=0.05)


# 2. Models comparison

## 2.0. Preparation

In [33]:
results = pd.DataFrame(
    columns=[
        "accuracy_mean", "accuracy_std",
        "f1_mean", "f1_std",
        "recall_mean", "recall_std",
        "auc_mean", "auc_std"
    ]
)

def store_results(model_name, metrics):
    results.loc[model_name] = [
        metrics["accuracy_mean"], metrics["accuracy_std"],
        metrics["f1_mean"],       metrics["f1_std"],
        metrics["recall_mean"],   metrics["recall_std"],
        metrics["auc_mean"],      metrics["auc_std"]
    ]

## 2.1. Run all models

In [34]:
#----------------------#
# LOGISTIC REGRESSION  #
#----------------------#

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

metrics_lr, preds_lr = logistic_cross_validation(
    X_train, y_train,
    k=10
)
store_results("Logistic Regression", metrics_lr)

Fold 1/10
Fold 2/10
Fold 3/10
Fold 4/10
Fold 5/10
Fold 6/10
Fold 7/10
Fold 8/10
Fold 9/10
Fold 10/10


In [35]:
#----------------------#
#         SVM          #
#----------------------#
metrics_svm, preds_svm = svm_cross_validation(
    X_train, y_train,
    k=10
)
store_results("SVM (RBF)", metrics_svm)

Fold 1/10
Fold 2/10
Fold 3/10
Fold 4/10
Fold 5/10
Fold 6/10
Fold 7/10
Fold 8/10
Fold 9/10
Fold 10/10


In [36]:
#----------------------#
#          MLP         #
#----------------------#

metrics_mlp, preds_mlp = mlp_cross_validation(
    X_train, y_train,
    k=10
)
store_results("MLP", metrics_mlp)

Fold 1/10
Fold 2/10
Fold 3/10
Fold 4/10
Fold 5/10
Fold 6/10
Fold 7/10
Fold 8/10
Fold 9/10
Fold 10/10


In [37]:
#----------------------#
#    HYBRID MODEL      #
#----------------------#

import torch

X_train_tensor = torch.tensor(X_train.to_numpy(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_numeric.to_numpy(), dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test.to_numpy(), dtype=torch.float32)

metrics_hybrid, preds_hybrid = hybrid_nn_svm_cv(
    X_train_tensor, y_train_tensor,
    k=10
)
store_results("Hybrid NN + SVM", metrics_hybrid)



Fold 1/10


  y = column_or_1d(y, warn=True)



Fold 2/10


  y = column_or_1d(y, warn=True)



Fold 3/10


  y = column_or_1d(y, warn=True)



Fold 4/10


  y = column_or_1d(y, warn=True)



Fold 5/10


  y = column_or_1d(y, warn=True)



Fold 6/10


  y = column_or_1d(y, warn=True)



Fold 7/10


  y = column_or_1d(y, warn=True)



Fold 8/10


  y = column_or_1d(y, warn=True)



Fold 9/10


  y = column_or_1d(y, warn=True)



Fold 10/10


  y = column_or_1d(y, warn=True)


## 2.2. Results

In breast cancer detection:

- A false negative = cancer is present but missed → health risk

- A false positive = further exam → less risky

Therefore, clinically, recall (sensitivity) is the primary priority.

In [38]:
results.sort_values(by=["recall_mean", "auc_mean"], ascending=False).round(3)


Unnamed: 0,accuracy_mean,accuracy_std,f1_mean,f1_std,recall_mean,recall_std,auc_mean,auc_std
Hybrid NN + SVM,0.747,0.005,0.765,0.005,0.825,0.011,0.823,0.004
SVM (RBF),0.703,0.004,0.733,0.004,0.816,0.009,0.771,0.006
MLP,0.748,0.003,0.762,0.009,0.809,0.042,0.824,0.004
Logistic Regression,0.748,0.006,0.753,0.006,0.767,0.009,0.824,0.003


<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>accuracy_mean</th>
      <th>accuracy_std</th>
      <th>f1_mean</th>
      <th>f1_std</th>
      <th>recall_mean</th>
      <th>recall_std</th>
      <th>auc_mean</th>
      <th>auc_std</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>Hybrid NN + SVM</th>
      <td>0.747</td>
      <td>0.005</td>
      <td>0.765</td>
      <td>0.005</td>
      <td>0.825</td>
      <td>0.011</td>
      <td>0.823</td>
      <td>0.004</td>
    </tr>
    <tr>
      <th>SVM (RBF)</th>
      <td>0.703</td>
      <td>0.004</td>
      <td>0.733</td>
      <td>0.004</td>
      <td>0.816</td>
      <td>0.009</td>
      <td>0.771</td>
      <td>0.006</td>
    </tr>
    <tr>
      <th>MLP</th>
      <td>0.748</td>
      <td>0.003</td>
      <td>0.762</td>
      <td>0.009</td>
      <td>0.809</td>
      <td>0.042</td>
      <td>0.824</td>
      <td>0.004</td>
    </tr>
    <tr>
      <th>Logistic Regression</th>
      <td>0.748</td>
      <td>0.006</td>
      <td>0.753</td>
      <td>0.006</td>
      <td>0.767</td>
      <td>0.009</td>
      <td>0.824</td>
      <td>0.003</td>
    </tr>
  </tbody>
</table>
</div>

# 3. Statistical test implementations

In [39]:
recalls_lr      = np.array(metrics_lr["recalls_per_fold"])
recalls_svm     = np.array(metrics_svm["recalls_per_fold"])
recalls_mlp     = np.array(metrics_mlp["recalls_per_fold"])
recalls_hybrid  = np.array(metrics_hybrid["recalls_per_fold"])

## 3.1. CI95 (Confidence Intervals)

In [40]:
from scipy import stats

def ci_95(values):
    mean = np.mean(values)
    sem = stats.sem(values)
    return (mean - 1.96 * sem, mean + 1.96 * sem)


In [41]:
ci95_hybrid = ci_95(recalls_hybrid)
ci95_svm    = ci_95(recalls_svm)
ci95_lr     = ci_95(recalls_lr)
ci95_mlp    = ci_95(recalls_mlp)


## 3.2. Wilcoxon Signed-Rank

In [42]:
from scipy.stats import wilcoxon

wilcox_hybrid_svm = wilcoxon(recalls_hybrid, recalls_svm)
wilcox_hybrid_lr  = wilcoxon(recalls_hybrid, recalls_lr)
wilcox_hybrid_mlp = wilcoxon(recalls_hybrid, recalls_mlp)

print("Wilcox Hybrid vs SVM", wilcox_hybrid_svm)

print("Wilcox Hybrid vs Logistic Regression", wilcox_hybrid_lr)


print("Wilcox Hybrid vs MLP", wilcox_hybrid_mlp)


Wilcox Hybrid vs SVM WilcoxonResult(statistic=np.float64(6.0), pvalue=np.float64(0.02734375))
Wilcox Hybrid vs Logistic Regression WilcoxonResult(statistic=np.float64(0.0), pvalue=np.float64(0.001953125))
Wilcox Hybrid vs MLP WilcoxonResult(statistic=np.float64(20.0), pvalue=np.float64(0.4921875))


## 3.3. Paired t-test

In [43]:
from scipy.stats import shapiro

shapiro_diff = shapiro(recalls_hybrid - recalls_svm)
print("Normality p-value:", shapiro_diff.pvalue)

Normality p-value: 0.537674214762842


They are not normal -> No paired t-test

In [44]:
stats_results = pd.DataFrame({
    "Model": ["Hybrid vs SVM","Hybrid vs LR","Hybrid vs MLP"],
    "Wilcoxon_p": [
        wilcox_hybrid_svm.pvalue,
        wilcox_hybrid_lr.pvalue,
        wilcox_hybrid_mlp.pvalue
    ],
    "CI95_Hybrid": [ci_95(recalls_hybrid)]*3,
    "CI95_Other": [
        ci_95(recalls_svm),
        ci_95(recalls_lr),
        ci_95(recalls_mlp)
    ]
})

In [45]:
stats_results

Unnamed: 0,Model,Wilcoxon_p,CI95_Hybrid,CI95_Other
0,Hybrid vs SVM,0.027344,"(0.8172390554419371, 0.832014805463887)","(0.8107633425368862, 0.8220117915635167)"
1,Hybrid vs LR,0.001953,"(0.8172390554419371, 0.832014805463887)","(0.7616013251238245, 0.7729311359043796)"
2,Hybrid vs MLP,0.492188,"(0.8172390554419371, 0.832014805463887)","(0.7809194292024884, 0.836157589783457)"


Table with diabetes

<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Model</th>
      <th>Wilcoxon_p</th>
      <th>CI95_Hybrid</th>
      <th>CI95_Other</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>Hybrid vs SVM</td>
      <td>0.027344</td>
      <td>(0.8172390554419371, 0.832014805463887)</td>
      <td>(0.8107633425368862, 0.8220117915635167)</td>
    </tr>
    <tr>
      <th>1</th>
      <td>Hybrid vs LR</td>
      <td>0.001953</td>
      <td>(0.8172390554419371, 0.832014805463887)</td>
      <td>(0.7616013251238245, 0.7729311359043796)</td>
    </tr>
    <tr>
      <th>2</th>
      <td>Hybrid vs MLP</td>
      <td>0.492188</td>
      <td>(0.8172390554419371, 0.832014805463887)</td>
      <td>(0.7809194292024884, 0.836157589783457)</td>
    </tr>
  </tbody>
</table>
</div>

Confidence interval interpretation:

Example:

- Hybrid CI(Rec): (0.916, 0.981)

- SVM CI(Rec): (0.903, 0.983)

Observation:

- Intervals overlap

- Overlap means no statistically meaningful separation

Same conclusion for LR and MLP.

“Pairwise comparisons between the Hybrid model and baseline models were conducted using the Wilcoxon signed-rank test. Although the Hybrid model achieved higher mean performance across all comparisons, none of the differences were statistically significant at the 5% level (p ≥ 0.0625). This suggests that while the Hybrid approach shows a consistent performance advantage, the observed improvements cannot be considered statistically significant given the current sample size.”