# MODEL COMPARISON

In [11]:
import model_functions
import importlib

importlib.reload(model_functions)

from model_functions import (
    preprocess_diabetes,
    logistic_cross_validation,
    svm_cross_validation,
    mlp_cross_validation,
    hybrid_nn_svm_cv
)

import torch.nn as nn
import numpy as np
import pandas as pd


# 1. Preprocess data

In [12]:
file = 'data/diabetes.csv'

df = pd.read_csv(file)

prep_data = preprocess_diabetes(df)

df = prep_data["df"]
X_train = prep_data["X_train"]
X_test = prep_data["X_test"]
y_train = prep_data["y_train"]
y_test = prep_data["y_test"]
y_train_numeric = y_train.replace({"Malignant": 1, "Benign": 0}).astype(float)
y_test_numeric  = y_test.replace({"Malignant": 1, "Benign": 0}).astype(float)

Selected 18 features from 21 (threshold=0.05)


# 2. Models comparison

## 2.0. Preparation

In [13]:
results = pd.DataFrame(
    columns=[
        "accuracy_mean", "accuracy_std",
        "f1_mean", "f1_std",
        "recall_mean", "recall_std",
        "auc_mean", "auc_std"
    ]
)

def store_results(model_name, metrics):
    results.loc[model_name] = [
        metrics["accuracy_mean"], metrics["accuracy_std"],
        metrics["f1_mean"],       metrics["f1_std"],
        metrics["recall_mean"],   metrics["recall_std"],
        metrics["auc_mean"],      metrics["auc_std"]
    ]

## 2.1. Run all models

In [14]:
#----------------------#
# LOGISTIC REGRESSION  #
#----------------------#

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

metrics_lr, preds_lr = logistic_cross_validation(
    X_train, y_train,
    k=5
)
store_results("Logistic Regression", metrics_lr)

Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
Fold 5/5


In [15]:
#----------------------#
#         SVM          #
#----------------------#
metrics_svm, preds_svm = svm_cross_validation(
    X_train, y_train,
    k=5
)
store_results("SVM (RBF)", metrics_svm)

Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
Fold 5/5


In [16]:
#----------------------#
#          MLP         #
#----------------------#

metrics_mlp, preds_mlp = mlp_cross_validation(
    X_train, y_train,
    k=5
)
store_results("MLP", metrics_mlp)

Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
Fold 5/5


In [17]:
#----------------------#
#    HYBRID MODEL      #
#----------------------#

import torch

X_train_tensor = torch.tensor(X_train.to_numpy(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_numeric.to_numpy(), dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test.to_numpy(), dtype=torch.float32)

metrics_hybrid, preds_hybrid = hybrid_nn_svm_cv(
    X_train_tensor, y_train_tensor,
    k=5
)
store_results("Hybrid NN + SVM", metrics_hybrid)



Fold 1/5


  y = column_or_1d(y, warn=True)



Fold 2/5


  y = column_or_1d(y, warn=True)



Fold 3/5


  y = column_or_1d(y, warn=True)



Fold 4/5


  y = column_or_1d(y, warn=True)



Fold 5/5


  y = column_or_1d(y, warn=True)


## 2.2. Results

In breast cancer detection:

- A false negative = cancer is present but missed → health risk

- A false positive = further exam → less risky

Therefore, clinically, recall (sensitivity) is the primary priority.

In [26]:
results.sort_values(by=["recall_mean", "auc_mean"], ascending=False).round(3)


Unnamed: 0,accuracy_mean,accuracy_std,f1_mean,f1_std,recall_mean,recall_std,auc_mean,auc_std
Hybrid NN + SVM,0.748,0.002,0.766,0.003,0.828,0.009,0.824,0.002
SVM (RBF),0.703,0.003,0.733,0.003,0.816,0.006,0.772,0.004
MLP,0.748,0.001,0.76,0.006,0.798,0.029,0.825,0.002
Logistic Regression,0.748,0.002,0.752,0.003,0.767,0.006,0.824,0.002


<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>accuracy_mean</th>
      <th>accuracy_std</th>
      <th>f1_mean</th>
      <th>f1_std</th>
      <th>recall_mean</th>
      <th>recall_std</th>
      <th>auc_mean</th>
      <th>auc_std</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>Hybrid NN + SVM</th>
      <td>0.746</td>
      <td>0.002</td>
      <td>0.765</td>
      <td>0.001</td>
      <td>0.826</td>
      <td>0.006</td>
      <td>0.823</td>
      <td>0.002</td>
    </tr>
    <tr>
      <th>SVM (RBF)</th>
      <td>0.703</td>
      <td>0.003</td>
      <td>0.733</td>
      <td>0.003</td>
      <td>0.816</td>
      <td>0.006</td>
      <td>0.772</td>
      <td>0.004</td>
    </tr>
    <tr>
      <th>MLP</th>
      <td>0.749</td>
      <td>0.003</td>
      <td>0.759</td>
      <td>0.006</td>
      <td>0.791</td>
      <td>0.029</td>
      <td>0.823</td>
      <td>0.001</td>
    </tr>
    <tr>
      <th>Logistic Regression</th>
      <td>0.748</td>
      <td>0.002</td>
      <td>0.752</td>
      <td>0.003</td>
      <td>0.767</td>
      <td>0.006</td>
      <td>0.824</td>
      <td>0.002</td>
    </tr>
  </tbody>
</table>
</div>

# 3. Statistical test implementations

In [27]:
recalls_lr      = np.array(metrics_lr["recalls_per_fold"])
recalls_svm     = np.array(metrics_svm["recalls_per_fold"])
recalls_mlp     = np.array(metrics_mlp["recalls_per_fold"])
recalls_hybrid  = np.array(metrics_hybrid["recalls_per_fold"])

## 3.1. CI95 (Confidence Intervals)

In [28]:
from scipy import stats

def ci_95(values):
    mean = np.mean(values)
    sem = stats.sem(values)
    return (mean - 1.96 * sem, mean + 1.96 * sem)


In [29]:
ci95_hybrid = ci_95(recalls_hybrid)
ci95_svm    = ci_95(recalls_svm)
ci95_lr     = ci_95(recalls_lr)
ci95_mlp    = ci_95(recalls_mlp)


## 3.2. Wilcoxon Signed-Rank

In [30]:
from scipy.stats import wilcoxon

wilcox_hybrid_svm = wilcoxon(recalls_hybrid, recalls_svm)
wilcox_hybrid_lr  = wilcoxon(recalls_hybrid, recalls_lr)
wilcox_hybrid_mlp = wilcoxon(recalls_hybrid, recalls_mlp)

print("Wilcox Hybrid vs SVM", wilcox_hybrid_svm)

print("Wilcox Hybrid vs Logistic Regression", wilcox_hybrid_lr)


print("Wilcox Hybrid vs MLP", wilcox_hybrid_mlp)


Wilcox Hybrid vs SVM WilcoxonResult(statistic=np.float64(0.0), pvalue=np.float64(0.0625))
Wilcox Hybrid vs Logistic Regression WilcoxonResult(statistic=np.float64(0.0), pvalue=np.float64(0.0625))
Wilcox Hybrid vs MLP WilcoxonResult(statistic=np.float64(2.0), pvalue=np.float64(0.1875))


## 3.3. Paired t-test

In [23]:
from scipy.stats import shapiro

shapiro_diff = shapiro(recalls_hybrid - recalls_svm)
print("Normality p-value:", shapiro_diff.pvalue)

Normality p-value: 0.4616813073906239


They are not normal -> No paired t-test

In [24]:
stats_results = pd.DataFrame({
    "Model": ["Hybrid vs SVM","Hybrid vs LR","Hybrid vs MLP"],
    "Wilcoxon_p": [
        wilcox_hybrid_svm.pvalue,
        wilcox_hybrid_lr.pvalue,
        wilcox_hybrid_mlp.pvalue
    ],
    "CI95_Hybrid": [ci_95(recalls_hybrid)]*3,
    "CI95_Other": [
        ci_95(recalls_svm),
        ci_95(recalls_lr),
        ci_95(recalls_mlp)
    ]
})

In [25]:
stats_results

Unnamed: 0,Model,Wilcoxon_p,CI95_Hybrid,CI95_Other
0,Hybrid vs SVM,0.0625,"(0.8186419759167292, 0.8367675388884839)","(0.8098823757392031, 0.8221153681806983)"
1,Hybrid vs LR,0.0625,"(0.8186419759167292, 0.8367675388884839)","(0.7614996613873273, 0.7723973292969628)"
2,Hybrid vs MLP,0.1875,"(0.8186419759167292, 0.8367675388884839)","(0.7691956994316677, 0.8270114472398012)"


Table with diabetes


<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Model</th>
      <th>Wilcoxon_p</th>
      <th>CI95_Hybrid</th>
      <th>CI95_Other</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>Hybrid vs SVM</td>
      <td>0.0625</td>
      <td>(0.8186419759167292, 0.8367675388884839)</td>
      <td>(0.8098823757392031, 0.8221153681806983)</td>
    </tr>
    <tr>
      <th>1</th>
      <td>Hybrid vs LR</td>
      <td>0.0625</td>
      <td>(0.8186419759167292, 0.8367675388884839)</td>
      <td>(0.7614996613873273, 0.7723973292969628)</td>
    </tr>
    <tr>
      <th>2</th>
      <td>Hybrid vs MLP</td>
      <td>0.1875</td>
      <td>(0.8186419759167292, 0.8367675388884839)</td>
      <td>(0.7691956994316677, 0.8270114472398012)</td>
    </tr>
  </tbody>
</table>
</div>

Confidence interval interpretation:

Example:

- Hybrid CI(Rec): (0.916, 0.981)

- SVM CI(Rec): (0.903, 0.983)

Observation:

- Intervals overlap

- Overlap means no statistically meaningful separation

Same conclusion for LR and MLP.

Statistical comparison was performed using fold-wise recall scores over 30 stratified cross-validation runs. The Wilcoxon signed-rank test revealed no statistically significant differences between the Hybrid method and any of the baseline models (p > 0.05 in all comparisons), indicating that the observed performance differences lie within typical sampling variability. The 95% confidence intervals of recall values overlapped across all models, supporting the conclusion that all models perform equivalently in terms of diagnostic sensitivity.

Hybrid does not significantly outperform SVM/MLP/LR. All models are statistically equivalent in terms of recall.

Given similar recall rates (all >0.90), any of the models could be deployed as a screening method without a meaningful difference in cancer detection safety.

Given statistically equivalent recall levels, the logistic regression is preferred due to lower computational complexity, easier interpretability, and more consistent performance distribution.

The BCWD dataset:

- Is low-dimensional

- Has separable structure

- Responds well to linear decision boundaries

Even though kernel methods (SVM-RBF) and nonlinear neural embeddings can model more complex shapes, those complexities do not help because the dataset is already very well-behaved.

So, the simplest model wins, because the dataset is simple.

