<a href="https://colab.research.google.com/github/tanishi22/FYP/blob/main/Code/ML/statistical_testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import itertools
from scipy.stats import friedmanchisquare, wilcoxon
from scipy.stats import shapiro, ttest_rel, wilcoxon

start with Friedman's test - non parametric statistical test used to compare three or more related groups, useufl when the data does not mean the assumption
of normality

Null hypothesis: All classifiers have the same median performance
If Friedman's test is significant (p < 0.05), then I can proceed to post-hoc comparisions with paired t-test for each pair with a Bonferroni or Holm correction for multiple testing

Starting with intrinsic only comparison

In [None]:
# create score matrix of the 5-fold CV results for all classifiers

scores = {
    "LogisticRegression" : np.array([0.6364414, 0.6201882, 0.62532079, 0.63130881, 0.61077844]),
    "SVM": np.array([0.72968349, 0.69289991, 0.71343028, 0.71941831, 0.69022263]),
    "RandomForest": np.array([0.82720274, 0.803250, 0.78785287, 0.79298546, 0.7673225]),
    "GradientBoosting": np.array([0.85714286, 0.82121471, 0.80410607, 0.82292558, 0.82292558]),
    "XGBoost": np.array([0.8528657, 0.81608212, 0.80581694, 0.8100941, 0.81864842]),
    "MLP": np.array([0.738223781, 0.7245509, 0.74165954, 0.72626176, 0.71086399])
}

# List of classifier names in a consistent order
classifiers = list(scores.keys())

In [None]:
# checking normality
from scipy.stats import shapiro

print("Shapiro–Wilk Normality Test Results:\n")
for clf_name, clf_scores in scores.items():
    stat, p = shapiro(clf_scores)
    print(f"{clf_name:<20} W = {stat:.3f}, p = {p:.6f}")


Shapiro–Wilk Normality Test Results:

LogisticRegression   W = 0.984, p = 0.954820
SVM                  W = 0.912, p = 0.477785
RandomForest         W = 0.984, p = 0.956483
GradientBoosting     W = 0.853, p = 0.205313
XGBoost              W = 0.797, p = 0.076672
MLP                  W = 0.943, p = 0.685907


In [None]:
# paired t-test
from scipy.stats import friedmanchisquare, ttest_rel
import numpy as np
import itertools

# Step 1: Friedman test for global differences
friedman_stat, friedman_p = friedmanchisquare(*(scores[clf] for clf in classifiers))
print(f"Friedman χ² = {friedman_stat:.3f}, p = {friedman_p:.5f}")

# Proceed to pairwise t-tests if Friedman is significant
if friedman_p < 0.05:
    print("\nFriedman is significant (p < 0.05). Proceeding to pairwise paired t-tests with Holm correction.")

    pairs = list(itertools.combinations(classifiers, 2))
    raw_pvals = []
    ttest_stats = []

    for (clf1, clf2) in pairs:
        stat, pval = ttest_rel(scores[clf1], scores[clf2])
        ttest_stats.append(stat)
        raw_pvals.append(pval)

    # Holm correction
    m = len(raw_pvals)
    sorted_indices = np.argsort(raw_pvals)
    holm_pvals = np.empty(m, dtype=float)

    for rank, idx in enumerate(sorted_indices):
        holm_p = min((m - rank) * raw_pvals[idx], 1.0)
        holm_pvals[idx] = holm_p

    # Display results
    print("\nPairwise Paired t-tests (t-stat, raw p-val, Holm-adjusted p-val):")
    print("-----------------------------------------------------------------------")
    print(f"{'Model A':<20} {'Model B':<20} {'t-stat':>7} {'p-raw':>9} {'p-holm':>9}")
    print("-----------------------------------------------------------------------")
    for i, (clf1, clf2) in enumerate(pairs):
        t_stat = ttest_stats[i]
        p_raw = raw_pvals[i]
        p_holm = holm_pvals[i]
        print(f"{clf1:<20} {clf2:<20} {t_stat:7.2f} {p_raw:9.5f} {p_holm:9.5f}")


Friedman χ² = 24.543, p = 0.00017

Friedman is significant (p < 0.05). Proceeding to pairwise paired t-tests with Holm correction.

Pairwise Paired t-tests (t-stat, raw p-val, Holm-adjusted p-val):
-----------------------------------------------------------------------
Model A              Model B               t-stat     p-raw    p-holm
-----------------------------------------------------------------------
LogisticRegression   SVM                   -23.08   0.00002   0.00023
LogisticRegression   RandomForest          -25.43   0.00001   0.00017
LogisticRegression   GradientBoosting      -27.16   0.00001   0.00015
LogisticRegression   XGBoost               -26.48   0.00001   0.00016
LogisticRegression   MLP                   -29.09   0.00001   0.00012
SVM                  RandomForest          -11.72   0.00030   0.00242
SVM                  GradientBoosting      -14.14   0.00015   0.00145
SVM                  XGBoost               -13.54   0.00017   0.00155
SVM                  MLP    

trying extrinsic + intrinsic comparisons

In [None]:
# create score matrix of the 5-fold CV results for all classifiers

combined_scores = {
    "LogisticRegression" : np.array([0.74486301, 0.73886986, 0.75428082, 0.76797945, 0.75664096]),
    "SVM": np.array([0.83304795, 0.79195205, 0.78767123, 0.81335616, 0.80548415]),
    "RandomForest": np.array([0.89297945, 0.87157534, 0.86558219, 0.87328767, 0.8688946]),
    "GradientBoosting": np.array([0.88527397, 0.87585616, 0.87671233, 0.87585616, 0.87746358]),
    "XGBoost": np.array([0.89126712, 0.86729452, 0.88613014, 0.89297945, 0.88431877]),
    "MLP": np.array([0.82534247, 0.8005137, 0.79623288, 0.82876712, 0.82262211])
}

# List of classifier names in a consistent order
combined_classifiers = list(combined_scores.keys())

In [None]:
# paired t-test
from scipy.stats import friedmanchisquare, ttest_rel
import numpy as np
import itertools

# Step 1: Friedman test for global differences
friedman_stat, friedman_p = friedmanchisquare(*(combined_scores[clf] for clf in combined_classifiers))
print(f"Friedman χ² = {friedman_stat:.3f}, p = {friedman_p:.5f}")

# Proceed to pairwise t-tests if Friedman is significant
if friedman_p < 0.05:
    print("\nFriedman is significant (p < 0.05). Proceeding to pairwise paired t-tests with Holm correction.")

    pairs = list(itertools.combinations(combined_classifiers, 2))
    raw_pvals = []
    ttest_stats = []

    for (clf1, clf2) in pairs:
        stat, pval = ttest_rel(combined_scores[clf1], combined_scores[clf2])
        ttest_stats.append(stat)
        raw_pvals.append(pval)

    # Holm correction
    m = len(raw_pvals)
    sorted_indices = np.argsort(raw_pvals)
    holm_pvals = np.empty(m, dtype=float)

    for rank, idx in enumerate(sorted_indices):
        holm_p = min((m - rank) * raw_pvals[idx], 1.0)
        holm_pvals[idx] = holm_p

    # Display results
    print("\nPairwise Paired t-tests (t-stat, raw p-val, Holm-adjusted p-val):")
    print("-----------------------------------------------------------------------")
    print(f"{'Model A':<20} {'Model B':<20} {'t-stat':>7} {'p-raw':>9} {'p-holm':>9}")
    print("-----------------------------------------------------------------------")
    for i, (clf1, clf2) in enumerate(pairs):
        t_stat = ttest_stats[i]
        p_raw = raw_pvals[i]
        p_holm = holm_pvals[i]
        print(f"{clf1:<20} {clf2:<20} {t_stat:7.2f} {p_raw:9.5f} {p_holm:9.5f}")


Friedman χ² = 22.143, p = 0.00049

Friedman is significant (p < 0.05). Proceeding to pairwise paired t-tests with Holm correction.

Pairwise Paired t-tests (t-stat, raw p-val, Holm-adjusted p-val):
-----------------------------------------------------------------------
Model A              Model B               t-stat     p-raw    p-holm
-----------------------------------------------------------------------
LogisticRegression   SVM                    -5.84   0.00428   0.02141
LogisticRegression   RandomForest          -15.21   0.00011   0.00131
LogisticRegression   GradientBoosting      -21.31   0.00003   0.00040
LogisticRegression   XGBoost               -34.76   0.00000   0.00006
LogisticRegression   MLP                   -10.08   0.00054   0.00327
SVM                  RandomForest          -15.55   0.00010   0.00130
SVM                  GradientBoosting      -10.64   0.00044   0.00397
SVM                  XGBoost               -12.20   0.00026   0.00260
SVM                  MLP    

In [None]:
# extrinsic only
# create score matrix of the 5-fold CV results for all classifiers

combined_scores = {
    "LogisticRegression" : np.array([0.82363014, 0.82277397, 0.8364726,  0.84760274, 0.82262211]),
    "SVM": np.array([0.7739726,  0.76712329, 0.77739726, 0.79195205, 0.79005998]),
    "RandomForest": np.array([0.76969178, 0.76626712, 0.77054795, 0.78767123, 0.77035133]),
    "GradientBoosting": np.array([0.85445205, 0.83390411, 0.84503425, 0.87157534, 0.84575835]),
    "XGBoost": np.array([0.8630137,  0.85616438, 0.87756849, 0.88356164, 0.85089974]),
    "MLP": np.array([0.78681507, 0.7885274,  0.7859589,  0.80308219, 0.80462725])
}

# List of classifier names in a consistent order
combined_classifiers = list(combined_scores.keys())

In [None]:
# paired t-test
from scipy.stats import friedmanchisquare, ttest_rel
import numpy as np
import itertools

# Step 1: Friedman test for global differences
friedman_stat, friedman_p = friedmanchisquare(*(combined_scores[clf] for clf in combined_classifiers))
print(f"Friedman χ² = {friedman_stat:.3f}, p = {friedman_p:.5f}")

# Proceed to pairwise t-tests if Friedman is significant
if friedman_p < 0.05:
    print("\nFriedman is significant (p < 0.05). Proceeding to pairwise paired t-tests with Holm correction.")

    pairs = list(itertools.combinations(combined_classifiers, 2))
    raw_pvals = []
    ttest_stats = []

    for (clf1, clf2) in pairs:
        stat, pval = ttest_rel(combined_scores[clf1], combined_scores[clf2])
        ttest_stats.append(stat)
        raw_pvals.append(pval)

    # Holm correction
    m = len(raw_pvals)
    sorted_indices = np.argsort(raw_pvals)
    holm_pvals = np.empty(m, dtype=float)

    for rank, idx in enumerate(sorted_indices):
        holm_p = min((m - rank) * raw_pvals[idx], 1.0)
        holm_pvals[idx] = holm_p

    # Display results
    print("\nPairwise Paired t-tests (t-stat, raw p-val, Holm-adjusted p-val):")
    print("-----------------------------------------------------------------------")
    print(f"{'Model A':<20} {'Model B':<20} {'t-stat':>7} {'p-raw':>9} {'p-holm':>9}")
    print("-----------------------------------------------------------------------")
    for i, (clf1, clf2) in enumerate(pairs):
        t_stat = ttest_stats[i]
        p_raw = raw_pvals[i]
        p_holm = holm_pvals[i]
        print(f"{clf1:<20} {clf2:<20} {t_stat:7.2f} {p_raw:9.5f} {p_holm:9.5f}")


Friedman χ² = 25.000, p = 0.00014

Friedman is significant (p < 0.05). Proceeding to pairwise paired t-tests with Holm correction.

Pairwise Paired t-tests (t-stat, raw p-val, Holm-adjusted p-val):
-----------------------------------------------------------------------
Model A              Model B               t-stat     p-raw    p-holm
-----------------------------------------------------------------------
LogisticRegression   SVM                    10.66   0.00044   0.00395
LogisticRegression   RandomForest           23.79   0.00002   0.00026
LogisticRegression   GradientBoosting       -4.66   0.00959   0.02877
LogisticRegression   XGBoost               -15.69   0.00010   0.00116
LogisticRegression   MLP                     6.68   0.00261   0.01568
SVM                  RandomForest            2.20   0.09263   0.09263
SVM                  GradientBoosting      -15.23   0.00011   0.00119
SVM                  XGBoost               -12.96   0.00020   0.00205
SVM                  MLP    