In [1]:
import numpy as np
from scipy.stats import f_oneway

# Define metric data for each model with two experiments each
data = {
    "YOLO-11m": {
        "Accuracy": [0.9553, 0.9483],
        "Precision": [0.9803, 0.9814],
        "Recall": [0.9740, 0.9657],
        "F1-score": [0.9921, 0.9908]
    },
    "YOLO-10m": {
        "Accuracy": [0.8706, 0.7685],
        "Precision": [0.9161, 0.8101],
        "Recall": [0.9460, 0.9374],
        "F1-score": [0.9842, 0.9563]
    },
    "YOLO-11n": {
        "Accuracy": [0.9774, 0.9553],
        "Precision": [0.9933, 0.9803],
        "Recall": [0.9839, 0.9740],
        "F1-score": [0.9949, 0.9921]
    },
    "YOLO-10n": {
        "Accuracy": [0.8706, 0.9288],
        "Precision": [0.9161, 0.9494],
        "Recall": [0.9460, 0.9772],
        "F1-score": [0.9842, 0.9867]
    },
    "EfficientNetV2": {
        "Accuracy": [0.5324, 0.5325],
        "Precision": [0.1065, 0.1065],
        "Recall": [0.2000, 0.2000],
        "F1-score": [0.1390, 0.1390]
    },
    "ResNet50": {
        "Accuracy": [0.5524, 0.6100],
        "Precision": [0.2775, 0.2398],
        "Recall": [0.2230, 0.2999],
        "F1-score": [0.1817, 0.2614]
    },
    "MobileNet": {
        "Accuracy": [0.9225, 0.9538],
        "Precision": [0.7165, 0.9441],
        "Recall": [0.7197, 0.9058],
        "F1-score": [0.7123, 0.9226]
    },
    "Custom Model": {
        "Accuracy": [0.9364, 0.9641],
        "Precision": [0.9360, 0.9637],
        "Recall": [0.9363, 0.9640],
        "F1-score": [0.9356, 0.9638]
    }
}

# Perform ANOVA tests across all models for each metric
anova_results = {}
for metric in ["Accuracy", "Precision", "Recall", "F1-score"]:
    metric_values = [data[model][metric] for model in data]  # Extract metric values for all models
    anova_results[metric] = f_oneway(*metric_values)  # Perform ANOVA

# Print results
print("\n=== ANOVA Test Results ===")
for metric, result in anova_results.items():
    f_stat, p_val = result
    print(f"{metric} - F-statistic: {f_stat:.4f}, P-value: {p_val:.4f}")
    
    if p_val < 0.05:
        print(f"  🚀 Significant difference found in {metric} across models!")
    else:
        print(f"  ✅ No significant difference in {metric}.")



=== ANOVA Test Results ===
Accuracy - F-statistic: 50.6238, P-value: 0.0000
  🚀 Significant difference found in Accuracy across models!
Precision - F-statistic: 58.8684, P-value: 0.0000
  🚀 Significant difference found in Precision across models!
Recall - F-statistic: 82.4160, P-value: 0.0000
  🚀 Significant difference found in Recall across models!
F1-score - F-statistic: 80.4408, P-value: 0.0000
  🚀 Significant difference found in F1-score across models!


In [2]:
import numpy as np
import pandas as pd
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Model Names (Expanded with experiments)
models = [
    "YOLO-11m_1", "YOLO-11m_2", "YOLO-10m_1", "YOLO-10m_2", 
    "YOLO-11n_1", "YOLO-11n_2", "YOLO-10n_1", "YOLO-10n_2", 
    "EfficientNetV2_1", "EfficientNetV2_2", "ResNet50_1", "ResNet50_2", 
    "MobileNet_1", "MobileNet_2", "Custom_1", "Custom_2"
]

# Define metric data for each model (Expanded with experiments)
accuracy = [
    0.9553, 0.9483, 0.8706, 0.7685, 
    0.9774, 0.9553, 0.8706, 0.9288, 
    0.5324, 0.5325, 0.5524, 0.610, 
    0.9225, 0.9538, 0.9364, 0.9641
]

precision = [
    0.9803, 0.9814, 0.9161, 0.8101, 
    0.9933, 0.9803, 0.9161, 0.9494, 
    0.1065, 0.1065, 0.2775, 0.2398, 
    0.7165, 0.9441, 0.9360, 0.9637
]

recall = [
    0.9740, 0.9657, 0.9460, 0.9374, 
    0.9839, 0.9740, 0.9460, 0.9772, 
    0.2000, 0.2000, 0.2230, 0.2999, 
    0.7197, 0.9058, 0.9363, 0.9640
]

f1_score = [
    0.9921, 0.9908, 0.9842, 0.9563, 
    0.9949, 0.9921, 0.9842, 0.9867, 
    0.1390, 0.1390, 0.1817, 0.2614, 
    0.7123, 0.9226, 0.9356, 0.9638
]

# Convert to Pandas DataFrame
df = pd.DataFrame({
    "Model": models * 4, 
    "Metric": ["Accuracy"] * len(models) + ["Precision"] * len(models) + 
              ["Recall"] * len(models) + ["F1-score"] * len(models),
    "Value": accuracy + precision + recall + f1_score
})

# Perform Tukey’s HSD test
tukey_results = pairwise_tukeyhsd(df["Value"], df["Model"], alpha=0.05)

# Print results
print("\n=== Tukey’s HSD Test Results ===")
print(tukey_results)



=== Tukey’s HSD Test Results ===
          Multiple Comparison of Means - Tukey HSD, FWER=0.05           
     group1           group2      meandiff p-adj   lower   upper  reject
------------------------------------------------------------------------
        Custom_1         Custom_2   0.0278    1.0 -0.2281  0.2838  False
        Custom_1 EfficientNetV2_1  -0.6916    0.0 -0.9475 -0.4357   True
        Custom_1 EfficientNetV2_2  -0.6916    0.0 -0.9475 -0.4356   True
        Custom_1      MobileNet_1  -0.1683 0.5732 -0.4243  0.0876  False
        Custom_1      MobileNet_2  -0.0045    1.0 -0.2604  0.2514  False
        Custom_1       ResNet50_1  -0.6274    0.0 -0.8834 -0.3715   True
        Custom_1       ResNet50_2  -0.5833    0.0 -0.8392 -0.3274   True
        Custom_1       YOLO-10m_1  -0.0068    1.0 -0.2628  0.2491  False
        Custom_1       YOLO-10m_2   -0.068 0.9998 -0.3239  0.1879  False
        Custom_1       YOLO-10n_1  -0.0068    1.0 -0.2628  0.2491  False
        Custom_1 