In [7]:
import pandas as pd

df_lightgbm = pd.read_csv('/content/lightgbm_metrics.csv')
df_catboost = pd.read_csv('/content/catboost_metrics.csv')
df_xgboost = pd.read_csv('/content/xgboost_metrics.csv')
df_resnet = pd.read_csv('/content/resnet_metrics.csv')
df_autogluon = pd.read_csv('/content/autogluon_metrics.csv')
df_autosklearn = pd.read_csv('/content/autosklearn_metrics.csv')

print("Classifier metrics CSV files loaded into DataFrames: df_lightgbm, df_catboost, df_xgboost, df_resnet, df_autogluon, df_autosklearn.")

Classifier metrics CSV files loaded into DataFrames: df_lightgbm, df_catboost, df_xgboost, df_resnet, df_autogluon, df_autosklearn.


In [8]:
classifiers = ['lightgbm', 'catboost', 'xgboost', 'resnet', 'autogluon', 'autosklearn']

classifier_dfs = {
    'lightgbm': df_lightgbm,
    'catboost': df_catboost,
    'xgboost': df_xgboost,
    'resnet': df_resnet,
    'autogluon': df_autogluon,
    'autosklearn': df_autosklearn
}

metric_mapping = {
    'Mean AUC OVO': 'auc_ovo',
    'Mean Accuracy (ACC)': 'accuracy',
    'G-Mean': 'gmean',
    'Mean Cross-Entropy (CE)': 'cross_entropy'
}

all_results = {}

for output_metric_name, df_column_name in metric_mapping.items():
    all_results[output_metric_name] = {}
    for classifier_name in classifiers:
        df = classifier_dfs[classifier_name]
        # Extract the column of values and convert to a list
        # Assuming each DataFrame contains 30 rows corresponding to the 30 performance values
        all_results[output_metric_name][classifier_name] = df[df_column_name].tolist()

print("Data restructured into 'all_results' dictionary.")
# Display a sample of the structure for verification
print(f"Example for 'Mean AUC OVO' and 'lightgbm': {all_results['Mean AUC OVO']['lightgbm'][:5]}...")
print(f"Number of values for 'Mean AUC OVO' and 'lightgbm': {len(all_results['Mean AUC OVO']['lightgbm'])}")

Data restructured into 'all_results' dictionary.
Example for 'Mean AUC OVO' and 'lightgbm': [0.9997125446962574, 0.9989591444547612, 0.5750249889163095, 0.9613442632688824, 1.0]...
Number of values for 'Mean AUC OVO' and 'lightgbm': 30


In [9]:
!pip install scikit-posthocs
import numpy as np
import pandas as pd
from scipy import stats
import scikit_posthocs as sp
from scipy.stats import f as f_dist

# =============================================================================
# CONFIDENCE INTERVALS (95%)
# =============================================================================
print("=" * 80)
print("CALCULATING 95% CONFIDENCE INTERVALS")
print("=" * 80)

confidence_results = []

for metric_name, metric_results in all_results.items():
    print(f"\n{'='*60}")
    print(f"METRIC: {metric_name.upper()}")
    print(f"{'='*60}\n")

    for classifier_name in classifiers:
        values = metric_results[classifier_name]
        n_samples = len(values)

        if n_samples > 1:
            arr = np.array(values)
            mean_val = np.mean(arr)
            std_val = np.std(arr, ddof=1) # Use ddof=1 for sample standard deviation

            # Confidence Interval
            confidence_level = 0.95
            degrees_freedom = n_samples - 1
            confidence_interval = stats.t.interval(
                confidence_level,
                degrees_freedom,
                loc=mean_val,
                scale=stats.sem(arr)
            )

            confidence_results.append({
                'metric': metric_name,
                'classifier': classifier_name,
                'mean': mean_val,
                'std': std_val,
                'ci_lower': confidence_interval[0],
                'ci_upper': confidence_interval[1]
            })

            print(f"{classifier_name:15} (N={n_samples}) Mean: {mean_val:.4f} | Std: {std_val:.4f} | "
                  f"CI 95%: [{confidence_interval[0]:.4f}, {confidence_interval[1]:.4f}]")

df_confidence = pd.DataFrame(confidence_results)

# =============================================================================
# FRIEDMAN TEST AND NEMENYI POST-HOC TEST
# =============================================================================
print("\n" + "="*80)
print("PERFORMING FRIEDMAN TEST AND NEMENYI POST-HOC TEST")
print("=" * 80)

for metric_name, metric_results in all_results.items():
    print("\n" + "="*80)
    print(f"ANALYSIS FOR METRIC: {metric_name.upper()}")
    print("="*80)
    print()

    # Data Matrix: Rows = 30 Repetitions, Columns = Classifiers
    # Ensure classifiers are in a consistent order
    data_matrix = np.array([metric_results[clf] for clf in classifiers]).T
    n_datasets, n_classifiers = data_matrix.shape

    print(f"Configuration: {n_datasets} repetitions \u00d7 {n_classifiers} classifiers")

    # -------------------------------------------------------------------------
    # STEP 1: Calculate ranks
    # -------------------------------------------------------------------------
    ranks_matrix = np.zeros_like(data_matrix, dtype=float)
    for i in range(n_datasets):
        if metric_name == "Mean Cross-Entropy (CE)":
            # Lower Cross-Entropy is better, so rank directly (ascending)
            ranks_matrix[i, :] = stats.rankdata(data_matrix[i, :], method='average')
        else:
            # Higher values (AUC, Accuracy, G-Mean) are better, so rank descending
            ranks_matrix[i, :] = stats.rankdata(-data_matrix[i, :], method='average')

    mean_ranks = np.mean(ranks_matrix, axis=0)

    rank_df = pd.DataFrame({
        'Classificador': classifiers,
        'Rank M\u00e9dio': mean_ranks
    }).sort_values('Rank M\u00e9dio')

    print("\n\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510")
    print("\u2502  MEAN RANKS (Lower Rank = Better Performance)          \u2502")
    print("\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518")
    print(rank_df.to_string(index=False))
    print()

    # -------------------------------------------------------------------------
    # STEP 2: Friedman + Iman-Davenport Test
    # -------------------------------------------------------------------------
    k = n_classifiers
    N = n_datasets

    # Original Friedman statistic
    # R_j_mean_sq_sum = np.sum(mean_ranks**2)
    # chi2_F_original = (12 * N / (k * (k + 1))) * (R_j_mean_sq_sum - (k * (k + 1)**2) / 4)

    # Iman-Davenport F-statistic
    # Calculate Friedman Chi-square directly from mean ranks
    R_squared_sum = np.sum(mean_ranks**2)
    chi_sq_friedman = (12 * N) / (k * (k + 1)) * (R_squared_sum - (k * (k + 1)**2) / 4)

    if (N * (k - 1) - chi_sq_friedman) == 0:
        # This can happen if chi_sq_friedman is exactly N * (k-1), which is theoretical maximum
        F_F = np.inf if chi_sq_friedman > 0 else 0 # Or handle as an edge case where no variance
    else:
        F_F = ((N - 1) * chi_sq_friedman) / (N * (k - 1) - chi_sq_friedman)

    df1 = k - 1
    df2 = (k - 1) * (N - 1)
    p_value_friedman = 1 - f_dist.cdf(F_F, df1, df2)

    print("\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510")
    print("\u2502  FRIEDMAN TEST (Iman-Davenport)                             \u2502")
    print("\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518")
    print(f"  F_F: {F_F:.4f} | p-value: {p_value_friedman:.4f}")

    if p_value_friedman < 0.05:
        print(f"  \u2713 Reject H0: There is a statistically significant difference between models.")

        # ---------------------------------------------------------------------
        # STEP 3: Nemenyi Post-Hoc Test
        # ---------------------------------------------------------------------
        print("\n\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510")
        print("\u2502  NEMENYI POST-HOC TEST                                      \u2502")
        print("\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518")

        # scikit_posthocs.posthoc_nemenyi_friedman expects the data in a specific format:
        # each row is a sample (dataset/repetition), each column is a group (classifier)
        # So, the data_matrix is already in the correct format (N rows x k columns)
        # It also implicitly handles ranking internally, consistent with Friedman.
        pairwise_pvalues = sp.posthoc_nemenyi_friedman(data_matrix)
        pairwise_pvalues.index = classifiers
        pairwise_pvalues.columns = classifiers

        print(f"Matrix of Nemenyi p-Values:")
        print(pairwise_pvalues)
        print("\nPairs with significant difference (p < 0.05):")
        found_significant = False
        for i in range(len(classifiers)): # Loop through rows (classifiers)
            for j in range(i + 1, len(classifiers)): # Loop through columns (other classifiers)
                p_val = pairwise_pvalues.iloc[i, j]
                if p_val < 0.05:
                    found_significant = True
                    # Determine which classifier is better based on mean ranks
                    # Ensure mean_ranks corresponds to the order of 'classifiers'
                    rank_clf_i = mean_ranks[classifiers.index(pairwise_pvalues.index[i])]
                    rank_clf_j = mean_ranks[classifiers.index(pairwise_pvalues.columns[j])]

                    if rank_clf_i < rank_clf_j:
                        better_clf = pairwise_pvalues.index[i]
                        worse_clf = pairwise_pvalues.columns[j]
                    else:
                        better_clf = pairwise_pvalues.columns[j]
                        worse_clf = pairwise_pvalues.index[i]

                    print(f"  \u2713 {better_clf} vs {worse_clf} (p={p_val:.4f}) -> {better_clf} performs significantly better")

        if not found_significant:
            print("  No significant pairwise differences found by Nemenyi.")

    else:
        print(f"  \u2717 Do not reject H0. The models perform equivalently.")

    print("\n" + "-"*80)

print("Statistical analysis complete.")

CALCULATING 95% CONFIDENCE INTERVALS

METRIC: MEAN AUC OVO

lightgbm        (N=30) Mean: 0.8876 | Std: 0.1203 | CI 95%: [0.8427, 0.9325]
catboost        (N=30) Mean: 0.8855 | Std: 0.1283 | CI 95%: [0.8376, 0.9334]
xgboost         (N=30) Mean: 0.8838 | Std: 0.1277 | CI 95%: [0.8362, 0.9315]
resnet          (N=30) Mean: 0.8911 | Std: 0.1261 | CI 95%: [0.8440, 0.9382]
autogluon       (N=30) Mean: 0.8906 | Std: 0.1272 | CI 95%: [0.8430, 0.9381]
autosklearn     (N=30) Mean: 0.8722 | Std: 0.1367 | CI 95%: [0.8211, 0.9233]

METRIC: MEAN ACCURACY (ACC)

lightgbm        (N=30) Mean: 0.8307 | Std: 0.1689 | CI 95%: [0.7677, 0.8938]
catboost        (N=30) Mean: 0.8320 | Std: 0.1704 | CI 95%: [0.7684, 0.8956]
xgboost         (N=30) Mean: 0.8279 | Std: 0.1696 | CI 95%: [0.7646, 0.8912]
resnet          (N=30) Mean: 0.8270 | Std: 0.1825 | CI 95%: [0.7589, 0.8952]
autogluon       (N=30) Mean: 0.8433 | Std: 0.1674 | CI 95%: [0.7808, 0.9059]
autosklearn     (N=30) Mean: 0.8485 | Std: 0.1666 | CI 95%: [0.