In [12]:
import os
import numpy as np
import pandas as pd
import scipy.stats as stats
from statsmodels.stats.multitest import multipletests

In [2]:
from raise_utils.interpret import ScottKnott

In [67]:
def get_data(dataset: str, task='issue-close-time'):
    results = {}
    for dir in os.listdir(f'./classical/results/{task}'):
        if dir == 'ff' and task != 'defect':
            results[dir] = [0.718, 0.644]
            continue
            
        if dataset in os.listdir(f'./classical/results/{task}/{dir}'):
            result = []
            for file in os.listdir(f'./classical/results/{task}/{dir}/{dataset}'):
                with open(f'./classical/results/{task}/{dir}/{dataset}/{file}') as f:
                    lines = f.readlines()
                    lines = [x for x in lines if x.startswith('Accuracy')]
                    lines = [float(x.split(': ')[1]) for x in lines]

                    result.append(lines[0])

            results[dir] = result
        
    # Now do ff
    if task == 'defect':
        if dataset in os.listdir(f'./results/{task}/'):
            result = []
            for file in os.listdir(f'./results/{task}/{dataset}'):
                with open(f'./results/{task}/{dataset}/{file}') as f:
                    lines = f.readlines()
                    lines = [x for x in lines if x.startswith('[main] Accuracy')]
                    lines = [eval(x.split(': ')[1]) for x in lines]

                    result.extend(max(lines))

            results['ff'] = result
    
    return results

In [68]:
def run_stats(data):
    # Perform Kruskal-Wallis test
    _, p_value = stats.kruskal(*data.values())
    print(f"Kruskal-Wallis test p-value: {p_value}")

    if p_value < 0.05:
        # Calculate medians for each group
        group_medians = {key: np.median(val) for key, val in data.items()}
        print(f"Group medians: {group_medians}")

        # Find the group with the largest median
        max_group = max(group_medians, key=group_medians.get)
        print(f"Group with the largest median: {max_group}")

        # Perform pairwise Mann-Whitney U tests
        groups = list(data.keys())
        num_groups = len(groups)
        p_values = np.zeros((num_groups, num_groups))

        for i in range(num_groups):
            for j in range(i+1, num_groups):
                _, p = stats.mannwhitneyu(data[groups[i]], data[groups[j]], alternative='two-sided')
                p_values[i, j] = p
                p_values[j, i] = p

        print('Pairwise Mann-Whitney U tests')
        print(pd.DataFrame(p_values, index=groups, columns=groups))
        print()

        # Apply Bonferroni correction for multiple comparisons
        adjusted_p_values = multipletests(p_values.ravel(), method='fdr_bh')[1].reshape(p_values.shape)
        post_hoc = pd.DataFrame(adjusted_p_values, index=groups, columns=groups)

        print("Pairwise Mann-Whitney U tests with Benjamini/Hochberg correction:")
        print(post_hoc)
        print()

        # Check if the group with the largest median is significantly better than the others
        significantly_better = True
        for key in data.keys():
            if key != max_group and post_hoc.loc[max_group, key] >= 0.05:
                significantly_better = False
                break

        if significantly_better:
            print(f"The group '{max_group}' with the largest median IS significantly better than the others, ", end='')
        else:
            print("The group with the largest median IS NOT significantly better than all the others, ", end='')
        
        print(f'and the highest p-value is {round(max(post_hoc[max_group]), 2)}')
    else:
        print("There is no significant difference among the groups.")

In [69]:
data = get_data('firefox-3class', task='issue-close-time')

In [70]:
run_stats(data)

Kruskal-Wallis test p-value: 0.027758946516771684
Group medians: {'logistic': 0.4932289583452374, 'random-5': 0.4610879378321239, 'nb': 0.49497171590194844, 'ff': 0.681}
Group with the largest median: ff
Pairwise Mann-Whitney U tests
          logistic  random-5        nb        ff
logistic  0.000000  0.053103  0.967635  0.008658
random-5  0.053103  0.000000  0.194128  0.008658
nb        0.967635  0.194128  0.000000  0.008658
ff        0.008658  0.008658  0.008658  0.000000

Pairwise Mann-Whitney U tests with Benjamini/Hochberg correction:
          logistic  random-5        nb        ff
logistic  0.000000  0.070804  0.967635  0.013853
random-5  0.070804  0.000000  0.221860  0.013853
nb        0.967635  0.221860  0.000000  0.013853
ff        0.013853  0.013853  0.013853  0.000000

The group 'ff' with the largest median IS significantly better than the others, and the highest p-value is 0.01


## Scott-Knott

In [39]:
sk = ScottKnott(data)
sk.pprint()

   1   logistic (                         |     *                  ), 0.581,  0.620,  0.625,  0.631,  0.636
   2         nb (                         |     *                  ), 0.621,  0.625,  0.631,  0.638,  0.647
   3         ff (                         |       --*              ), 0.673,  0.673,  0.712,  0.712,  0.712
   4   random-5 (                         |             -*-        ), 0.787,  0.788,  0.809,  0.836,  0.845
