In [3]:
import os
import pandas as pd
import pandas as pd
from scipy.stats import mannwhitneyu

Statistical significance analysis
==

The objective of this script is to evaluate the significance of observed difference in term of energy consumption. 

### Scenario wise comparison

Compare the distribution of each utility scenarios to a baseline (bloated or GNU depending the RQ). Each scenario is executed 10 times.

In [4]:
# Get the directory with the notebooks, including its subdirectories
current_dir = os.getcwd()

# Get the directories with data and results


def computeMannWhitneyU(input_file_path, utilities, baseline):
    # Charger le fichier CSV
    df = pd.read_csv(input_file_path, delimiter=";", engine="python")

    # Initialiser la liste des résultats
    results = []

    # Liste des utilitaires à comparer avec "bloated"

    # Grouper par (Program, TestNr)
    grouped = df.groupby(["Program", "TestNr"])

    for (program, test_nr), group in grouped:
        # Extraire les valeurs PSYS pour "bloated"
        psys_bloated = group[group["Utilities"] == baseline]["PSYS"]

        for utility in utilities:
            # Extraire les valeurs PSYS pour l'utilitaire en cours
            psys_other = group[group["Utilities"] == utility]["PSYS"]

            # Vérifier qu'on a bien des données pour comparer
            if len(psys_bloated) > 0 and len(psys_other) > 0:
                # Test de Mann-Whitney U
                u_stat, p_value = mannwhitneyu(psys_other, psys_bloated, alternative="two-sided")

                # Stocker les résultats
                results.append((program, test_nr, utility, u_stat, p_value))

    # Convertir les résultats en DataFrame pour affichage
    results_df = pd.DataFrame(results, columns=["Program", "TestNr", "Utility", "U_stat", "p_value"])

    # Afficher les résultats
    return results_df

def check_significance(results_df):
    """
    Groups the results by (Utility, Program) and checks:
    - If all p-values in the group are significant (p < 0.05).
    - Whether the utility reduces or increases PSYS based on U-statistic.

    Parameters:
        results_df (pd.DataFrame): DataFrame with columns ["Program", "TestNr", "Utility", "U_stat", "p_value"]

    Returns:
        pd.DataFrame: DataFrame with columns ["Program", "Utility", "All_Significant", "Effect"]
    """
    # Group by (Program, Utility) and check if all p-values are < 0.05
    summary_df = (
        results_df
        .groupby(["Program", "Utility"])
        .agg(
            All_Significant=("p_value", lambda p: (p < 0.05).all()),  # All tests significant?
            Mean_U_stat=("U_stat", "mean")  # Compute average U-statistic
        )
        .reset_index()
    )

    # Determine the effect using U-statistic
    summary_df["Effect"] = summary_df.apply(
        lambda row: "Lower" if row["Mean_U_stat"] < (10 * 10) / 2 else "Higher", axis=1
    )

    return summary_df




#### RQ1 - Debloated vs. bloated

**Research hypothesis:** debloating techniques , impact the energy consumption  

**Null hypothesis:** Debloated utility [DEBOP/COV/CHISEL] have a different distribution in term of energy consumption compared to the bloated utility [BLOATED]  

In [5]:
results_dir = os.path.join(current_dir, "debloating_results")
debloat_result = os.path.join(results_dir, 'cat_all_repeats.csv')

utilities_to_compare = ["chisel", "debop", "cov"]
baseline = "bloated"
df =computeMannWhitneyU(debloat_result,utilities_to_compare, baseline)
df

Unnamed: 0,Program,TestNr,Utility,U_stat,p_value
0,date,1,chisel,43.0,0.623176
1,date,1,debop,40.0,0.472509
2,date,1,cov,39.0,0.427355
3,date,2,chisel,53.0,0.850107
4,date,2,debop,53.0,0.850107
5,date,2,cov,57.0,0.623176
6,grep,1,chisel,100.0,0.000182
7,grep,1,debop,44.0,0.677585
8,grep,1,cov,43.0,0.623176
9,grep,2,chisel,100.0,0.000183


In [6]:
sig =check_significance(df)
significan_utility = sig[sig["All_Significant"] == True]
significan_utility

Unnamed: 0,Program,Utility,All_Significant,Mean_U_stat,Effect
3,grep,chisel,True,100.0,Higher
6,gzip,chisel,True,100.0,Higher


In [7]:
percentage_significant = len(significan_utility) / len(sig) * 100
print(f" {percentage_significant} % of the results are significant")

 11.11111111111111 % of the results are significant


#### RQ1 - ALTERNATIVE 

In [8]:
results_dir = os.path.join(current_dir, "results")
alternative_result = os.path.join(results_dir, 'cat_all_repeats.csv')
utilities_to_compare = ["ToyBox", "BusyBox"]
baseline = "GNU"
df=computeMannWhitneyU(alternative_result,utilities_to_compare, baseline)
df

Unnamed: 0,Program,TestNr,Utility,U_stat,p_value
0,base64,1,ToyBox,100.0,0.000183
1,base64,1,BusyBox,92.0,0.001706
2,base64,2,ToyBox,100.0,0.000183
3,base64,2,BusyBox,100.0,0.000183
4,basename,1,ToyBox,44.5,0.705351
...,...,...,...,...,...
105,true,2,BusyBox,46.0,0.791337
106,wc,1,ToyBox,0.0,0.000183
107,wc,1,BusyBox,0.0,0.000183
108,wc,2,ToyBox,0.0,0.000183


In [9]:
sig =check_significance(df)
significan_utility = sig[sig["All_Significant"] == True]
significan_utility.to_csv("significant_utility.csv", index=False)

In [10]:
percentage_significant = len(significan_utility) / len(sig) * 100
print(f" {percentage_significant} % of the results are significant")

 32.142857142857146 % of the results are significant


## Analysis at global level

In [None]:
def computeMannWhitneyUGlobal(input_file_path, utilities, baseline):
    """
    Performs the Mann-Whitney U test on PSYS values between a baseline utility 
    and each specified utility across all measurements.
    
    Parameters:
        input_file_path (str): Path to the CSV file.
        utilities (list): List of utilities to compare against the baseline.
        baseline (str): The reference utility (e.g., "bloated").
    
    Returns:
        pd.DataFrame: DataFrame with columns ["Utility", "U_stat", "p_value"].
    """
    # Load CSV file
    df = pd.read_csv(input_file_path, delimiter=";", engine="python")

    # Extract PSYS values for the baseline utility
    psys_bloated = df[df["Utilities"] == baseline]["PSYS"]

    # Initialize results list
    results = []

    for utility in utilities:
        if utility == baseline:
            continue  # Skip comparing the baseline to itself

        # Extract PSYS values for the current utility
        psys_other = df[df["Utilities"] == utility]["PSYS"]

        # Ensure both samples have data before performing the test
        if len(psys_bloated) > 0 and len(psys_other) > 0:
            # Perform Mann-Whitney U test
            u_stat, p_value = mannwhitneyu(psys_other, psys_bloated, alternative="two-sided")

            # Store results
            results.append((utility, u_stat, p_value))

    # Convert results into a DataFrame
    results_df = pd.DataFrame(results, columns=["Utility", "U_stat", "p_value"])

    return results_df

In [None]:
results_dir = os.path.join(current_dir, "debloating_results")
debloat_result = os.path.join(results_dir, 'cat_all_repeats.csv')

utilities_to_compare = ["chisel", "debop", "cov"]
baseline = "bloated"
df =computeMannWhitneyUGlobal(debloat_result,utilities_to_compare, baseline)
df

Unnamed: 0,Utility,U_stat,p_value
0,chisel,8001.5,0.136358
1,debop,6890.5,0.565565
2,cov,7034.5,0.758976


In [None]:
results_dir = os.path.join(current_dir, "results")
alternative_result = os.path.join(results_dir, 'cat_all_repeats.csv')
utilities_to_compare = ["ToyBox", "BusyBox"]
baseline = "GNU"
df=computeMannWhitneyUGlobal(alternative_result,utilities_to_compare, baseline)
df

Unnamed: 0,Utility,U_stat,p_value
0,ToyBox,152893.0,0.755212
1,BusyBox,143035.5,0.11896
