In [1]:
import os
import pandas as pd

In [2]:
# Get the directory with the notebooks, including its subdirectories
current_dir = os.getcwd()

# Get the directories with data and results
data_dir = os.path.join(current_dir, "data/debloating")
temporary_dir= os.path.join(current_dir, "debloating_temporary_data")
results_dir = os.path.join(current_dir, "debloating_results")

project_dir = os.path.dirname(current_dir)


In [3]:
# Using this function, the purpose is to cancatenate all generated measurements for each repeated experiment 
def concatenate_csv_files(directory_path, output_file):
    file_list = os.listdir(directory_path)
    csv_files = [file for file in file_list if file.endswith(".csv")]

    if not csv_files:
        print("No CSV files found in the directory.")
        return

    dfs = []
    for file in csv_files:
        file_path = os.path.join(directory_path, file)
        df = pd.read_csv(file_path, delimiter=';') # This is important, the delimiter
        
        filename_parts = file[:-4].split("_")  # Split filename and remove ".csv" extension        
        df['Program'] = filename_parts[0].split("-")[0]
        df['TestNr'] = filename_parts[0].split("-")[1]
        df['Utilities'] = filename_parts[1]
        dfs.append(df)

    concatenated_df = pd.concat(dfs, ignore_index=True)
    concatenated_df.to_csv(output_file, index=False, sep=';')  # Use semicolon as delimiter

    print(f"CSV files concatenated and saved to '{output_file}'.")

for i in range(1,11):
    # All data into files in the folder save in a new CSV file
    input_data = os.path.join(data_dir, 'repeat'+str(i)+'/') # You need to change this for each new folder that you want to concatenate its files
    output_data = os.path.join(results_dir, 'cat_repeat'+str(i)+'.csv') # You need to change this for each new folder given above

    concatenate_csv_files(input_data, output_data)

CSV files concatenated and saved to '/home/rlefeuvr/Documents/Workspace/Debloat/bloat-energy-consumption/notebooks/debloating_results/cat_repeat1.csv'.
CSV files concatenated and saved to '/home/rlefeuvr/Documents/Workspace/Debloat/bloat-energy-consumption/notebooks/debloating_results/cat_repeat2.csv'.
CSV files concatenated and saved to '/home/rlefeuvr/Documents/Workspace/Debloat/bloat-energy-consumption/notebooks/debloating_results/cat_repeat3.csv'.
CSV files concatenated and saved to '/home/rlefeuvr/Documents/Workspace/Debloat/bloat-energy-consumption/notebooks/debloating_results/cat_repeat4.csv'.
CSV files concatenated and saved to '/home/rlefeuvr/Documents/Workspace/Debloat/bloat-energy-consumption/notebooks/debloating_results/cat_repeat5.csv'.
CSV files concatenated and saved to '/home/rlefeuvr/Documents/Workspace/Debloat/bloat-energy-consumption/notebooks/debloating_results/cat_repeat6.csv'.
CSV files concatenated and saved to '/home/rlefeuvr/Documents/Workspace/Debloat/bloat-en

In [4]:
# Using this function, the purpose is to concatenate all resulting files from all 5 executions
def concatenate_allcsv_files(directory_path, output_file):
    file_list = os.listdir(directory_path)
    csv_files = [file for file in file_list if file.endswith(".csv")]

    if not csv_files:
        print("No CSV files found in the directory.")
        return

    dfs = []
    for file in csv_files:
        file_path = os.path.join(directory_path, file)
        df = pd.read_csv(file_path, delimiter=';') # This is important, the delimiter
        df['Repetition'] = file  # Add filename as a new column
        dfs.append(df)

    concatenated_df = pd.concat(dfs, ignore_index=True)
    concatenated_df.to_csv(output_file, index=False, sep=';')  # Use semicolon as delimiter


    print(f"CSV files concatenated and saved to '{output_file}'.")

# Save in a new CSV file
output_alldata = os.path.join(results_dir, 'cat_all_repeats.csv')

concatenate_allcsv_files(temporary_dir, output_alldata)

CSV files concatenated and saved to '/home/rlefeuvr/Documents/Workspace/Debloat/bloat-energy-consumption/notebooks/debloating_results/cat_all_repeats.csv'.


In [5]:
# Now we need to find the average value of PSYS for all measurements of a program 
def calculate_average_PSYS(input_file, output_file):
    # Read the CSV file
    df = pd.read_csv(input_file, delimiter=';')
    
    # Calculate average of 'PSYS' column for each unique combination of 'Program' and 'Utilities'
    average_psys = df.groupby(['Program', 'Utilities'])['PSYS'].mean().reset_index()
    
    # Write the average values to a new file separated with a semicolon
    average_psys.to_csv(output_file, index=False, sep=';')
    
    print(f"Average PSYS values written to '{output_file}'.")

# Required inputs and outputs
input_data = os.path.join(results_dir, 'cat_all_repeats.csv')
output_data = os.path.join(results_dir, 'averagePSYS_all_repeats.csv')

calculate_average_PSYS(input_data, output_data)


Average PSYS values written to '/home/rlefeuvr/Documents/Workspace/Debloat/bloat-energy-consumption/notebooks/debloating_results/averagePSYS_all_repeats.csv'.


In [6]:

def calculate_average_and_std_PSYS(input_file, output_file):
    # Read the CSV file
    df = pd.read_csv(input_file, delimiter=';')
    
    # Convert 'PSYS' values from microjoules to joules
    # df['PSYS'] = df['PSYS'] / 1000000  # 1 microjoule = 1e-6 joules

    # Calculate average of 'PSYS' column for each unique combination of 'Program' and 'Utilities'
    average_psys = df.groupby(['Program', 'Utilities'])['PSYS'].mean().reset_index()
    average_psys['PSYS'] = average_psys['PSYS'].round(2)  # Round the average values to 2 decimals

    # Calculate standard deviation of 'PSYS' column for each unique combination of 'Program' and 'Utilities'
    std_psys = df.groupby(['Program', 'Utilities'])['PSYS'].std().reset_index()
    std_psys['PSYS'] = std_psys['PSYS'].round(2)  # Round the standard deviation values to 2 decimals
    
    # Merge the average and standard deviation DataFrames based on 'Program' and 'Utilities' columns
    result_df = pd.merge(average_psys, std_psys, on=['Program', 'Utilities'], suffixes=('_avg', '_std'))

    # Write the average and standard deviation values to a new file separated with a semicolon
    result_df.to_csv(output_file, index=False, sep=';')

    print(f"Average PSYS and STD values written to '{output_file}'.")

# Required inputs and outputs
input_data = os.path.join(results_dir, 'cat_all_repeats.csv')
output_data = os.path.join(results_dir, 'averagePSYS_with_std_all_repeats.csv')

calculate_average_and_std_PSYS(input_data, output_data)

Average PSYS and STD values written to '/home/rlefeuvr/Documents/Workspace/Debloat/bloat-energy-consumption/notebooks/debloating_results/averagePSYS_with_std_all_repeats.csv'.


In [7]:
import pandas as pd

# Required inputs and outputs
input_data = os.path.join(results_dir, 'averagePSYS_with_std_all_repeats.csv')
output_data = os.path.join(results_dir, 'averagePSYS_with_std_all_repeats_pivot.csv')

data = pd.read_csv(input_data, sep=';')

# Pivot the data to create new columns for GNU, ToyBox, and BusyBox
pivoted_data = data.pivot(index='Program', columns='Utilities', values=['PSYS_avg', 'PSYS_std'])

# Flatten the multi-level column index
pivoted_data.columns = [f'{col[0]}_{col[1]}' for col in pivoted_data.columns]

# Reset the index to make 'Program' a regular column again
pivoted_data = pivoted_data.reset_index()

pivoted_data.to_csv(output_data, index=False, sep=';')

# Display the reorganized data
print(pivoted_data)


       Program  PSYS_avg_bloated  PSYS_avg_chisel  PSYS_avg_cov  \
0         date         287914.25        284160.70     284352.85   
1         grep         317086.10        457307.85     315260.85   
2         gzip         334755.55        549107.40     331841.20   
3        mkdir         289446.30        287996.70     287615.30   
4  printokens2         282000.05        281304.40     285134.15   
5          sed         285567.55        288768.75     290404.50   

   PSYS_avg_debop  PSYS_std_bloated  PSYS_std_chisel  PSYS_std_cov  \
0       284331.55          14287.28          9876.11       9444.81   
1       315404.45          19601.77         20620.21      17533.55   
2       331715.95          12876.30         22471.20      16107.98   
3       287520.60          15787.29         13904.30      11606.01   
4       281771.20           8763.53         12498.42      21416.34   
5       287371.10           9693.99         15200.94      21546.36   

   PSYS_std_debop  
0         9677.53  

In [8]:
# Now we need to find the average value of PSYS and DURATION for all measurements of a program 
def calculate_average_PSYS_DURATION(input_file, output_file):
    # Read the CSV file
    df = pd.read_csv(input_file, delimiter=';')
    
    # Calculate average of 'PSYS' column for each unique combination of 'Program' and 'Utilities'
    average_psys = df.groupby(['Program', 'Utilities'])['PSYS'].mean().reset_index()
    
    # Calculate average of 'DURATION' column for each unique combination of 'Program' and 'Utilities'
    average_duration = df.groupby(['Program', 'Utilities'])['DURATION'].mean().reset_index()
    
    # Merge the two DataFrames based on 'Program' and 'Utilities'
    merged_df = pd.merge(average_psys, average_duration, on=['Program', 'Utilities'])
    
    # Write the average values to a new file separated with a semicolon
    merged_df.to_csv(output_file, index=False, sep=';')
    
    print(f"Average PSYS and DURATION values written to '{output_file}'.")

# Required inputs and outputs
input_data = os.path.join(results_dir, 'cat_all_repeats.csv')
output_data = os.path.join(results_dir, 'averageDURATION_all_repeats.csv')

calculate_average_PSYS_DURATION(input_data, output_data)

Average PSYS and DURATION values written to '/home/rlefeuvr/Documents/Workspace/Debloat/bloat-energy-consumption/notebooks/debloating_results/averageDURATION_all_repeats.csv'.


In [15]:
# After we need to put together the PSYS and Size values for each program in each of the 3 different implementations

csv_with_ec = os.path.join(results_dir, 'averagePSYS_all_repeats.csv')
csv_with_size = os.path.join(results_dir, 'debloat_experiments_size.csv') 
output_file = os.path.join(results_dir, 'all_ec_bsize.csv')

# Read the first CSV file
df1 = pd.read_csv(csv_with_ec, delimiter=';')
# Read the second CSV file
df2 = pd.read_csv(csv_with_size, delimiter=';')
df1['Program'] = df1['Program'].replace('printokens2', 'printtokens2')
# Merge the two DataFrames based on 'Program' column
#NOTE TODO FIX THE MANUAL PROCESSING NEEDED : REPLACE PRINTOKEN TO PRINTTOKEN
merged_df = pd.merge(df1, df2, on='Program')
# Add the 'Size' values for each 'Utilities' based on the Program and fillna with 0 if not found
df1['Size'] = merged_df.apply(lambda row: row[row['Utilities']], axis=1)

# Save the final DataFrame to a new CSV file, including 'Utilities' and 'PSYS' columns
df1.to_csv(output_file, index=False, sep=';')


In [16]:
df1['Program'] = df1['Program'].str.strip()
print(df1[df1['Program'].str.contains('sed', na=False)])


   Program Utilities       PSYS    Size
20     sed   bloated  285567.55  174472
21     sed    chisel  288768.75  155224
22     sed       cov  290404.50   87856
23     sed     debop  287371.10   85772


## Corelation between PSYS and Size

In [17]:
# Now we want to calculate the Peason correlation between the EC and Size 
# for all utilities in each 3 different implementations

# The Spearman correlation with the p-value

import pandas as pd
from scipy.stats import spearmanr

input_data = os.path.join(results_dir, 'all_ec_bsize.csv')
# Read the CSV file into a DataFrame
df = pd.read_csv(input_data, delimiter=";")
print(df)
# Convert 'PSYS' and 'Size' columns to numeric
df['PSYS'] = pd.to_numeric(df['PSYS'])
df['Size'] = pd.to_numeric(df['Size'])

# Calculate Spearman correlation and p-value for each 'Utilities' separately
results = df.groupby('Utilities').apply(lambda x: spearmanr(x['PSYS'], x['Size']))

# Extract the correlation coefficients and p-values
correlations = results.apply(lambda x: x.correlation)
p_values = results.apply(lambda x: x.pvalue)

# Combine correlations and p-values into a DataFrame
spearman_df_size = pd.DataFrame({'Spearman Correlation': correlations, 'p-value': p_values})

# Print the correlations and p-values
print(spearman_df_size)

         Program Utilities       PSYS    Size
0           date   bloated  287914.25   94240
1           date    chisel  284160.70   27952
2           date       cov  284352.85   37536
3           date     debop  284331.55   37536
4           grep   bloated  317086.10  162640
5           grep    chisel  457307.85  111664
6           grep       cov  315260.85   87656
7           grep     debop  315404.45   87664
8           gzip   bloated  334755.55  104152
9           gzip    chisel  549107.40   91720
10          gzip       cov  331841.20   56616
11          gzip     debop  331715.95   56624
12         mkdir   bloated  289446.30   49360
13         mkdir    chisel  287996.70   19696
14         mkdir       cov  287615.30   23392
15         mkdir     debop  287520.60   23392
16  printtokens2   bloated  282000.05   21176
17  printtokens2    chisel  281304.40   21168
18  printtokens2       cov  285134.15   21184
19  printtokens2     debop  281771.20   21192
20           sed   bloated  285567

  results = df.groupby('Utilities').apply(lambda x: spearmanr(x['PSYS'], x['Size']))


In [18]:
def merge_correlation_dfs(spearman: pd.DataFrame, pearson: pd.DataFrame) -> pd.DataFrame:
    # Rename columns for clarity
    spearman = spearman.rename(columns={
        'Spearman Correlation': 'Spearman',
        'p-value': 'p-value_s'
    })
    
    pearson = pearson.rename(columns={
        'Pearson Correlation': 'Pearson',
        'p-value': 'p-value_p'
    })
    
    # Merge on index (Utilities)
    merged_df = spearman.merge(pearson, left_index=True, right_index=True)
    
    return merged_df
def print_latex_df(df: pd.DataFrame, filename: str = "output.tex"):
    # Define column format: 'l' for first column, a separator '|', and 'c' for the rest
    column_format = "l| " + "c " * (len(df.columns))

    # Convert DataFrame to LaTeX without index and with correct format
    latex_str = df.to_latex(
        index=True,  # No row index
        float_format=lambda x: f"{x:.2g}",  # Limit float precision
        column_format=column_format.strip(),  # Ensure clean formatting
        escape=False  # Prevent LaTeX escaping
    )

    # Remove any unwanted extra row
    latex_str = latex_str.replace("\\toprule\n &", "\\toprule\n")
    latex_str = latex_str.replace("\\midrule\n &", "\\midrule\n") 
    latex_str = latex_str.replace("Utilities &  &  &  &  \\\\", "")  
    # Make headers bold dynamically

    headers = " & ".join(f"{col}" for col in df.columns)
    headers = df.index.name+" & "+headers
    latex_str = latex_str.replace(" & ".join(df.columns), headers)
    print(latex_str)
    with open(filename, "w") as f:
        f.write(latex_str)



def print_latex_compact_df(
    df1: pd.DataFrame,
    df2: pd.DataFrame,
    label1: str = "DF1",
    label2: str = "DF2",
    filename: str = "compact_output.tex"
):
    def format_number(x):
        try:
            x = float(x)
            if abs(x) <= 1e-3 and x != 0.0:
                return f"{x:.2e}"  # scientific notation
            else:
                return f"{x:.3f}"
        except:
            return str(x)

    def format_df(df):
        df_formatted = df.copy()
        for idx, row in df.iterrows():
            
            if float(row.iloc[1]) > 0.05:
                df_formatted.loc[idx] = row.apply(lambda x: f"\\hatchedCell{{{format_number(x)}}}")
            else:
                df_formatted.loc[idx] = row.apply(lambda x: format_number(x))
        return df_formatted

    df1_fmt = format_df(df1)
    df2_fmt = format_df(df2)

    # Add \textit{} around the index (utility names)
    df1_fmt.index = [f"\\textit{{{i}}}" for i in df1_fmt.index]
    df2_fmt.index = [f"\\textit{{{i}}}" for i in df2_fmt.index]

    combined = pd.concat([df1_fmt, df2_fmt], axis=1, keys=[label1, label2])
    combined.columns = pd.MultiIndex.from_tuples(combined.columns)

    latex_str = combined.to_latex(
        escape=False,
        multicolumn=True,
        multicolumn_format='c',
        multirow=True,
        index=True,
        column_format="l|cc||cc||cc"
    )

    print(latex_str)
    with open(filename, "w") as f:
        f.write(latex_str)


In [19]:
# The Pearson correlation with the p-value

import pandas as pd
from scipy.stats import pearsonr

input_data = os.path.join(results_dir, 'all_ec_bsize.csv')
# Read the CSV file into a DataFrame
df = pd.read_csv(input_data, delimiter=";")

# Convert 'PSYS' and 'Size' columns to numeric
df['PSYS'] = pd.to_numeric(df['PSYS'])
df['Size'] = pd.to_numeric(df['Size'])

# Calculate Spearman correlation and p-value for each 'Utilities' separately
results = df.groupby('Utilities').apply(lambda x: pearsonr(x['PSYS'], x['Size']))

# Extract the correlation coefficients and p-values
correlations = results.apply(lambda x: x.correlation)
p_values = results.apply(lambda x: x.pvalue)

# Combine correlations and p-values into a DataFrame
pearson_df = pd.DataFrame({'Pearson Correlation': correlations, 'p-value': p_values})

# Print the correlations and p-values
print(pearson_df)



           Pearson Correlation   p-value
Utilities                               
bloated               0.334806  0.516557
chisel                0.386750  0.448800
cov                   0.465291  0.352430
debop                 0.463585  0.354437


  results = df.groupby('Utilities').apply(lambda x: pearsonr(x['PSYS'], x['Size']))


In [20]:
merged_df_size = merge_correlation_dfs(spearman_df_size, pearson_df)
print_latex_df(merged_df_size, filename=os.path.join(results_dir, "correlation_size.tex"))

\begin{tabular}{l| c c c c}
\toprule
 Utilities & Spearman & p-value_s & Pearson & p-value_p \\

\midrule
bloated & 0.31 & 0.54 & 0.33 & 0.52 \\
chisel & 0.6 & 0.21 & 0.39 & 0.45 \\
cov & 0.6 & 0.21 & 0.47 & 0.35 \\
debop & 0.6 & 0.21 & 0.46 & 0.35 \\
\bottomrule
\end{tabular}



## Correlation between PSYS and duration

In [28]:
# Now we want to calculate the Peason correlation between the EC and Duration 
# for all utilities in each 3 different implementations

# The Spearman correlation with the p-value

import pandas as pd
from scipy.stats import spearmanr

input_data = os.path.join(results_dir, 'averageDURATION_all_repeats.csv')
# Read the CSV file into a DataFrame
df = pd.read_csv(input_data, delimiter=";")

# Convert 'PSYS' and 'Size' columns to numeric
df['PSYS'] = pd.to_numeric(df['PSYS'])
df['DURATION'] = pd.to_numeric(df['DURATION'])

# Calculate Spearman correlation and p-value for each 'Utilities' separately
results = df.groupby('Utilities').apply(lambda x: spearmanr(x['PSYS'], x['DURATION']))

# Extract the correlation coefficients and p-values
correlations = results.apply(lambda x: x.correlation)
p_values = results.apply(lambda x: x.pvalue)

# Combine correlations and p-values into a DataFrame
spearman_df_perf = pd.DataFrame({'Spearman Correlation': correlations, 'p-value': p_values})

# Print the correlations and p-values
print(spearman_df_perf)

           Spearman Correlation   p-value
Utilities                                
bloated                1.000000  0.000000
chisel                 1.000000  0.000000
cov                    0.885714  0.018845
debop                  0.942857  0.004805


  results = df.groupby('Utilities').apply(lambda x: spearmanr(x['PSYS'], x['DURATION']))


In [29]:
# The Pearson correlation with the p-value

import pandas as pd
from scipy.stats import pearsonr

input_data = os.path.join(results_dir, 'averageDURATION_all_repeats.csv')
# Read the CSV file into a DataFrame
df = pd.read_csv(input_data, delimiter=";")

# Convert 'PSYS' and 'Size' columns to numeric
df['PSYS'] = pd.to_numeric(df['PSYS'])
df['DURATION'] = pd.to_numeric(df['DURATION'])

# Calculate Spearman correlation and p-value for each 'Utilities' separately
results = df.groupby('Utilities').apply(lambda x: pearsonr(x['PSYS'], x['DURATION']))

# Extract the correlation coefficients and p-values
correlations = results.apply(lambda x: x.correlation)
p_values = results.apply(lambda x: x.pvalue)

# Combine correlations and p-values into a DataFrame
pearson_df = pd.DataFrame({'Pearson Correlation': correlations, 'p-value': p_values})

# Print the correlations and p-values
print(pearson_df)

           Pearson Correlation       p-value
Utilities                                   
bloated               0.998515  3.304415e-06
chisel                0.999928  7.679358e-09
cov                   0.996567  1.765990e-05
debop                 0.998403  3.822160e-06


  results = df.groupby('Utilities').apply(lambda x: pearsonr(x['PSYS'], x['DURATION']))


In [30]:
merged_df_perf = merge_correlation_dfs(spearman_df_perf, pearson_df)
print_latex_df(merged_df_perf, filename=os.path.join(results_dir, "correlation_perf.tex"))

\begin{tabular}{l| c c c c}
\toprule
 Utilities & Spearman & p-value_s & Pearson & p-value_p \\

\midrule
bloated & 1 & 0 & 1 & 3.3e-06 \\
chisel & 1 & 0 & 1 & 7.7e-09 \\
cov & 0.89 & 0.019 & 1 & 1.8e-05 \\
debop & 0.94 & 0.0048 & 1 & 3.8e-06 \\
\bottomrule
\end{tabular}



In [31]:
print_latex_compact_df(spearman_df_perf,spearman_df_size, label1="Energy/Perf", label2="Energy/Binary Size", filename=os.path.join(results_dir, "correlation_compact_debloat.tex"))

\begin{tabular}{l|cc||cc||cc}
\toprule
 & \multicolumn{2}{c}{Energy/Perf} & \multicolumn{2}{c}{Energy/Binary Size} \\
 & Spearman Correlation & p-value & Spearman Correlation & p-value \\
\midrule
\textit{bloated} & 1.000 & 0.000 & \hatchedCell{0.314} & \hatchedCell{0.544} \\
\textit{chisel} & 1.000 & 0.000 & \hatchedCell{0.600} & \hatchedCell{0.208} \\
\textit{cov} & 0.886 & 0.019 & \hatchedCell{0.600} & \hatchedCell{0.208} \\
\textit{debop} & 0.943 & 0.005 & \hatchedCell{0.600} & \hatchedCell{0.208} \\
\bottomrule
\end{tabular}



  df_formatted.loc[idx] = row.apply(lambda x: format_number(x))
  df_formatted.loc[idx] = row.apply(lambda x: format_number(x))
  df_formatted.loc[idx] = row.apply(lambda x: f"\\hatchedCell{{{format_number(x)}}}")
  df_formatted.loc[idx] = row.apply(lambda x: f"\\hatchedCell{{{format_number(x)}}}")
