In [4]:
import os
import pandas as pd
from statistics import mean

In [23]:
def calculate_mean_size(folder_path, binaries):
    binary_sizes = {}
    for binary in binaries:
        binary_sizes[binary] = []
        for root, _, files in os.walk(folder_path):
            for file in files:
                if any(name in file for name in binaries[binary]):
                    file_path = os.path.join(root, file)
                    size = os.path.getsize(file_path)
                    binary_sizes[binary].append(size)

    return {binary: mean(sizes) if sizes else 0 for binary, sizes in binary_sizes.items()}

def create_dataframe(ref_folder, targeted_directories, studied_binaries, ref_binaries):
    ref_sizes = calculate_mean_size(ref_folder, ref_binaries)
    targeted_sizes = {}

    for targeted_folder in targeted_directories:
        targeted_sizes[targeted_folder] = calculate_mean_size(targeted_folder, studied_binaries)

    data = {"Binary": [], "Reference Size": [], "Reduced Size": [], "Difference (%)":[]}

    for binary, ref_size in ref_sizes.items():
        data["Binary"].append(binary)
        data["Reference Size"].append(round(ref_size,2))
        reduced_size=round(mean(targeted_sizes[t].get(binary, 0) for t in targeted_directories),0)
        data["Reduced Size"].append(reduced_size)
        percentage_difference = 100 * ((reduced_size - ref_size) / ref_size) if ref_size != 0 else 0
        data["Difference (%)"].append(percentage_difference)
    df = pd.DataFrame(data)
    return df

ref_folder = "../pre-experiment/bloated/"
targeted_directories = ["../pre-experiment/debloated/chisel/", "../pre-experiment/debloated/debop/","../pre-experiment/debloated/cov/"]

studied_binaries= {
"date-8.21": ["date-8.21_I2"],
"grep-2.4.2": ["grep-2.4.2_p0.2train"],
"gzip-1.3":["gzip-1.3_p0.2train"],
"mkdir-5.2.1":["mkdir-5.2.1_I0","mkdir-5.2.1_I5"],
"printtokens2":["printtokens2_p0.3train","printtokens2_p0.2train"],
"sed-4.1.5": ["sed-4.1.5_p0.1train","sed-4.1.5_p0.3train"]
}

ref_binaries= {
"date-8.21": ["date-8.21"],
"grep-2.4.2": ["grep-2.4.2"],
"gzip-1.3":["gzip-1.3"],
"mkdir-5.2.1":["mkdir-5.2.1"],
"printtokens2":["printtokens2"],
"sed-4.1.5": ["sed-4.1.5"]
}
result_df = create_dataframe(ref_folder, targeted_directories, studied_binaries,ref_binaries)
print(result_df)

         Binary  Reference Size  Reduced Size  Difference (%)
0     date-8.21           94240       34341.0      -63.560059
1    grep-2.4.2          162640       95661.0      -41.182366
2      gzip-1.3          104152       68320.0      -34.403564
3   mkdir-5.2.1           49360       22160.0      -55.105348
4  printtokens2           21176       21181.0        0.023612
5     sed-4.1.5          174472      109617.0      -37.172154
