In [None]:
from natsort import natsorted
import numpy as np
import os
import pandas as pd

root_folder = '' # For the datasets (real trace data)

def find_and_sort_csv_files(folder):
    csv_files = []
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.endswith('.csv'):
                csv_files.append(os.path.join(root, file))
    csv_files_sorted = natsorted(csv_files)
    return csv_files_sorted

all_csv_files_sorted = find_and_sort_csv_files(root_folder)
epsilon = 1e-10

def process_dataset(file_path):
    df_test = pd.read_csv(file_path)
    dir_name = os.path.basename(os.path.dirname(file_path))
    file_name = os.path.splitext(os.path.basename(file_path))[0]
    final_name = f'{dir_name}_{file_name}'
    df_predict = pd.read_csv(f'.../{final_name}.csv') # For the target folder of the results (synthetic trace data)
    results = []
    differences_list = []
    for col in df_test.columns:
        mean_y_test = np.mean(df_test[col])
        std_y_test = np.std(df_test[col])
        median_y_test = np.median(df_test[col])
        percentile_25_y_test = np.percentile(df_test[col], 25)
        percentile_75_y_test = np.percentile(df_test[col], 75)
        variance_y_test = np.var(df_test[col])
        mean_y_pred = np.mean(df_predict[col])
        std_y_pred = np.std(df_predict[col])
        median_y_pred = np.median(df_predict[col])
        percentile_25_y_pred = np.percentile(df_predict[col], 25)
        percentile_75_y_pred = np.percentile(df_predict[col], 75)
        variance_y_pred = np.var(df_predict[col])
        mean_result = abs(mean_y_test - mean_y_pred) / ((abs(mean_y_test) + abs(mean_y_pred)) / 2 + epsilon)
        std_result = abs(std_y_test - std_y_pred) / ((abs(std_y_test) + abs(std_y_pred)) / 2 + epsilon)
        median_result = abs(median_y_test - median_y_pred) / ((abs(median_y_test) + abs(median_y_pred)) / 2 + epsilon)
        percentile_25_result = abs(percentile_25_y_test - percentile_25_y_pred) / ((abs(percentile_25_y_test) + abs(percentile_25_y_pred)) / 2 + epsilon)
        percentile_75_result = abs(percentile_75_y_test - percentile_75_y_pred) / ((abs(percentile_75_y_test) + abs(percentile_75_y_pred)) / 2 + epsilon)
        variance_result = abs(variance_y_test - variance_y_pred) / ((abs(variance_y_test) + abs(variance_y_pred)) / 2 + epsilon)
        final_diff = np.mean([mean_result, std_result, median_result, percentile_25_result, percentile_75_result, variance_result])
        if final_diff > 2:
            final_diff = 2
        final_diff *= 100
        differences_list.append(final_diff)
    difference = np.mean(differences_list)
    return difference

In [None]:
all_differences = []

for csv_file in all_csv_files_sorted:
    file_path = os.path.join(root_folder, csv_file)
    difference = process_dataset(file_path)
    print(f'{file_path}\nAverage Statistical Difference: {difference:.2f}%\n')
    all_differences.append(difference)

the_final_difference = np.mean(all_differences)
print(f'The Final Difference: {the_final_difference:.2f}%\n')