In [None]:
from natsort import natsorted
from sklearn.metrics import accuracy_score, r2_score
import numpy as np
import os
import pandas as pd

root_folder = '' # For the datasets (real BCLT and IOLT)

def find_and_sort_csv_files(folder):
    csv_files = []
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.endswith('.csv'):
                csv_files.append(os.path.join(root, file))
    csv_files_sorted = natsorted(csv_files)
    return csv_files_sorted

all_csv_files_sorted = find_and_sort_csv_files(root_folder)
epsilon = 1e-10

def process_dataset(file_path):
    df = pd.read_csv(file_path)
    train_length = int(np.round(len(df) * (1 - test_size)))
    test_length = len(df) - train_length
    df_train = df.head(train_length).copy()
    df_test = df.tail(test_length).copy()
    non_immutable_columns = []
    continuous_columns = []
    discrete_columns = []
    for col in df_train.columns:
        unique_values = df_train[col].nunique()
        if unique_values > 1:
            non_immutable_columns.append(col)
            if unique_values > 15:
                continuous_columns.append(col)
            else:
                discrete_columns.append(col)
    dir_name = os.path.basename(os.path.dirname(file_path))
    file_name = os.path.splitext(os.path.basename(file_path))[0]
    final_name = f'{dir_name}_{file_name}_{int(np.round(test_size * 100))}'
    df_predict = pd.read_csv(f'.../{final_name}.csv') # For the target folder of the results (synthetic BCLT and IOLT)
    results = []
    differences_list = []
    for col in df_train.columns:
        row = {}
        row['Column'] = col
        if col in non_immutable_columns:
            if col in continuous_columns:
                row['Category'] = 'Continuous (C)'
                r2 = r2_score(df_test[col], df_predict[col])
                if r2 < 0 or r2 > 1:
                    r2 = 0
                row['R2 Score'] = r2
            else:
                row['Category'] = 'Discrete (D)'
                accuracy = accuracy_score(df_test[col], df_predict[col])
                row['Accuracy'] = accuracy
        else:
            row['Category'] = 'Immutable (I)'
            row['Accuracy'] = 1
        results.append(row)
        mean_y_test = np.mean(df_test[col])
        std_y_test = np.std(df_test[col])
        median_y_test = np.median(df_test[col])
        percentile_25_y_test = np.percentile(df_test[col], 25)
        percentile_75_y_test = np.percentile(df_test[col], 75)
        variance_y_test = np.var(df_test[col])
        mean_y_pred = np.mean(df_predict[col])
        std_y_pred = np.std(df_predict[col])
        median_y_pred = np.median(df_predict[col])
        percentile_25_y_pred = np.percentile(df_predict[col], 25)
        percentile_75_y_pred = np.percentile(df_predict[col], 75)
        variance_y_pred = np.var(df_predict[col])
        mean_result = abs(mean_y_test - mean_y_pred) / ((abs(mean_y_test) + abs(mean_y_pred)) / 2 + epsilon)
        std_result = abs(std_y_test - std_y_pred) / ((abs(std_y_test) + abs(std_y_pred)) / 2 + epsilon)
        median_result = abs(median_y_test - median_y_pred) / ((abs(median_y_test) + abs(median_y_pred)) / 2 + epsilon)
        percentile_25_result = abs(percentile_25_y_test - percentile_25_y_pred) / ((abs(percentile_25_y_test) + abs(percentile_25_y_pred)) / 2 + epsilon)
        percentile_75_result = abs(percentile_75_y_test - percentile_75_y_pred) / ((abs(percentile_75_y_test) + abs(percentile_75_y_pred)) / 2 + epsilon)
        variance_result = abs(variance_y_test - variance_y_pred) / ((abs(variance_y_test) + abs(variance_y_pred)) / 2 + epsilon)
        final_diff = np.mean([mean_result, std_result, median_result, percentile_25_result, percentile_75_result, variance_result])
        if final_diff > 2:
            final_diff = 2
        final_diff *= 100
        differences_list.append(final_diff)
    difference = np.mean(differences_list)
    df_results = pd.DataFrame(results)
    averages = df_results.mean(numeric_only=True, skipna=True)
    averages['Column'] = ''
    averages['Category'] = 'Average'
    df_averages = pd.DataFrame([averages])
    df_results = pd.concat([df_results, df_averages], ignore_index=True)
    average_accuracy = df_results.loc[df_results['Category'] == 'Average', 'Accuracy'].iloc[0]
    if continuous_columns:
        average_r2_score = df_results.loc[df_results['Category'] == 'Average', 'R2 Score'].iloc[0]
        average_score = (average_r2_score + average_accuracy) / 2 * 100
    else:
        average_r2_score = np.nan
        average_score = average_accuracy * 100
    return average_score, difference

really_really_final_score = []
really_really_final_difference = []

In [None]:
test_size = 0.2

all_averages = []
all_differences = []

for csv_file in all_csv_files_sorted:
    file_path = os.path.join(root_folder, csv_file)
    average_score, difference = process_dataset(file_path)
    print(f'{file_path}\nFinal Score: {average_score:.2f}%\nAverage Statistical Difference: {difference:.2f}%\n')
    all_averages.append(average_score)
    all_differences.append(difference)

the_final_score = np.mean(all_averages)
the_final_difference = np.mean(all_differences)
print(f'The Final Score: {the_final_score:.2f}%\nThe Final Difference: {the_final_difference:.2f}%\n')
really_really_final_score.append(the_final_score)
really_really_final_difference.append(the_final_difference)

In [None]:
test_size = 0.5

all_averages = []
all_differences = []

for csv_file in all_csv_files_sorted:
    file_path = os.path.join(root_folder, csv_file)
    average_score, difference = process_dataset(file_path)
    print(f'{file_path}\nFinal Score: {average_score:.2f}%\nAverage Statistical Difference: {difference:.2f}%\n')
    all_averages.append(average_score)
    all_differences.append(difference)

the_final_score = np.mean(all_averages)
the_final_difference = np.mean(all_differences)
print(f'The Final Score: {the_final_score:.2f}%\nThe Final Difference: {the_final_difference:.2f}%\n')
really_really_final_score.append(the_final_score)
really_really_final_difference.append(the_final_difference)

In [None]:
test_size = 0.8

all_averages = []
all_differences = []

for csv_file in all_csv_files_sorted:
    file_path = os.path.join(root_folder, csv_file)
    average_score, difference = process_dataset(file_path)
    print(f'{file_path}\nFinal Score: {average_score:.2f}%\nAverage Statistical Difference: {difference:.2f}%\n')
    all_averages.append(average_score)
    all_differences.append(difference)

the_final_score = np.mean(all_averages)
the_final_difference = np.mean(all_differences)
print(f'The Final Score: {the_final_score:.2f}%\nThe Final Difference: {the_final_difference:.2f}%\n')
really_really_final_score.append(the_final_score)
really_really_final_difference.append(the_final_difference)

In [None]:
test_size = 0.9

all_averages = []
all_differences = []

for csv_file in all_csv_files_sorted:
    file_path = os.path.join(root_folder, csv_file)
    average_score, difference = process_dataset(file_path)
    print(f'{file_path}\nFinal Score: {average_score:.2f}%\nAverage Statistical Difference: {difference:.2f}%\n')
    all_averages.append(average_score)
    all_differences.append(difference)

the_final_score = np.mean(all_averages)
the_final_difference = np.mean(all_differences)
print(f'The Final Score: {the_final_score:.2f}%\nThe Final Difference: {the_final_difference:.2f}%\n')
really_really_final_score.append(the_final_score)
really_really_final_difference.append(the_final_difference)

In [None]:
test_size = 0.95

all_averages = []
all_differences = []

for csv_file in all_csv_files_sorted:
    file_path = os.path.join(root_folder, csv_file)
    average_score, difference = process_dataset(file_path)
    print(f'{file_path}\nFinal Score: {average_score:.2f}%\nAverage Statistical Difference: {difference:.2f}%\n')
    all_averages.append(average_score)
    all_differences.append(difference)

the_final_score = np.mean(all_averages)
the_final_difference = np.mean(all_differences)
print(f'The Final Score: {the_final_score:.2f}%\nThe Final Difference: {the_final_difference:.2f}%\n')
really_really_final_score.append(the_final_score)
really_really_final_difference.append(the_final_difference)

In [None]:
test_size = 0.99

all_averages = []
all_differences = []

for csv_file in all_csv_files_sorted:
    file_path = os.path.join(root_folder, csv_file)
    average_score, difference = process_dataset(file_path)
    print(f'{file_path}\nFinal Score: {average_score:.2f}%\nAverage Statistical Difference: {difference:.2f}%\n')
    all_averages.append(average_score)
    all_differences.append(difference)

the_final_score = np.mean(all_averages)
the_final_difference = np.mean(all_differences)
print(f'The Final Score: {the_final_score:.2f}%\nThe Final Difference: {the_final_difference:.2f}%\n')
really_really_final_score.append(the_final_score)
really_really_final_difference.append(the_final_difference)

In [None]:
its_the_final = np.mean(really_really_final_score)
its_the_difference = np.mean(really_really_final_difference)
print(f'Really Really Final Score: {its_the_final:.2f}%\nReally Really Final Difference: {its_the_difference:.2f}%')