In [3]:
import pandas as pd


# File paths
file_paths = {
    "fold_transformed_LGBM_XGB": "fold_transformed_LGBM_XGB_model_performance_results.csv",
    "fold_model_transformed": "fold_model_transformed_performance_results.csv",
    "fold_LGBM_XGB": "fold__LGBM_XGB_model_performance_results.csv",
    "fold_model": "fold_model_performance_results.csv"
}

# Load data
dataframes = {name: pd.read_csv(path) for name, path in file_paths.items()}

# Standardize dataset column names and assign default values for missing datasets
for key, df in dataframes.items():
    if "Data Type" in df.columns:
        df.rename(columns={"Data Type": "Dataset"}, inplace=True)
    elif "DataType" in df.columns:
        df.rename(columns={"DataType": "Dataset"}, inplace=True)
    else:
        df["Dataset"] = "original"  # Assign "original" to missing dataset types
    
    # Rename "RF" to "Random Forest"
    df["Model"] = df["Model"].replace("RF", "Random Forest")

# Define relevant metric columns
metric_columns = ["accuracy", "precision", "recall", "f1_score", "threshold_accuracy", "top_3_accuracy"]

# Function to process each dataframe
def process_dataframe(df):
    # Calculate the mean of metrics across the 18 labels, grouped by Fold, Model, and Dataset type
    avg_metrics = df.groupby(["Fold", "Model", "Dataset"])[metric_columns].mean().reset_index()
    return avg_metrics

# Process all files
processed_dfs = {key: process_dataframe(df) for key, df in dataframes.items()}

# Concatenate all processed dataframes
all_data = pd.concat(processed_dfs.values(), ignore_index=True)

# Compute the mean and standard deviation across 5 folds for each Model-Dataset combination
final_results = all_data.groupby(["Model", "Dataset"])[metric_columns].agg(["mean", "std"]).reset_index()

# Reformat the results to show mean ± std directly in the same column
final_results_formatted = final_results.copy()
for metric in metric_columns:
    final_results_formatted[(metric, "mean±std")] = final_results_formatted.apply(
        lambda row: f"{row[(metric, 'mean')]:.4f} ± {row[(metric, 'std')]:.4f}", axis=1
    )

# Keep only the formatted columns
formatted_columns = [("Model", ""), ("Dataset", "")] + [(metric, "mean±std") for metric in metric_columns]
final_results_formatted = final_results_formatted[formatted_columns]

# Rename columns for clarity
final_results_formatted.columns = ["Model", "Dataset"] + metric_columns

# Save the results to a CSV file
output_file = "Combined_model_performance_results.csv"
final_results_formatted.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")

Results saved to Combined_model_performance_results.csv
