In [1]:
import pandas as pd
import re

# File paths
file_paths = {
    "fold_transformed_LGBM_XGB": "fold_transformed_LGBM_XGB_model_performance_results.csv",
    "fold_model_transformed": "fold_model_transformed_performance_results.csv",
    "fold_LGBM_XGB": "fold__LGBM_XGB_model_performance_results.csv",
    "fold_model": "fold_model_performance_results.csv"
}

# Load data
dataframes = {name: pd.read_csv(path) for name, path in file_paths.items()}

# Standardize dataset column names and assign default values for missing datasets
for key, df in dataframes.items():
    if "Data Type" in df.columns:
        df.rename(columns={"Data Type": "Dataset"}, inplace=True)
    elif "DataType" in df.columns:
        df.rename(columns={"DataType": "Dataset"}, inplace=True)
    else:
        df["Dataset"] = "original"  # Assign "original" to missing dataset types
    
    # Rename "RF" to "Random Forest"
    df["Model"] = df["Model"].replace("RF", "Random Forest")
    
    # Clean up Label column by removing text within parentheses
    df["Label"] = df["Label"].apply(lambda x: re.sub(r"\s*\(.*?\)", "", x))

# Define relevant metric columns
metric_columns = ["accuracy", "precision", "recall", "f1_score", "threshold_accuracy", "top_3_accuracy"]

# Function to process each dataframe
def process_dataframe_with_labels(df):
    # Calculate the mean and std across the 5 folds for each Label, Model, and Dataset type
    avg_metrics = df.groupby(["Label", "Model", "Dataset"])[metric_columns].agg(["mean", "std"]).reset_index()
    return avg_metrics

# Process all files
processed_dfs = {key: process_dataframe_with_labels(df) for key, df in dataframes.items()}

# Concatenate all processed dataframes
all_data_with_labels = pd.concat(processed_dfs.values(), ignore_index=True)

# Save the results to a new CSV file
output_file = "Combined_model_performance_by_label.csv"
all_data_with_labels.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")

Results saved to Combined_model_performance_by_label.csv
