In [5]:
import os
import pandas as pd

In [29]:
# Define the directory containing the datasets
directory = '.'

# Initialize an empty list to store the data for the final DataFrame
comparison_data = []

# Function to extract the best row based on Test G-Mean and return specific metrics
def get_best_metrics(file_path):
    df = pd.read_csv(file_path)
    best_row = df.loc[df['Test G-Mean'].idxmax()]
    # Extract desired metrics and round to 3 decimal places
    metrics = {
        'Test Balanced Accuracy': round(best_row['Test Balanced Accuracy'], 3),
        'Test F1 Score': round(best_row['Test F1 Score'], 3),
        'Test ROC AUC': round(best_row['Test ROC AUC'], 3),
        'Test G-Mean': round(best_row['Test G-Mean'], 3)
    }
    return metrics

# Loop through directories to find datasets and their corresponding result files
for folder_name in os.listdir(directory):
    dataset_path = os.path.join(directory, folder_name)
    if os.path.isdir(dataset_path):  # Check if it's a directory
        # Identify result files for the dataset
        results_file = os.path.join(dataset_path, f"{folder_name}_results.csv")
        original_results_file = os.path.join(dataset_path, f"{folder_name}_original_results.csv")
        
        # Ensure both files exist
        if os.path.exists(results_file) and os.path.exists(original_results_file):
            # Extract metrics for oversampled and original data
            oversampled_metrics = get_best_metrics(results_file)
            original_metrics = get_best_metrics(original_results_file)
            
            # Add a row for the current dataset to the comparison data
            comparison_data.append([
                folder_name,  # Dataset name
                oversampled_metrics['Test Balanced Accuracy'],
                oversampled_metrics['Test F1 Score'],
                oversampled_metrics['Test ROC AUC'],
                oversampled_metrics['Test G-Mean'],
                original_metrics['Test Balanced Accuracy'],
                original_metrics['Test F1 Score'],
                original_metrics['Test ROC AUC'],
                original_metrics['Test G-Mean']
            ])

# Create the final DataFrame
columns = pd.MultiIndex.from_tuples([
    ('', 'Dataset'),
    ('Oversampled Data', 'Test Balanced Accuracy'),
    ('Oversampled Data', 'Test F1 Score'),
    ('Oversampled Data', 'Test ROC AUC'),
    ('Oversampled Data', 'Test G-Mean'),
    ('Original Data', 'Test Balanced Accuracy'),
    ('Original Data', 'Test F1 Score'),
    ('Original Data', 'Test ROC AUC'),
    ('Original Data', 'Test G-Mean')
], names=['', ''])

# Create the DataFrame with the corrected columns
comparison_df = pd.DataFrame(comparison_data, columns=columns)

# Display the resulting DataFrame
comparison_df.reset_index(drop=True, inplace=True)
comparison_df


Unnamed: 0_level_0,Unnamed: 1_level_0,Oversampled Data,Oversampled Data,Oversampled Data,Oversampled Data,Original Data,Original Data,Original Data,Original Data
Unnamed: 0_level_1,Dataset,Test Balanced Accuracy,Test F1 Score,Test ROC AUC,Test G-Mean,Test Balanced Accuracy,Test F1 Score,Test ROC AUC,Test G-Mean
0,abalone19,0.898,0.476,0.945,0.845,0.5,0.498,0.839,0.707
1,ecoli1,0.779,0.632,0.779,0.778,0.798,0.667,0.798,0.797
2,flare-F,0.734,0.499,0.807,0.726,0.5,0.49,0.225,0.707
3,poker-8-9_vs_5,0.824,0.077,0.824,0.805,0.667,0.5,0.667,0.577
4,yeast5,0.972,0.5,0.972,0.972,0.828,0.138,0.828,0.809
5,yeast6,0.823,0.607,0.929,0.859,0.618,0.634,0.854,0.781
