In [1]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob

In [2]:
def plot_crossval_histories(dataset):
    """
    Plots the cross-validation histories and includes the averaged test set metrics in the plots.
    Handles cases where there is only one target feature without task names in the column names.
    
    Parameters:
    - dataset: Path object representing the dataset directory containing the history CSV files.
    """
    # Define known metrics
    known_metrics = ['loss', 'mean_absolute_error', 'sparse_categorical_accuracy']
    
    # Mapping of history metric names to averaged results metric names
    metric_name_mapping = {
        'mean_absolute_error': 'mae',
        'sparse_categorical_accuracy': 'accuracy',
        # Add other mappings if necessary
    }
    
    # Get histories and determine the minimum length
    histories = []
    min_length = float('inf')
    for idx in range(5):
        history_path = dataset / f"history_{idx+1}.csv"
        history = pd.read_csv(history_path)
        histories.append(history)
        if len(history) < min_length:
            min_length = len(history)
    
    # Assuming all histories have the same structure
    columns = histories[0].columns
    
    # Create a mapping from task names to their corresponding averaged results files
    result_files = list(dataset.glob("averaged_results_*.csv"))
    task_to_result_file = {}
    task_names_in_results = []

    for result_file in result_files:
        base_name = result_file.name
        parts = base_name.split("averaged_results_")
        if len(parts) < 2:
            continue
        task_name_with_ext = parts[1]
        task_name = task_name_with_ext.replace(".csv", "")
        # Map the task name to the result file
        task_to_result_file[task_name] = result_file
        task_names_in_results.append(task_name)

    if len(task_names_in_results) == 1:
        single_task_name = task_names_in_results[0]
    else:
        single_task_name = 'main'  # Default task name

    # Initialize data structures
    metrics_data = {}  # {task_name: {'metrics': {metric_name: [data]}, 'loss': [data], 'val_metrics': {}, 'val_loss': [], 'test_metrics': {}}}

    # For each column, parse and collect data
    for col in columns:
        is_val = col.startswith('val_')
        col_base = col[4:] if is_val else col

        if col_base in ['loss', 'lr']:
            # Handle overall loss and learning rate separately
            if col_base == 'loss':
                task_name = 'overall'
                metric_name = 'loss'
                if task_name not in metrics_data:
                    metrics_data[task_name] = {'metrics': {}, 'loss': [], 'val_metrics': {}, 'val_loss': [], 'test_metrics': {}}
                # Collect data
                for history in histories:
                    data = history[col][:min_length].values
                    if is_val:
                        metrics_data[task_name]['val_loss'].append(data)
                    else:
                        metrics_data[task_name]['loss'].append(data)
            # Skip 'lr' for now
            continue
        else:
            # Extract task and metric
            # Handle the case when there is only one target feature and no task name in the column name
            for metric in known_metrics:
                if col_base == metric:
                    task_name = single_task_name  # Use the single task name
                    metric_name = metric
                    break
                elif col_base.endswith(metric):
                    task_name = col_base[:-len(metric)].rstrip('_')
                    metric_name = metric
                    break
            else:
                print(f"Could not parse column '{col}'")
                continue

            # Remove 'output_' prefix from task name
            if task_name.startswith('output_'):
                task_name = task_name[len('output_'):]
            # Initialize data structures if needed
            if task_name not in metrics_data:
                metrics_data[task_name] = {'metrics': {}, 'loss': [], 'val_metrics': {}, 'val_loss': [], 'test_metrics': {}}

            # Collect data
            for history in histories:
                data = history[col][:min_length].values
                if is_val:
                    if metric_name == 'loss':
                        metrics_data[task_name]['val_loss'].append(data)
                    else:
                        if metric_name not in metrics_data[task_name]['val_metrics']:
                            metrics_data[task_name]['val_metrics'][metric_name] = []
                        metrics_data[task_name]['val_metrics'][metric_name].append(data)
                else:
                    if metric_name == 'loss':
                        metrics_data[task_name]['loss'].append(data)
                    else:
                        if metric_name not in metrics_data[task_name]['metrics']:
                            metrics_data[task_name]['metrics'][metric_name] = []
                        metrics_data[task_name]['metrics'][metric_name].append(data)

    # Read the averaged results for each task, excluding 'overall'
    for task in metrics_data.keys():
        if task == 'overall':
            continue  # Skip the 'overall' task as there is no averaged results file
        # Find the corresponding averaged results file
        if task in task_to_result_file:
            averaged_results_path = task_to_result_file[task]
            df_averaged = pd.read_csv(averaged_results_path)
            # Get the 'Weighted Mean' row
            weighted_mean_row = df_averaged[df_averaged['k'] == 'Weighted Mean']
            if not weighted_mean_row.empty:
                # Identify metric columns dynamically
                metric_columns = [col for col in df_averaged.columns if col not in ['k', 'weight']]
                test_metrics = {}
                for metric in metric_columns:
                    value = weighted_mean_row.iloc[0][metric]
                    if pd.notna(value):
                        test_metrics[metric] = float(value)
                # Store the test metrics
                metrics_data[task]['test_metrics'] = test_metrics
            else:
                print(f"'Weighted Mean' row not found in averaged results for task '{task}'.")
        else:
            print(f"Averaged results file for task '{task}' not found in '{dataset}'.")

    # Now plot per task
    tasks = list(metrics_data.keys())
    num_tasks = len(tasks)

    epochs = np.arange(min_length)

    # Define colors
    training_color = 'blue'
    validation_color = 'orange'
    fold_color = 'gray'
    test_color = 'red'

    # Map internal task names to friendly names
    task_name_mapping = {
        'concept_name_next': 'Next Activity',
        'time_timestamp_next': 'Next Time',
        'time_timestamp_last': 'Remaining Time',
        'overall': 'Overall'
    }
    # Add the single task name to the mapping if not present
    if single_task_name not in task_name_mapping:
        task_name_mapping[single_task_name] = 'Main Task'

    # Set up subplots
    fig, axes = plt.subplots(num_tasks, 2, figsize=(12, 5 * num_tasks))
    if num_tasks == 1:
        axes = np.array([axes])  # Ensure axes is always 2D array

    for i, task in enumerate(tasks):
        task_data = metrics_data[task]
        friendly_task_name = task_name_mapping.get(task, task)

        # Plot loss
        ax_loss = axes[i, 0]
        plotted = False  # Flag to check if any data is plotted
        if task_data['loss']:
            # Plot individual folds
            for data in task_data['loss']:
                ax_loss.plot(epochs, data, color=fold_color, alpha=0.3)
            # Compute mean and std
            data_array = np.array(task_data['loss'])
            mean = np.mean(data_array, axis=0)
            std = np.std(data_array, axis=0)
            ax_loss.plot(epochs, mean, label='Training Loss', color=training_color, linewidth=2)
            ax_loss.fill_between(epochs, mean - std, mean + std, alpha=0.2, color=training_color)
            plotted = True

        if task_data['val_loss']:
            # Plot individual folds
            for data in task_data['val_loss']:
                ax_loss.plot(epochs, data, color=fold_color, alpha=0.3)
            # Compute mean and std
            data_array = np.array(task_data['val_loss'])
            mean = np.mean(data_array, axis=0)
            std = np.std(data_array, axis=0)
            ax_loss.plot(epochs, mean, label='Validation Loss', color=validation_color, linewidth=2)
            ax_loss.fill_between(epochs, mean - std, mean + std, alpha=0.2, color=validation_color)
            plotted = True

        if plotted:
            # Plot test metric as a horizontal line
            if 'loss' in task_data['test_metrics']:
                test_loss = task_data['test_metrics']['loss']
                ax_loss.axhline(y=test_loss, color=test_color, linestyle='-', linewidth=2, label='Test Loss')
            ax_loss.set_title(f'{friendly_task_name} - Training and Validation Loss')
            ax_loss.set_xlabel('Epochs')
            ax_loss.set_ylabel('Loss')
            ax_loss.legend(loc='upper right', fontsize='small')
        else:
            ax_loss.axis('off')  # Hide the axis if no data

        # Plot metrics
        ax_metric = axes[i, 1]
        plotted = False
        # Combine training and validation metrics to ensure consistent legends
        all_metrics = set(task_data['metrics'].keys()).union(task_data['val_metrics'].keys())
        for metric_name in all_metrics:
            # Plot training metrics
            if metric_name in task_data['metrics']:
                if task_data['metrics'][metric_name]:
                    # Plot individual folds
                    for data in task_data['metrics'][metric_name]:
                        ax_metric.plot(epochs, data, color=fold_color, alpha=0.3)
                    # Compute mean and std
                    data_array = np.array(task_data['metrics'][metric_name])
                    mean = np.mean(data_array, axis=0)
                    std = np.std(data_array, axis=0)
                    ax_metric.plot(epochs, mean, label=f'Training {metric_name}', color=training_color, linewidth=2)
                    ax_metric.fill_between(epochs, mean - std, mean + std, alpha=0.2, color=training_color)
                    plotted = True

            # Plot validation metrics
            if metric_name in task_data['val_metrics']:
                if task_data['val_metrics'][metric_name]:
                    # Plot individual folds
                    for data in task_data['val_metrics'][metric_name]:
                        ax_metric.plot(epochs, data, color=fold_color, alpha=0.3)
                    # Compute mean and std
                    data_array = np.array(task_data['val_metrics'][metric_name])
                    mean = np.mean(data_array, axis=0)
                    std = np.std(data_array, axis=0)
                    ax_metric.plot(epochs, mean, label=f'Validation {metric_name}', color=validation_color, linewidth=2)
                    ax_metric.fill_between(epochs, mean - std, mean + std, alpha=0.2, color=validation_color)
                    plotted = True

            # Map the metric name to the test metric name
            test_metric_name = metric_name_mapping.get(metric_name, metric_name)

            # Plot test metric as a horizontal line
            if test_metric_name in task_data['test_metrics']:
                test_metric_value = task_data['test_metrics'][test_metric_name]
                ax_metric.axhline(y=test_metric_value, color=test_color, linestyle='-', linewidth=2, label=f'Test {metric_name}')
                plotted = True
            else:
                print(f"Test metric '{test_metric_name}' not found for task '{task}'.")

        if plotted:
            ax_metric.set_title(f'{friendly_task_name} - Training and Validation Metrics')
            ax_metric.set_xlabel('Epochs')
            ax_metric.set_ylabel('Metric')
            ax_metric.legend(loc='lower right', fontsize='small')
        else:
            ax_metric.axis('off')  # Hide the axis if no data

    plt.tight_layout()
    # plt.savefig(dataset / 'train_val_history.png')
    plt.show()
    plt.clf()  # Clear figure

In [3]:
def plot_holdout_history(dataset):
    """
    Plots the training and validation metrics for the holdout approach.
    Includes test metrics from the results files in the plots.

    Parameters:
    - dataset: Path object representing the dataset directory containing the history CSV file.
    """
    # Define known metrics
    known_metrics = ['loss', 'mean_absolute_error', 'sparse_categorical_accuracy']
    
    # Mapping of history metric names to results metric names
    metric_name_mapping = {
        'mean_absolute_error': 'mae',
        'sparse_categorical_accuracy': 'accuracy',
        # Add other mappings if necessary
    }
    
    # Read the history CSV file
    history_path = dataset / "history_1.csv"
    history = pd.read_csv(history_path)
    
    # Assuming the history has the same structure as in cross-validation
    columns = history.columns

    # Create a mapping from task names to their corresponding result files
    result_files = list(dataset.glob("results_*__*.csv"))
    task_to_result_file = {}
    task_names_in_results = []

    for result_file in result_files:
        base_name = result_file.name
        parts = base_name.split("__")
        if len(parts) < 2:
            continue
        task_name_with_ext = parts[1]
        task_name = task_name_with_ext.replace(".csv", "")
        # Remove parentheses and quotes from task name
        task_name = task_name.strip("()'\"").replace("', '", "_").replace(" ", "_")
        # Map the task name to the result file
        task_to_result_file[task_name] = result_file
        task_names_in_results.append(task_name)

    if len(task_names_in_results) == 1:
        single_task_name = task_names_in_results[0]
    else:
        single_task_name = 'main'  # Default task name

    # Initialize data structures
    metrics_data = {}  # {task_name: {'metrics': {metric_name: data}, 'loss': data, 'val_metrics': {}, 'val_loss': data, 'test_metrics': {}}}

    # For each column, parse and collect data
    for col in columns:
        is_val = col.startswith('val_')
        col_base = col[4:] if is_val else col

        if col_base in ['loss', 'lr']:
            # Handle overall loss and learning rate separately
            if col_base == 'loss':
                task_name = 'overall'
                metric_name = 'loss'
                if task_name not in metrics_data:
                    metrics_data[task_name] = {'metrics': {}, 'loss': None, 'val_metrics': {}, 'val_loss': None, 'test_metrics': {}}
                # Collect data
                data = history[col].values
                if is_val:
                    metrics_data[task_name]['val_loss'] = data
                else:
                    metrics_data[task_name]['loss'] = data
            # Skip 'lr' for now
            continue
        else:
            # Extract task and metric
            # Handle the case when there is only one target feature and no task name in the column name
            for metric in known_metrics:
                if col_base == metric:
                    task_name = single_task_name  # Use the single task name
                    metric_name = metric
                    break
                elif col_base.endswith(metric):
                    task_name = col_base[:-len(metric)].rstrip('_')
                    metric_name = metric
                    break
            else:
                print(f"Could not parse column '{col}'")
                continue

            # Remove 'output_' prefix from task name
            if task_name.startswith('output_'):
                task_name = task_name[len('output_'):]
            # Initialize data structures if needed
            if task_name not in metrics_data:
                metrics_data[task_name] = {'metrics': {}, 'loss': None, 'val_metrics': {}, 'val_loss': None, 'test_metrics': {}}

            # Collect data
            data = history[col].values
            if is_val:
                if metric_name == 'loss':
                    metrics_data[task_name]['val_loss'] = data
                else:
                    metrics_data[task_name]['val_metrics'][metric_name] = data
            else:
                if metric_name == 'loss':
                    metrics_data[task_name]['loss'] = data
                else:
                    metrics_data[task_name]['metrics'][metric_name] = data

    # Read the results for each task, excluding 'overall'
    for task in metrics_data.keys():
        if task == 'overall':
            continue  # Skip the 'overall' task as there is no separate results file
        # Find the corresponding result file
        if task in task_to_result_file:
            results_path = task_to_result_file[task]
            df_results = pd.read_csv(results_path)
            # Exclude 'Weighted Mean' row if present
            if 'k' in df_results.columns and 'Weighted Mean' in df_results['k'].values:
                df_results = df_results[df_results['k'] != 'Weighted Mean']

            # Identify metric columns dynamically
            metric_columns = [col for col in df_results.columns if col not in ['k', 'weight']]

            # Compute the weighted mean metrics
            if 'weight' in df_results.columns:
                total_weight = df_results['weight'].sum()
                test_metrics = {}
                for metric in metric_columns:
                    valid = df_results[metric].notna()
                    if valid.any():
                        weighted_metric = (df_results.loc[valid, metric] * df_results.loc[valid, 'weight']).sum() / total_weight
                        test_metrics[metric] = weighted_metric
                # Store the test metrics
                metrics_data[task]['test_metrics'] = test_metrics
            else:
                print(f"No 'weight' column found in results file '{results_path.name}'. Cannot compute weighted mean.")
        else:
            print(f"Results file for task '{task}' not found in '{dataset}'.")

    # Now plot per task
    tasks = list(metrics_data.keys())
    num_tasks = len(tasks)

    epochs = np.arange(len(history))

    # Define colors
    training_color = 'blue'
    validation_color = 'orange'
    test_color = 'red'  # Use red color for test metrics

    # Map internal task names to friendly names
    task_name_mapping = {
        'concept_name_next': 'Next Activity',
        'time_timestamp_next': 'Next Time',
        'time_timestamp_last': 'Remaining Time',
        'overall': 'Overall'
    }
    # Add the single task name to the mapping if not present
    if single_task_name not in task_name_mapping:
        task_name_mapping[single_task_name] = 'Main Task'

    # Set up subplots
    fig, axes = plt.subplots(num_tasks, 2, figsize=(12, 5 * num_tasks))
    if num_tasks == 1:
        axes = np.array([axes])  # Ensure axes is always 2D array

    for i, task in enumerate(tasks):
        task_data = metrics_data[task]
        friendly_task_name = task_name_mapping.get(task, task)

        # Plot loss
        ax_loss = axes[i, 0]
        plotted = False  # Flag to check if any data is plotted
        if task_data['loss'] is not None:
            ax_loss.plot(epochs, task_data['loss'], label='Training Loss', color=training_color, linewidth=2)
            plotted = True

        if task_data['val_loss'] is not None:
            ax_loss.plot(epochs, task_data['val_loss'], label='Validation Loss', color=validation_color, linewidth=2)
            plotted = True

        if plotted:
            # Plot test loss as a horizontal line
            if 'loss' in task_data['test_metrics']:
                test_loss = task_data['test_metrics']['loss']
                ax_loss.axhline(y=test_loss, color=test_color, linestyle='-', linewidth=2, label='Test Loss')
            ax_loss.set_title(f'{friendly_task_name} - Training and Validation Loss')
            ax_loss.set_xlabel('Epochs')
            ax_loss.set_ylabel('Loss')
            ax_loss.legend(loc='upper right', fontsize='small')
        else:
            ax_loss.axis('off')  # Hide the axis if no data

        # Plot metrics
        ax_metric = axes[i, 1]
        plotted = False
        # Combine training and validation metrics to ensure consistent legends
        all_metrics = set(task_data['metrics'].keys()).union(task_data['val_metrics'].keys())
        for metric_name in all_metrics:
            # Plot training metrics
            if metric_name in task_data['metrics']:
                if task_data['metrics'][metric_name] is not None:
                    ax_metric.plot(epochs, task_data['metrics'][metric_name], label=f'Training {metric_name}', color=training_color, linewidth=2)
                    plotted = True

            # Plot validation metrics
            if metric_name in task_data['val_metrics']:
                if task_data['val_metrics'][metric_name] is not None:
                    ax_metric.plot(epochs, task_data['val_metrics'][metric_name], label=f'Validation {metric_name}', color=validation_color, linewidth=2)
                    plotted = True

            # Map the metric name to the test metric name
            test_metric_name = metric_name_mapping.get(metric_name, metric_name)

            # Plot test metric as a horizontal line
            if test_metric_name in task_data['test_metrics']:
                test_metric_value = task_data['test_metrics'][test_metric_name]
                ax_metric.axhline(y=test_metric_value, color=test_color, linestyle='-', linewidth=2, label=f'Test {metric_name}')
                plotted = True
            else:
                print(f"Test metric '{test_metric_name}' not found for task '{task}'.")

        if plotted:
            ax_metric.set_title(f'{friendly_task_name} - Training and Validation Metrics')
            ax_metric.set_xlabel('Epochs')
            ax_metric.set_ylabel('Metric')
            ax_metric.legend(loc='lower right', fontsize='small')
        else:
            ax_metric.axis('off')  # Hide the axis if no data

    plt.tight_layout()
    plt.show()
    plt.clf()  # Clear figure

In [4]:
def average_results_over_folds(dataset):
    """
    Averages the results over all folds and stores the result in a CSV file for each task.
    Additionally, prints the weighted mean accuracy for classification tasks and 
    the weighted mean MAE for regression tasks.

    Parameters:
    - dataset: Path object representing the dataset directory containing the results CSV files.
    """
    # Get all result files in the dataset directory
    result_files = list(dataset.glob("results_*__*.csv"))

    # Dictionary to store dataframes for each task
    task_data = {}

    # Process each result file
    for file_path in result_files:
        # Extract task name from the filename
        base_name = file_path.name
        parts = base_name.split("__")
        if len(parts) < 2:
            continue  # Skip files that don't match the pattern
        task_name_with_ext = parts[1]  # This includes the .csv extension
        task_name = task_name_with_ext.replace(".csv", "")
        # Remove parentheses and quotes from task name
        task_name = task_name.strip("()'\"").replace("', '", "_").replace(" ", "_")

        # Read the CSV file
        df = pd.read_csv(file_path, index_col=False)

        # Print the columns in df for debugging
        # print(f"Processing file: {file_path}")
        # print("Columns in df:", df.columns.tolist())
        # print(df.head())

        # Handle 'Weighted Mean' row separately
        weighted_mean_row = df[df['k'] == 'Weighted Mean']
        df = df[df['k'] != 'Weighted Mean']

        # Convert 'k' to numeric, handling errors
        df['k'] = pd.to_numeric(df['k'], errors='coerce')
        df['weight'] = pd.to_numeric(df['weight'], errors='coerce')

        # Identify metric columns dynamically
        metric_columns = [col for col in df.columns if col not in ['k', 'weight', 'fold']]

        # Convert metric columns to numeric
        df[metric_columns] = df[metric_columns].apply(pd.to_numeric, errors='coerce')

        # Drop rows with NaN values in 'k' or 'weight'
        df = df.dropna(subset=['k', 'weight'])

        # Add a column for fold number
        fold_part = parts[0]
        fold_number = fold_part.split('_')[1]
        df['fold'] = int(fold_number)

        # Store the dataframe in the task_data dictionary
        if task_name not in task_data:
            task_data[task_name] = {'dfs': [], 'metric_columns': metric_columns}
        else:
            # Ensure metric_columns are consistent across folds
            if set(metric_columns) != set(task_data[task_name]['metric_columns']):
                print(f"Warning: Metric columns differ for task {task_name} in file {file_path}")
        task_data[task_name]['dfs'].append(df)

    # Process each task
    for task_name, data in task_data.items():
        dfs = data['dfs']
        metric_columns = data['metric_columns']

        # Concatenate all folds data for this task
        df_all_folds = pd.concat(dfs, ignore_index=True)

        # Group by 'k' and compute the mean of metrics over folds
        agg_dict = {'weight': 'first'}
        for metric in metric_columns:
            agg_dict[metric] = 'mean'

        df_mean = df_all_folds.groupby('k').agg(agg_dict).reset_index()

        # Compute the overall weighted mean for each metric
        weighted_metrics = {}
        for metric in metric_columns:
            valid = df_mean[metric].notna()
            if valid.any():
                total_weight_metric = df_mean.loc[valid, 'weight'].sum()
                weighted_metric = (df_mean.loc[valid, metric] * df_mean.loc[valid, 'weight']).sum() / total_weight_metric
                weighted_metrics[metric] = weighted_metric
            else:
                weighted_metrics[metric] = float('nan')  # Assign NaN if all values are NaN

        # Create the 'Weighted Mean' row
        weighted_mean_row = {'k': 'Weighted Mean', 'weight': ''}
        for metric in metric_columns:
            weighted_mean_row[metric] = weighted_metrics[metric]

        # Append the 'Weighted Mean' row to the averaged dataframe using pd.concat
        df_mean = pd.concat([df_mean, pd.DataFrame([weighted_mean_row])], ignore_index=True)

        # Reorder columns to match the original file
        columns_order = ['k', 'weight'] + metric_columns
        df_mean = df_mean[columns_order]

        # Save the averaged results to a new CSV file in the dataset directory
        output_file = dataset / f"averaged_results_{task_name}.csv"
        df_mean.to_csv(output_file, index=False)

        # Print the weighted mean accuracy or MAE
        if 'accuracy' in metric_columns:
            weighted_accuracy = weighted_metrics['accuracy']
            print(f"Accuracy '{task_name}': {weighted_accuracy:.4f}")
        elif 'mae' in metric_columns:
            weighted_mae = weighted_metrics['mae']
            print(f"MAE '{task_name}': {weighted_mae:.4f}")

        # print(f"Averaged results for task '{task_name}' saved to '{output_file}'.")

In [5]:
def print_holdout_weighted_mean(dataset):
    """
    Reads the results CSV files for the holdout dataset, computes the weighted mean values,
    and prints the weighted mean accuracy (for classification tasks) or weighted mean MAE
    (for regression tasks).

    Parameters:
    - dataset: Path object representing the dataset directory containing the results CSV files.
    """
    # Get all result files in the dataset directory
    result_files = list(dataset.glob("results_*.csv"))
    
    # Process each result file
    for file_path in result_files:
        # Extract task name from the filename
        base_name = file_path.name
        parts = base_name.split("__")
        if len(parts) < 2:
            continue  # Skip files that don't match the pattern
        task_name_with_ext = parts[1]  # This includes the .csv extension
        task_name = task_name_with_ext.replace(".csv", "")
        # Remove parentheses and quotes from task name
        task_name = task_name.strip("()'\"").replace("', '", "_").replace(" ", "_")

        # Read the CSV file
        df = pd.read_csv(file_path, index_col=False)

        # Handle 'Weighted Mean' row separately if it exists
        if 'k' in df.columns and 'Weighted Mean' in df['k'].values:
            df = df[df['k'] != 'Weighted Mean']

        # Convert 'k' to numeric, handling errors
        if 'k' in df.columns:
            df['k'] = pd.to_numeric(df['k'], errors='coerce')
        if 'weight' in df.columns:
            df['weight'] = pd.to_numeric(df['weight'], errors='coerce')

        # Identify metric columns dynamically
        metric_columns = [col for col in df.columns if col not in ['k', 'weight']]

        # Convert metric columns to numeric
        df[metric_columns] = df[metric_columns].apply(pd.to_numeric, errors='coerce')

        # Drop rows with NaN values in 'k' or 'weight'
        if 'k' in df.columns and 'weight' in df.columns:
            df = df.dropna(subset=['k', 'weight'])

        # Compute the overall weighted mean for each metric
        weighted_metrics = {}
        if 'weight' in df.columns:
            total_weight = df['weight'].sum()
            for metric in metric_columns:
                valid = df[metric].notna()
                if valid.any():
                    weighted_metric = (df.loc[valid, metric] * df.loc[valid, 'weight']).sum() / total_weight
                    weighted_metrics[metric] = weighted_metric
                else:
                    weighted_metrics[metric] = float('nan')  # Assign NaN if all values are NaN

            # Print the weighted mean accuracy or MAE
            if 'accuracy' in metric_columns:
                weighted_accuracy = weighted_metrics['accuracy']
                print(f"Accuracy {task_name}: {weighted_accuracy:.4f}")
            elif 'mae' in metric_columns:
                weighted_mae = weighted_metrics['mae']
                print(f"MAE {task_name}: {weighted_mae:.4f}")
            else:
                # If other metrics are present, you can adjust this section to print them
                for metric in metric_columns:
                    weighted_value = weighted_metrics[metric]
                    print(f"{metric} {task_name}: {weighted_value:.4f}")
        else:
            print(f"No 'weight' column found in file {file_path}. Cannot compute weighted mean.")

In [6]:
# def rename_file(dataset):
#     concept_name_target = "next"
#     time_timestamp_target = "next"
    
    
#     for file in dataset.iterdir():
#         if "download" in file.name:
#             file.unlink()
#         # print(file.name)
#         elif "results_" in file.name or "predictions_" in file.name:
#             if "averaged" in file.name:
#                 file.unlink()
#             else:
#                 # print(file.name)
#                 name_parts = file.name.split("__")
#                 if "concept_name" in name_parts[1]:
#                     name_parts[1] = f"('concept_name', '{concept_name_target}').csv"
#                 elif "time_timestamp" in name_parts[1]:
#                     name_parts[1] = f"('time_timestamp', '{time_timestamp_target}').csv"
#                 new_file = Path(name_parts[0]+ "__" + name_parts[1])
#                 new_file_path = dataset / new_file
                
#                 if not (new_file_path.exists() and new_file_path.is_file()):
#                     file.rename(new_file_path)
            
#             # print(new_filename)
#             # rename file

In [None]:
current_dir = Path.cwd()  # Get the current directory

for job_idx, job in enumerate(current_dir.iterdir()):
    if job.is_dir() and job_idx != 0:
        if job == current_dir / "15":  # TODO: for testing
            print(f"########################## {job.name} ##########################")
            # cross_val and holdout
            for app_idx, approach in enumerate(job.iterdir()):
                # cross_val
                if app_idx == 0:
                    print(f"############# cross_val #############")
                    # loop over datasets in approach
                    for dataset in approach.iterdir():
                        print(f"----- {dataset.name} -----")
                        # Average results over folds
                        average_results_over_folds(dataset)
                        # Plot histories
                        plot_crossval_histories(dataset)
                        break
                        
                # holdout
                elif app_idx == 1:
                    print(f"############# holdout #############")
                    # loop over datasets in approach
                    for dataset in approach.iterdir():
                        print(f"----- {dataset.name} -----")
                        # Print weighted mean values for holdout
                        print_holdout_weighted_mean(dataset)
                        # Plot history for holdout
                        plot_holdout_history(dataset)
                        break
                # if there are more dirs
                else:
                    raise ValueError(f"Too many approach folders in {job}. Expected 2.")