# Sweep analysis

In [1]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from src.constants import METADATA_COLUMNS, METRIC_COLUMNS

plt.style.use("seaborn-v0_8-dark")
sns.set_style("whitegrid", {"grid.color": ".7", "grid.linestyle": ":"})
pd.set_option("display.max_rows", 100)

# Dataset Overview

The output displays the DataFrame `df`, which contains all **successful runs** from the dataset. This DataFrame includes:

- **run_name**: A unique identifier for each run, generated based on the combination of feature values.
- **Features**: A list of features with more than one unique value, used for analysis.
- **Performance Metrics**:
  - `avg_fps`: Average frames per second achieved during the run.
  - `std_fps`: Standard deviation of the frames per second, indicating performance consistency.
  - `avg_loss`: Average loss achieved during the run.
  - `std_loss`: Standard deviation of the loss, indicating performance consistency.
  - `avg_loss_tail`: Average loss achieved over the last iterations of the run the run.
  - `std_loss_tail`: Standard deviation of the `avg_loss`, indicating performance consistency.

Additionally, the following information is printed:

- **Features**: A list of all features considered for analysis.
- **Binary Features**: A subset of features that have exactly two unique values.
- **Run Counts**:
  - **All runs**: Total number of runs in the dataset.
  - **Successful runs**: Number of runs where `avg_fps` is not zero.
  - **Failed runs**: Number of runs where `avg_fps` is zero (due to `error`, `oom`, `oom-skipped`).


In [None]:
# Load DataFrames
df_path = "outputs/runs/sweep_000/runs_summary.csv"
df_raw = pd.read_csv(df_path)

# Split df in successful and failed runs
df_failed = df_raw[df_raw["status"].isin(["error", "oom", "oom-skipped"])].copy()
df = df_raw[df_raw["status"].isin(["success"])].copy()

# Convert sets to lists and concatenate
excluded_columns = list(METRIC_COLUMNS) + list(METADATA_COLUMNS)

# Automatically extract features from DataFrame columns
all_columns = df.columns.tolist()
features = [col for col in all_columns if col not in excluded_columns]

# Update features to include only those with more than one unique value
features = [feature for feature in features if df[feature].nunique() > 1]


# Create df IDs
def create_run_id(row):
    return "_".join(
        (
            f"{''.join(col.replace('.', '_').split('_')[i][0].lower() for i in range(len(col.replace('.', '_').split('_'))))}{val}"
        )
        for col, val in row.items()
    )


df.loc[:, "run_name"] = df[features].apply(create_run_id, axis=1)

# Define no-batch-size features
no_bs_features = [f for f in features if f != "train_batch_size"]

# Define binary_features
binary_features = [feature for feature in features if df[feature].nunique() == 2]

print(f"Features: {features}")
print(f"Binary features: {binary_features}")

# Sort the DataFrame by avg_fps in descending order
df_sorted = df.sort_values(by="avg_fps", ascending=False)

# Display the DataFrame with run_name, features, avg_fps, and std_fps
display_columns = ["run_name"] + features + list(METRIC_COLUMNS)

print(f"All runs: {len(df_raw)}")
print(f"Successful runs: {len(df)}")
print(f"Failed runs: {len(df_failed)}")

df_sorted[display_columns]

In [None]:
# Extract unique values from DataFrame for consistent plotting
backend_order = sorted(df["dynamo_config.dynamo_backend"].unique())
precision_order = sorted(df["mixed_precision"].unique())

# Define colors using a colormap based on number of precision types
colors = plt.cm.Set2(np.linspace(0, 1, len(precision_order)))
color_dict_precision = dict(zip(precision_order, colors))

# Create figure
plt.figure(figsize=(15, 8))

# Group by backend first
x_positions = []
x_labels = []
current_x = 0

for backend in backend_order:
    backend_data = df[df["dynamo_config.dynamo_backend"] == backend]

    # For each precision in this backend
    for precision in precision_order:
        precision_data = backend_data[backend_data["mixed_precision"] == precision]

        if not precision_data.empty:
            # Get top performing run
            top_run = precision_data.nlargest(1, "avg_fps").iloc[0]

            # Plot bar
            plt.bar(
                current_x,
                top_run["avg_fps"],
                color=color_dict_precision[precision],
                yerr=top_run["std_fps"],
                capsize=5,
            )

            # Add value label on top of bar
            plt.text(
                current_x,
                top_run["avg_fps"],
                f'{top_run["avg_fps"]:.2f}\n(bs={top_run["train_batch_size"]})',
                ha="center",
                va="bottom",
            )

            x_positions.append(current_x)
            x_labels.append(f"{backend}\n{precision}")
            current_x += 1

    # Add space between backend groups
    current_x += 0.5

# Customize plot
plt.xticks(x_positions, x_labels, rotation=45, ha="right")
plt.ylabel("FPS")
plt.title("Top Performing Configurations by Backend and Precision")

# Add legend for precision types
legend_elements = [
    plt.Rectangle((0, 0), 1, 1, facecolor=color_dict_precision[prec], label=prec)
    for prec in precision_order
]
plt.legend(handles=legend_elements, title="Precision")

plt.grid(True, linestyle=":", alpha=0.7)
plt.tight_layout()
plt.show()

# Print detailed information about top configurations
print("\nTop configurations:")
for backend in backend_order:
    print(f"\nBackend: {backend}")
    backend_data = df[df["dynamo_config.dynamo_backend"] == backend]

    for precision in precision_order:
        precision_data = backend_data[backend_data["mixed_precision"] == precision]
        if not precision_data.empty:
            top_run = precision_data.nlargest(1, "avg_fps").iloc[0]
            print(f"\n{precision}:")
            print(f"  FPS: {top_run['avg_fps']:.2f} Â± {top_run['std_fps']:.2f}")
            print(f"  Batch size: {top_run['train_batch_size']}")
            print(f"  Run name: {top_run['run_name']}")

## FPS vs train_batch_size: Overview

In this section, we'll examine the relationship between FPS and train_batch_size, while also considering the influence of data type (dtype) and Model Size. This visualization aims to provide insights into how these factors interact and affect performance.

In [None]:
plt.figure(figsize=(20, 6))
scatter = sns.scatterplot(
    x="train_batch_size",
    y="avg_fps",
    hue="dynamo_config.dynamo_backend",
    style="mixed_precision",
    data=df,
    palette="Set2",
    s=100,
)

scatter.set_title(
    "FPS vs train_batch_size, Colored by mixed_precision and Styled by use_cache"
)
scatter.set_xlabel("train_batch_size")
scatter.set_ylabel("FPS")

plt.legend(loc="best", title="mixed_precision and use_cache")

plt.show()

## FPS vs train_batch_size: Scatter feature-wise Analysis

This section presents an examination of how different features affect the relationship between FPS and train_batch_size. We've created a grid of scatter plots, each focusing on a specific feature's impact on performance.

Key aspects of this visualization:
- Grid Layout: The plots are arranged in a grid, with each subplot dedicated to a different feature.
- Error Bars: Each data point includes error bars representing the standard deviation of FPS, giving insight into the variability of performance **within the same run**.
- Color Coding: Different values of each feature are represented by distinct colors, allowing for easy comparison within each plot.

Axes:
- X-axis: train_batch_size
- Y-axis: Average FPS

This view allows us to:
- Identify how each feature independently affects the FPS-train_batch_size relationship
- Spot any features that have a more pronounced impact on performance
- Detect potential interactions between features and train_batch_size
- Observe the variability in performance for different feature values and train_batch_sizes


In [None]:
# Calculate the number of rows needed for the grid
n_features = len(no_bs_features)
n_rows = (n_features + 1) // 2  # Round up to the nearest integer

# Create a figure with subplots
fig, axes = plt.subplots(n_rows, 2, figsize=(20, 8 * n_rows), constrained_layout=True)
fig.suptitle("FPS vs train_batch_size, Faceted by Features", fontsize=16, y=1.02)

# Flatten the axes array for easy iteration
axes = axes.flatten()


# Custom function to plot scatter points with error bars
def scatter_with_errorbars(data, x, y, yerr, ax, hue, **kwargs):
    for hue_val in data[hue].unique():
        hue_data = data[data[hue] == hue_val]
        ax.errorbar(
            hue_data[x],
            hue_data[y],
            yerr=hue_data[yerr],
            fmt="o",
            capsize=3,
            label=hue_val,
            **kwargs,
        )


# Iterate through features and create a plot for each
for i, feature in enumerate(no_bs_features):
    if i < len(axes):
        scatter_with_errorbars(
            df,
            x="train_batch_size",
            y="avg_fps",
            yerr="std_fps",
            ax=axes[i],
            hue=feature,
        )
        axes[i].set_title(f"FPS vs train_batch_size, by {feature}")
        axes[i].set_xlabel("train_batch_size")
        axes[i].set_ylabel("avg_fps")
        axes[i].legend(title=feature)
        axes[i].grid(True, linestyle=":", alpha=0.7)

# Remove any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.show()

## FPS vs train_batch_size: Averaged feature-wise Analysis

This section presents an examination of how different features affect the relationship between FPS and train_batch_size. We've created a grid of point plots, each focusing on a specific feature's impact on performance.

Key aspects of this visualization:
- Grid Layout: The plots are arranged in a grid, with each subplot dedicated to a different feature.
- Error Bars: Each data point includes error bars representing the standard deviation of FPS, giving insight into the variability of performance **between multiple runs**.
- Color Coding: Different values of each feature are represented by distinct colors, allowing for easy comparison within each plot.

Axes:
- X-axis: train_batch_size
- Y-axis: Average FPS

This view allows us to:
- Identify how each feature independently affects the FPS-train_batch_size relationship
- Spot any features that have a more pronounced impact on performance
- Detect potential interactions between features and train_batch_size
- Observe the variability in performance for different feature values and train_batch_sizes


In [None]:
# Create a figure with subplots
fig, axes = plt.subplots(
    2,
    (len(no_bs_features) // 2 + len(no_bs_features) % 2),
    figsize=(30, 10 * (len(no_bs_features) // 2 + len(no_bs_features) % 2)),
    constrained_layout=True,
)
fig.suptitle(
    "FPS over train_batch_size",
    fontsize=20,
    y=1.02,
)

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Iterate through no_bs_features and create a plot for each
for i, feature in enumerate(no_bs_features):
    sns.pointplot(
        x="train_batch_size",
        y="avg_fps",
        hue=feature,
        data=df,
        errorbar="sd",
        dodge=True,
        capsize=0.1,
        ax=axes[i],
    )

    axes[i].set_title(f"Effect of {feature} on FPS", fontsize=16)
    axes[i].set_xlabel("train_batch_size", fontsize=14)
    axes[i].set_ylabel("avg_fps", fontsize=14)
    axes[i].legend(title=feature, loc="lower right", fontsize=12, title_fontsize=14)
    axes[i].grid(True, linestyle=":", alpha=0.7)

    # Get current tick positions and labels
    ticks = axes[i].get_xticks()
    labels = axes[i].get_xticklabels()

    # Set ticks first, then labels with rotation
    axes[i].set_xticks(ticks)
    axes[i].set_xticklabels(labels, rotation=45, ha="right")

# Show the plot
plt.show()

## FPS distribution by feature: violin plots

This visualization presents an examination of how different features affect both the average and standard deviation of FPS. The plot is organized as a grid of violin plots and stripplots, providing a multi-faceted view of the data.
The plots are arranged in a 2 x N grid, where N is the number of features. The top row shows the effect on average FPS, while the bottom row shows the effect on standard deviation of FPS.

Dual Plot Type: Each cell in the grid contains two plot types overlaid:
- A violin showing the distribution of FPS values
- A stripplot showing individual data points

Axes:
- X-axis: Different categories of each feature
- Y-axis: FPS values (Average or Standard Deviation)

Violin Plot Details:
- The violin plots are cut at the extremes of the data
- Inner quartiles are displayed within the violin plots

This visualization allows us to:
- Compare the distribution of FPS across different categories within each feature
- Identify features that have a significant impact on both average FPS and its variability
- Spot outliers and understand the spread of data points within each category
- Observe potential patterns or trends across features

By presenting both the average and standard deviation of FPS, this plot provides insights into the performance distribution across different feature categories. This can be interesting for identifying configurations that offer both high and stable FPS.

In [None]:
fig, axes = plt.subplots(
    2, len(no_bs_features), figsize=(10 * len(no_bs_features), 10), sharex="col"
)

for i, feature in enumerate(no_bs_features):
    # Plot for avg_fps
    sns.violinplot(
        x=feature,
        y="avg_fps",
        data=df,
        ax=axes[0, i],
        cut=0,
        inner="quartile",
        hue=feature,
        legend=False,
    )
    sns.stripplot(
        x=feature,
        y="avg_fps",
        data=df,
        color="black",
        alpha=0.7,
        ax=axes[0, i],
        jitter=True,
    )
    axes[0, i].set_title(f"Effect of {feature} on avg_fps")
    axes[0, i].set_ylabel("avg_fps")
    axes[0, i].tick_params(axis="x", rotation=45)

    # Plot for std_fps
    sns.violinplot(
        x=feature,
        y="std_fps",
        data=df,
        ax=axes[1, i],
        cut=0,
        inner="quartile",
        hue=feature,
        legend=False,
    )
    sns.stripplot(
        x=feature,
        y="std_fps",
        data=df,
        color="black",
        alpha=0.7,
        ax=axes[1, i],
        jitter=True,
    )
    axes[1, i].set_title(f"Effect of {feature} on std_fps")
    axes[1, i].set_xlabel(feature)
    axes[1, i].set_ylabel("std_fps")
    axes[1, i].tick_params(axis="x", rotation=45)

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

## Heatmap: Average FPS Across Features vs train_batch_size

This visualization presents a series of heatmaps examining the relationship between various features, train_batch_size, and average frames per second (FPS). The plot is organized as a grid of individual heatmaps, providing a comprehensive view of how different factors interact to affect performance.
Key aspects of this visualization:
The heatmaps are arranged in a grid, with each cell representing a different feature's interaction with train_batch_size.

Heatmap Structure:
- X-axis: Different categories or values of each feature
- Y-axis: train_batch_size
- Color scale: Represents the average FPS, with a "coolwarm" color scheme

Data Representation:
- Each cell in the heatmap shows the average FPS for a specific combination of feature value and train_batch_size
- Actual FPS values are annotated within each cell for precise reading

In [None]:
fig, axes = plt.subplots(
    len(no_bs_features) // 2 + len(no_bs_features) % 2,
    2,
    figsize=(20, 8 * (len(no_bs_features) // 2 + len(no_bs_features) % 2)),
)
fig.suptitle("Heatmaps of avg_fps Across no_bs_features", fontsize=16, y=1.02)

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Iterate through no_bs_features and create a heatmap for each
for i, feature in enumerate(no_bs_features):
    heatmap_data = df.pivot_table(
        index="train_batch_size", columns=feature, values="avg_fps"
    )

    sns.heatmap(heatmap_data, annot=True, cmap="coolwarm", fmt=".2f", ax=axes[i])

    axes[i].set_title(f"avg_fps: {feature} vs train_batch_size")
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel("train_batch_size")
    axes[i].tick_params(axis="x", rotation=45)

# Remove any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

# Impact of Binary Features on Performance Metrics

The plots illustrate how toggling each binary feature affects system performance, specifically focusing on the average frames per second (avg_fps) and the standard deviation of fps (std_fps).
Here's what's being visualized:

- **Comparison Between Feature States**: For each binary feature, the plots compare the two possible states (e.g., 0 and 1) while keeping all other feature settings constant.

- **Top Plot (avg_fps Difference)**:
  - **Y-Axis**: Displays the difference in avg_fps between the two states of the binary feature for each configuration.
  - **X-Axis**: Each bar represents a unique configuration of the other features (denoted by `custom_id`).
  - **Interpretation**: Positive bars indicate an increase in Avg fps when the feature is toggled, while negative bars indicate a decrease.

- **Bottom Plot (std_fps Difference)**:
  - **Y-Axis**: Shows the difference in std_fps between the two feature states for each configuration.
  - **X-Axis**: Mirrors the top plot for direct comparison.
  - **Interpretation**: Highlights how the feature toggle affects the consistency of performance.

These plots help visualize how each binary feature influences performance metrics across different configurations, making it easier to identify features that have significant positive or negative effects on system performance.

TODO: run comparison on execution time (1/FPS) rather than FPS more coherent reading.

In [None]:
def analyze_binary_feature(df, feature, features):
    # Print DataFrame info for debugging
    print(f"Analyzing feature: {feature}")
    print(f"Feature unique values: {df[feature].unique()}")

    # Create a custom ID excluding the current feature
    other_features = [f for f in features if f != feature]
    df["custom_id"] = df[other_features].apply(create_run_id, axis=1)

    # Find runs with both states for the current feature
    feature_counts = df.groupby("custom_id")[feature].nunique().reset_index()
    matching_ids = feature_counts[feature_counts[feature] == 2]["custom_id"]

    # Filter the dataframe for matching runs
    df_matching = df[df["custom_id"].isin(matching_ids)].copy()

    if df_matching.empty:
        print(f"No matching data for feature: {feature}")
        return None

    # Pivot the data for comparison
    df_pivot = df_matching.pivot(
        index="custom_id", columns=feature, values=["avg_fps", "std_fps"]
    )
    df_pivot.columns = [f"{metric}_{value}" for metric, value in df_pivot.columns]
    df_pivot.reset_index(inplace=True)

    # Calculate differences
    unique_values = sorted(df_matching[feature].unique())
    if len(unique_values) < 2:
        print(f"Not enough unique values for feature: {feature}")
        return None

    df_pivot[f"avg_fps_diff"] = (
        df_pivot[f"avg_fps_{unique_values[1]}"]
        - df_pivot[f"avg_fps_{unique_values[0]}"]
    )
    df_pivot[f"std_fps_diff"] = (
        df_pivot[f"std_fps_{unique_values[1]}"]
        - df_pivot[f"std_fps_{unique_values[0]}"]
    )

    return df_pivot, unique_values


# Analyze each binary feature
for feature in binary_features:
    result = analyze_binary_feature(df, feature, features)

    if result is not None:
        df_pivot, unique_values = result

        # Create plots
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(25, 12))

        # Plot avg_fps difference
        sns.barplot(
            x="index",
            y="avg_fps_diff",
            data=df_pivot.reset_index(),
            hue="index",
            palette="Blues_d",
            legend=False,
            ax=ax1,
        )
        ax1.set_title(
            f"Difference in avg_fps\n{feature}: {unique_values[1]} vs {unique_values[0]}"
        )
        ax1.set_ylabel("avg_fps Difference")
        ax1.axhline(0, color="red", linestyle="--")
        ax1.tick_params(axis="x", which="both", bottom=False, labelbottom=False)

        # Plot std_fps difference
        sns.barplot(
            x="index",
            y="std_fps_diff",
            data=df_pivot.reset_index(),
            hue="index",
            palette="Greens_d",
            legend=False,
            ax=ax2,
        )
        ax2.set_title(
            f"Difference in std_fps\n{feature}: {unique_values[1]} vs {unique_values[0]}"
        )
        ax2.set_ylabel("std_fps Difference")
        ax2.axhline(0, color="red", linestyle="--")

        # Set x-axis labels
        x_ticks = np.arange(len(df_pivot))
        ax2.set_xticks(x_ticks)
        ax2.set_xticklabels(df_pivot["custom_id"], rotation=90, ha="right")

        plt.tight_layout()

        plt.tight_layout()
        plt.show()

        print(f"\nAnalyzing feature: {feature}")
        print(f"Unique values: {df[feature].unique()}")
        print(f"Value counts:\n{df[feature].value_counts()}")
        print(f"Number of matching run_names: {len(df_pivot)}")

# Heatmaps of Failed Runs Across Features

The plots display heatmaps illustrating the number of failed runs across different configurations. Each heatmap corresponds to a specific feature (excluding "train_batch_size"), with:

- **X-Axis**: Different values of the feature under consideration.
- **Y-Axis**: Various "train_batch_size" settings.
- **Color Intensity**: Represents the count of failed runs for each combination of "train_batch_size" and the feature value.

These visualizations help identify patterns and correlations between feature settings, train_batch_sizes, and the frequency of failed runs, highlighting configurations that may lead to higher failure rates.

In [None]:
# Create a figure with subplots (2 columns)
fig, axes = plt.subplots(
    len(no_bs_features) // 2 + len(no_bs_features) % 2,
    2,
    figsize=(20, 8 * (len(no_bs_features) // 2 + len(no_bs_features) % 2)),
)
fig.suptitle("Heatmaps of Failed Runs Across Features", fontsize=16, y=1.02)

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Iterate through features and create a heatmap for each
for i, feature in enumerate(no_bs_features):
    heatmap_data = df_failed.pivot_table(
        index="train_batch_size", columns=feature, aggfunc="size", fill_value=0
    )

    sns.heatmap(heatmap_data, annot=True, cmap="YlOrRd", fmt="d", ax=axes[i])

    axes[i].set_title(f"Failed Runs: {feature} vs train_batch_size")
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel("train_batch_size")
    axes[i].tick_params(axis="x", rotation=45)

# Remove any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

# Distribution of Failed Runs Across Features

The plots display histograms illustrating how failed runs are distributed across different feature values. Each subplot corresponds to a specific feature, showing the count of failed runs associated with each value or range of that feature.

- **X-Axis**: Represents the values or categories of the feature being analyzed.
- **Y-Axis**: Indicates the number of failed runs for each feature value.

These visualizations help identify patterns and trends in the failure rates related to specific feature settings, making it easier to pinpoint features that may contribute to higher failure occurrences.

In [None]:
# Create a figure with subplots (2 columns)
fig, axes = plt.subplots(
    len(features) // 2 + len(features) % 2,
    2,
    figsize=(20, 6 * (len(features) // 2 + len(features) % 2)),
)
fig.suptitle("Distribution of Failed Runs Across Features", fontsize=16, y=1.02)

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Iterate through features and create a histogram for each
for i, feature in enumerate(features):
    sns.histplot(df_failed[feature], bins=10, kde=False, ax=axes[i])

    axes[i].set_title(f"Failed Runs: {feature}")
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel("Count of Failed Runs")
    axes[i].tick_params(axis="x", rotation=45)

# Remove any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

# Export notebook

In [None]:
output_dir = os.path.dirname(df_path)
notebook_name = 'analysis_notebook.ipynb'
output_format = 'html'

# If you want to set a custom output filename, e.g. analysis_notebook.html:
output_file = os.path.join(output_dir, "analysis_notebook.html")

# Export the notebook to HTML
!jupyter nbconvert --to {output_format} --no-input --output {output_file} {notebook_name}

print(f"Successfully exported notebook to {output_file}")