In [None]:
import config
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from adjustText import adjust_text

In [None]:
def load_and_preprocess(file_path, required_cols=None, cols_to_numeric=None):
    """
    Load Excel/CSV file, drop missing required columns, convert columns to numeric.
    """
    if file_path.endswith('.xlsx'):
        df = pd.read_excel(file_path, keep_default_na=False)
    else:
        df = pd.read_csv(file_path)
    if required_cols:
        df = df.dropna(subset=required_cols)
    if cols_to_numeric:
        for col in cols_to_numeric:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df


In [None]:
required_cols = [
    "Edit Distance (Avg)", "Emissions (kg CO2e)", "Agent Design", "Prompt Type", "Model Name",
    "Parsing Accuracy (%)", "LCS (Avg)", "Compute Time (s)", "CPU Energy (kWh)", "GPU Energy (kWh)",
    "RAM Energy (kWh)", "Energy Consumed (kWh)"
]
cols_to_numeric = [
    "Edit Distance (Avg)", "Emissions (kg CO2e)", "Parsing Accuracy (%)", "LCS (Avg)",
    "Compute Time (s)", "CPU Energy (kWh)", "GPU Energy (kWh)", "RAM Energy (kWh)", "Energy Consumed (kWh)"
]
#df = load_and_preprocess("experiments_complete.xlsx", required_cols, cols_to_numeric)

In [None]:
def process_and_average(file_path):
    """
    Loads a CSV, extracts relevant columns, groups by project (without timestamp), and averages numeric metrics.
    Returns a DataFrame with group_key and averaged metrics.
    """
    df = pd.read_csv(file_path)
    columns = [
        "timestamp", "project_name", "duration", "emissions",
        "cpu_energy", "gpu_energy", "ram_energy", "energy_consumed",
        "country_name", "country_iso_code", "os", "python_version",
        "codecarbon_version"
    ]
    df = df[columns]
    df["group_key"] = df["project_name"].apply(lambda name: re.sub(r'_[0-9]{8}-[0-9]{6}$', '', name))
    numeric_cols = ["duration", "emissions", "cpu_energy", "gpu_energy", "ram_energy", "energy_consumed"]
    grouped = df.groupby("group_key")[numeric_cols].mean().reset_index()
    return grouped

# result_df = process_and_average("emissions_filtered.csv")
# print(result_df)

In [None]:
# Scatterplot - Edit Distance vs Emissions
def plot_edit_distance_vs_emissions(df):
    """
    Scatterplot: Edit Distance vs Emissions, colored by Agent Design, shaped by Prompt Type, annotated by Model Name.
    """
    agent_palette = {
        'NA': 'gray', 'MA': 'darkorange', 'TA': 'teal', 'SA': 'mediumpurple', 'DA': 'deeppink'
    }
    plt.figure(figsize=(12, 8))
    sns.set(style="whitegrid", context="talk")
    scatter = sns.scatterplot(
        data=df,
        x="Edit Distance (Avg)",
        y="Emissions (kg CO2e)",
        hue="Agent Design",
        style="Prompt Type",
        palette=agent_palette,
        s=150,
        edgecolor="black",
        alpha=0.85
    )
    texts = []
    for _, row in df.iterrows():
        texts.append(
            plt.text(
                row["Edit Distance (Avg)"],
                row["Emissions (kg CO2e)"],
                row["Model Name"],
                fontsize=10,
                ha='center',
                va='center',
                alpha=0.8
            )
        )
    adjust_text(texts, ax=plt.gca(), only_move={'points':'y', 'text':'y'}, arrowprops=dict(arrowstyle='-', color='gray', lw=0.5))
    plt.title("Edit Distance vs Emissions by Agent Design and Prompt Type")
    plt.xlabel("Edit Distance (Avg) — Lower is Better")
    plt.ylabel("Emissions (kg CO₂e) — Lower is Better")
    plt.tight_layout()
    plt.savefig(os.path.join(config.PLOT_DIR, "edit_distance_vs_emissions.png"), dpi=300)
    plt.show()

# plot_edit_distance_vs_emissions(df)

In [None]:
# Barplot - Energy Consumption by Agent Design and Model
def plot_energy_by_agent_model(df):
    plt.figure(figsize=(10, 6))
    sns.barplot(
        data=df,
        x="Agent Design",
        y="Energy Consumed (kWh)",
        hue="Model Name",
        ci=None
    )
    plt.title("Energy Consumption per Agent Design by Model")
    plt.ylabel("Energy Consumed (kWh)")
    plt.tight_layout()
    plt.savefig(os.path.join(config.PLOT_DIR, "energy_by_agent_model.png"), dpi=300)
    plt.show()

# plot_energy_by_agent_model(df)

In [None]:
# Stacked Barplot - Energy Components
def plot_stacked_energy_components(df):
    energy_components = df.groupby("Agent Design")[["CPU Energy (kWh)", "GPU Energy (kWh)", "RAM Energy (kWh)"]].sum()
    energy_components.plot(kind='bar', stacked=True, colormap="Paired", figsize=(10, 6))
    plt.title("Stacked Energy Use by Component (CPU/GPU/RAM) per Agent Design")
    plt.ylabel("Energy (kWh)")
    plt.xlabel("Agent Design")
    plt.tight_layout()
    plt.savefig(os.path.join(config.PLOT_DIR, "stacked_energy_components.png"), dpi=300)
    plt.show()

# plot_stacked_energy_components(df)

In [None]:
# Heatmaps
def plot_heatmap(df, value, title, filename, fmt=".2f", cmap="YlGnBu"):
    heatmap_data = df.pivot_table(
        values=value,
        index="Agent Design",
        columns="Prompt Type"
    )
    plt.figure(figsize=(8, 6))
    sns.heatmap(heatmap_data, annot=True, fmt=fmt, cmap=cmap, linewidths=0.5)
    plt.title(title)
    plt.ylabel("Agent Design")
    plt.xlabel("Prompt Type")
    plt.tight_layout()
    plt.savefig(os.path.join(config.PLOT_DIR, filename), dpi=300)
    plt.show()

# plot_heatmap(df, "Parsing Accuracy (%)", "Heatmap of Parsing Accuracy", "accuracy_heatmap.png", fmt=".1f")
# plot_heatmap(df, "Edit Distance (Avg)", "Heatmap of Edit Distance (Avg)", "edit_distance_heatmap.png", fmt=".2f", cmap="coolwarm")
# plot_heatmap(df, "Emissions (kg CO2e)", "Heatmap of Emissions", "emissions_heatmap.png", fmt=".2e", cmap="OrRd")

In [None]:
# Boxplot - Parsing Accuracy by Agent Design
def plot_parsing_accuracy_boxplot(df):
    plt.figure(figsize=(10, 6))
    sns.boxplot(
        data=df,
        x="Agent Design",
        y="Parsing Accuracy (%)",
        hue="Prompt Type",
        palette="pastel"
    )
    plt.title("Parsing Accuracy by Agent Design and Prompt Type")
    plt.ylabel("Parsing Accuracy (%)")
    plt.tight_layout()
    plt.savefig(os.path.join(config.PLOT_DIR, "accuracy_by_agent_prompt.png"), dpi=300)
    plt.show()

#plot_parsing_accuracy_boxplot(df)

In [None]:
# Scatterplot - LCS vs Compute Time
def plot_lcs_vs_compute_time(df):
    plt.figure(figsize=(8, 6))
    sns.scatterplot(
        data=df,
        x="LCS (Avg)",
        y="Compute Time (s)",
        hue="Agent Design",
        style="Prompt Type",
        s=100,
        edgecolor="black",
        alpha=0.85
    )
    plt.title("LCS vs Compute Time")
    plt.xlabel("LCS (Avg)")
    plt.ylabel("Compute Time (s)")
    plt.tight_layout()
    plt.savefig(os.path.join(config.PLOT_DIR, "lcs_vs_compute_time.png"), dpi=300)
    plt.show()

#plot_lcs_vs_compute_time(df)

In [None]:
def plot_metrics_boxplot(design, results_dir=config.RESULT_DIR, plots_dir=config.PLOT_DIR):

    csv_path = os.path.join(results_dir, f"{design}_per_line_metrics.csv")
    save_path = os.path.join(plots_dir, f"{design}_boxplot.png")

    df = pd.read_csv(csv_path)

    plt.figure(figsize=(10, 6))
    sns.boxplot(data=df[["Edit Distance", "LCS Length"]])
    plt.title(f"{design}: Distribution of Edit Distance and LCS Length")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()
    print(f"Boxplot saved to: {save_path}")