In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

def analyze_resume_predictions(csv_path, output_dir="analysis_outputs"):
    os.makedirs(output_dir, exist_ok=True)

    df = pd.read_csv(csv_path)

    # --- 1. Average duration_seconds ---
    avg_duration_model = df.groupby("model")["duration_seconds"].mean().reset_index()
    avg_duration_prompt = df.groupby("prompt_path")["duration_seconds"].mean().reset_index()
    avg_duration_combined = df.groupby(["model", "prompt_path"])["duration_seconds"].mean().reset_index()

    # Save to CSV
    avg_duration_model.to_csv(os.path.join(output_dir, "avg_duration_by_model.csv"), index=False)
    avg_duration_prompt.to_csv(os.path.join(output_dir, "avg_duration_by_prompt_path.csv"), index=False)
    avg_duration_combined.to_csv(os.path.join(output_dir, "avg_duration_by_model_and_prompt.csv"), index=False)

    # Plotting helper
    def plot_bar(data, x, y, title, filename, hue=None):
        plt.figure(figsize=(10, 6))
        sns.barplot(data=data, x=x, y=y, hue=hue)
        plt.title(title)
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, filename))
        plt.close()

    plot_bar(avg_duration_model, "model", "duration_seconds", "Avg Duration by Model", "avg_duration_model.png")
    plot_bar(avg_duration_prompt, "prompt_path", "duration_seconds", "Avg Duration by Prompt Path", "avg_duration_prompt_path.png")
    plot_bar(avg_duration_combined, "model", "duration_seconds", "Avg Duration by Model & Prompt", "avg_duration_combined.png", hue="prompt_path")

    # --- 2. Frequency counts for each categorical prediction column ---
    pred_columns = [
        "pred_factual_accuracy", "pred_alignment",
        "pred_section_length", "pred_grammar", "pred_justification"
    ]

    for col in pred_columns:
        # By model
        model_counts = df.groupby(["model", col]).size().reset_index(name="count")
        plot_bar(model_counts, col, "count", f"{col} by Model", f"{col}_by_model.png", hue="model")

        # By prompt_path
        prompt_counts = df.groupby(["prompt_path", col]).size().reset_index(name="count")
        plot_bar(prompt_counts, col, "count", f"{col} by Prompt Path", f"{col}_by_prompt_path.png", hue="prompt_path")

        # By model + prompt_path
        combined_counts = df.groupby(["model", "prompt_path", col]).size().reset_index(name="count")
        for (model_val, prompt_val), group in combined_counts.groupby(["model", "prompt_path"]):
            title = f"{col} for Model: {model_val}, Prompt: {prompt_val}"
            fname = f"{col}_model_{model_val}_prompt_{prompt_val}.png".replace("/", "_")
            plot_bar(group, col, "count", title, fname)

    print(f"Analysis complete. Outputs saved to {output_dir}")


In [3]:
csv_path = r"C:\Users\viren\Desktop\Rizzume\Code\rizzume_ml\experiment_results\prompt_model_variations\prompt_model_variations_05_13_2025_22_08_47.csv"
analyze_resume_predictions(csv_path)

KeyError: 'pred_justification'