In [None]:
import os
os.chdir("/home/vecglypher/codes/svg_glyph_llm/")
os.getcwd()

In [None]:
#!/usr/bin/env python3
"""
Script to gather metrics from score_stats JSON files and create a pandas DataFrame.
"""

import glob
import json
from pathlib import Path

import matplotlib.pyplot as plt

import pandas as pd
import seaborn as sns


def gather_metrics():
    """
    Gather metrics from all score_stats-use_case_{False,True}.json files
    and return a pandas DataFrame.
    """
    # Pattern to match the files
    pattern = "outputs/250813-alphanumeric/*/*/infer_ocr_eval/score_stats-use_case_*.json"

    # Find all matching files
    files = glob.glob(pattern)

    if not files:
        print(f"No files found matching pattern: {pattern}")
        return pd.DataFrame()

    print(f"Found {len(files)} files")

    # List to store all rows
    rows = []

    for file_path in files:
        try:
            # Extract experiment name from path
            # Path structure: outputs/exp_name/checkpoint/infer_ocr_eval/score_stats-use_case_*.json
            path_parts = Path(file_path).parts

            # [NOTE] change here to match your path structure
            exp_name = path_parts[-4]  # Second part is the experiment name
            job_name = path_parts[-3]  # Third part is the job name

            # [NOTE] change here to trim unused prefix
            exp_name = exp_name.replace("250813-alphanumeric-", "")
            exp_name = exp_name.replace("train_fonts-", "")
            exp_name = exp_name.replace("-full_sft", "")
            job_name = job_name.replace("checkpoint-", "")

            # [NOTE] change here to match type
            exp_name = str(exp_name)
            job_name = int(job_name)

            # Load JSON data
            with open(file_path, "r") as f:
                data = json.load(f)

            # Create row with exp_name and all JSON fields
            row = {
                "exp_name": exp_name,
                "job_name": job_name,
                "file_path": file_path,
                **data,  # Unpack all JSON fields
            }

            rows.append(row)
            print(f"Processed: {file_path}")

        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            continue

    if not rows:
        print("No valid data found")
        return pd.DataFrame()

    # Create DataFrame
    df = pd.DataFrame(rows)
    df = df.sort_values("job_name")

    # Reorder columns to put exp_name and use_case first
    cols = ["exp_name", "use_case"] + [
        col for col in df.columns if col not in ["exp_name", "use_case"]
    ]
    df = df[cols]

    print(f"\nCreated DataFrame with {len(df)} rows and {len(df.columns)} columns")
    print(f"Columns: {list(df.columns)}")

    return df


def create_plots(df, output_dir):
    """Create two plots: one for use_case=True and one for use_case=False."""
    if df.empty:
        print("No data to plot")
        return

    # Set up the plotting style
    plt.style.use("default")
    sns.set_palette("husl")

    # Create figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

    # Calculate global y-axis limits for both plots
    # y_min = df["accuracy"].min()
    # y_max = df["accuracy"].max()
    # y_range = y_max - y_min
    # y_padding = y_range * 0.05  # Add 5% padding
    # global_ylim = (y_min - y_padding, y_max + y_padding)
    global_ylim = (0.6, 0.85)


    # [NOTE] change here to sort by key
    df = df.sort_values(["exp_name", "job_name"])

    # Plot for use_case=True
    df_true = df[df["use_case"] == True]
    if not df_true.empty:
        for exp_name in df_true["exp_name"].unique():
            exp_data = df_true[df_true["exp_name"] == exp_name]
            ax1.plot(
                exp_data["job_name"],
                exp_data["accuracy"],
                marker="o",
                label=exp_name,
                linewidth=2,
                markersize=6,
            )
            print(f"exp_name: {exp_name}, {exp_data}")

        ax1.set_title(
            "LMM OCR Accuracy (cased)", fontsize=14, fontweight="bold"
        )
        ax1.set_xlabel("Steps", fontsize=12)
        ax1.set_ylabel("Accuracy", fontsize=12)
        ax1.legend(title="Experiment Name", bbox_to_anchor=(1.05, 1), loc="upper left")
        ax1.grid(True, alpha=0.3)
        ax1.tick_params(axis="x", rotation=45)
        ax1.set_ylim(global_ylim)

    # Plot for use_case=False
    df_false = df[df["use_case"] == False]
    if not df_false.empty:
        for exp_name in df_false["exp_name"].unique():
            exp_data = df_false[df_false["exp_name"] == exp_name]
            ax2.plot(
                exp_data["job_name"],
                exp_data["accuracy"],
                marker="s",
                label=exp_name,
                linewidth=2,
                markersize=6,
            )
            print(f"exp_name: {exp_name}, {exp_data}")

        ax2.set_title(
            "LMM OCR Accuracy (uncased)", fontsize=14, fontweight="bold"
        )
        ax2.set_xlabel("Steps", fontsize=12)
        ax2.set_ylabel("Accuracy", fontsize=12)
        ax2.legend(title="Model", bbox_to_anchor=(1.05, 1), loc="upper left")
        ax2.grid(True, alpha=0.3)
        ax2.tick_params(axis="x", rotation=45)
        ax2.set_ylim(global_ylim)

    # Adjust layout to prevent overlap
    plt.tight_layout()

    # Save the plot

    plt.savefig(output_dir / "accuracy_plots.png", dpi=300, bbox_inches="tight")
    print("Plots saved as 'accuracy_plots.png'")

    # Show the plot
    plt.show()

In [None]:
output_dir = Path("misc")
output_dir.mkdir(exist_ok=True, parents=True)

df = gather_metrics()
output_file = output_dir / "metrics_summary.csv"
df.to_csv(output_file, index=False, sep="\t")
print(f"\nSaved results to: {output_file}")
create_plots(df, output_dir)


In [None]:
input_csv = "misc/prev_csv.csv"
df = pd.read_csv(input_csv, sep="\t")
create_plots(df, output_dir)