In [None]:
import os
os.chdir("/home/vecglypher/codes/svg_glyph_llm/")
os.getcwd()

In [None]:
import json
import os
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


def load_field_statistics(file_path):
    """Load the field value statistics from JSON file."""
    with open(file_path, "r") as f:
        return json.load(f)


def plot_histogram_for_field(field_name, field_data, max_labels_threshold=50):
    """
    Plot histogram for a specific field with unique_count and total_occurrences in title.
    Shows all entries by default, but hides x-axis labels if there are too many.

    Args:
        field_name: Name of the field
        field_data: Dictionary containing field statistics
        max_labels_threshold: Maximum number of entries before hiding x-axis labels
    """
    unique_count = field_data["unique_count"]
    total_occurrences = field_data["total_occurrences"]
    values = field_data["values"]

    # Extract value names and counts
    value_names = [item["value"] for item in values]
    counts = [item["count"] for item in values]

    # Sort by count in descending order
    sorted_data = sorted(zip(value_names, counts), key=lambda x: x[1], reverse=True)
    names, counts = zip(*sorted_data) if sorted_data else ([], [])

    # Determine if we should show x-axis labels
    show_labels = len(names) <= max_labels_threshold


    # Create the plot
    plt.figure(figsize=(12, 8))
    bars = plt.bar(
        range(len(names)), counts, color="skyblue", alpha=0.7
    )

    # Customize the plot
    plt.title(
        f'{field_name.replace("_", " ").title()} Distribution\n'
        f"Unique Count: {unique_count:,} | Total Occurrences: {total_occurrences:,}",
        fontsize=14,
        fontweight="bold",
    )

    plt.xlabel("Values", fontsize=12)
    plt.ylabel("Count", fontsize=12)

    # Set x-axis labels only if not too many entries
    if show_labels:
        plt.xticks(range(len(names)), names, rotation=45, ha="right")
    else:
        plt.xticks([])  # Hide x-axis labels
        # Add note about hidden labels
        plt.text(
            0.02,
            0.98,
            f"X-axis labels hidden ({unique_count:,} entries)",
            transform=plt.gca().transAxes,
            fontsize=10,
            bbox=dict(boxstyle="round", facecolor="lightblue", alpha=0.8),
            verticalalignment="top",
        )

    # Add value labels on top of bars only if not too many bars
    if len(names) <= 100:  # Only show value labels if reasonable number of bars
        for i, (bar, count) in enumerate(zip(bars, counts)):
            plt.text(
                bar.get_x() + bar.get_width() / 2,
                bar.get_height() + max(counts) * 0.01,
                str(count),
                ha="center",
                va="bottom",
                fontsize=8 if len(names) > 30 else 10,
            )

    plt.tight_layout()
    plt.grid(axis="y", alpha=0.3)

    return plt.gcf()


def create_all_histograms(json_file_path, output_dir=None):
    """
    Create histograms for all specified fields and save them.

    Args:
        json_file_path: Path to the JSON file containing field statistics
        output_dir: Directory to save plots (optional)
    """
    # Load the data
    data = load_field_statistics(json_file_path)

    # Fields to plot
    fields_to_plot = [
        "font_family_dir_name",
        "style",
        "weight",
        "num_fonts",
        "category",
        "stroke",
        "classifications",
    ]

    # Create plots for each field
    for field_name in fields_to_plot:
        if field_name in data:
            print(f"Creating histogram for {field_name}...")

            # Create the plot
            fig = plot_histogram_for_field(field_name, data[field_name])

            # Save the plot if output directory is specified
            if output_dir:
                os.makedirs(output_dir, exist_ok=True)
                output_path = os.path.join(output_dir, f"{field_name}_histogram.png")
                plt.savefig(output_path, dpi=300, bbox_inches="tight")
                print(f"Saved: {output_path}")

            # Show the plot
            plt.show()

        else:
            print(f"Warning: Field '{field_name}' not found in data")


def main():
    """Main function to run the histogram generation."""
    # Path to the JSON file
    json_file_path = "/home/vecglypher/codes/svg_glyph_llm_data/processed/google_font_metadata/field_value_statistics.json"

    # Output directory for saving plots
    output_dir = "/home/vecglypher/codes/svg_glyph_llm/misc/font_metadata_plots"

    # Create all histograms
    create_all_histograms(json_file_path, output_dir)


if __name__ == "__main__":
    main()
