# Get metadata

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

data_dir = Path.cwd().parent / "data"
metadata_df = pd.read_parquet(data_dir / "processed/vasr/metadata.parquet")
mapping_df = pd.read_json(data_dir / "processed/vasr/mapping.json", dtype={"id": str, "shard": str})
df = metadata_df.merge(mapping_df, on=["id", "shard", "split"])
df

## Get split

In [None]:
split = "train"
df = df[df["split"] == split]
df

## Get training subset

In [None]:
hour = 200
manifest_file = data_dir / f"processed/vasr/{hour}h/train.tsv"
subset_ids = []
with open(manifest_file, "r") as f:
    _ = f.readline()
    for line in f:
        subset_ids.append(line.split()[0].split("-")[0])
df = df[df["id"].isin(subset_ids)]
df

# Count examples

In [None]:
count_df = df.groupby("topic").size().sort_values(ascending=False).reset_index(name="count")
count_df["percent"] = count_df["count"] / count_df["count"].sum()
count_df

In [None]:
subject = "200h"
count_df.to_csv(Path.cwd().parent / f"docs/vasr/topic_distribution_{subject}.csv", index=False)

# Plot pie chart

In [None]:
topic_counts = df["topic"].value_counts(dropna=False)

colors = sns.color_palette("husl", len(topic_counts))
labels = topic_counts.index.tolist()
sizes = topic_counts.values.tolist()

fig, ax = plt.subplots(figsize=(10, 8))
wedges, texts, autotexts = ax.pie(sizes, colors=colors, autopct="%1.1f%%", startangle=140)
ax.legend(wedges, labels, title="Topics", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))
plt.axis("equal")

plt.show()

In [None]:
subject = "200h"
plt.savefig(Path.cwd().parent / f"docs/vasr/topic_distribution_{subject}.png", bbox_inches="tight")