# Get metadata

In [None]:
import pandas as pd
from pathlib import Path

data_dir = Path.cwd().parent / "data"
metadata_df = pd.read_parquet(data_dir / "processed/vasr/metadata.parquet")
mapping_df = pd.read_json(data_dir / "processed/vasr/mapping.json", dtype={"id": str, "shard": str})
df = metadata_df.merge(mapping_df, on=["id", "shard", "split"])
df

## Get split

In [None]:
split = "train"
df = df[df["split"] == split]
df

## Get training subset

In [None]:
hour = 200
manifest_file = data_dir / f"processed/vasr/{hour}h/train.tsv"
subset_ids = []
with open(manifest_file, "r") as f:
    _ = f.readline()
    for line in f:
        subset_ids.append(line.split()[0].split("-")[0])
df = df[df["id"].isin(subset_ids)]
df

# Count channels

In [None]:
len(df["channel"].unique())

# Count examples

In [None]:
count_df = df.groupby("channel").size().sort_values(ascending=False).reset_index(name="count")
count_df["percent"] = count_df["count"] / count_df["count"].sum()
count_df

In [None]:
subject = "200h"
count_df.to_csv(Path.cwd().parent / f"docs/vasr/source_distribution_{subject}.csv", index=False)