In [None]:
import polars as pl
from pathlib import Path
from toolz import groupby, valmap, keyfilter, concat
from scipy.stats import entropy

In [None]:
folder = Path(f"../structured_output")
files = sorted(folder.glob("*structured_output_df.parquet"))

In [None]:
def load(path: Path):
    df = pl.read_parquet(path)
    return df

In [None]:
data = {k.stem: load(k) for k in files}
grouped_data = groupby(len, data.values())

In [None]:
valmap(len, grouped_data)

In [None]:
concat_dfs = pl.concat(concat(grouped_data.values()))

In [None]:
models = concat_dfs['model'].unique().to_list()

In [None]:
cat_col = "primary_category"
aligned_categories = concat_dfs.pivot(
    index=["date", "body", "from", "subject"], on="model", values=cat_col, aggregate_function="first"
).with_columns(
    all_equal=pl.concat_list(models).list.n_unique() == 1
).drop_nulls(subset=models)
aligned_categories['all_equal'].mean()

In [None]:
# remove examples from prior supervised datasets (don't need to relabel for this experiment)
for _file in Path("..").glob("training_dataset_with_labels*.parquet"):
    tmp_df = pl.read_parquet(_file).drop_nulls(subset=["supervised_label"])
    aligned_categories = aligned_categories.join(tmp_df, on=["date", "body", "from"], how="anti")

In [None]:
aligned_categories.shape

In [None]:
# create a sample of emails where all models agree
# should have the same number of emails for each category
filtered = aligned_categories.filter(pl.col("all_equal"))
filtered.group_by(models[0]).agg(pl.len())

In [None]:
agree_sample = pl.concat(
    df.sample(n=min(10, len(df)), seed=1)
    for df in filtered.partition_by(models[0])
)
len(agree_sample)

In [None]:
agree_sample.shape

In [None]:
# create a sample of emails where the models (almost) maximally disagree
unequal_filtered = aligned_categories.filter(~pl.col("all_equal"))

entropies = []
for i, _data in enumerate(unequal_filtered[models].iter_rows()):
    vcs = pl.Series("categories", _data).value_counts()
    entropies.append(entropy(vcs["count"]))
    if i == 0:
        print(vcs)
        print(entropies[0])

unequal_filtered = unequal_filtered.with_columns(pl.Series("entropy", entropies)).sort(
    "entropy", descending=True
)
low_entropy_sample = unequal_filtered[-75:].drop("entropy")
disagree_sample = unequal_filtered[:150].drop("entropy")

In [None]:
# create a sample of emails in between the two extremes
mid = len(unequal_filtered) // 2
middle_sample = unequal_filtered[mid - 75:mid + 75].drop("entropy")

In [None]:
disagree_sample.head(2)

In [None]:
low_entropy_sample.head(2)

In [None]:
middle_sample.head(2)

In [None]:
val_counts = concat_dfs['primary_category'].value_counts(sort=True).filter(pl.col("primary_category") != "N/A")
val_counts.tail(10)

In [None]:
models

In [None]:
categories = concat_dfs['primary_category'].unique().to_list()

samples = []
for cat in categories:
    _tmp_df = aligned_categories.with_columns(
        pl.fold(pl.lit(False), function=lambda acc, x: acc | x, exprs=[pl.col(m) == cat for m in models]).alias("has_category")
    ).filter(pl.col("has_category"))
    samples.append(_tmp_df.sample(n=min(50, len(_tmp_df)), seed=1))
samples = pl.concat(samples).drop("has_category")

In [None]:
# concatenate the three samples and save them to a parquet file
pl.concat(
    [agree_sample, low_entropy_sample, disagree_sample, middle_sample, samples]
).unique(keep="first", subset=["date", "body", "from", "subject"]).write_parquet(
    "../training_dataset.parquet"
)