In [1]:
import polars as pl
from pathlib import Path
from toolz import groupby, valmap, keyfilter, concat
from scipy.stats import entropy

In [None]:
folder = Path(f"../structured_output")
files = sorted(folder.glob("*structured_output_df.parquet"))

In [None]:
def load(path: Path):
    df = pl.read_parquet(path)
    return df

In [None]:
data = {k.stem: load(k) for k in files}
grouped_data = groupby(len, data.values())

In [5]:
valmap(len, grouped_data)

{9996: 13, 23992: 3}

In [6]:
concat_dfs = pl.concat(concat(grouped_data.values()))

In [7]:
models = concat_dfs['model'].unique().to_list()

In [None]:
cat_col = "primary_category"
aligned_categories = concat_dfs.pivot(
    index=["date", "body", "from", "subject"], on="model", values=cat_col, aggregate_function="first"
).with_columns(
    all_equal=pl.concat_list(models).list.n_unique() == 1
).drop_nulls(subset=models)
aligned_categories['all_equal'].mean()

0.14995998399359745

In [9]:
# remove examples from prior supervised datasets (don't need to relabel for this experiment)
for _file in Path("..").glob("training_dataset_with_labels*.parquet"):
    tmp_df = pl.read_parquet(_file).drop_nulls(subset=["supervised_label"])
    aligned_categories = aligned_categories.join(tmp_df, on=["date", "body", "from"], how="anti")

In [10]:
aligned_categories.shape

(9298, 21)

In [None]:
# create a sample of emails where all models agree
# should have the same number of emails for each category
filtered = aligned_categories.filter(pl.col("all_equal"))
filtered.group_by(models[0]).agg(pl.len())

Qwen-2.5-1.5b,len
str,u32
"""Travel, scheduling and calenda…",6
"""Financial information""",22
"""News alerts and newsletters""",130
"""Customer service and support""",4
"""Promotional emails""",703
"""Shopping and order confirmatio…",120
"""Personal or professional corre…",26
"""Programming, educational, and …",433


In [None]:
agree_sample = pl.concat(
    df.sample(n=min(10, len(df)), seed=1)
    for df in filtered.partition_by(models[0])
)
len(agree_sample)

70

In [13]:
agree_sample.shape

(70, 21)

In [14]:
# create a sample of emails where the models (almost) maximally disagree
unequal_filtered = aligned_categories.filter(~pl.col("all_equal"))

entropies = []
for i, _data in enumerate(unequal_filtered[models].iter_rows()):
    vcs = pl.Series("categories", _data).value_counts()
    entropies.append(entropy(vcs["count"]))
    if i == 0:
        print(vcs)
        print(entropies[0])

unequal_filtered = unequal_filtered.with_columns(pl.Series("entropy", entropies)).sort(
    "entropy", descending=True
)
low_entropy_sample = unequal_filtered[-75:].drop("entropy")
disagree_sample = unequal_filtered[:150].drop("entropy")

shape: (3, 2)
┌─────────────────────────────────┬───────┐
│ categories                      ┆ count │
│ ---                             ┆ ---   │
│ str                             ┆ u32   │
╞═════════════════════════════════╪═══════╡
│ Customer service and support    ┆ 1     │
│ Account security and privacy    ┆ 14    │
│ Personal or professional corre… ┆ 1     │
└─────────────────────────────────┴───────┘
0.46341355882643


In [15]:
# create a sample of emails in between the two extremes
mid = len(unequal_filtered) // 2
middle_sample = unequal_filtered[mid - 75:mid + 75].drop("entropy")

In [16]:
disagree_sample.head(2)

date,body,from,subject,Command-R-7b,Dolphin-3,Falcon-3-3b,Falcon-3-7b,InternLM-3,Llama-3.1-8b,Llama-3.2-3b,Marco-o1,Nemo,Qwen-2.5-1.5b,Qwen-2.5-3b,Qwen-2.5-7b,Smallthinker,Granite-3.1-2b,Granite-3.1-8b,Tulu-3,all_equal
"datetime[μs, UTC]",str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,bool
2016-04-18 15:09:56 UTC,"""*** Top Meetups for you *** …","""Meetup <info@meetup.com>""","""Meetups this week with: Azurit…","""News alerts and newsletters""","""Promotional emails""","""Travel, scheduling and calenda…","""Other""","""Other""","""News alerts and newsletters""","""Travel, scheduling and calenda…","""Other""","""Promotional emails""","""Personal or professional corre…","""Customer service and support""","""Programming, educational, and …","""News alerts and newsletters""","""Medical information""","""News alerts and newsletters""","""Shopping and order confirmatio…",False
2013-01-30 17:29:43 UTC,"""Hello Students, REMINDER: T…","""""Littke, Michelle R"" <mlittke@…","""LAST DAY TO ADD CLASSES""","""Other""","""Programming, educational, and …","""Travel, scheduling and calenda…","""Travel, scheduling and calenda…","""News alerts and newsletters""","""Travel, scheduling and calenda…","""Travel, scheduling and calenda…","""Personal or professional corre…","""Programming, educational, and …","""Promotional emails""","""Customer service and support""","""Programming, educational, and …","""News alerts and newsletters""","""Account security and privacy""","""Account security and privacy""","""Shopping and order confirmatio…",False


In [17]:
low_entropy_sample.head(2)

date,body,from,subject,Command-R-7b,Dolphin-3,Falcon-3-3b,Falcon-3-7b,InternLM-3,Llama-3.1-8b,Llama-3.2-3b,Marco-o1,Nemo,Qwen-2.5-1.5b,Qwen-2.5-3b,Qwen-2.5-7b,Smallthinker,Granite-3.1-2b,Granite-3.1-8b,Tulu-3,all_equal
"datetime[μs, UTC]",str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,bool
2024-02-12 17:31:42 UTC,"""Blaze Pizza | Fast Fire'd -…","""Blaze Pizza <no_reply_at_Blaze…","""See which Pizza was Made for Y…","""Promotional emails""","""Promotional emails""","""Promotional emails""","""Promotional emails""","""Promotional emails""","""Promotional emails""","""Promotional emails""","""Promotional emails""","""Promotional emails""","""Promotional emails""","""Promotional emails""","""Promotional emails""","""Promotional emails""","""Shopping and order confirmatio…","""Promotional emails""","""Promotional emails""",False
2014-01-18 00:06:50 UTC,""" Your Weekly Summary Saturd…","""""Mint.com"" <team@mint.com>""","""Your Weekly Financial Summary …","""Financial information""","""Financial information""","""Financial information""","""Financial information""","""Financial information""","""Financial information""","""Shopping and order confirmatio…","""Financial information""","""Financial information""","""Financial information""","""Financial information""","""Financial information""","""Financial information""","""Financial information""","""Financial information""","""Financial information""",False


In [18]:
middle_sample.head(2)

date,body,from,subject,Command-R-7b,Dolphin-3,Falcon-3-3b,Falcon-3-7b,InternLM-3,Llama-3.1-8b,Llama-3.2-3b,Marco-o1,Nemo,Qwen-2.5-1.5b,Qwen-2.5-3b,Qwen-2.5-7b,Smallthinker,Granite-3.1-2b,Granite-3.1-8b,Tulu-3,all_equal
"datetime[μs, UTC]",str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,bool
2017-07-15 19:28:23 UTC,""" <!DOCTYPE HTML PUBLIC ""-//…","""Venmo <venmo@venmo.com>""","""You completed Momchil Tomov's …","""Financial information""","""Financial information""","""Financial information""","""Financial information""","""Financial information""","""Financial information""","""Shopping and order confirmatio…","""Financial information""","""Financial information""","""Personal or professional corre…","""Financial information""","""Shopping and order confirmatio…","""Promotional emails""","""Financial information""","""Financial information""","""Financial information""",False
2018-06-10 14:36:16 UTC,""" <!DOCTYPE HTML PUBLIC ""-//…","""Venmo <venmo@venmo.com>""","""You paid Sebastian Van Dyck $9…","""Financial information""","""Account security and privacy""","""Financial information""","""Financial information""","""Financial information""","""Financial information""","""Shopping and order confirmatio…","""Financial information""","""Financial information""","""Personal or professional corre…","""Financial information""","""Financial information""","""Financial information""","""Shopping and order confirmatio…","""Financial information""","""Financial information""",False


In [None]:
val_counts = concat_dfs['primary_category'].value_counts(sort=True).filter(pl.col("primary_category") != "N/A")
val_counts.tail(10)

primary_category,count
str,u32
"""Programming, educational, and …",37418
"""News alerts and newsletters""",22434
"""Shopping and order confirmatio…",19583
"""Financial information""",16421
"""Personal or professional corre…",12341
"""Travel, scheduling and calenda…",7281
"""Other""",6932
"""Account security and privacy""",6058
"""Customer service and support""",4646
"""Medical information""",1579


In [20]:
models

['Qwen-2.5-1.5b',
 'Qwen-2.5-3b',
 'Falcon-3-3b',
 'Qwen-2.5-7b',
 'Marco-o1',
 'Granite-3.1-2b',
 'Tulu-3',
 'Dolphin-3',
 'Falcon-3-7b',
 'Granite-3.1-8b',
 'Llama-3.2-3b',
 'Command-R-7b',
 'Smallthinker',
 'InternLM-3',
 'Nemo',
 'Llama-3.1-8b']

In [21]:
categories = concat_dfs['primary_category'].unique().to_list()

samples = []
for cat in categories:
    _tmp_df = aligned_categories.with_columns(
        pl.fold(pl.lit(False), function=lambda acc, x: acc | x, exprs=[pl.col(m) == cat for m in models]).alias("has_category")
    ).filter(pl.col("has_category"))
    samples.append(_tmp_df.sample(n=min(50, len(_tmp_df)), seed=1))
samples = pl.concat(samples).drop("has_category")

In [22]:
# concatenate the three samples and save them to a parquet file
pl.concat(
    [agree_sample, low_entropy_sample, disagree_sample, middle_sample, samples]
).unique(keep="first", subset=["date", "body", "from", "subject"]).write_parquet(
    "../training_dataset.parquet"
)