In [None]:
import warnings
import numpy as np
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.auto import tqdm
from sklearn.svm import SVC
from toolz import keymap, merge
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from scipy.spatial.distance import pdist, squareform
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import (
    StratifiedKFold,
    cross_validate,
    GridSearchCV,
    RepeatedStratifiedKFold,
    cross_val_predict,
)

# Define models

In [None]:
models = {
    "svc": SVC(probability=True, class_weight="balanced"),
    "knn": KNeighborsClassifier(n_jobs=1),
    "rf": RandomForestClassifier(class_weight="balanced", n_jobs=1),
    "gnb": GaussianNB(),
    "lr": LogisticRegression(
        class_weight="balanced", solver="saga", n_jobs=1, max_iter=5_000, tol=1e-1,
    ),
}

## Create sampling function 

In [None]:
# sample items so that items in each category are reasonably distant from each other
# - adds robustness to the model
def get_sample(df: pl.DataFrame, n=100, random_state=0) -> pl.DataFrame:
    """Run k-Center algorithm to select most diverse samples for a category."""
    if len(df) <= n:
        return df
    distance_mtx = pdist(df["snowflake_embedding_clean"].to_numpy(), metric="cosine")
    distance_mtx = squareform(distance_mtx)

    rng = np.random.default_rng(random_state)

    sample = rng.integers(0, len(df), size=1).tolist()
    dist_row = distance_mtx[sample[0]]

    for i in range(n - 1):
        # make a bit robust, use quantile
        idxes = np.argsort(dist_row)
        pos = int(len(idxes) * 0.95)
        idx = idxes[pos]
        while idx in sample and pos < len(df):
            pos += 1
            idx = idxes[pos]

        while idx in sample and pos >= 0:
            pos -= 1
            idx = idxes[pos]

        sample.append(idx)
        dist_row = np.minimum(dist_row, distance_mtx[sample[-1]])
    
    return df[sample]

# Load data, create training dataset

Run this cell and parameter scan cell for each LLM.

In [None]:
llm_name = "tulu"

if llm_name == "granite":
    # granite
    df = pl.scan_parquet(
        "../structured_output_v5/granite3_1-dense-8b_structured_output_df.parquet"
    )
elif llm_name == "tulu":
    # tulu
    df = pl.scan_parquet("../structured_output_v5/tulu3_structured_output_df.parquet")
else:
    raise ValueError("llm_name not recognized")

# merge with embedding df
df = df.join(
    pl.scan_parquet("../emails_with_tokens_and_embeddings_v5.parquet").select(
        [
            "subject",
            "from",
            "body",
            "date",
            "snowflake_embedding_clean",
            "gte_embedding_clean",
        ]
    ),
    on=["subject", "from", "body", "date"],
    how="left",
).collect()

df = df.drop_nulls(subset=["snowflake_embedding_clean", "gte_embedding_clean"])
df = df.filter(pl.col("snowflake_embedding_clean").arr.sum() != 0).filter(
    pl.col("primary_category") != "N/A"  # came from CUDA error
)

In [None]:
df['primary_category'].value_counts(sort=True)

In [None]:
# create class-balanced training dataset
sample = pl.concat(
    [
        get_sample(_df, n=400, random_state=0)
        for _df in df.partition_by("primary_category", maintain_order=True)
    ]
)

# Run parameter scans

In [None]:
pca_params = {
    "n_components": [25, 50, 250, 400],
}

svc_params = {
    "kernel": ["linear", "rbf"],
    "C": [1e-2, 0.1, 0.9],
}

knn_params = {
    "n_neighbors": [5, 15, 50, 100],
    "weights": ["distance"],
    "metric": ["euclidean", "cosine", "manhattan"],
}

rf_params = {
    "n_estimators": [50, 100, 150],
    "max_depth": [5, 20, None],
}

gb_params = {
    "n_estimators": [50, 100, 500],
    "max_depth": [5, 20, None],
}

lr_params = {
    "C": [0.001, 0.1, 1, 10, 100],
    "penalty": ["l1", "l2"],
}

model_params = {
    "svc": svc_params,
    "knn": knn_params,
    "rf": rf_params,
    "gb": gb_params,
    "lr": lr_params,
}

In [None]:
results = {}

warnings.filterwarnings("ignore")

for embedding_name in tqdm(["gte_embedding_clean", "snowflake_embedding_clean"]):
    for model_name, model in tqdm(models.items()):
        print(f"Running {model_name} with {embedding_name}")
        pipeline = Pipeline(
            [("pca", PCA()), ("scaler", StandardScaler()), ("model", model)]
        )
        _model_params = model_params.get(model_name, {})
        _model_params = keymap(lambda k: f"model__{k}", _model_params)
        _model_params = {**_model_params, **keymap(lambda k: f"pca__{k}", pca_params)}

        embedding = sample[embedding_name].to_numpy()
        labels = sample["primary_category"].to_numpy()

        grid_search = GridSearchCV(
            pipeline,
            _model_params,
            cv=StratifiedKFold(n_splits=4, shuffle=True, random_state=0),
            n_jobs=5,
            verbose=1,
            scoring="f1_weighted",

        )
        grid_search.fit(embedding, labels)
        results[(embedding_name, model_name)] = pl.DataFrame(
            grid_search.cv_results_, strict=False
        ).sort("rank_test_score")
        print(results[(embedding_name, model_name)]['mean_test_score'][0])

In [None]:
with warnings.catch_warnings(action="ignore"):
    combined_results = []
    for k, v in results.items():
        _df = (
            v.select(["mean_test_score", "params"])
            .head(10)
            .with_columns(
                pl.col("params").map_elements(str, return_dtype=pl.String),
                embedding=pl.lit(k[0]),
                model=pl.lit(k[1]),
            )
        )
        combined_results.append(_df)
    combined_results = pl.concat(combined_results).sort(
        "mean_test_score", descending=True
    )
combined_results.head(10)

In [None]:
combined_results.write_parquet(f"../{llm_name}_model_selection_results.parquet")

# Plot parameter scan results within LLM

In [None]:
best_params = eval(combined_results["params"][0])
pipe = Pipeline(
    [("pca", PCA()), ("scaler", StandardScaler()), ("model", models["svc"])]
)
pipe = pipe.set_params(**best_params)

x = cross_validate(
    pipe,
    sample["snowflake_embedding_clean"].to_numpy(),
    sample["primary_category"].to_numpy(),
    cv=RepeatedStratifiedKFold(n_splits=4, n_repeats=10, random_state=0),
    n_jobs=5,
    scoring=["f1_weighted", "accuracy"],
)

y = cross_validate(
    pipe,
    sample["gte_embedding_clean"].to_numpy(),
    sample["primary_category"].to_numpy(),
    cv=RepeatedStratifiedKFold(n_splits=4, n_repeats=10, random_state=0),
    n_jobs=5,
    scoring=["f1_weighted", "accuracy"],
)

plt_df = pl.DataFrame({"mdl": "snowflake", "f1": x["test_f1_weighted"]})
plt_df = pl.concat([plt_df, pl.DataFrame({"mdl": "gte", "f1": y["test_f1_weighted"]})])

In [None]:
ax = sns.swarmplot(data=plt_df, x="mdl", y="f1", hue="mdl", zorder=1)
ax = sns.pointplot(
    data=plt_df,
    x="mdl",
    y="f1",
    color="k",
    ax=ax,
    linestyle="none",
    zorder=2,
    errorbar=("se", 2),
)
sns.despine()

In [None]:
snow = combined_results.filter(pl.col("embedding") == "snowflake_embedding_clean").sort(
    "mean_test_score", descending=True
)
gte = combined_results.filter(pl.col("embedding") == "gte_embedding_clean").sort(
    "mean_test_score", descending=True
)

In [None]:
for i in range(5):
    plt.plot(
        [0, 1],
        [snow["mean_test_score"][i], gte["mean_test_score"][i]],
        marker="o",
        color="k",
    )
sns.despine()

In [None]:
y_true = sample["primary_category"].to_numpy()
y_pred = cross_val_predict(
    pipe,
    sample["snowflake_embedding_clean"].to_numpy(),
    y_true,
    cv=StratifiedKFold(n_splits=4, shuffle=True, random_state=0),
)

In [None]:
# plot confusion matrix
labels = sample["primary_category"].unique().to_list()
cm = confusion_matrix(y_true, y_pred, labels=labels)
cm = cm / cm.sum(axis=1, keepdims=True) * 100

cm_df = pl.DataFrame(cm, schema=labels).to_pandas()
cm_df.index = labels

In [None]:
sns.heatmap(
    cm_df,
    annot=True,
    fmt="0.0f",
    cmap="mako",
    vmax=50,
    cbar_kws={"label": "Percent (%)"},
)

In [None]:
# compare tulu with granite

top_5_tulu = pl.read_parquet("../tulu_model_selection_results.parquet").head(10)
top_5_granite = pl.read_parquet("../granite_model_selection_results.parquet").head(10)
to_plt = pl.concat(
    [
        top_5_tulu.with_columns(llm=pl.lit("tulu")),
        top_5_granite.with_columns(llm=pl.lit("granite")),
    ]
)

In [None]:
sns.swarmplot(data=to_plt, x="llm", y="mean_test_score", hue='model')

In [None]:
for i in range(5):
    plt.plot(
        [0, 1],
        [top_5_tulu["mean_test_score"][i], top_5_granite["mean_test_score"][i]],
        marker="o",
        color="k",
    )
sns.despine()

# Train classifiers for the top parameter set for each classifier type

In [None]:
def load_dataset(llm_name):
    if llm_name not in ("tulu", "granite"): raise ValueError("llm_name not recognized")

    llm_prefix = "tulu3" if llm_name == "tulu" else "granite3_1-dense-8b"
    df = pl.scan_parquet(f"../structured_output_v5/{llm_prefix}_structured_output_df.parquet")

    # merge with embedding df
    df = df.join(
        pl.scan_parquet("../emails_with_tokens_and_embeddings_v5.parquet").select(
            [
                "subject",
                "from",
                "body",
                "date",
                "snowflake_embedding_clean",
                "gte_embedding_clean",
            ]
        ),
        on=["subject", "from", "body", "date"],
        how="left",
    ).collect()

    df = df.drop_nulls(subset=["snowflake_embedding_clean", "gte_embedding_clean"])
    df = df.filter(pl.col("snowflake_embedding_clean").arr.sum() != 0).filter(
        pl.col("primary_category") != "N/A"  # came from CUDA error
    )

    # create class-balanced training dataset
    sample = pl.concat(
        [
            get_sample(_df, n=400, random_state=0)
            for _df in df.partition_by("primary_category", maintain_order=True)
        ]
    )
    return sample

In [None]:
sample_map = {
    "granite": load_dataset("granite"),
    "tulu": load_dataset("tulu")
}

In [None]:
top_results = (
    pl.concat(
        pl.read_parquet(f"../{llm}_model_selection_results.parquet").with_columns(
            llm=pl.lit(llm)
        )
        for llm in ["tulu", "granite"]
    )
    .group_by(["model", "llm"])
    .agg(
        pl.struct(["mean_test_score", "params", "embedding"])
        .sort_by("mean_test_score", descending=True)
        .first()
    )
    .unnest("mean_test_score")
    .sort("mean_test_score", descending=True)
)

In [None]:
top_results

In [None]:
output = []
for result in top_results.iter_rows(named=True):
    _sample = sample_map[result['llm']]
    # if result['model'] == 'lr':  # ignore for now, takes too long
        # continue
    print(result)
    best_params = eval(result['params'])
    pipe = Pipeline(
        [("pca", PCA()), ("scaler", StandardScaler()), ("model", models[result['model']])]
    )
    pipe = pipe.set_params(**best_params)
    x = cross_validate(
        pipe,
        _sample[result["embedding"]].to_numpy(),
        _sample["primary_category"].to_numpy(),
        cv=RepeatedStratifiedKFold(n_splits=4, n_repeats=10, random_state=0),
        n_jobs=6,
        scoring=["f1_weighted", "accuracy"],
    )
    for i, (f1, acc) in enumerate(zip(x['test_f1_weighted'], x['test_accuracy'])):
        result = merge(result, dict(f1=f1, acc=acc, iteration=i))
        output.append(result)

In [None]:
output = pl.DataFrame(output)
output.write_parquet("../top_classifier_runs_per_type.parquet", compression_level=5)

## Try making a stacking or bagging classifier

In [None]:
from sklearn.ensemble import BaggingClassifier, StackingClassifier, VotingClassifier

In [None]:
pipe = Pipeline(
    [("pca", PCA()), ("scaler", StandardScaler()), ("model", models['svc'])]
).set_params(**eval(top_results['params'][0]))

In [None]:
bag = BaggingClassifier(
    estimator=pipe,
    n_estimators=10,
    bootstrap_features=True,
    n_jobs=-1
)

In [None]:
xval = cross_validate(
    bag,
    sample_map["granite"]["snowflake_embedding_clean"].to_numpy(),
    sample_map["granite"]["primary_category"].to_numpy(),
    cv=StratifiedKFold(n_splits=4, shuffle=True, random_state=0),
    n_jobs=1,
    scoring=["f1_weighted", "accuracy"],
)
xval['test_f1_weighted'].mean()

In [None]:
idx_map = {
    'svc': 0,
    'knn': 3,
    'lr': 2
}

models_list = []

for k, v in idx_map.items():
    pipe = Pipeline(
        [("pca", PCA()), ("scaler", StandardScaler()), ("model", models[k])]
    ).set_params(**eval(top_results['params'][v]))
    models_list.append((k, pipe['model']))

stack = StackingClassifier(estimators=models_list)

pipe = Pipeline(
    [("pca", PCA()), ("scaler", StandardScaler()), ("model", stack)]
)

In [None]:
xval = cross_validate(
    pipe,
    sample_map["granite"]["snowflake_embedding_clean"].to_numpy(),
    sample_map["granite"]["primary_category"].to_numpy(),
    cv=StratifiedKFold(n_splits=4, shuffle=True, random_state=0),
    n_jobs=4,
    scoring=["f1_weighted", "accuracy"],
)
xval['test_f1_weighted'].mean()

In [None]:
idx_map = {
    'svc': 0,
    'knn': 3,
    'lr': 2
}

models_list = []

for k, v in idx_map.items():
    pipe = Pipeline(
        [("pca", PCA()), ("scaler", StandardScaler()), ("model", models[k])]
    ).set_params(**eval(top_results['params'][v]))
    models_list.append((k, pipe['model']))

voter = VotingClassifier(estimators=models_list, voting='soft')

pipe = Pipeline(
    [("pca", PCA()), ("scaler", StandardScaler()), ("model", voter)]
)

In [None]:
xval = cross_validate(
    pipe,
    sample_map["granite"]["snowflake_embedding_clean"].to_numpy(),
    sample_map["granite"]["primary_category"].to_numpy(),
    cv=StratifiedKFold(n_splits=4, shuffle=True, random_state=0),
    n_jobs=4,
    scoring=["f1_weighted", "accuracy"],
)
xval['test_f1_weighted'].mean()

# Compare embeddings

In [None]:
result = top_results.to_dicts()[0]
assert result['model'] == 'svc'
result

In [None]:
pipe = Pipeline(
    [("pca", PCA()), ("scaler", StandardScaler()), ("model", models[result['model']])]
).set_params(**eval(result['params']))

In [None]:
output = []
_sample = sample_map["granite"]
for embedding in ["snowflake_embedding_clean", "gte_embedding_clean"]:
    x = cross_validate(
        pipe,
        _sample[embedding].to_numpy(),
        _sample["primary_category"].to_numpy(),
        cv=RepeatedStratifiedKFold(n_splits=4, n_repeats=10, random_state=0),
        n_jobs=6,
        scoring=["f1_weighted", "accuracy"],
    )
    for i, (f1, acc) in enumerate(zip(x['test_f1_weighted'], x['test_accuracy'])):
        result = merge(result, dict(f1=f1, acc=acc, iteration=i, embedding=embedding))
        output.append(result)
output = pl.DataFrame(output)

In [None]:
output.head()

In [None]:
output.write_parquet("../compare_embeddings_for_classification.parquet", compression_level=5)

# Test on validation data (all other samples)

In [None]:
# granite
df = pl.scan_parquet(
    "../structured_output_v5/granite3_1-dense-8b_structured_output_df.parquet"
)

# merge with embedding df
df = df.join(
    pl.scan_parquet("../emails_with_tokens_and_embeddings_v5.parquet").select(
        [
            "subject",
            "from",
            "body",
            "date",
            "snowflake_embedding_clean",
        ]
    ),
    on=["subject", "from", "body", "date"],
    how="left",
).collect()

df = df.drop_nulls(subset=["snowflake_embedding_clean"])
df = df.filter(pl.col("snowflake_embedding_clean").arr.sum() != 0).filter(
    pl.col("primary_category") != "N/A"  # came from CUDA error
)

In [None]:
len(df)

In [None]:
result = top_results.to_dicts()[0]
assert result['model'] == 'svc'
result

In [None]:
pipe = Pipeline(
    [("pca", PCA()), ("scaler", StandardScaler()), ("model", models[result['model']])]
).set_params(**eval(result['params']))

In [None]:
_sample = sample_map["granite"]
pipe = pipe.fit(
    _sample["snowflake_embedding_clean"].to_numpy(),
    _sample["primary_category"].to_numpy(),
)

In [None]:
# remove entries from _sample
smaller_df = df.join(_sample.select(["subject", "from", "body", "date"]), how="anti", on=["subject", "from", "body", "date"])

In [None]:
from sklearn.metrics import balanced_accuracy_score, f1_score

In [None]:
preds = pipe.predict(smaller_df["snowflake_embedding_clean"].to_numpy())

In [None]:
balanced_accuracy_score(smaller_df["primary_category"].to_numpy(), preds)

In [None]:
f1_score(smaller_df["primary_category"].to_numpy(), preds, average='weighted')

In [None]:
log_probs = pipe.predict_log_proba(smaller_df["snowflake_embedding_clean"].to_numpy()[:1000])

In [None]:
def entropy(log_probs):
    return -np.sum(np.exp(log_probs) * log_probs, axis=1)

In [None]:
entropies = entropy(log_probs)
sorted_inds = np.argsort(entropies)
plt.plot(np.sort(entropy(log_probs)))

In [None]:
entropies[sorted_inds[700]]

In [None]:
np.quantile(entropies, 0.95)

In [None]:
entropies[sorted_inds[950]]

In [None]:
plt.plot(np.exp(log_probs[[2, 31]]).T)

In [None]:
from rich.table import Table
from IPython.display import display, HTML


def print_email(idx):
    for col in ["from", "subject", "body"]:
        if col == "body":
            display(HTML(smaller_df[col][idx]))
        else:
            print(smaller_df[col][idx])


def create_confidence_output(log_probs, model, n_show=4):
    table = Table(show_header=True, header_style="bold magenta")
    table.add_column("Confidence", justify="right")
    table.add_column("Category", justify="left")
    sorted_idx = np.argsort(log_probs)[::-1][:n_show]
    probs = np.exp(log_probs[sorted_idx])[:n_show] * 100
    rep = [f"{pct:0.0f}%: {model.classes_[idx]}" for idx, pct in zip(sorted_idx, probs)]
    for pct, cat in zip(probs, model.classes_[sorted_idx]):
        table.add_row(f"{pct:0.0f}%", cat)
    return table

In [None]:
for i in range(15):
    idx = int(sorted_inds[i])
    print_email(idx)
    display(create_confidence_output(log_probs[idx], pipe))

In [None]:
for i in range(1, 16):
    idx = int(sorted_inds[-i])
    print_email(idx)
    display(create_confidence_output(log_probs[idx], pipe))

In [None]:
idx = int(sorted_inds[940])

print("entropy:", entropies[idx])
print_email(idx)
display(create_confidence_output(log_probs[idx], pipe))