## Config

In [None]:
# import necessary libraries
import os
import json
import re
import random
from typing import List, Dict, Tuple
import numpy as np
import pandas as pd
from tqdm.auto import tqdm as progress
from sklearn.preprocessing import LabelEncoder
# %pip install torch transformers
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch
import torch.nn.functional as F
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
)
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import json
import time
import numpy as np
from typing import Optional, Literal
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List
from sklearn.preprocessing import LabelEncoder
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
EMB_MODEL = "Qwen/Qwen3-Embedding-8B"

# Make call to Data
# Ensure that messages_by_top_agency.csv is in the parent directory
# This is assuming that the messages have already been created
DATA_PATH = os.environ.get("DATA_PATH", "../messages_by_top_agency.csv")
TEXT_COL = os.environ.get("TEXT_COL", "message")
LABEL_COLS = ["top_agency"]

# Threshold for "threshold" method (cosine)
OTHER_THRESHOLD = float(os.environ.get("OTHER_THRESHOLD", "0.50"))

# Seed
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

## Data

In [4]:
def normalize(s: str) -> str:
    """
    checks input type, lowercases the text, strip, collapses all whitespace
    """
    # type check
    if not isinstance(s, str):
        return ""
    # normalize
    s = s.lower().strip()
    s = re.sub(r"\s+", " ", s)
    return s


def load_data(path: str, text_col: str) -> pd.DataFrame:
    """
    Load data from a CSV file, check for the presence of the text column, and normalize the text data.
    """
    # load
    df = pd.read_csv(path)
    # check columns
    if text_col not in df.columns:
        raise KeyError(f"Missing columns: {text_col}")
    # normalize
    df = df.copy()
    df = df.dropna(subset=[text_col])
    df[text_col] = df[text_col].map(normalize)
    return df

## Titan Embedding Model

In [13]:
_SBERT = None

def sbert_client(model_name: str = EMB_MODEL):
    """Create/load a Sentence-BERT model (CPU by default)."""
    global _SBERT
    if _SBERT is None:
        _SBERT = SentenceTransformer(model_name)
    return _SBERT

def embed_one(
    # Single text to embedding vector
    text: str,
    client=None,
    model_id: str = EMB_MODEL,
    *,
    # Embedding parameters
    dimensions: Optional[int] = None,   # 256 / 512 / 1024
    normalize: bool = True,             # Normalization by Titan v2
    # Retry parameters
    max_retries: int = 3,
    backoff_base: float = 1.5,
) -> np.ndarray:
    """ Embed a single text into a vector."""
    if client is None:
        client = sbert_client(model_id)

    # Handle None or NaN input
    if text is None or (isinstance(text, float) and np.isnan(text)):
        text = ""
    text = str(text)

    vec = client.encode(
        [text],
        convert_to_numpy=True,
        normalize_embeddings=normalize,
        show_progress_bar=False,
    )[0].astype(np.float32)

    # Optional dimensions
    if dimensions is not None and vec.shape[0] != dimensions:
        raise ValueError(
            f"dimensions={dimensions} requested, but SBERT '{model_id}' outputs {vec.shape[0]} dims."
        )

def embed_all(
    texts: List[str],
    *,
    dimensions: Optional[int] = None,
    normalize: bool = True,
    batch_size: int = 64,
    show_progress_bar: bool = True,
    model_id: str = EMB_MODEL,
) -> np.ndarray:
    """ Embed a list of texts into vectors."""
    client = sbert_client(model_id)

    safe_texts = []
    for t in texts:
        if t is None or (isinstance(t, float) and np.isnan(t)):
            safe_texts.append("")
        else:
            safe_texts.append(str(t))

    X = client.encode(
        safe_texts,
        batch_size=batch_size,
        convert_to_numpy=True,
        normalize_embeddings=normalize,
        show_progress_bar=show_progress_bar,
    ).astype(np.float32)

    if dimensions is not None and X.shape[1] != dimensions:
        raise ValueError(
            f"dimensions={dimensions} requested, but SBERT '{model_id}' outputs {X.shape[1]} dims."
        )

    return X

## Train/Test Split

In [14]:
def split_indices(
    n: int,
    # Split ratios by train, val, test
    ratios: Tuple[float, float, float] = (0.6, 0.2, 0.2),
    seed: int = SEED,
):
    """ Split indices into train/val/test sets based on given ratios. """
    # Shuffle indices
    rng = np.random.RandomState(seed)
    idx = np.arange(n)
    rng.shuffle(idx)

    # Compute split sizes
    r_tr, r_va, r_te = ratios
    n_tr = int(n * r_tr)
    n_va = int(n * r_va)

    # Split indices
    tr_idx = idx[:n_tr]
    va_idx = idx[n_tr:n_tr + n_va]
    te_idx = idx[n_tr + n_va:]

    return tr_idx, va_idx, te_idx

## Main

In [17]:
# 1) Data
df = load_data(DATA_PATH, TEXT_COL)
print("Loaded:", df.shape)
print("Columns:", list(df.columns))
# Optional: create concatenated column for top_agency + issue_area
if all(c in df.columns for c in ["top_agency", "issue_area"]):
    df["top_agency_issue_area_concat"] = (
        df["top_agency"].astype(str) + " | " + df["issue_area"].astype(str)
    )
texts = df[TEXT_COL].tolist()

# 2) Convert to / Load Embedding
# You likely don't have s4_data/X_ALL.npy yet, so this will compute the embeddings
if os.path.exists("s4_data/X_ALL.npy"):
    print("Loading precomputed embeddings...")
    X_ALL = np.load("s4_data/X_ALL.npy")
    print(f"Loaded X_ALL.npy with shape {X_ALL.shape}")
else:
    print("Computing embeddings...")
    X_ALL = embed_all(texts, batch_size=64, normalize=True)
    np.save("s4_data/X_ALL.npy", X_ALL)
    print(f"Saved X_ALL.npy with shape {X_ALL.shape}")

Loaded: (16065, 17)
Columns: ['top_agency', 'sub_agency', 'issue_area', 'program_title', 'objectives', 'trust', 'efficacy', 'knowledge', 'message', 'prompt_used', 'tag_string', 'agency_issue_pair', 'quality_validated', 'quality_status', 'repaired', 'generation_attempts_repair', 'notes']
Computing embeddings...


Batches: 100%|██████████| 252/252 [00:42<00:00,  5.94it/s]


Saved X_ALL.npy with shape (16065, 384)


In [18]:
# 3) Split (load-or-create, shared indices across all labels)
split_dir = "s4_data"
os.makedirs(split_dir, exist_ok=True)

tr_path = f"{split_dir}/TR_IDX.npy"
va_path = f"{split_dir}/VA_IDX.npy"
te_path = f"{split_dir}/TE_IDX.npy"

# load-or-create
if os.path.exists(tr_path) and os.path.exists(va_path) and os.path.exists(te_path):
    print("Loading existing split indices...")
    TR_IDX = np.load(tr_path)
    VA_IDX = np.load(va_path)
    TE_IDX = np.load(te_path)
else:
    print("Creating new split indices...")
    TR_IDX, VA_IDX, TE_IDX = split_indices(len(df))
    np.save(tr_path, TR_IDX)
    np.save(va_path, VA_IDX)
    np.save(te_path, TE_IDX)
    print("Saved TR/VA/TE indices to s4_data/")

# embedding split
X_tr = X_ALL[TR_IDX]
X_va = X_ALL[VA_IDX]
X_te = X_ALL[TE_IDX]

# label split
y_tr = {}
y_va = {}
y_te = {}
label_encoders = {}

for label_name in LABEL_COLS:
    print(f"Processing label column: {label_name}")
    labels_raw = df[label_name].to_numpy()

    # LabelEncoder: string → int id
    le = LabelEncoder()
    y_all_int = le.fit_transform(labels_raw)
    label_encoders[label_name] = le

    y_tr[label_name] = y_all_int[TR_IDX]
    y_va[label_name] = y_all_int[VA_IDX]
    y_te[label_name] = y_all_int[TE_IDX]

print("Split ready.")

Creating new split indices...
Saved TR/VA/TE indices to s4_data/
Processing label column: top_agency
Split ready.


## Contrastive Learning

In [19]:
# 4) Contrastive learning: X -> Z  (using top_agency as supervision, val for best epoch)

print("Training contrastive projection head (X -> Z) using top_agency labels...")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

y_tr_super = y_tr["top_agency"]
y_va_super = y_va["top_agency"]

X_tr_t = torch.from_numpy(X_tr).float()
y_tr_t = torch.from_numpy(y_tr_super).long()

dataset = TensorDataset(X_tr_t, y_tr_t)
loader = DataLoader(dataset, batch_size=256, shuffle=True, drop_last=True)

in_dim = X_tr.shape[1]
proj_dim = 256

class ProjectionHead(nn.Module):
    """Projection head for contrastive learning."""
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 512),
            nn.ReLU(),
            nn.Linear(512, out_dim),
        )

    def forward(self, x):
        """Forward pass to project input features."""
        z = self.net(x)
        z = F.normalize(z, dim=1)
        return z

model = ProjectionHead(in_dim, proj_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
temperature = 0.3
epochs = 10

def supervised_contrastive_loss(z, labels, temperature=0.1):
    """Compute supervised contrastive loss."""
    
    z = F.normalize(z, dim=1)
    sim = torch.matmul(z, z.T) / temperature  # [B, B]

    batch_size = z.size(0)
    labels = labels.contiguous().view(-1, 1)  # [B, 1]

    matches = torch.eq(labels, labels.T) 
    self_mask = torch.eye(batch_size, dtype=torch.bool, device=z.device)
    matches = matches & ~self_mask

    logits_mask = ~self_mask
    exp_sim = torch.exp(sim) * logits_mask

    log_prob = sim - torch.log(exp_sim.sum(dim=1, keepdim=True) + 1e-9)

    pos_sum = (matches * log_prob).sum(dim=1)
    pos_count = matches.sum(dim=1)

    mask_non_zero = pos_count > 0
    mean_log_prob_pos = pos_sum[mask_non_zero] / (pos_count[mask_non_zero] + 1e-9)

    loss = -mean_log_prob_pos.mean()
    return loss

from sklearn.metrics import f1_score

def build_centroids(X, y):
    """ Build class centroids from embeddings and labels. """
    centroids = {}
    for lab in np.unique(y):
        centroids[lab] = X[y == lab].mean(axis=0)
    return centroids

def predict_with_centroids(X, centroids):
    """ Predict labels based on nearest centroids using cosine similarity. """
    labels = np.array(list(centroids.keys()))
    C = np.vstack([centroids[lab] for lab in labels])

    Xn = X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-8)
    Cn = C / (np.linalg.norm(C, axis=1, keepdims=True) + 1e-8)

    sims = Xn @ Cn.T
    idx = sims.argmax(axis=1)
    return labels[idx]

best_va_f1 = -1.0
best_state = None

model.train()
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    n_batches = 0

    for xb, yb in loader:
        xb = xb.to(device)
        yb = yb.to(device)

        optimizer.zero_grad()
        z = model(xb)
        loss = supervised_contrastive_loss(z, yb, temperature)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        n_batches += 1

    avg_loss = running_loss / max(n_batches, 1)
    print(f"Epoch {epoch:02d} | SupCon loss = {avg_loss:.4f}", end="")

    
    model.eval()
    with torch.no_grad():
        Z_tr_tmp = model(torch.from_numpy(X_tr).float().to(device)).cpu().numpy()
        Z_va_tmp = model(torch.from_numpy(X_va).float().to(device)).cpu().numpy()

    centroids_tmp = build_centroids(Z_tr_tmp, y_tr_super)
    y_va_pred_tmp = predict_with_centroids(Z_va_tmp, centroids_tmp)
    f1_va_tmp = f1_score(y_va_super, y_va_pred_tmp, average="macro")
    print(f" | val F1_macro = {f1_va_tmp:.4f}")

    if f1_va_tmp > best_va_f1:
        best_va_f1 = f1_va_tmp
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}

    model.train()

print(f"Best val F1_macro = {best_va_f1:.4f}")

if best_state is not None:
    model.load_state_dict(best_state)

model.eval()
with torch.no_grad():
    Z_tr = model(torch.from_numpy(X_tr).float().to(device)).cpu().numpy()
    Z_va = model(torch.from_numpy(X_va).float().to(device)).cpu().numpy()
    Z_te = model(torch.from_numpy(X_te).float().to(device)).cpu().numpy()

print("Z shapes:", Z_tr.shape, Z_va.shape, Z_te.shape)


Training contrastive projection head (X -> Z) using top_agency labels...
Using device: cpu
Epoch 01 | SupCon loss = 4.9465 | val F1_macro = 0.6906
Epoch 02 | SupCon loss = 4.5314 | val F1_macro = 0.7586
Epoch 03 | SupCon loss = 4.3167 | val F1_macro = 0.7851
Epoch 04 | SupCon loss = 4.1669 | val F1_macro = 0.7936
Epoch 05 | SupCon loss = 4.0222 | val F1_macro = 0.7990
Epoch 06 | SupCon loss = 3.9108 | val F1_macro = 0.7974
Epoch 07 | SupCon loss = 3.8135 | val F1_macro = 0.8056
Epoch 08 | SupCon loss = 3.7315 | val F1_macro = 0.8006
Epoch 09 | SupCon loss = 3.6456 | val F1_macro = 0.8028
Epoch 10 | SupCon loss = 3.5804 | val F1_macro = 0.7997
Best val F1_macro = 0.8056
Z shapes: (9639, 256) (3213, 256) (3213, 256)


## Centroids

In [20]:
label_name = "top_agency"

y_tr_top = y_tr[label_name]
y_va_top = y_va[label_name]
y_te_top = y_te[label_name]

def build_centroids(X, y):
    """ Build class centroids from embeddings and labels. """
    centroids = {}
    for lab in np.unique(y):
        centroids[lab] = X[y == lab].mean(axis=0)
    return centroids

def predict_with_centroids(X, centroids):
    """ Predict labels based on nearest centroids using cosine similarity. """
    labels = np.array(list(centroids.keys()))
    C = np.vstack([centroids[lab] for lab in labels])  # [n_classes, dim]

    Xn = X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-8)
    Cn = C / (np.linalg.norm(C, axis=1, keepdims=True) + 1e-8)

    # cosine similarity: [n_samples, n_classes]
    sims = Xn @ Cn.T
    idx = sims.argmax(axis=1)
    return labels[idx]


def eval_centroid(name, X_tr_, X_te_, y_tr_, y_te_):
    """ Evaluate centroid classifier on given data. """
    print(f"\n=== Centroid classifier on {name} ===")
    centroids = build_centroids(X_tr_, y_tr_)
    y_pred = predict_with_centroids(X_te_, centroids)

    acc = accuracy_score(y_te_, y_pred)
    f1_macro = f1_score(y_te_, y_pred, average="macro")
    f1_weighted = f1_score(y_te_, y_pred, average="weighted")

    print(f"Accuracy      : {acc:.4f}")
    print(f"F1 (macro)    : {f1_macro:.4f}")
    print(f"F1 (weighted) : {f1_weighted:.4f}")
    print("\nClassification report (truncated):")
    print(classification_report(y_te_, y_pred, digits=3))

    return centroids, y_pred


# 1) X -> centroids
centroids_X, y_pred_X = eval_centroid(
    "X (Titan raw embedding)", X_tr, X_te, y_tr_top, y_te_top
)

# 2) Z -> centroids
centroids_Z, y_pred_Z = eval_centroid(
    "Z (contrastive embedding)", Z_tr, Z_te, y_tr_top, y_te_top
)


=== Centroid classifier on X (Titan raw embedding) ===
Accuracy      : 0.6026
F1 (macro)    : 0.6036
F1 (weighted) : 0.6060

Classification report (truncated):
              precision    recall  f1-score   support

           0      0.706     0.632     0.667       152
           1      0.829     0.534     0.650       191
           2      0.697     0.485     0.572       171
           3      0.638     0.777     0.701       202
           4      0.814     0.679     0.741       187
           5      0.397     0.370     0.383       162
           6      0.562     0.489     0.523       184
           7      0.598     0.718     0.652       209
           8      0.536     0.513     0.524       189
           9      0.477     0.595     0.529       190
          10      0.354     0.586     0.441       186
          11      0.713     0.641     0.676       198
          12      0.870     0.870     0.870       215
          13      0.673     0.532     0.594       201
          14      0.563     

## Compare

In [21]:
print("X shapes:", X_tr.shape, X_va.shape, X_te.shape)
print("Z shapes:", Z_tr.shape, Z_va.shape, Z_te.shape)
print("y sizes :", y_tr_top.shape, y_va_top.shape, y_te_top.shape)

def simple_metrics(title, y_true, y_pred):
    """ Compute and print simple accuracy and F1-macro metrics. """
    acc = accuracy_score(y_true, y_pred)
    f1_macro = f1_score(y_true, y_pred, average="macro")
    print(title)
    print(f"  accuracy : {acc:.4f}")
    print(f"  f1_macro : {f1_macro:.4f}")
    return acc, f1_macro


# ======================================
# 1) Baseline: label embedding + cosine
# ======================================
print("\n=== Baseline: label embedding cosine (TEST) ===")

# label encoder and class ids
le_top = label_encoders[label_name]
classes = np.unique(y_tr_top)
class_texts = le_top.inverse_transform(classes)

# embed labels using Titan
label_vecs = embed_all(
    list(class_texts),
    dimensions=X_tr.shape[1],
    normalize=True,
)

label_vecs = label_vecs / (np.linalg.norm(label_vecs, axis=1, keepdims=True) + 1e-8)

# normalize X_te
X_te_norm = X_te / (np.linalg.norm(X_te, axis=1, keepdims=True) + 1e-8)

# cosine similarity = dot product
sims = X_te_norm @ label_vecs.T
idx = sims.argmax(axis=1)
y_te_label_cos = classes[idx]

simple_metrics("TEST (label-embedding cosine)", y_te_top, y_te_label_cos)


# ======================================
# 2) Bayesian: GaussianNB on Z
# ======================================
print("\n=== Bayesian: GaussianNB on Z (TEST) ===")

gnb = GaussianNB()
gnb.fit(Z_tr, y_tr_top)

y_te_gnb = gnb.predict(Z_te)
simple_metrics("TEST (GaussianNB on Z)", y_te_top, y_te_gnb)


# ======================================
# 3) kNN on Z (TEST)
# ======================================
print("\n=== kNN on Z (TEST) ===")

for k in [3, 5, 10]:
    knn = KNeighborsClassifier(
        n_neighbors=k,
        metric="euclidean",
        n_jobs=-1,
    )
    knn.fit(Z_tr, y_tr_top)

    y_te_knn = knn.predict(Z_te)
    simple_metrics(f"TEST (kNN on Z, k={k})", y_te_top, y_te_knn)


X shapes: (9639, 384) (3213, 384) (3213, 384)
Z shapes: (9639, 256) (3213, 256) (3213, 256)
y sizes : (9639,) (3213,) (3213,)

=== Baseline: label embedding cosine (TEST) ===


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.14s/it]


TEST (label-embedding cosine)
  accuracy : 0.3442
  f1_macro : 0.3193

=== Bayesian: GaussianNB on Z (TEST) ===
TEST (GaussianNB on Z)
  accuracy : 0.8017
  f1_macro : 0.8041

=== kNN on Z (TEST) ===
TEST (kNN on Z, k=3)
  accuracy : 0.8017
  f1_macro : 0.8009
TEST (kNN on Z, k=5)
  accuracy : 0.8024
  f1_macro : 0.8020
TEST (kNN on Z, k=10)
  accuracy : 0.8033
  f1_macro : 0.8031


## Final

In [22]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

# final evaluation: centroid classifier on Z (TEST set)

print("\n=== Centroid classifier on Z (TEST) ===")

# reuse previously defined build_centroids / predict_with_centroids
centroids_Z = build_centroids(Z_tr, y_tr_top)
y_te_cent = predict_with_centroids(Z_te, centroids_Z)

acc = accuracy_score(y_te_top, y_te_cent)
f1_macro = f1_score(y_te_top, y_te_cent, average="macro")
f1_weighted = f1_score(y_te_top, y_te_cent, average="weighted")

print(f"Accuracy      : {acc:.4f}")
print(f"F1 (macro)    : {f1_macro:.4f}")
print(f"F1 (weighted) : {f1_weighted:.4f}")
print("\nFull classification report:")
print(classification_report(y_te_top, y_te_cent, digits=3))


=== Centroid classifier on Z (TEST) ===
Accuracy      : 0.8030
F1 (macro)    : 0.8032
F1 (weighted) : 0.8046

Full classification report:
              precision    recall  f1-score   support

           0      0.930     0.789     0.854       152
           1      0.768     0.817     0.792       191
           2      0.851     0.836     0.844       171
           3      0.827     0.851     0.839       202
           4      0.918     0.834     0.874       187
           5      0.673     0.710     0.691       162
           6      0.768     0.793     0.781       184
           7      0.792     0.785     0.788       209
           8      0.837     0.815     0.826       189
           9      0.630     0.753     0.686       190
          10      0.811     0.828     0.819       186
          11      0.821     0.788     0.804       198
          12      0.939     0.926     0.932       215
          13      0.890     0.801     0.843       201
          14      0.779     0.764     0.772       

## Error Analysis

### Annotation
- Config
    - `LABEL_COLS = ["top_agency"]`
- Split
    - `TR_IDX, VA_IDX, TE_IDX`: integer arrays of row indices into df ->
        - `X_tr, X_va, X_te`: text embeddings split by index
        - `y_tr, y_va, y_te`: (!dict) label names split by index
- Contrastive Learning
    - `X_tr, X_va` -> `Z_tr, Z_va, Z_te`: remapped text embeddings split by index
- Main
    - `y_tr_top, y_va_top, y_te_top`: (no longer a dict! ready to be used) label names split by index
- Centroids
    - `y_te_cent`: predicted label using centroids
- Compare
    - `y_te_label_cos, y_te_gnb, y_te_knn`: predicted label on test set using corresponding approaches

In [None]:
import numpy as np

y_true = np.asarray(y_te_top)
y_pred = np.asarray(y_te_cent)   # or y_te_label_cos, y_te_gnb, y_te_knn

# 1. misclassified positions INSIDE the test split
mis_local_idx = np.where(y_true != y_pred)[0]

print("Misclassified (local):", mis_local_idx[:20])

# 2. convert to original df indexes
mis_global_idx = TE_IDX[mis_local_idx]

print("Misclassified (global df indexes):", mis_global_idx[:20])

Misclassified (local): [ 1  3  9 10 11 15 16 21 31 47 58 59 62 67 68 69 83 88 90 98]
Misclassified (global df indexes): [ 8964 11754  4531  9866   365 16034   147  8599  6230  8980  6272  8616
    16 13199  3493  9651  2355  4592  8656  6356]


In [None]:
# choose your prediction method: "cent", "label_cos", "gnb", or "knn"
METHOD = "cent"

# map method name → predicted label vector
pred_map = {
    "cent": y_te_cent,
    "label_cos": y_te_label_cos,
    "gnb": y_te_gnb,
    "knn": y_te_knn,
}

y_true = np.asarray(y_te_top)
y_pred = np.asarray(pred_map[METHOD])


In [None]:
# local positions within the test split
mis_local_idx = np.where(y_true != y_pred)[0]
print(f"Total misclassified samples ({METHOD}):", len(mis_local_idx))

# convert to original dataframe row indices
mis_global_idx = TE_IDX[mis_local_idx]

print("\nFirst 20 global misclassified indexes:")
print(mis_global_idx[:20])


Total misclassified samples (cent): 577

First 20 global misclassified indexes:
[ 8964 11754  4531  9866   365 16034   147  8599  6230  8980  6272  8616
    16 13199  3493  9651  2355  4592  8656  6356]


In [None]:
le = label_encoders["top_agency"]

true_names = le.inverse_transform(y_te_top[mis_local_idx])
pred_names = le.inverse_transform(y_pred[mis_local_idx])

In [None]:
#true_names = label_encoder.inverse_transform(y_true[mis_local_idx])
#pred_names = label_encoder.inverse_transform(y_pred[mis_local_idx])


import pandas as pd

df_errors = pd.DataFrame({
    "global_idx": mis_global_idx,
    "local_te_idx": mis_local_idx,
    "true_label": y_true[mis_local_idx],
    "pred_label": y_pred[mis_local_idx],
    "message": df.loc[mis_global_idx, "message"].values,
    "true_label": true_names,
    "pred_label": pred_names
})

df_errors.head()


Unnamed: 0,global_idx,local_te_idx,true_label,pred_label,message
0,8964,1,Department of Labor,Other Independent Federal Agency or Commission,"this model is not designed to give legal, fina..."
1,11754,3,Department of Veteran's Affairs (VA),Executive Office of the President,is writing to express her support for and ask ...
2,4531,9,Department of Energy,Other Independent Federal Agency or Commission,"dear senator/representative, i'm writing to in..."
3,9866,10,Department of State,Department of Health and Human Services (HHS),dear senator/representative: my name is aliyu ...
4,365,11,Department of Agriculture (USDA),Executive Office of the President,"dear senator/representative , as an american a..."


In [None]:
#!pip install openpyxl

# Export misclassified messages to Excel
output_path = "misclassified_messages.xlsx"
df_errors.to_excel(output_path, index=False)

print("Saved to:", output_path)


Saved to: misclassified_messages.xlsx


In [None]:
sample = df_errors.sample(n=20, random_state=42)

output_path = "misclassified_sample20.xlsx"
sample.to_excel(output_path, index=False)

print("Saved to:", output_path)

Saved to: misclassified_sample20.xlsx
