## Novel model approach

This notebook serves as the source code for all the model testing and training (along with hyperparam grid search) before the development/submission of the final best model. This model approach tries a variation on the transformer architecture, with different heads, as detailed in the report.

In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score, average_precision_score
import itertools
from tqdm import tqdm

In [None]:
from pcl_tf.dataset_manager import DatasetManager as DM
from pcl_tf.collation import collate_fn
from pcl_tf.tf import warmup_model, get_tokenizer

In [None]:
NUM_LABELS = 7
LOAD_BATCH_SIZE = 16
LOCAL_CACHE_DIR = './models_cache'
MODEL_NAME = "albert-base-v2"
NUM_WORKERS = 8
PIN_MEMORY = False
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

In [None]:
print('Warming up tokenizer...')
tokenizer = get_tokenizer(MODEL_NAME)

print('Warming up encoder (downloads model if needed)...')

_ = warmup_model(MODEL_NAME, device=None, cache_dir=LOCAL_CACHE_DIR)
print('Model cache warmup completed.')

In [None]:
train_labels_path = "data/train_semeval_parids-labels.csv"
dev_labels_path = "data/dev_semeval_parids-labels.csv"
texts_path = "data/dontpatronizeme_pcl_cleaned.csv"
test_path = "data/task4_test.tsv"
cats_path = "data/dontpatronizeme_categories.tsv"

In [None]:
texts_df = pd.read_csv(texts_path, low_memory=False)
texts_df["par_id"] = texts_df["par_id"].astype(int)
texts_df = texts_df.set_index("par_id")

In [None]:
training_ds = DM(train_labels_path, texts_df=texts_df)
training_ds.print_stats()

In [None]:
dev_ds = DM(dev_labels_path, texts_df=texts_df)
dev_ds.print_stats()

In [None]:
tokenizer = get_tokenizer(MODEL_NAME)


train_loader = DataLoader(training_ds, batch_size=LOAD_BATCH_SIZE, shuffle=True, 
                          collate_fn=lambda b: collate_fn(tokenizer, b), pin_memory=PIN_MEMORY, num_workers=NUM_WORKERS)

dev_loader = DataLoader(dev_ds, batch_size=LOAD_BATCH_SIZE, shuffle=False, 
                        collate_fn=lambda b: collate_fn(tokenizer, b), pin_memory=PIN_MEMORY, num_workers=NUM_WORKERS)

In [None]:
def evaluate_dev(model, dataloader, device):
    model.eval()
    bin_probs=[]
    bin_labels=[]
    multi_probs=[]
    multi_labels=[]

    with torch.no_grad():
        for b in dataloader:
            input_ids = b["input_ids"].to(device)
            attention_mask = b["attention_mask"].to(device)
            labels = b["labels"].to(device)
            out = model(input_ids=input_ids, attention_mask=attention_mask)
            bin_probs.append(torch.sigmoid(out["logit_bin"]).cpu().numpy())
            multi_probs.append(torch.sigmoid(out["logit_multi"]).cpu().numpy())
            bin_labels.append(labels[:,0].cpu().numpy())
            multi_labels.append(labels[:,1:].cpu().numpy())
            
    bin_probs = np.concatenate(bin_probs); bin_labels = np.concatenate(bin_labels)
    multi_probs = np.concatenate(multi_probs); multi_labels = np.concatenate(multi_labels)
    micro_f1 = f1_score(multi_labels.flatten(), (multi_probs>=0.5).astype(int).flatten(), zero_division=0)
    return {"micro_f1": micro_f1, "bin_ap": average_precision_score(bin_labels, bin_probs)}


In [None]:
from pcl_tf.tf import PCLModel

def train_and_eval(config):
    model = PCLModel(config["model_name"], n_labels=NUM_LABELS, dropout=config["dropout"], device=DEVICE).to(DEVICE)
    optim = torch.optim.AdamW(model.parameters(), lr=config["lr"], weight_decay=config["wd"])
    print("Model and optimizer created")

    print("Starting training...")
    for epoch in range(config["epochs"]):
        model.train()
        running = 0.0
        for batch in train_loader:
            input_ids = batch["input_ids"].to(DEVICE, non_blocking=True)
            attention_mask = batch["attention_mask"].to(DEVICE, non_blocking=True)
            labels = batch["labels"].to(DEVICE, non_blocking=True)
            optim.zero_grad()
            out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = out["loss"]
            loss.backward()
            optim.step()
            running += loss.item()
        # optionally print per-epoch
        print(f"Epoch {epoch+1}, Loss: {running/len(train_loader)}")
    metrics = evaluate_dev(model, dev_loader, DEVICE)
    return metrics

In [None]:
grid = {
    "model_name": ["albert-base-v2", "microsoft/deberta-v3-small"],
    "lr": [2e-5, 5e-5, 1e-4, 5e-4],
    "wd": [1e-4, 5e-4, 1e-3, 1e-2, 5e-3],
    "max_len": [128, 256],
    "dropout": [0, 0.1, 0.01],
    "epochs": [10]
}

In [None]:
torch.cuda.empty_cache()

In [None]:
keys, values = zip(*grid.items())
results = []
for combo in tqdm(list(itertools.product(*values)), desc="Grid"):
    cfg = dict(zip(keys, combo))
    try:
        metrics = train_and_eval(cfg)
        results.append({**cfg, **metrics})
        print("CFG:", cfg, "=>", metrics)
    except Exception as e:
        print("Error for cfg", cfg, ":", e)
        results.append({**cfg, "error": str(e)})

In [None]:
res_df = pd.DataFrame(results)
res_df.to_csv("grid_results.csv", index=False)
print("Saved grid_results.csv")
best_idx = res_df["micro_f1"].idxmax() if "micro_f1" in res_df.columns else None
if best_idx is not None:
    print("Best config:\n", res_df.loc[best_idx].to_dict())
else:
    print("No successful runs found.")