## Novel model approach

This notebook serves as the source code for all the model testing and training (along with hyperparam grid search) before the development/submission of the final best model. This model approach tries a variation on the transformer architecture, with different heads, as detailed in the report.

In [31]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score, average_precision_score
import itertools
from tqdm import tqdm

In [32]:
from pcl_tf.dataset_manager import DatasetManager as DM
from pcl_tf.collation import collate_fn
from pcl_tf.tf import warmup_model, get_tokenizer

In [33]:
NUM_LABELS = 7
LOAD_BATCH_SIZE = 64
LOCAL_CACHE_DIR = './models_cache'
MODEL_NAME = "albert-base-v2"
NUM_WORKERS = 0
PIN_MEMORY = False
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

Using device: cpu


In [34]:
print('Warming up tokenizer...')
tokenizer = get_tokenizer(MODEL_NAME)

print('Warming up encoder (downloads model if needed)...')

_ = warmup_model(MODEL_NAME, device=None, cache_dir=LOCAL_CACHE_DIR)
print('Model cache warmup completed.')

Warming up tokenizer...
Warming up encoder (downloads model if needed)...
Model cache warmup completed.


In [35]:
train_labels_path = "data/train_semeval_parids-labels.csv"
dev_labels_path = "data/dev_semeval_parids-labels.csv"
texts_path = "data/dontpatronizeme_pcl_cleaned.csv"
test_path = "data/task4_test.tsv"
cats_path = "data/dontpatronizeme_categories.tsv"

In [36]:
texts_df = pd.read_csv(texts_path, low_memory=False)
texts_df["par_id"] = texts_df["par_id"].astype(int)
texts_df = texts_df.set_index("par_id")

In [37]:
training_ds = DM(train_labels_path, texts_df=texts_df)
training_ds.print_stats()

Total samples: 8375
Binary distribution: [7581  794]
Multilabel distribution: [574. 160. 162. 192. 145. 363.  29.]


In [38]:
dev_ds = DM(dev_labels_path, texts_df=texts_df)
dev_ds.print_stats()

Total samples: 2093
Binary distribution: [1894  199]
Multilabel distribution: [142.  36.  62.  38.  52. 106.  11.]


In [39]:
def collate_fn_wrapper(tokenizer):
    def collate_fn_inner(batch):
        return collate_fn(tokenizer, batch)
    return collate_fn_inner

In [40]:
tokenizer = get_tokenizer(MODEL_NAME)


train_loader = DataLoader(training_ds, batch_size=LOAD_BATCH_SIZE, shuffle=True, 
                          collate_fn=collate_fn_wrapper(tokenizer), pin_memory=PIN_MEMORY, num_workers=NUM_WORKERS)

dev_loader = DataLoader(dev_ds, batch_size=LOAD_BATCH_SIZE, shuffle=False, 
                        collate_fn=collate_fn_wrapper(tokenizer), pin_memory=PIN_MEMORY, num_workers=NUM_WORKERS)

In [41]:
def evaluate_dev(model, dataloader, device):
    model.eval()
    bin_probs=[]
    bin_labels=[]
    multi_probs=[]
    multi_labels=[]

    with torch.no_grad():
        for b in dataloader:
            input_ids = b["input_ids"].to(device)
            attention_mask = b["attention_mask"].to(device)
            
            labels = b["labels"].to(device)
            out = model(input_ids=input_ids, attention_mask=attention_mask)
            
            bin_probs.append(torch.sigmoid(out["logit_bin"]).cpu().numpy())
            multi_probs.append(torch.sigmoid(out["logit_multi"]).cpu().numpy())
            
            bin_labels.append(labels[:,0].cpu().numpy())
            multi_labels.append(labels[:,1:].cpu().numpy())
            
    bin_probs = np.concatenate(bin_probs); bin_labels = np.concatenate(bin_labels)
    multi_probs = np.concatenate(multi_probs); multi_labels = np.concatenate(multi_labels)
    micro_f1 = f1_score(multi_labels.flatten(), (multi_probs>=0.5).astype(int).flatten(), zero_division=0)
    return {"micro_f1": micro_f1, "bin_ap": average_precision_score(bin_labels, bin_probs)}


In [45]:
import optuna
from pcl_tf.tf import PCLModel

def objective(trial):
    model_name = trial.suggest_categorical("model_name", ["albert-base-v2", "roberta-base"])
    lr = trial.suggest_loguniform("lr", 1e-5, 1e-3)
    wd = trial.suggest_loguniform("wd", 1e-4, 1e-2)
    max_len = trial.suggest_categorical("max_len", [128, 256])
    dropout = trial.suggest_uniform("dropout", 0.0, 0.5)
    epochs = trial.suggest_int("epochs", 3, 10)

    model = PCLModel(model_name, n_labels=NUM_LABELS, dropout=dropout, device=DEVICE).to(DEVICE)
    optim = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)

    for epoch in range(epochs):
        print("Epoch:", epoch + 1)
        model.train()
        running_loss = 0.0
        
        for batch in train_loader:
            input_ids = batch["input_ids"].to(DEVICE, non_blocking=True)
            attention_mask = batch["attention_mask"].to(DEVICE, non_blocking=True)
            labels = batch["labels"].to(DEVICE, non_blocking=True)

            optim.zero_grad()
            out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = out["loss"]
            loss.backward()
            optim.step()
            running_loss += loss.item()

    metrics = evaluate_dev(model, dev_loader, DEVICE)
    micro_f1 = metrics["micro_f1"]
    print("Epoch metrics: ", str(metrics))
    # Optuna minimizes the objective, so return the negative of the metric to maximize it
    return -micro_f1

In [43]:
torch.cuda.empty_cache()

In [46]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[32m[I 2026-02-19 10:05:37,752][0m A new study created in memory with name: no-name-b0035d9c-c6eb-4d37-9b0c-fefac9922f00[0m
  lr = trial.suggest_loguniform("lr", 1e-5, 1e-3)
  wd = trial.suggest_loguniform("wd", 1e-4, 1e-2)
  dropout = trial.suggest_uniform("dropout", 0.0, 0.5)


Epoch: 1


[33m[W 2026-02-19 10:20:06,570][0m Trial 0 failed with parameters: {'model_name': 'roberta-base', 'lr': 8.426022355911104e-05, 'wd': 0.001087502734761261, 'max_len': 256, 'dropout': 0.3071273341278175, 'epochs': 5} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "/Users/pc/Code/Python/pcl-detection/.venv/lib/python3.10/site-packages/optuna/study/_optimize.py", line 206, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/0c/b8h_62051hz984rp1ys4tbvw0000gn/T/ipykernel_57235/1629578714.py", line 26, in objective
    out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
  File "/Users/pc/Code/Python/pcl-detection/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/Users/pc/Code/Python/pcl-detection/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
    return for

KeyboardInterrupt: 

In [None]:
best_trial = study.best_trial
print("Best trial:")
print(f"  Value: {best_trial.value}")
print("  Params:")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

In [None]:
results = []
for trial in study.trials:
    results.append({**trial.params, "value": trial.value})
res_df = pd.DataFrame(results)
res_df.to_csv("optuna_results.csv", index=False)
print("Saved optuna_results.csv")