## Novel model approach

This notebook serves as the source code for all the model testing and training (along with hyperparam grid search) before the development/submission of the final best model. This model approach tries a variation on the transformer architecture, with different heads, as detailed in the report.

In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score, average_precision_score
import itertools
from tqdm import tqdm

In [None]:
from pcl_tf.dataset_manager import DatasetManager as DM
from pcl_tf.collation import collate_fn
from pcl_tf.tf import warmup_model, get_tokenizer

In [None]:
NUM_LABELS = 7
LOAD_BATCH_SIZE = 16
LOCAL_CACHE_DIR = './models_cache'
MODEL_NAME = "albert-base-v2"
NUM_WORKERS = 0
PIN_MEMORY = False
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

In [None]:
print('Warming up tokenizer...')
tokenizer = get_tokenizer(MODEL_NAME)

print('Warming up encoder (downloads model if needed)...')

_ = warmup_model(MODEL_NAME, device=None, cache_dir=LOCAL_CACHE_DIR)
print('Model cache warmup completed.')

Warming up tokenizer...
Warming up encoder (downloads model if needed)...


Loading weights:   0%|          | 0/25 [00:00<?, ?it/s]

[1mAlbertModel LOAD REPORT[0m from: albert-base-v2
Key                          | Status     |  | 
-----------------------------+------------+--+-
predictions.LayerNorm.weight | UNEXPECTED |  | 
predictions.LayerNorm.bias   | UNEXPECTED |  | 
predictions.bias             | UNEXPECTED |  | 
predictions.dense.weight     | UNEXPECTED |  | 
predictions.dense.bias       | UNEXPECTED |  | 
predictions.decoder.bias     | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Model cache warmup completed.


In [None]:
train_labels_path = "data/train_semeval_parids-labels.csv"
dev_labels_path = "data/dev_semeval_parids-labels.csv"
texts_path = "data/dontpatronizeme_pcl_cleaned.csv"
test_path = "data/task4_test.tsv"
cats_path = "data/dontpatronizeme_categories.tsv"

In [None]:
texts_df = pd.read_csv(texts_path, low_memory=False)
texts_df["par_id"] = texts_df["par_id"].astype(int)
texts_df = texts_df.set_index("par_id")

In [None]:
training_ds = DM(train_labels_path, texts_df=texts_df)
training_ds.print_stats()

In [None]:
dev_ds = DM(dev_labels_path, texts_df=texts_df)
dev_ds.print_stats()

In [9]:
def collate_fn_wrapper(tokenizer):
    def collate_fn_inner(batch):
        return collate_fn(tokenizer, batch)
    return collate_fn_inner

In [None]:
def evaluate_dev(model, dataloader, device):
    model.eval()
    bin_probs=[]
    bin_labels=[]
    multi_probs=[]
    multi_labels=[]

    with torch.no_grad():
        for b in dataloader:
            input_ids = b["input_ids"].to(device)
            attention_mask = b["attention_mask"].to(device)
            
            labels = b["labels"].to(device)
            out = model(input_ids=input_ids, attention_mask=attention_mask)
            
            bin_probs.append(torch.sigmoid(out["logit_bin"]).cpu().numpy())
            multi_probs.append(torch.sigmoid(out["logit_multi"]).cpu().numpy())
            
            bin_labels.append(labels[:,0].cpu().numpy())
            multi_labels.append(labels[:,1:].cpu().numpy())
            
    bin_probs = np.concatenate(bin_probs); bin_labels = np.concatenate(bin_labels)
    multi_probs = np.concatenate(multi_probs); multi_labels = np.concatenate(multi_labels)
    micro_f1 = f1_score(multi_labels.flatten(), (multi_probs>=0.5).astype(int).flatten(), zero_division=0)
    return {"micro_f1": micro_f1, "bin_ap": average_precision_score(bin_labels, bin_probs)}


In [None]:
import optuna
import pcl_tf.collation as pcl_collation
from pcl_tf.tf import PCLModel, get_tokenizer

def objective(trial):
    model_name = trial.suggest_categorical("model_name", ["albert-base-v2", "microsoft/deberta-v3-small"])
    lr = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    wd = trial.suggest_float("wd", 1e-4, 1e-2, log=True)
    max_len = trial.suggest_categorical("max_len", [128, 256])
    dropout = 0 # any other dropout value causes grad explosion prob due to small batch size + complex task
    epochs = trial.suggest_int("epochs", 3, 10)

    trial_tokenizer = get_tokenizer(model_name)

    pcl_collation.MAX_LEN = max_len

    trial_train_loader = DataLoader(
        training_ds,
        batch_size=LOAD_BATCH_SIZE,
        shuffle=True,
        collate_fn=collate_fn_wrapper(trial_tokenizer),
        pin_memory=PIN_MEMORY,
        num_workers=NUM_WORKERS,
    )

    trial_dev_loader = DataLoader(
        dev_ds,
        batch_size=LOAD_BATCH_SIZE,
        shuffle=False,
        collate_fn=collate_fn_wrapper(trial_tokenizer),
        pin_memory=PIN_MEMORY,
        num_workers=NUM_WORKERS,
    )

    model = PCLModel(model_name, n_labels=NUM_LABELS, dropout=dropout, device=DEVICE).to(DEVICE)
    optim = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)

    print("Config:", {"model_name": model_name, "lr": lr, "wd": wd, "max_len": max_len, "dropout": dropout, "epochs": epochs})

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0

        for batch in trial_train_loader:
            input_ids = batch["input_ids"].to(DEVICE, non_blocking=True)
            attention_mask = batch["attention_mask"].to(DEVICE, non_blocking=True)
            labels = batch["labels"].to(DEVICE, non_blocking=True)

            optim.zero_grad()
            out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = out["loss"]
            loss.backward()
            optim.step()
            running_loss += loss.item()

        print(f"Epoch {epoch + 1} - Average Loss: {running_loss / len(trial_train_loader)}")

    metrics = evaluate_dev(model, trial_dev_loader, DEVICE)
    micro_f1 = metrics["micro_f1"]
    print("Trial metrics:", str(metrics))
    return micro_f1

In [15]:
torch.cuda.empty_cache()

In [None]:
study = optuna.create_study(direction="maximize", study_name="pcl_hyperparam_search")
study.optimize(objective, n_trials=50)

[32m[I 2026-02-19 16:46:41,491][0m A new study created in memory with name: no-name-6798b5e8-d2f7-4788-8ecc-cd586841645b[0m


Config: {'model_name': 'albert-base-v2', 'lr': 3.6915697620432926e-05, 'wd': 0.0005918506109375099, 'max_len': 128, 'dropout': 0, 'epochs': 10}
Epoch 1 - Average Loss: 0.3933741569497715
Epoch 2 - Average Loss: 0.334785170469234
Epoch 3 - Average Loss: 0.24139006980534872
Epoch 4 - Average Loss: 0.17194928043212684
Epoch 5 - Average Loss: 0.151179707753081


In [None]:
best_trial = study.best_trial
print("Best trial:")
print(f"  Value: {best_trial.value}")
print("  Params:")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

In [None]:
results = []
for trial in study.trials:
    results.append({**trial.params, "value": trial.value})
res_df = pd.DataFrame(results)
res_df.to_csv("optuna_results.csv", index=False)
print("Saved optuna_results.csv")