## Novel model approach

This notebook serves as the source code for all the model testing and training (along with hyperparam grid search) before the development/submission of the final best model. This model approach tries a variation on the transformer architecture, with different heads, as detailed in the report.

In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score, average_precision_score
import itertools
from tqdm import tqdm

In [2]:
from pcl_tf.dataset_manager import DatasetManager as DM
from pcl_tf.collation import collate_fn

In [3]:
NUM_LABELS = 7
LOAD_BATCH_SIZE = 16
ACCUM_STEPS = 3  # effective batch size = LOAD_BATCH_SIZE * ACCUM_STEPS = 32
LOCAL_CACHE_DIR = './models_cache'
MODEL_NAME = "albert-base-v2"
NUM_WORKERS = 0
PIN_MEMORY = False
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

Using device: cuda


In [4]:
train_labels_path = "data/train_semeval_parids-labels.csv"
dev_labels_path = "data/dev_semeval_parids-labels.csv"
texts_path = "data/dontpatronizeme_pcl_cleaned.csv"
test_path = "data/task4_test.tsv"
cats_path = "data/dontpatronizeme_categories.tsv"

In [5]:
texts_df = pd.read_csv(texts_path, low_memory=False)
texts_df["par_id"] = texts_df["par_id"].astype(int)
texts_df = texts_df.set_index("par_id")

In [6]:
from pcl_tf.feature_engineering import build_auxiliary_features, transform_auxiliary_features

train_labels = pd.read_csv(train_labels_path)
train_par_ids = train_labels["par_id"].astype(int).values
train_texts = texts_df.loc[train_par_ids, "text"]

aux_train, aux_meta = build_auxiliary_features(train_texts, ngram_range=(1, 3),
                                                max_features=200, min_df=5)
AUX_DIM = aux_meta["total_dim"]
print(f"Auxiliary feature dim: {AUX_DIM} (NER={aux_meta['n_ner']}, ngram={aux_meta['n_ngram']})")

Auxiliary feature dim: 218 (NER=18, ngram=200)


In [7]:
dev_labels = pd.read_csv(dev_labels_path)
dev_par_ids = dev_labels["par_id"].astype(int).values
dev_par_ids = dev_par_ids[np.isin(dev_par_ids, texts_df.index)]
dev_texts = texts_df.loc[dev_par_ids, "text"]
aux_dev = transform_auxiliary_features(dev_texts, aux_meta)

In [8]:
training_ds = DM(train_labels_path, texts_df=texts_df, aux_features=aux_train)
training_ds.print_stats()

Total samples: 8375
Binary distribution: [7581  794]
Multilabel distribution: [574. 160. 162. 192. 145. 363.  29.]


In [9]:
dev_ds = DM(dev_labels_path, texts_df=texts_df, aux_features=aux_dev)
dev_ds.print_stats()

Total samples: 2093
Binary distribution: [1894  199]
Multilabel distribution: [142.  36.  62.  38.  52. 106.  11.]


In [10]:
def collate_fn_wrapper(tokenizer):
    def collate_fn_inner(batch):
        return collate_fn(tokenizer, batch)
    return collate_fn_inner

In [None]:
def evaluate_dev(model, dataloader, device):
    """Evaluate model on dev set. Primary metric: F1 of positive (PCL) class."""
    model.eval()
    bin_probs=[]
    bin_labels=[]
    multi_probs=[]
    multi_labels=[]

    with torch.no_grad():
        for b in dataloader:
            input_ids = b["input_ids"].to(device)
            attention_mask = b["attention_mask"].to(device)
            labels = b["labels"].to(device)
            aux_features = b["aux_features"].to(device) if "aux_features" in b else None

            out = model(input_ids=input_ids, attention_mask=attention_mask, aux_features=aux_features)
            
            bin_probs.append(torch.sigmoid(out["logit_bin"]).cpu().numpy())
            multi_probs.append(torch.sigmoid(out["logit_multi"]).cpu().numpy())
            
            bin_labels.append(labels[:,0].cpu().numpy())
            multi_labels.append(labels[:,1:].cpu().numpy())
            
    bin_probs = np.concatenate(bin_probs)
    bin_labels = np.concatenate(bin_labels)
    multi_probs = np.concatenate(multi_probs)
    multi_labels = np.concatenate(multi_labels)

    # Primary task metric: F1 of positive (PCL) class
    bin_preds = (bin_probs >= 0.5).astype(int)
    bin_f1 = f1_score(bin_labels, bin_preds, pos_label=1, zero_division=0)

    # Secondary diagnostics
    multi_micro_f1 = f1_score(multi_labels.flatten(), (multi_probs >= 0.1).flatten(), zero_division=0)
    bin_ap = average_precision_score(bin_labels, bin_probs)

    return {"bin_f1": bin_f1, "multi_micro_f1": multi_micro_f1, "bin_ap": bin_ap}

In [None]:
import optuna
import pcl_tf.collation as pcl_collation
from pcl_tf.tf import PCLModel, get_tokenizer

scaler = torch.amp.GradScaler("cuda")  # for mixed-precision training

def objective(trial):
    model_name = trial.suggest_categorical("model_name", ["albert-base-v2", "microsoft/deberta-v3-small"])
    lr = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    wd = trial.suggest_float("wd", 1e-4, 1e-2, log=True)
    max_len = trial.suggest_categorical("max_len", [128, 256])
    dropout = 0 # any other dropout value causes grad explosion prob due to small batch size + complex task
    epochs = trial.suggest_int("epochs", 3, 12)

    trial_tokenizer = get_tokenizer(model_name)

    pcl_collation.MAX_LEN = max_len

    trial_train_loader = DataLoader(
        training_ds,
        batch_size=LOAD_BATCH_SIZE,
        shuffle=True,
        collate_fn=collate_fn_wrapper(trial_tokenizer),
        pin_memory=PIN_MEMORY,
        num_workers=NUM_WORKERS,
    )

    trial_dev_loader = DataLoader(
        dev_ds,
        batch_size=LOAD_BATCH_SIZE,
        shuffle=False,
        collate_fn=collate_fn_wrapper(trial_tokenizer),
        pin_memory=PIN_MEMORY,
        num_workers=NUM_WORKERS,
    )

    model = PCLModel(model_name, n_labels=NUM_LABELS, aux_dim=AUX_DIM, dropout=dropout, device=DEVICE).to(DEVICE)
    optim = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)

    print("Config:", {"model_name": model_name, "lr": lr, "wd": wd, "max_len": max_len, "dropout": dropout, "epochs": epochs, "accum_steps": ACCUM_STEPS})

    try:
        for epoch in range(epochs):
            model.train()
            running_loss = 0.0
            optim.zero_grad()

            for step, batch in enumerate(trial_train_loader):
                input_ids = batch["input_ids"].to(DEVICE, non_blocking=True)
                attention_mask = batch["attention_mask"].to(DEVICE, non_blocking=True)
                labels = batch["labels"].to(DEVICE, non_blocking=True)
                aux_features = batch["aux_features"].to(DEVICE, non_blocking=True) if "aux_features" in batch else None

                with torch.amp.autocast("cuda"): # fp16 to halve mem usage to avoid oom
                    out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, aux_features=aux_features)
                    loss = out["loss"] / ACCUM_STEPS

                scaler.scale(loss).backward()
                running_loss += out["loss"].item()

                if (step + 1) % ACCUM_STEPS == 0 or (step + 1) == len(trial_train_loader): # handle odd cases asw
                    scaler.step(optim)
                    scaler.update()
                    optim.zero_grad()

            print(f"Epoch {epoch + 1} - Average Loss: {running_loss / len(trial_train_loader)}")

    except torch.cuda.OutOfMemoryError:
        print("OOM: pruning this trial")
        torch.cuda.empty_cache()
        raise optuna.TrialPruned()

    metrics = evaluate_dev(model, trial_dev_loader, DEVICE)
    print("Trial metrics:", str(metrics))
    del model
    torch.cuda.empty_cache()
    return metrics["bin_f1"]  # optimize for F1 of positive (PCL) class — the actual task metric

In [22]:
torch.cuda.empty_cache()

In [None]:
study = optuna.create_study(direction="maximize", study_name="pcl_hyperparam_search")
study.optimize(objective, n_trials=50, n_jobs=1)

[32m[I 2026-02-19 19:12:37,973][0m A new study created in memory with name: pcl_hyperparam_search[0m


Config: {'model_name': 'microsoft/deberta-v3-small', 'lr': 0.0009847550584412984, 'wd': 0.0026340358425033694, 'max_len': 128, 'dropout': 0, 'epochs': 12, 'accum_steps': 3}


[32m[I 2026-02-19 19:12:40,089][0m Trial 0 pruned. [0m


OOM — pruning this trial


Loading weights:   0%|          | 0/25 [00:00<?, ?it/s]

[1mAlbertModel LOAD REPORT[0m from: albert-base-v2
Key                          | Status     |  | 
-----------------------------+------------+--+-
predictions.decoder.bias     | UNEXPECTED |  | 
predictions.LayerNorm.weight | UNEXPECTED |  | 
predictions.bias             | UNEXPECTED |  | 
predictions.LayerNorm.bias   | UNEXPECTED |  | 
predictions.dense.weight     | UNEXPECTED |  | 
predictions.dense.bias       | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Config: {'model_name': 'albert-base-v2', 'lr': 2.5452432531905925e-05, 'wd': 0.0003372757019917842, 'max_len': 256, 'dropout': 0, 'epochs': 7, 'accum_steps': 3}
Epoch 1 - Average Loss: 0.28548357745575653
Epoch 2 - Average Loss: 0.21269459506845145
Epoch 3 - Average Loss: 0.16650328542397336
Epoch 4 - Average Loss: 0.10750457924970191
Epoch 5 - Average Loss: 0.06740810489926295
Epoch 6 - Average Loss: 0.04370970121473305
Epoch 7 - Average Loss: 0.03496816375891355


[32m[I 2026-02-19 19:18:21,726][0m Trial 1 finished with value: 0.1590955806783145 and parameters: {'model_name': 'albert-base-v2', 'lr': 2.5452432531905925e-05, 'wd': 0.0003372757019917842, 'max_len': 256, 'epochs': 7}. Best is trial 1 with value: 0.1590955806783145.[0m


Trial metrics: {'micro_f1': 0.1590955806783145, 'bin_ap': 0.5121219553541346}
Config: {'model_name': 'albert-base-v2', 'lr': 4.4197933894302816e-05, 'wd': 0.000156022902935253, 'max_len': 256, 'dropout': 0, 'epochs': 11, 'accum_steps': 3}
Epoch 1 - Average Loss: 0.28941687121311954
Epoch 2 - Average Loss: 0.22256851913176393
Epoch 3 - Average Loss: 0.15807349091655226
Epoch 4 - Average Loss: 0.10286588676530203
Epoch 5 - Average Loss: 0.05558376051040095
Epoch 6 - Average Loss: 0.02704432423356836
Epoch 7 - Average Loss: 0.023363337560605928
Epoch 8 - Average Loss: 0.017925359905667525
Epoch 9 - Average Loss: 0.028770079984795523
Epoch 10 - Average Loss: 0.013773322793624438
Epoch 11 - Average Loss: 0.02480191040191991


[32m[I 2026-02-19 19:27:12,734][0m Trial 2 finished with value: 0.22253653936822254 and parameters: {'model_name': 'albert-base-v2', 'lr': 4.4197933894302816e-05, 'wd': 0.000156022902935253, 'max_len': 256, 'epochs': 11}. Best is trial 2 with value: 0.22253653936822254.[0m


Trial metrics: {'micro_f1': 0.22253653936822254, 'bin_ap': 0.4003592104213016}
Config: {'model_name': 'microsoft/deberta-v3-small', 'lr': 0.0009830970384743143, 'wd': 0.0002600234018347784, 'max_len': 128, 'dropout': 0, 'epochs': 6, 'accum_steps': 3}


[32m[I 2026-02-19 19:27:14,934][0m Trial 3 pruned. [0m


OOM — pruning this trial
Config: {'model_name': 'microsoft/deberta-v3-small', 'lr': 0.0009184137721469565, 'wd': 0.00498768366123752, 'max_len': 256, 'dropout': 0, 'epochs': 7, 'accum_steps': 3}


[32m[I 2026-02-19 19:27:16,636][0m Trial 4 pruned. [0m
[32m[I 2026-02-19 19:27:16,792][0m Trial 5 pruned. [0m


OOM — pruning this trial
Config: {'model_name': 'albert-base-v2', 'lr': 0.00015421821737335732, 'wd': 0.00957464662719906, 'max_len': 128, 'dropout': 0, 'epochs': 11, 'accum_steps': 3}
OOM — pruning this trial


[33m[W 2026-02-19 19:27:16,915][0m Trial 6 failed with parameters: {'model_name': 'albert-base-v2', 'lr': 1.3151297698966115e-05, 'wd': 0.0005457268146496025, 'max_len': 256, 'epochs': 12} because of the following error: OutOfMemoryError('CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 7.65 GiB of which 41.19 MiB is free. Process 10951 has 21.41 MiB memory in use. Including non-PyTorch memory, this process has 6.94 GiB memory in use. Of the allocated memory 6.65 GiB is allocated by PyTorch, and 110.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)').[0m
Traceback (most recent call last):
  File [35m"/home/pranav/Code/pcl-detection/venv/lib/python3.13/site-packages/optuna/study/_optimize.py"[0m, line [35m206[0m, in [3

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 7.65 GiB of which 41.19 MiB is free. Process 10951 has 21.41 MiB memory in use. Including non-PyTorch memory, this process has 6.94 GiB memory in use. Of the allocated memory 6.65 GiB is allocated by PyTorch, and 110.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
best_trial = study.best_trial
print("Best trial:")
print(f"  Value: {best_trial.value}")
print("  Params:")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

In [None]:
results = []
for trial in study.trials:
    results.append({**trial.params, "value": trial.value})
res_df = pd.DataFrame(results)
res_df.to_csv("optuna_results.csv", index=False)
print("Saved optuna_results.csv")