## Novel model approach

This notebook serves as the source code for all the model testing and training (along with hyperparam grid search) before the development/submission of the final best model. This model approach tries a variation on the transformer architecture, with different heads, as detailed in the report.

In [3]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from sklearn.metrics import f1_score, average_precision_score
import itertools
from tqdm import tqdm

In [4]:
from pcl_tf.dataset_manager import DatasetManager as DM
from pcl_tf.collation import collate_fn
from pcl_tf.tf import warmup_model, get_tokenizer, _ENCODER_CACHE

In [5]:
NUM_LABELS = 7
LOAD_BATCH_SIZE = 64
LOCAL_CACHE_DIR = './models_cache'
MODEL_NAME = "roberta-base"
NUM_WORKERS = 8
PIN_MEMORY = False
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

Using device: cuda


In [6]:
print('Warming up tokenizer...')
tokenizer = get_tokenizer(MODEL_NAME)

print('Warming up encoder (downloads model if needed)...')

_ = warmup_model(MODEL_NAME, device=None, cache_dir=LOCAL_CACHE_DIR)
print('Model cache warmup completed.')

Warming up tokenizer...




Warming up encoder (downloads model if needed)...


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaModel LOAD REPORT[0m from: roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
pooler.dense.weight             | MISSING    | 
pooler.dense.bias               | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Model cache warmup completed.


In [7]:
train_labels_path = "data/train_semeval_parids-labels.csv"
dev_labels_path = "data/dev_semeval_parids-labels.csv"
texts_path = "data/dontpatronizeme_pcl_cleaned.csv"
test_path = "data/task4_test.tsv"
cats_path = "data/dontpatronizeme_categories.tsv"

In [8]:
texts_df = pd.read_csv(texts_path, low_memory=False)
texts_df["par_id"] = texts_df["par_id"].astype(int)
texts_df = texts_df.set_index("par_id")

In [9]:
training_ds = DM(train_labels_path, texts_df=texts_df)
training_ds.print_stats()

Total samples: 8375
Binary distribution: [7581  794]
Multilabel distribution: [574. 160. 162. 192. 145. 363.  29.]


In [10]:
dev_ds = DM(dev_labels_path, texts_df=texts_df)
dev_ds.print_stats()

Total samples: 2093
Binary distribution: [1894  199]
Multilabel distribution: [142.  36.  62.  38.  52. 106.  11.]


In [11]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


train_loader = DataLoader(training_ds, batch_size=LOAD_BATCH_SIZE, shuffle=True, 
                          collate_fn=lambda b: collate_fn(tokenizer, b), pin_memory=PIN_MEMORY, num_workers=NUM_WORKERS)

dev_loader = DataLoader(dev_ds, batch_size=LOAD_BATCH_SIZE, shuffle=False, 
                        collate_fn=lambda b: collate_fn(tokenizer, b), pin_memory=PIN_MEMORY, num_workers=NUM_WORKERS)

In [12]:
def evaluate_dev(model, dataloader, device):
    model.eval()
    bin_probs=[]
    bin_labels=[]
    multi_probs=[]
    multi_labels=[]

    with torch.no_grad():
        for b in dataloader:
            input_ids = b["input_ids"].to(device)
            attention_mask = b["attention_mask"].to(device)
            labels = b["labels"].to(device)
            out = model(input_ids=input_ids, attention_mask=attention_mask)
            bin_probs.append(torch.sigmoid(out["logit_bin"]).cpu().numpy())
            multi_probs.append(torch.sigmoid(out["logit_multi"]).cpu().numpy())
            bin_labels.append(labels[:,0].cpu().numpy())
            multi_labels.append(labels[:,1:].cpu().numpy())
            
    bin_probs = np.concatenate(bin_probs); bin_labels = np.concatenate(bin_labels)
    multi_probs = np.concatenate(multi_probs); multi_labels = np.concatenate(multi_labels)
    micro_f1 = f1_score(multi_labels.flatten(), (multi_probs>=0.5).astype(int).flatten(), zero_division=0)
    return {"micro_f1": micro_f1, "bin_ap": average_precision_score(bin_labels, bin_probs)}


In [13]:
from pcl_tf.tf import PCLModel

def train_and_eval(config):
    model = PCLModel(config["model_name"], n_labels=NUM_LABELS, dropout=config["dropout"], device=DEVICE).to(DEVICE)
    optim = torch.optim.AdamW(model.parameters(), lr=config["lr"], weight_decay=config["wd"])
    print("Model and optimizer created")

    print("Starting training...")
    for epoch in range(config["epochs"]):
        model.train()
        running = 0.0
        for batch in train_loader:
            input_ids = batch["input_ids"].to(DEVICE, non_blocking=True)
            attention_mask = batch["attention_mask"].to(DEVICE, non_blocking=True)
            labels = batch["labels"].to(DEVICE, non_blocking=True)
            optim.zero_grad()
            out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = out["loss"]
            loss.backward()
            optim.step()
            running += loss.item()
        # optionally print per-epoch
        print(f"Epoch {epoch+1}, Loss: {running/len(train_loader)}")
    metrics = evaluate_dev(model, dev_loader, DEVICE)
    return metrics

In [14]:
grid = {
    "model_name": ["roberta-base"],
    "lr": [2e-5, 5e-5],
    "wd": [1e-3, 1e-2, 5e-3],
    "max_len": [128, 256],
    "dropout": [0, 0.1],
    "epochs": [6]
}

In [15]:
torch.cuda.empty_cache()

In [16]:
keys, values = zip(*grid.items())
results = []
for combo in tqdm(list(itertools.product(*values)), desc="Grid"):
    cfg = dict(zip(keys, combo))
    try:
        metrics = train_and_eval(cfg)
        results.append({**cfg, **metrics})
        print("CFG:", cfg, "=>", metrics)
    except Exception as e:
        print("Error for cfg", cfg, ":", e)
        results.append({**cfg, "error": str(e)})

res_df = pd.DataFrame(results)
res_df.to_csv("grid_results.csv", index=False)
print("Saved grid_results.csv")
best_idx = res_df["micro_f1"].idxmax() if "micro_f1" in res_df.columns else None
if best_idx is not None:
    print("Best config:\n", res_df.loc[best_idx].to_dict())
else:
    print("No successful runs found.")

Grid:   0%|          | 0/24 [00:00<?, ?it/s]

Model and optimizer created
Starting training...
Epoch 1, Loss: 0.45656906318573554
Epoch 2, Loss: 0.28324812241182984
Epoch 3, Loss: 0.20947086287818792
Epoch 4, Loss: 0.14777101606187473
Epoch 5, Loss: 0.09722598522674036
Epoch 6, Loss: 0.0835050648177853


Grid:   4%|▍         | 1/24 [07:16<2:47:23, 436.67s/it]

CFG: {'model_name': 'roberta-base', 'lr': 2e-05, 'wd': 0.001, 'max_len': 128, 'dropout': 0, 'epochs': 6} => {'micro_f1': 0.273224043715847, 'bin_ap': 0.5950159538533516}
Model and optimizer created
Starting training...
Epoch 1, Loss: 0.4490828883784418
Epoch 2, Loss: 0.29092072905929944
Epoch 3, Loss: 0.22693791476483563
Epoch 4, Loss: 0.15818579318641707
Epoch 5, Loss: 0.1065674459291551
Epoch 6, Loss: 0.07784430829111401


Grid:   8%|▊         | 2/24 [14:33<2:40:12, 436.95s/it]

CFG: {'model_name': 'roberta-base', 'lr': 2e-05, 'wd': 0.001, 'max_len': 128, 'dropout': 0.1, 'epochs': 6} => {'micro_f1': 0.3474178403755869, 'bin_ap': 0.6025107426492349}
Model and optimizer created
Starting training...
Epoch 1, Loss: 0.42959114203926263
Epoch 2, Loss: 0.28572234716124206
Epoch 3, Loss: 0.20815658296337564
Epoch 4, Loss: 0.1454774436206763
Epoch 5, Loss: 0.10083209019410246
Epoch 6, Loss: 0.0794652417330569


Grid:  12%|█▎        | 3/24 [21:50<2:32:49, 436.66s/it]

CFG: {'model_name': 'roberta-base', 'lr': 2e-05, 'wd': 0.001, 'max_len': 256, 'dropout': 0, 'epochs': 6} => {'micro_f1': 0.3678832116788321, 'bin_ap': 0.6166042886003502}
Model and optimizer created
Starting training...
Epoch 1, Loss: 0.432477657349055
Epoch 2, Loss: 0.2846253471349487
Epoch 3, Loss: 0.2073063663000824
Epoch 4, Loss: 0.13581463373230615
Epoch 5, Loss: 0.09174259743503942
Epoch 6, Loss: 0.07854629161533054


Grid:  17%|█▋        | 4/24 [29:07<2:25:35, 436.79s/it]

CFG: {'model_name': 'roberta-base', 'lr': 2e-05, 'wd': 0.001, 'max_len': 256, 'dropout': 0.1, 'epochs': 6} => {'micro_f1': 0.3930942895086321, 'bin_ap': 0.6005527050861326}
Model and optimizer created
Starting training...
Epoch 1, Loss: 0.4300794826664087
Epoch 2, Loss: 0.29072908755704646
Epoch 3, Loss: 0.22438651363130743
Epoch 4, Loss: 0.1579709914121919
Epoch 5, Loss: 0.1183913067153847
Epoch 6, Loss: 0.0841311682584404


Grid:  21%|██        | 5/24 [36:23<2:18:19, 436.79s/it]

CFG: {'model_name': 'roberta-base', 'lr': 2e-05, 'wd': 0.01, 'max_len': 128, 'dropout': 0, 'epochs': 6} => {'micro_f1': 0.3645320197044335, 'bin_ap': 0.5850008568565164}
Model and optimizer created
Starting training...
Epoch 1, Loss: 0.44270398024156804
Epoch 2, Loss: 0.2824859658952888
Epoch 3, Loss: 0.20380870815906815
Epoch 4, Loss: 0.13900417754895814
Epoch 5, Loss: 0.0955747553069173
Epoch 6, Loss: 0.07450716887095957


Grid:  25%|██▌       | 6/24 [43:40<2:11:01, 436.77s/it]

CFG: {'model_name': 'roberta-base', 'lr': 2e-05, 'wd': 0.01, 'max_len': 128, 'dropout': 0.1, 'epochs': 6} => {'micro_f1': 0.31186440677966104, 'bin_ap': 0.6033739355335254}
Model and optimizer created
Starting training...
Epoch 1, Loss: 0.44939536527367946
Epoch 2, Loss: 0.2958022359445805
Epoch 3, Loss: 0.22372344559954324
Epoch 4, Loss: 0.1595347821712494
Epoch 5, Loss: 0.10885829555283066
Epoch 6, Loss: 0.08455285574984915


Grid:  29%|██▉       | 7/24 [50:57<2:03:43, 436.69s/it]

CFG: {'model_name': 'roberta-base', 'lr': 2e-05, 'wd': 0.01, 'max_len': 256, 'dropout': 0, 'epochs': 6} => {'micro_f1': 0.36123348017621143, 'bin_ap': 0.5887737848535188}
Model and optimizer created
Starting training...
Epoch 1, Loss: 0.44479305757821064
Epoch 2, Loss: 0.2892388499533857
Epoch 3, Loss: 0.20778586032963892
Epoch 4, Loss: 0.14224391972347525
Epoch 5, Loss: 0.09643616069477932
Epoch 6, Loss: 0.07566344597241806


Grid:  33%|███▎      | 8/24 [58:13<1:56:27, 436.70s/it]

CFG: {'model_name': 'roberta-base', 'lr': 2e-05, 'wd': 0.01, 'max_len': 256, 'dropout': 0.1, 'epochs': 6} => {'micro_f1': 0.3387096774193548, 'bin_ap': 0.6214037642348211}
Model and optimizer created
Starting training...
Epoch 1, Loss: 0.4254075916668841
Epoch 2, Loss: 0.28034869928396383
Epoch 3, Loss: 0.2049034678355883
Epoch 4, Loss: 0.13171733196341354
Epoch 5, Loss: 0.09289591395445453
Epoch 6, Loss: 0.08069981159713432


Grid:  38%|███▊      | 9/24 [1:05:30<1:49:08, 436.59s/it]

CFG: {'model_name': 'roberta-base', 'lr': 2e-05, 'wd': 0.005, 'max_len': 128, 'dropout': 0, 'epochs': 6} => {'micro_f1': 0.37567567567567567, 'bin_ap': 0.6095496883839383}
Model and optimizer created
Starting training...
Epoch 1, Loss: 0.45806975776457604
Epoch 2, Loss: 0.29003186523914337
Epoch 3, Loss: 0.21735542648621187
Epoch 4, Loss: 0.15448838128274633
Epoch 5, Loss: 0.10809075929053867
Epoch 6, Loss: 0.0725292632398942


Grid:  42%|████▏     | 10/24 [1:12:47<1:41:53, 436.66s/it]

CFG: {'model_name': 'roberta-base', 'lr': 2e-05, 'wd': 0.005, 'max_len': 128, 'dropout': 0.1, 'epochs': 6} => {'micro_f1': 0.3785234899328859, 'bin_ap': 0.588893416033394}
Model and optimizer created
Starting training...


Grid:  46%|████▌     | 11/24 [1:12:49<1:05:49, 303.77s/it]

Error for cfg {'model_name': 'roberta-base', 'lr': 2e-05, 'wd': 0.005, 'max_len': 256, 'dropout': 0, 'epochs': 6} : CUDA out of memory. Tried to allocate 96.00 MiB. GPU 0 has a total capacity of 7.65 GiB of which 74.00 MiB is free. Process 6267 has 37.41 MiB memory in use. Process 6351 has 39.41 MiB memory in use. Process 44088 has 21.16 MiB memory in use. Including non-PyTorch memory, this process has 7.00 GiB memory in use. Of the allocated memory 6.16 GiB is allocated by PyTorch, and 684.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Model and optimizer created
Starting training...
Epoch 1, Loss: 0.4572733736220207
Epoch 2, Loss: 0.3011042355126097
Epoch 3, Loss: 0.24049579664951062
Epoch 4, Loss: 0.17803256648756166
Epoch 5, Loss: 0.119833547621965

Grid:  50%|█████     | 12/24 [1:20:06<1:08:51, 344.26s/it]

CFG: {'model_name': 'roberta-base', 'lr': 2e-05, 'wd': 0.005, 'max_len': 256, 'dropout': 0.1, 'epochs': 6} => {'micro_f1': 0.3395061728395062, 'bin_ap': 0.5823587998462407}
Model and optimizer created
Starting training...
Epoch 1, Loss: 0.402828793489296
Epoch 2, Loss: 0.2732160648771825
Epoch 3, Loss: 0.18088675446751465
Epoch 4, Loss: 0.11908478433451125
Epoch 5, Loss: 0.09738358202610763
Epoch 6, Loss: 0.07412350273740884


Grid:  54%|█████▍    | 13/24 [1:27:23<1:08:16, 372.42s/it]

CFG: {'model_name': 'roberta-base', 'lr': 5e-05, 'wd': 0.001, 'max_len': 128, 'dropout': 0, 'epochs': 6} => {'micro_f1': 0.3559096945551129, 'bin_ap': 0.5864831265279478}
Model and optimizer created
Starting training...
Epoch 1, Loss: 0.4329040566928514
Epoch 2, Loss: 0.30127211222211825
Epoch 3, Loss: 0.2200675446341056
Epoch 4, Loss: 0.15053664568726344
Epoch 5, Loss: 0.11192177881607572
Epoch 6, Loss: 0.08282499665124271


Grid:  58%|█████▊    | 14/24 [1:34:40<1:05:18, 391.85s/it]

CFG: {'model_name': 'roberta-base', 'lr': 5e-05, 'wd': 0.001, 'max_len': 128, 'dropout': 0.1, 'epochs': 6} => {'micro_f1': 0.30560271646859083, 'bin_ap': 0.5626739053148456}
Model and optimizer created
Starting training...
Epoch 1, Loss: 0.41875473335954067
Epoch 2, Loss: 0.2827950474870114
Epoch 3, Loss: 0.21356117148094506
Epoch 4, Loss: 0.1368024042942597
Epoch 5, Loss: 0.10277063852417788
Epoch 6, Loss: 0.06543721501779465


Grid:  62%|██████▎   | 15/24 [1:41:57<1:00:48, 405.38s/it]

CFG: {'model_name': 'roberta-base', 'lr': 5e-05, 'wd': 0.001, 'max_len': 256, 'dropout': 0, 'epochs': 6} => {'micro_f1': 0.3122923588039867, 'bin_ap': 0.5201969615081425}
Model and optimizer created
Starting training...
Epoch 1, Loss: 0.4296468081137606
Epoch 2, Loss: 0.2970080159547675
Epoch 3, Loss: 0.2352227999167588
Epoch 4, Loss: 0.1642850016648988
Epoch 5, Loss: 0.10474172828875425
Epoch 6, Loss: 0.0890571853316581


Grid:  67%|██████▋   | 16/24 [1:49:13<55:17, 414.71s/it]  

CFG: {'model_name': 'roberta-base', 'lr': 5e-05, 'wd': 0.001, 'max_len': 256, 'dropout': 0.1, 'epochs': 6} => {'micro_f1': 0.24285714285714285, 'bin_ap': 0.5499711742720691}
Model and optimizer created
Starting training...
Epoch 1, Loss: 0.4050332345352828
Epoch 2, Loss: 0.2858610772563301
Epoch 3, Loss: 0.20574631888902825
Epoch 4, Loss: 0.14121462367759405
Epoch 5, Loss: 0.11471200666127314
Epoch 6, Loss: 0.08334310376974008


Grid:  71%|███████   | 17/24 [1:56:30<49:10, 421.49s/it]

CFG: {'model_name': 'roberta-base', 'lr': 5e-05, 'wd': 0.01, 'max_len': 128, 'dropout': 0, 'epochs': 6} => {'micro_f1': 0.3151515151515151, 'bin_ap': 0.5561576998732162}
Model and optimizer created
Starting training...
Epoch 1, Loss: 0.4369251763320151
Epoch 2, Loss: 0.30090474780961757
Epoch 3, Loss: 0.2325960752618222
Epoch 4, Loss: 0.15377018279360452
Epoch 5, Loss: 0.10689818864787808
Epoch 6, Loss: 0.0707852357628805


Grid:  75%|███████▌  | 18/24 [2:03:46<42:35, 425.91s/it]

CFG: {'model_name': 'roberta-base', 'lr': 5e-05, 'wd': 0.01, 'max_len': 128, 'dropout': 0.1, 'epochs': 6} => {'micro_f1': 0.33921302578018997, 'bin_ap': 0.4951717095947703}
Model and optimizer created
Starting training...
Epoch 1, Loss: 0.4593095365371413
Epoch 2, Loss: 0.34737776822716226
Epoch 3, Loss: 0.2825833195607171
Epoch 4, Loss: 0.2160498268674803
Epoch 5, Loss: 0.16516919235009273
Epoch 6, Loss: 0.11161234081928967


Grid:  79%|███████▉  | 19/24 [2:11:03<35:45, 429.18s/it]

CFG: {'model_name': 'roberta-base', 'lr': 5e-05, 'wd': 0.01, 'max_len': 256, 'dropout': 0, 'epochs': 6} => {'micro_f1': 0.21830985915492956, 'bin_ap': 0.4929176806834262}
Model and optimizer created
Starting training...
Epoch 1, Loss: 0.4326106676163564
Epoch 2, Loss: 0.3193895636510303
Epoch 3, Loss: 0.231781434239322
Epoch 4, Loss: 0.1541283964712656
Epoch 5, Loss: 0.10370032762298147
Epoch 6, Loss: 0.07708807863323515


Grid:  83%|████████▎ | 20/24 [2:18:20<28:46, 431.55s/it]

CFG: {'model_name': 'roberta-base', 'lr': 5e-05, 'wd': 0.01, 'max_len': 256, 'dropout': 0.1, 'epochs': 6} => {'micro_f1': 0.2669039145907473, 'bin_ap': 0.5224543986571739}
Model and optimizer created
Starting training...
Epoch 1, Loss: 0.43127605546521774
Epoch 2, Loss: 0.32729125136637505
Epoch 3, Loss: 0.2482476328279226
Epoch 4, Loss: 0.18014279349159648
Epoch 5, Loss: 0.1322674640091776
Epoch 6, Loss: 0.0986728637784947


Grid:  88%|████████▊ | 21/24 [2:25:38<21:39, 433.28s/it]

CFG: {'model_name': 'roberta-base', 'lr': 5e-05, 'wd': 0.005, 'max_len': 128, 'dropout': 0, 'epochs': 6} => {'micro_f1': 0.34065934065934067, 'bin_ap': 0.5733371131403427}
Model and optimizer created
Starting training...
Epoch 1, Loss: 0.4306029257656054
Epoch 2, Loss: 0.3114978073321226
Epoch 3, Loss: 0.23258725590719523
Epoch 4, Loss: 0.15439212814200926
Epoch 5, Loss: 0.09600465111292274
Epoch 6, Loss: 0.08109679958918167


Grid:  92%|█████████▏| 22/24 [2:32:54<14:28, 434.32s/it]

CFG: {'model_name': 'roberta-base', 'lr': 5e-05, 'wd': 0.005, 'max_len': 128, 'dropout': 0.1, 'epochs': 6} => {'micro_f1': 0.3286384976525822, 'bin_ap': 0.5540698397455713}
Model and optimizer created
Starting training...
Epoch 1, Loss: 0.4094110709679036
Epoch 2, Loss: 0.2792855397094297
Epoch 3, Loss: 0.20516147908136134
Epoch 4, Loss: 0.13374803553669506
Epoch 5, Loss: 0.09884872769524577
Epoch 6, Loss: 0.07762129542254309


Grid:  96%|█████████▌| 23/24 [2:40:11<07:14, 434.96s/it]

CFG: {'model_name': 'roberta-base', 'lr': 5e-05, 'wd': 0.005, 'max_len': 256, 'dropout': 0, 'epochs': 6} => {'micro_f1': 0.31347962382445144, 'bin_ap': 0.5089862991043769}
Model and optimizer created
Starting training...
Epoch 1, Loss: 0.4027164536350556
Epoch 2, Loss: 0.29668836835913986
Epoch 3, Loss: 0.21055090225493636
Epoch 4, Loss: 0.15677081836202672
Epoch 5, Loss: 0.10435222879848408
Epoch 6, Loss: 0.08140688053979218


Grid: 100%|██████████| 24/24 [2:47:28<00:00, 418.68s/it]

CFG: {'model_name': 'roberta-base', 'lr': 5e-05, 'wd': 0.005, 'max_len': 256, 'dropout': 0.1, 'epochs': 6} => {'micro_f1': 0.30973451327433627, 'bin_ap': 0.5267868684231509}
Saved grid_results.csv
Best config:
 {'model_name': 'roberta-base', 'lr': 2e-05, 'wd': 0.001, 'max_len': 256, 'dropout': 0.1, 'epochs': 6, 'micro_f1': 0.3930942895086321, 'bin_ap': 0.6005527050861326, 'error': nan}



