## Set Up

In [None]:
#!pip install huggingface_hub



In [1]:
# Libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# from datasets import load_dataset
from datasets import Dataset, DatasetDict
from transformers import DistilBertTokenizerFast

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import AdamW
from transformers import (
    DistilBertModel,
    DistilBertConfig,
    DistilBertTokenizerFast,
    get_linear_schedule_with_warmup
)
from sklearn.metrics import accuracy_score, f1_score
# from datasets import load_metric

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [2]:
# Colab
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Load Data

In [3]:
# Path
path = '/content/drive/MyDrive/DS Project - Moderately'

In [4]:
babe_polibias_labeled = pd.read_csv(path + '/data/babe_polibias_combined_labeled.csv')
babe_polibias_labeled.head()

Unnamed: 0,text,ideology,factuality,data_source
0,NYPD Commissioner Dermot Shea on Monday expres...,3,0,babe
1,School systems across the country are adopting...,1,2,babe
2,"And then along came President Barry Obama, who...",1,2,babe
3,"The curfews, which have never before occurred ...",3,0,babe
4,"Rather than help be a part of the solution, Tr...",4,2,babe


In [None]:
print(babe_polibias_labeled.shape)

(4779, 4)


In [None]:
babe_polibias_labeled['ideology'].value_counts().sort_index()

Unnamed: 0_level_0,count
ideology,Unnamed: 1_level_1
0,705
1,700
2,1804
3,942
4,628


In [None]:
babe_polibias_labeled['factuality'].value_counts().sort_index()

Unnamed: 0_level_0,count
factuality,Unnamed: 1_level_1
0,1783
1,1500
2,1496


## Shuffle & Split

In [5]:
train_val_df, test_df = train_test_split(
    # babe_polibias_labeled['text'], babe_polibias_labeled['label'], # Two labels columns - should not specify text and label columns
    babe_polibias_labeled, # Instead specify the whole data
    test_size = 0.15,
    stratify = babe_polibias_labeled['ideology'], # Stratified on ideology
    shuffle = True,
    random_state = 42
)

train_df, val_df = train_test_split(
    train_val_df,
    test_size = 0.1765,
    stratify = train_val_df['ideology'],
    shuffle = True,
    random_state = 42
)

print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

(3345, 4)
(717, 4)
(717, 4)


In [None]:
train_df['ideology'].value_counts().sort_index()

Unnamed: 0_level_0,count
ideology,Unnamed: 1_level_1
0,493
1,490
2,1262
3,660
4,440


In [None]:
train_df['factuality'].value_counts().sort_index()

Unnamed: 0_level_0,count
factuality,Unnamed: 1_level_1
0,1233
1,1050
2,1062


## Prepare Model Datasets

In [6]:
# Label encoders
ideology_enc = LabelEncoder()
factuality_enc = LabelEncoder()

train_df['ideology_label'] = ideology_enc.fit_transform(train_df['ideology'])
train_df['factuality_label'] = factuality_enc.fit_transform(train_df['factuality'])

val_df['ideology_label'] = ideology_enc.transform(val_df['ideology'])
val_df['factuality_label'] = factuality_enc.transform(val_df['factuality'])

test_df['ideology_label'] = ideology_enc.transform(test_df['ideology'])
test_df['factuality_label'] = factuality_enc.transform(test_df['factuality'])

# Save for inverse lookup later
id2ideology = dict(enumerate(ideology_enc.classes_))
id2factuality = dict(enumerate(factuality_enc.classes_))

In [7]:
# Convert to HuggingFace DatasetDict
train_ds = Dataset.from_pandas(train_df[['text', 'ideology_label', 'factuality_label']])
val_ds = Dataset.from_pandas(val_df[['text', 'ideology_label', 'factuality_label']])
test_ds = Dataset.from_pandas(test_df[['text', 'ideology_label', 'factuality_label']])

dataset = DatasetDict({
    "train": train_ds,
    "validation": val_ds,
    "test": test_ds
})

In [8]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

# Tokenize everything
tokenized_dataset = dataset.map(tokenize, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/3345 [00:00<?, ? examples/s]

Map:   0%|          | 0/717 [00:00<?, ? examples/s]

Map:   0%|          | 0/717 [00:00<?, ? examples/s]

In [9]:
tokenized_dataset.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'ideology_label', 'factuality_label']
)

## Baseline

In [None]:
# Define multitask model

class MultiTaskDistilBert(nn.Module):
    def __init__(self, num_ideology_labels, num_factuality_labels):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        hidden_size = self.bert.config.hidden_size
        self.ideology_head   = nn.Linear(hidden_size, num_ideology_labels)
        self.factuality_head = nn.Linear(hidden_size, num_factuality_labels)
        self.dropout = nn.Dropout(0.1)

    def forward(self, input_ids, attention_mask, ideology_label=None, factuality_label=None):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = self.dropout(out.last_hidden_state[:,0])               # [CLS] token

        ideology_logits   = self.ideology_head(pooled)
        factuality_logits = self.factuality_head(pooled)

        loss = None
        if ideology_label is not None and factuality_label is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss_ideo = loss_fct(ideology_logits,   ideology_label)
            loss_fact = loss_fct(factuality_logits, factuality_label)
            loss = (loss_ideo + loss_fact) / 2

        return {
            "loss": loss,
            "ideology_logits": ideology_logits,
            "factuality_logits": factuality_logits
        }

In [10]:
# Data loaders

batch_size = 16

# Grab your pandas splits (you already have `train_df`, `val_df`, `test_df`)
train_texts = train_df['text'].tolist()
val_texts   = val_df['text'].tolist()
test_texts  = test_df['text'].tolist()

train_ideo_labels = torch.tensor(train_df['ideology_label'].values,   dtype=torch.long)
train_fact_labels = torch.tensor(train_df['factuality_label'].values, dtype=torch.long)

val_ideo_labels   = torch.tensor(val_df['ideology_label'].values,     dtype=torch.long)
val_fact_labels   = torch.tensor(val_df['factuality_label'].values,   dtype=torch.long)

test_ideo_labels  = torch.tensor(test_df['ideology_label'].values,    dtype=torch.long)
test_fact_labels  = torch.tensor(test_df['factuality_label'].values,  dtype=torch.long)

# Tokenize with return_tensors='pt'
train_enc = tokenizer( train_texts, padding='max_length', truncation=True,
                       max_length=128, return_tensors='pt' )
val_enc   = tokenizer( val_texts,   padding='max_length', truncation=True,
                       max_length=128, return_tensors='pt' )
test_enc  = tokenizer( test_texts,  padding='max_length', truncation=True,
                       max_length=128, return_tensors='pt' )

# Build TensorDatasets
train_dataset = TensorDataset(
    train_enc['input_ids'],
    train_enc['attention_mask'],
    train_ideo_labels,
    train_fact_labels
)
val_dataset = TensorDataset(
    val_enc['input_ids'],
    val_enc['attention_mask'],
    val_ideo_labels,
    val_fact_labels
)
test_dataset = TensorDataset(
    test_enc['input_ids'],
    test_enc['attention_mask'],
    test_ideo_labels,
    test_fact_labels
)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False)

In [None]:
# Instantiate model, optimizer, scheduler

num_ideology_labels   = len(ideology_enc.classes_)
num_factuality_labels = len(factuality_enc.classes_)

model = MultiTaskDistilBert(num_ideology_labels, num_factuality_labels)
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 3
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

In [20]:
# Define metrics

def eval_epoch(model, loader):
    model.eval()
    all_ideo_preds, all_ideo_labels = [], []
    all_fact_preds, all_fact_labels = [], []

    with torch.no_grad():
        for input_ids, attention_mask, ideo_lbl, fact_lbl in loader:
            input_ids      = input_ids.to(device)
            attention_mask = attention_mask.to(device)

            out = model(input_ids=input_ids, attention_mask=attention_mask)
            # get class predictions
            ideo_preds = out["ideology_logits"].argmax(dim=-1).cpu().tolist()
            fact_preds = out["factuality_logits"].argmax(dim=-1).cpu().tolist()

            all_ideo_preds  += ideo_preds
            all_fact_preds  += fact_preds
            all_ideo_labels += ideo_lbl.tolist()
            all_fact_labels += fact_lbl.tolist()

    # compute metrics with sklearn
    ideo_acc = accuracy_score(all_ideo_labels, all_ideo_preds)
    fact_acc = accuracy_score(all_fact_labels, all_fact_preds)
    ideo_f1  = f1_score(all_ideo_labels, all_ideo_preds, average="macro")
    fact_f1  = f1_score(all_fact_labels, all_fact_preds, average="macro")

    return {
        "ideo_acc": ideo_acc,
        "fact_acc": fact_acc,
        "ideo_f1":  ideo_f1,
        "fact_f1":  fact_f1
    }

In [None]:
# Training loop

for epoch in range(1, epochs + 1):
    model.train()
    total_loss = 0

    # unpack the tuple directly:
    for input_ids, attention_mask, ideo_lbl, fact_lbl in train_loader:
        # move to GPU
        input_ids      = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        ideo_lbl       = ideo_lbl.to(device)
        fact_lbl       = fact_lbl.to(device)

        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            ideology_label=ideo_lbl,
            factuality_label=fact_lbl
        )
        loss = outputs["loss"]
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    val_metrics = eval_epoch(val_loader)  # eval_epoch also needs to unpack tuples similarly

    print(
        f"Epoch {epoch:>2} | Train Loss: {avg_train_loss:.4f}"
        f" | Val Ideo Acc: {val_metrics['ideo_acc']:.4f},"
        f" Val Fact Acc: {val_metrics['fact_acc']:.4f}"
        f" | Val Ideo F1: {val_metrics['ideo_f1']:.4f},"
        f" Val Fact F1: {val_metrics['fact_f1']:.4f}"
    )

Epoch  1 | Train Loss: 0.4709 | Val Ideo Acc: 0.5676, Val Fact Acc: 0.7573 | Val Ideo F1: 0.5204, Val Fact F1: 0.7563
Epoch  2 | Train Loss: 0.3912 | Val Ideo Acc: 0.5676, Val Fact Acc: 0.7573 | Val Ideo F1: 0.5204, Val Fact F1: 0.7563
Epoch  3 | Train Loss: 0.3911 | Val Ideo Acc: 0.5676, Val Fact Acc: 0.7573 | Val Ideo F1: 0.5204, Val Fact F1: 0.7563


In [None]:
# Example: compute majority-class accuracy on val split
most_common_ideo = val_df['ideology'].mode()[0]
ideo_baseline_acc = (val_df['ideology'] == most_common_ideo).mean()

most_common_fact = val_df['factuality'].mode()[0]
fact_baseline_acc = (val_df['factuality'] == most_common_fact).mean()

print("Ideo baseline:", ideo_baseline_acc)
print("Fact baseline:", fact_baseline_acc)


Ideo baseline: 0.37796373779637377
Fact baseline: 0.38633193863319387


In [None]:
print(train_df['ideology'].value_counts(normalize=True))
print(train_df['factuality'].value_counts(normalize=True))


ideology
2    0.377280
3    0.197309
0    0.147384
1    0.146487
4    0.131540
Name: proportion, dtype: float64
factuality
0    0.368610
2    0.317489
1    0.313901
Name: proportion, dtype: float64


So far: baseline DistilBERT validation accuracies are *poor* but they exceed the naive majority class baselines (guessing the most common example), so the model learned something. It is concerning that the model didn't learn anything past the first epoch. This could be due to class imbalance (underfitting minority classes, esp. since factuality guessing is much better), and class weights could help.

In [None]:
from sklearn.metrics import confusion_matrix, f1_score
import pandas as pd

# 1) Gather predictions & labels on the validation set
all_ideo_preds, all_ideo_labels = [], []
all_fact_preds, all_fact_labels = [], []

model.eval()
with torch.no_grad():
    for input_ids, attention_mask, ideo_lbl, fact_lbl in val_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        out = model(input_ids=input_ids, attention_mask=attention_mask)
        all_ideo_preds  += out["ideology_logits"].argmax(dim=-1).cpu().tolist()
        all_fact_preds  += out["factuality_logits"].argmax(dim=-1).cpu().tolist()
        all_ideo_labels += ideo_lbl.tolist()
        all_fact_labels += fact_lbl.tolist()

# 2) Build confusion matrices
ideo_cm = pd.crosstab(
    pd.Series(all_ideo_labels, name="True"),
    pd.Series(all_ideo_preds,   name="Predicted")
)
fact_cm = pd.crosstab(
    pd.Series(all_fact_labels,   name="True"),
    pd.Series(all_fact_preds,    name="Predicted")
)

# 3) Compute per-class F1
ideo_f1 = f1_score(all_ideo_labels, all_ideo_preds, average=None)
fact_f1 = f1_score(all_fact_labels, all_fact_preds, average=None)

# 4) Map indices back to label names
ideo_names = [id2ideology[i] for i in sorted(id2ideology)]
fact_names = [id2factuality[i] for i in sorted(id2factuality)]

# 5) Display results
print("Ideology Confusion Matrix:")
print(ideo_cm, "\n")

print("Ideology per-class F1:")
for name, score in zip(ideo_names, ideo_f1):
    print(f"  {name}: {score:.3f}")

print("\nFactuality Confusion Matrix:")
print(fact_cm, "\n")

print("Factuality per-class F1:")
for name, score in zip(fact_names, fact_f1):
    print(f"  {name}: {score:.3f}")


Ideology Confusion Matrix:
Predicted   0   1    2   3   4
True                          
0          49  10   15  20  12
1           9  35   24  32   5
2          12  10  207  40   2
3          11  17   32  67  14
4           8   4    7  26  49 

Ideology per-class F1:
  0: 0.503
  1: 0.387
  2: 0.745
  3: 0.411
  4: 0.557

Factuality Confusion Matrix:
Predicted    0    1    2
True                    
0          218   57    2
1           26  160   35
2            9   45  165 

Factuality per-class F1:
  0: 0.823
  1: 0.663
  2: 0.784


Next steps: Try to improve the model and get accuracies as high as possible. Use that to build the fine tuned model. Let's have the final version of the model in a new code for finetuning.

## Improving Model

**Class-weighted Loss** can help when there is class imbalance (as with this data). With imbalanced classes, the model can minimize loss by focusing on the majority class. Class-weighted loss gives each class a multiplier inversely proportional to its frequency, so errors on less frequent classes "hurt" more.

In [12]:
num_ideology_labels   = len(ideology_enc.classes_)
num_factuality_labels = len(factuality_enc.classes_)

print(num_ideology_labels)
print(num_factuality_labels)

5
3


In [14]:
from sklearn.utils.class_weight import compute_class_weight

# Class weights
ideo_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.arange(num_ideology_labels),
    y=train_df["ideology_label"]
)
fact_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.arange(num_factuality_labels),
    y=train_df["factuality_label"]
)
ideo_weights = torch.tensor(ideo_weights, dtype=torch.float).to(device)
fact_weights = torch.tensor(fact_weights, dtype=torch.float).to(device)

In [23]:
# Define multitask model with class weights

class MultiTaskDistilBert(nn.Module):
    def __init__(
        self,
        num_ideology_labels: int,
        num_factuality_labels: int,
        class_weights_ideo: torch.Tensor,
        class_weights_fact: torch.Tensor,
        alpha: float = 0.5,
        beta: float = 0.5
    ):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        hidden_size = self.bert.config.hidden_size

        self.ideology_head   = nn.Linear(hidden_size, num_ideology_labels)
        self.factuality_head = nn.Linear(hidden_size, num_factuality_labels)
        self.dropout = nn.Dropout(0.1)

        # weighted CrossEntropy for each head
        self.loss_fct_ideo = nn.CrossEntropyLoss(weight=class_weights_ideo)
        self.loss_fct_fact = nn.CrossEntropyLoss(weight=class_weights_fact)

        # task‐weighting scalars
        self.alpha = alpha
        self.beta  = beta

    def forward(
        self,
        input_ids,
        attention_mask,
        ideology_label=None,
        factuality_label=None
    ):
        out    = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = self.dropout(out.last_hidden_state[:, 0])

        ideology_logits   = self.ideology_head(pooled)
        factuality_logits = self.factuality_head(pooled)

        loss = None
        if ideology_label is not None and factuality_label is not None:
            loss_ideo = self.loss_fct_ideo(ideology_logits, ideology_label)
            loss_fact = self.loss_fct_fact(factuality_logits, factuality_label)
            loss = self.alpha * loss_ideo + self.beta * loss_fact

        return {
            "loss": loss,
            "ideology_logits": ideology_logits,
            "factuality_logits": factuality_logits
        }


In [None]:
# Instantiate model, optimizer & scheduler
model = MultiTaskDistilBert(
    num_ideology_labels   = num_ideology_labels,
    num_factuality_labels = num_factuality_labels,
    class_weights_ideo    = ideo_weights,
    class_weights_fact    = fact_weights,
    alpha = 0.5,   # try 0.3 or 0.7 too
    beta  = 0.5
)
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
epochs    = 5
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
# Training loop

for epoch in range(1, epochs + 1):
    model.train()
    total_loss = 0

    # unpack the tuple directly:
    for input_ids, attention_mask, ideo_lbl, fact_lbl in train_loader:
        # move to GPU
        input_ids      = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        ideo_lbl       = ideo_lbl.to(device)
        fact_lbl       = fact_lbl.to(device)

        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            ideology_label=ideo_lbl,
            factuality_label=fact_lbl
        )
        loss = outputs["loss"]
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    val_metrics = eval_epoch(model, val_loader)  # eval_epoch also needs to unpack tuples similarly

    print(
        f"Epoch {epoch:>2} | Train Loss: {avg_train_loss:.4f}"
        f" | Val Ideo Acc: {val_metrics['ideo_acc']:.4f},"
        f" Val Fact Acc: {val_metrics['fact_acc']:.4f}"
        f" | Val Ideo F1: {val_metrics['ideo_f1']:.4f},"
        f" Val Fact F1: {val_metrics['fact_f1']:.4f}"
    )

Epoch  1 | Train Loss: 1.1757 | Val Ideo Acc: 0.4742, Val Fact Acc: 0.7113 | Val Ideo F1: 0.4388, Val Fact F1: 0.7158
Epoch  2 | Train Loss: 0.8445 | Val Ideo Acc: 0.5328, Val Fact Acc: 0.7462 | Val Ideo F1: 0.4724, Val Fact F1: 0.7380
Epoch  3 | Train Loss: 0.6504 | Val Ideo Acc: 0.5537, Val Fact Acc: 0.7350 | Val Ideo F1: 0.5108, Val Fact F1: 0.7329
Epoch  4 | Train Loss: 0.4984 | Val Ideo Acc: 0.5425, Val Fact Acc: 0.7434 | Val Ideo F1: 0.5114, Val Fact F1: 0.7427
Epoch  5 | Train Loss: 0.4171 | Val Ideo Acc: 0.5495, Val Fact Acc: 0.7490 | Val Ideo F1: 0.5089, Val Fact F1: 0.7461


- Initial loss is higher (1.1757 vs 0.4709) because the loss landscape is steeper, meaning mistakes on rare classes carry a bigger penalty. This is expected.
- Accuracy and F1 didn't improve although the model is doing a better job at 'learning' (In the original, these metrics stayed stagnant)
- Class weights slowed down learning, but they eventually recover to the original standards

Ideas on model parameters
- Try weighting harder on ideology (0.7 ideology, 0.3 factuality)
- Increase epochs
- Higher learning rate

In [None]:
model = MultiTaskDistilBert(
    num_ideology_labels   = num_ideology_labels,
    num_factuality_labels = num_factuality_labels,
    class_weights_ideo    = ideo_weights,
    class_weights_fact    = fact_weights,
    alpha = 0.7,
    beta  = 0.3
)
model.to(device)

optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)
epochs    = 7
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

In [None]:
# Training loop

for epoch in range(1, epochs + 1):
    model.train()
    total_loss = 0

    # unpack the tuple directly:
    for input_ids, attention_mask, ideo_lbl, fact_lbl in train_loader:
        # move to GPU
        input_ids      = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        ideo_lbl       = ideo_lbl.to(device)
        fact_lbl       = fact_lbl.to(device)

        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            ideology_label=ideo_lbl,
            factuality_label=fact_lbl
        )
        loss = outputs["loss"]
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    val_metrics = eval_epoch(model, val_loader)  # eval_epoch also needs to unpack tuples similarly

    print(
        f"Epoch {epoch:>2} | Train Loss: {avg_train_loss:.4f}"
        f" | Val Ideo Acc: {val_metrics['ideo_acc']:.4f},"
        f" Val Fact Acc: {val_metrics['fact_acc']:.4f}"
        f" | Val Ideo F1: {val_metrics['ideo_f1']:.4f},"
        f" Val Fact F1: {val_metrics['fact_f1']:.4f}"
    )

Epoch  1 | Train Loss: 1.2895 | Val Ideo Acc: 0.4728, Val Fact Acc: 0.7322 | Val Ideo F1: 0.3710, Val Fact F1: 0.7326
Epoch  2 | Train Loss: 0.9503 | Val Ideo Acc: 0.5216, Val Fact Acc: 0.7294 | Val Ideo F1: 0.4855, Val Fact F1: 0.7216
Epoch  3 | Train Loss: 0.6447 | Val Ideo Acc: 0.5495, Val Fact Acc: 0.7476 | Val Ideo F1: 0.5059, Val Fact F1: 0.7463
Epoch  4 | Train Loss: 0.3996 | Val Ideo Acc: 0.5565, Val Fact Acc: 0.7531 | Val Ideo F1: 0.5117, Val Fact F1: 0.7515
Epoch  5 | Train Loss: 0.2370 | Val Ideo Acc: 0.5537, Val Fact Acc: 0.7420 | Val Ideo F1: 0.5297, Val Fact F1: 0.7403
Epoch  6 | Train Loss: 0.1678 | Val Ideo Acc: 0.5732, Val Fact Acc: 0.7545 | Val Ideo F1: 0.5424, Val Fact F1: 0.7508
Epoch  7 | Train Loss: 0.1284 | Val Ideo Acc: 0.5635, Val Fact Acc: 0.7490 | Val Ideo F1: 0.5297, Val Fact F1: 0.7475


**Hyperparameter Search with Optuna**

In [16]:
!pip install -q optuna

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/395.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/246.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m246.9/246.9 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [17]:
import optuna

In [21]:
# Objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    lr = trial.suggest_loguniform("lr", 1e-5, 5e-5)
    alpha = trial.suggest_float("alpha", 0.3, 0.7)
    beta = 1.0 - alpha

    # Instantiate a fresh model for each trial
    model = MultiTaskDistilBert(
        num_ideology_labels   = len(ideology_enc.classes_),
        num_factuality_labels = len(factuality_enc.classes_),
        class_weights_ideo    = ideo_weights,
        class_weights_fact    = fact_weights,
        alpha = alpha,
        beta  = beta
    ).to(device)

    # Optimizer & scheduler
    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    epochs = 3  # use fewer epochs per trial
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),
        num_training_steps=total_steps
    )

    # Training loop
    for _ in range(epochs):
        model.train()
        for input_ids, attention_mask, ideo_lbl, fact_lbl in train_loader:
            optimizer.zero_grad()
            input_ids      = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            ideo_lbl       = ideo_lbl.to(device)
            fact_lbl       = fact_lbl.to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                ideology_label=ideo_lbl,
                factuality_label=fact_lbl
            )
            outputs["loss"].backward()
            optimizer.step()
            scheduler.step()

    # Validation
    val_metrics = eval_epoch(model, val_loader) # <- This uses eval_epoch previously defined
    avg_f1 = (val_metrics["ideo_f1"] + val_metrics["fact_f1"]) / 2.0
    return avg_f1

# Run the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

# Display best results
print("Best hyperparameters:", study.best_params)
print("Best average F1:", study.best_value)

[I 2025-07-10 00:29:38,221] A new study created in memory with name: no-name-7cc1e3de-8983-4763-b615-cc74e0248f87
  lr = trial.suggest_loguniform("lr", 1e-5, 5e-5)
[I 2025-07-10 00:31:28,804] Trial 0 finished with value: 0.5925721068646828 and parameters: {'lr': 1.715352268802884e-05, 'alpha': 0.3026385404368495}. Best is trial 0 with value: 0.5925721068646828.
  lr = trial.suggest_loguniform("lr", 1e-5, 5e-5)
[I 2025-07-10 00:33:17,999] Trial 1 finished with value: 0.6373129847181076 and parameters: {'lr': 2.248227732520022e-05, 'alpha': 0.6565632225985104}. Best is trial 1 with value: 0.6373129847181076.
  lr = trial.suggest_loguniform("lr", 1e-5, 5e-5)
[I 2025-07-10 00:35:07,068] Trial 2 finished with value: 0.6172758950884849 and parameters: {'lr': 1.5521072355587528e-05, 'alpha': 0.3584260447889318}. Best is trial 1 with value: 0.6373129847181076.
  lr = trial.suggest_loguniform("lr", 1e-5, 5e-5)
[I 2025-07-10 00:36:56,037] Trial 3 finished with value: 0.6164998836446073 and param

Best hyperparameters: {'lr': 2.16411857404266e-05, 'alpha': 0.5848983419474759}
Best average F1: 0.6443016824205028


Best run with parameters: {'lr': 2.16411857404266e-05, 'alpha': 0.5848983419474759}

In [26]:
# Final parameters

# Instantiate

model = MultiTaskDistilBert(
    num_ideology_labels   = num_ideology_labels,
    num_factuality_labels = num_factuality_labels,
    class_weights_ideo    = ideo_weights,
    class_weights_fact    = fact_weights,
    alpha = 0.5848983419474759,
    beta  = 1 - 0.5848983419474759
)
model.to(device)

optimizer = AdamW(model.parameters(), lr=2.16411857404266e-05, weight_decay=0.01)
epochs    = 7
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)


# Training loop

for epoch in range(1, epochs + 1):
    model.train()
    total_loss = 0

    # unpack the tuple directly:
    for input_ids, attention_mask, ideo_lbl, fact_lbl in train_loader:
        # move to GPU
        input_ids      = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        ideo_lbl       = ideo_lbl.to(device)
        fact_lbl       = fact_lbl.to(device)

        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            ideology_label=ideo_lbl,
            factuality_label=fact_lbl
        )
        loss = outputs["loss"]
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    val_metrics = eval_epoch(model, val_loader)  # eval_epoch also needs to unpack tuples similarly

    print(
        f"Epoch {epoch:>2} | Train Loss: {avg_train_loss:.4f}"
        f" | Val Ideo Acc: {val_metrics['ideo_acc']:.4f},"
        f" Val Fact Acc: {val_metrics['fact_acc']:.4f}"
        f" | Val Ideo F1: {val_metrics['ideo_f1']:.4f},"
        f" Val Fact F1: {val_metrics['fact_f1']:.4f}"
    )

Epoch  1 | Train Loss: 1.2342 | Val Ideo Acc: 0.4840, Val Fact Acc: 0.7350 | Val Ideo F1: 0.4375, Val Fact F1: 0.7340
Epoch  2 | Train Loss: 0.9151 | Val Ideo Acc: 0.5300, Val Fact Acc: 0.7406 | Val Ideo F1: 0.4873, Val Fact F1: 0.7337
Epoch  3 | Train Loss: 0.6840 | Val Ideo Acc: 0.5328, Val Fact Acc: 0.7448 | Val Ideo F1: 0.4957, Val Fact F1: 0.7420
Epoch  4 | Train Loss: 0.4886 | Val Ideo Acc: 0.5774, Val Fact Acc: 0.7448 | Val Ideo F1: 0.5381, Val Fact F1: 0.7404
Epoch  5 | Train Loss: 0.3413 | Val Ideo Acc: 0.5607, Val Fact Acc: 0.7448 | Val Ideo F1: 0.5216, Val Fact F1: 0.7428
Epoch  6 | Train Loss: 0.2553 | Val Ideo Acc: 0.5718, Val Fact Acc: 0.7559 | Val Ideo F1: 0.5332, Val Fact F1: 0.7493
Epoch  7 | Train Loss: 0.2150 | Val Ideo Acc: 0.5676, Val Fact Acc: 0.7448 | Val Ideo F1: 0.5287, Val Fact F1: 0.7415


In [27]:
# 1) Gather predictions & labels on the validation set
all_ideo_preds, all_ideo_labels = [], []
all_fact_preds, all_fact_labels = [], []

model.eval()
with torch.no_grad():
    for input_ids, attention_mask, ideo_lbl, fact_lbl in val_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        out = model(input_ids=input_ids, attention_mask=attention_mask)
        all_ideo_preds  += out["ideology_logits"].argmax(dim=-1).cpu().tolist()
        all_fact_preds  += out["factuality_logits"].argmax(dim=-1).cpu().tolist()
        all_ideo_labels += ideo_lbl.tolist()
        all_fact_labels += fact_lbl.tolist()

# 2) Build confusion matrices
ideo_cm = pd.crosstab(
    pd.Series(all_ideo_labels, name="True"),
    pd.Series(all_ideo_preds,   name="Predicted")
)
fact_cm = pd.crosstab(
    pd.Series(all_fact_labels,   name="True"),
    pd.Series(all_fact_preds,    name="Predicted")
)

# 3) Compute per-class F1
ideo_f1 = f1_score(all_ideo_labels, all_ideo_preds, average=None)
fact_f1 = f1_score(all_fact_labels, all_fact_preds, average=None)

# 4) Map indices back to label names
ideo_names = [id2ideology[i] for i in sorted(id2ideology)]
fact_names = [id2factuality[i] for i in sorted(id2factuality)]

# 5) Display results
print("Ideology Confusion Matrix:")
print(ideo_cm, "\n")

print("Ideology per-class F1:")
for name, score in zip(ideo_names, ideo_f1):
    print(f"  {name}: {score:.3f}")

print("\nFactuality Confusion Matrix:")
print(fact_cm, "\n")

print("Factuality per-class F1:")
for name, score in zip(fact_names, fact_f1):
    print(f"  {name}: {score:.3f}")

Ideology Confusion Matrix:
Predicted   0   1    2   3   4
True                          
0          51  17    7  23   8
1           8  38   15  39   5
2          12  21  195  39   4
3          12  19   25  75  10
4           9   4    5  28  48 

Ideology per-class F1:
  0: 0.515
  1: 0.373
  2: 0.753
  3: 0.435
  4: 0.568

Factuality Confusion Matrix:
Predicted    0    1    2
True                    
0          221   51    5
1           32  148   41
2            6   48  165 

Factuality per-class F1:
  0: 0.825
  1: 0.632
  2: 0.767
