In [1]:
import transformers as T
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
from torchmetrics import SpearmanCorrCoef, Accuracy, F1Score
from transformers import BertModel
import torch.nn as nn
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'Using {device} device')

  from .autonotebook import tqdm as notebook_tqdm


Using cpu device


In [2]:
# 有些中文的標點符號在tokenizer編碼以後會變成[UNK]，所以將其換成英文標點
token_replacement = [
    ["：" , ":"],
    ["，" , ","],
    ["“" , "\""],
    ["”" , "\""],
    ["？" , "?"],
    ["……" , "..."],
    ["！" , "!"]
]

tokenizer = T.BertTokenizer.from_pretrained("google-bert/bert-base-uncased", cache_dir="./cache/")

In [3]:
class SemevalDataset(Dataset):
    def __init__(self, split="train") -> None:
        super().__init__()
        assert split in ["train", "validation"]
        self.data = load_dataset(
            "sem_eval_2014_task_1", split=split, cache_dir="./cache/"
        ).to_list()

    def __getitem__(self, index):
        d = self.data[index]
        # 把中文標點替換掉
        for k in ["premise", "hypothesis"]:
            for tok in token_replacement:
                d[k] = d[k].replace(tok[0], tok[1])
        return d

    def __len__(self):
        return len(self.data)

data_sample = SemevalDataset(split="train").data[:3]
print(f"Dataset example: \n{data_sample[0]} \n{data_sample[1]} \n{data_sample[2]}")

Dataset example: 
{'sentence_pair_id': 1, 'premise': 'A group of kids is playing in a yard and an old man is standing in the background', 'hypothesis': 'A group of boys in a yard is playing and a man is standing in the background', 'relatedness_score': 4.5, 'entailment_judgment': 0} 
{'sentence_pair_id': 2, 'premise': 'A group of children is playing in the house and there is no man standing in the background', 'hypothesis': 'A group of kids is playing in a yard and an old man is standing in the background', 'relatedness_score': 3.200000047683716, 'entailment_judgment': 0} 
{'sentence_pair_id': 3, 'premise': 'The young boys are playing outdoors and the man is smiling nearby', 'hypothesis': 'The kids are playing outdoors near a man with a smile', 'relatedness_score': 4.699999809265137, 'entailment_judgment': 1}


In [4]:
# Define the hyperparameters
lr = 3e-5
epochs = 3
train_batch_size = 8
validation_batch_size = 8

In [5]:
# TODO1: Create batched data for DataLoader
# `collate_fn` is a function that defines how the data batch should be packed.
# This function will be called in the DataLoader to pack the data batch.

def collate_fn(batch):
    # Get premise and hypothesis texts
    premises = [d['premise'] for d in batch]
    hypotheses = [d['hypothesis'] for d in batch]
    
    # Concatenate premise and hypothesis with [SEP] token for BERT input
    text_pairs = list(zip(premises, hypotheses))
    
    # Tokenize using BERT tokenizer
    encoded = tokenizer.batch_encode_plus(
        text_pairs,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )
    
    # Get labels - fix the key name here
    relatedness_scores = torch.tensor([d['relatedness_score'] for d in batch], dtype=torch.float)
    entailment_labels = torch.tensor([d['entailment_judgment'] for d in batch], dtype=torch.long)
    
    return (
        encoded['input_ids'], 
        encoded['attention_mask'],
        encoded['token_type_ids'],
        relatedness_scores,
        entailment_labels
    )

# Create DataLoaders
dl_train = DataLoader(
    SemevalDataset(split="train"),
    batch_size=train_batch_size,
    shuffle=True,
    collate_fn=collate_fn
)

dl_validation = DataLoader(
    SemevalDataset(split="validation"),
    batch_size=validation_batch_size,
    shuffle=False,
    collate_fn=collate_fn
)

In [6]:
# TODO2: Construct your model

class MultiOutputModel(nn.Module):
    def __init__(self):
        super(MultiOutputModel, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.regressor = nn.Linear(self.bert.config.hidden_size, 1)  # Regression task
        self.classifier = nn.Linear(self.bert.config.hidden_size, 3)  # Classification task

    def forward(self, input_ids, attention_mask, token_type_ids):
        bert_output = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = bert_output.pooler_output
        relatedness_score = self.regressor(hidden_state)
        entailment_judgement = self.classifier(hidden_state)
        return relatedness_score, entailment_judgement


In [7]:
model = MultiOutputModel().to(device)

In [8]:
# TODO3: Define your optimizer and loss function

# Optimizer
optimizer = AdamW(model.parameters(), lr=lr)

# Loss functions
regression_loss = nn.MSELoss()  # For relatedness score regression
classification_loss = nn.CrossEntropyLoss()  # For entailment classification

# scoring functions
spc = SpearmanCorrCoef()
acc = Accuracy(task="multiclass", num_classes=3)
f1 = F1Score(task="multiclass", num_classes=3, average='macro')



In [9]:
@torch.no_grad()
def evaluate(model, dataloader):
    model.eval()
    all_rel_preds = []
    all_rel_true = []
    all_ent_preds = []
    all_ent_true = []
    
    for batch in dataloader:
        # Unpack batch and move to device
        input_ids, attention_mask, token_type_ids, rel_scores, ent_labels = [
            x.to(device) for x in batch
        ]
        
        # Forward pass
        rel_pred, ent_pred = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        
        # Store predictions and true labels
        all_rel_preds.extend(rel_pred.squeeze().cpu().tolist())
        all_rel_true.extend(rel_scores.cpu().tolist())
        all_ent_preds.extend(ent_pred.argmax(dim=-1).cpu().tolist())
        all_ent_true.extend(ent_labels.cpu().tolist())
    
    # Calculate metrics
    rel_preds = torch.tensor(all_rel_preds)
    rel_true = torch.tensor(all_rel_true)
    ent_preds = torch.tensor(all_ent_preds)
    ent_true = torch.tensor(all_ent_true)
    
    spearman = spc(rel_preds, rel_true)
    accuracy = acc(ent_preds, ent_true)
    f1_macro = f1(ent_preds, ent_true)
    
    return {
        'spearman': spearman.item(),
        'accuracy': accuracy.item(),
        'f1_macro': f1_macro.item()
    }

In [11]:
for ep in range(epochs):
    pbar = tqdm(dl_train)
    pbar.set_description(f"Training epoch [{ep+1}/{epochs}]")
    model.train()
    # TODO4: Write the training loop
    
    for batch in pbar:
        # Clear gradients
        optimizer.zero_grad()
        
        # Unpack batch and move to device
        input_ids, attention_mask, token_type_ids, rel_scores, ent_labels = [
            x.to(device) for x in batch
        ]
        
        # Forward pass
        rel_pred, ent_pred = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        
        # Calculate losses
        loss_rel = regression_loss(rel_pred.squeeze(), rel_scores)
        loss_ent = classification_loss(ent_pred, ent_labels)
        
        # Combine losses
        total_loss = loss_rel + loss_ent
        
        # Backward pass
        total_loss.backward()
        
        # Update weights
        optimizer.step()
        
        # Update progress bar
        pbar.set_postfix({
            'rel_loss': f'{loss_rel.item():.4f}',
            'ent_loss': f'{loss_ent.item():.4f}'
        })
    
    pbar = tqdm(dl_validation)
    pbar.set_description(f"Validation epoch [{ep+1}/{epochs}]")
    model.eval()
    # TODO5: Write the evaluation loop
    metrics = evaluate(model, dl_validation)

    print(f"Epoch {ep+1} Validation Metrics:")
    print(f"Spearman: {metrics['spearman']:.4f}")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"F1-macro: {metrics['f1_macro']:.4f}")

    # Save the model
    torch.save(model, f'./ep{ep}.ckpt')

Validation epoch [1/3]:   0%|          | 0/63 [15:39<?, ?it/s]
Training epoch [1/3]: 100%|██████████| 563/563 [06:44<00:00,  1.39it/s, rel_loss=0.3444, ent_loss=1.6482]
Validation epoch [1/3]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch 1 Validation Metrics:
Spearman: 0.8245
Accuracy: 0.8800
F1-macro: 0.8762


Validation epoch [1/3]:   0%|          | 0/63 [00:17<?, ?it/s]
Training epoch [2/3]: 100%|██████████| 563/563 [08:14<00:00,  1.14it/s, rel_loss=0.1713, ent_loss=0.0855]
Validation epoch [2/3]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch 2 Validation Metrics:
Spearman: 0.8230
Accuracy: 0.8540
F1-macro: 0.8564


Validation epoch [2/3]:   0%|          | 0/63 [00:39<?, ?it/s]
Training epoch [3/3]: 100%|██████████| 563/563 [09:42<00:00,  1.03s/it, rel_loss=0.1916, ent_loss=0.0523]
Validation epoch [3/3]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch 3 Validation Metrics:
Spearman: 0.8238
Accuracy: 0.8640
F1-macro: 0.8609


For test set predictions, you can write perform evaluation simlar to #TODO5.

In [12]:
# Load test dataset
test_dataset = load_dataset("sem_eval_2014_task_1", split="test", cache_dir="./cache/")

# Create test dataloader
dl_test = DataLoader(
   test_dataset,
   batch_size=validation_batch_size,
   shuffle=False,
   collate_fn=collate_fn
)

# Load best model
best_model = torch.load('./ep2.ckpt')
best_model.eval()

@torch.no_grad()
def predict_test(model, dataloader):
   pbar = tqdm(dataloader)
   pbar.set_description("Testing")
   all_rel_preds = []
   all_rel_true = []
   all_ent_preds = []
   all_ent_true = []
   
   for batch in pbar:
       input_ids, attention_mask, token_type_ids, rel_scores, ent_labels = [
           x.to(device) for x in batch
       ]
       
       rel_pred, ent_pred = model(
           input_ids=input_ids,
           attention_mask=attention_mask, 
           token_type_ids=token_type_ids
       )
       
       all_rel_preds.extend(rel_pred.squeeze().cpu().tolist())
       all_rel_true.extend(rel_scores.cpu().tolist())
       all_ent_preds.extend(ent_pred.argmax(dim=-1).cpu().tolist())
       all_ent_true.extend(ent_labels.cpu().tolist())
   
   # Calculate metrics
   rel_preds = torch.tensor(all_rel_preds)
   rel_true = torch.tensor(all_rel_true)
   ent_preds = torch.tensor(all_ent_preds)
   ent_true = torch.tensor(all_ent_true)
   
   spearman = spc(rel_preds, rel_true)
   accuracy = acc(ent_preds, ent_true)
   f1_macro = f1(ent_preds, ent_true)
   
   return {
       'spearman': spearman.item(),
       'accuracy': accuracy.item(),
       'f1_macro': f1_macro.item()
   }

# Run predictions
test_metrics = predict_test(best_model, dl_test)
print("\nTest Set Results:")
print(f"Spearman Correlation: {test_metrics['spearman']:.4f}")
print(f"Accuracy: {test_metrics['accuracy']:.4f}") 
print(f"F1 Macro: {test_metrics['f1_macro']:.4f}")

  best_model = torch.load('./ep2.ckpt')
Testing: 100%|██████████| 616/616 [01:24<00:00,  7.33it/s]


Test Set Results:
Spearman Correlation: 0.8246
Accuracy: 0.8746
F1 Macro: 0.8664



