In [1]:
import transformers as T
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
from torchmetrics import SpearmanCorrCoef, Accuracy, F1Score
from transformers import BertModel
import torch.nn as nn
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'Using {device} device')

  from .autonotebook import tqdm as notebook_tqdm


Using cuda device


In [2]:
# 有些中文的標點符號在tokenizer編碼以後會變成[UNK]，所以將其換成英文標點
token_replacement = [
    ["：" , ":"],
    ["，" , ","],
    ["“" , "\""],
    ["”" , "\""],
    ["？" , "?"],
    ["……" , "..."],
    ["！" , "!"]
]

tokenizer = T.BertTokenizer.from_pretrained("google-bert/bert-base-uncased", cache_dir="./cache/")

In [3]:
class SemevalDataset(Dataset):
    def __init__(self, split="train") -> None:
        super().__init__()
        assert split in ["train", "validation"]
        self.data = load_dataset(
            "sem_eval_2014_task_1", split=split, cache_dir="./cache/", trust_remote_code=True
        ).to_list()

    def __getitem__(self, index):
        d = self.data[index]
        # 把中文標點替換掉
        for k in ["premise", "hypothesis"]:
            for tok in token_replacement:
                d[k] = d[k].replace(tok[0], tok[1])
        return d

    def __len__(self):
        return len(self.data)

data_sample = SemevalDataset(split="train").data[:3]
print(f"Dataset example: \n{data_sample[0]} \n{data_sample[1]} \n{data_sample[2]}")

Dataset example: 
{'sentence_pair_id': 1, 'premise': 'A group of kids is playing in a yard and an old man is standing in the background', 'hypothesis': 'A group of boys in a yard is playing and a man is standing in the background', 'relatedness_score': 4.5, 'entailment_judgment': 0} 
{'sentence_pair_id': 2, 'premise': 'A group of children is playing in the house and there is no man standing in the background', 'hypothesis': 'A group of kids is playing in a yard and an old man is standing in the background', 'relatedness_score': 3.200000047683716, 'entailment_judgment': 0} 
{'sentence_pair_id': 3, 'premise': 'The young boys are playing outdoors and the man is smiling nearby', 'hypothesis': 'The kids are playing outdoors near a man with a smile', 'relatedness_score': 4.699999809265137, 'entailment_judgment': 1}


In [4]:
# Define the hyperparameters
lr = 3e-5
epochs = 10
train_batch_size = 8
validation_batch_size = 8

In [5]:
# TODO1: Create batched data for DataLoader
# `collate_fn` is a function that defines how the data batch should be packed.
# This function will be called in the DataLoader to pack the data batch.

def collate_fn(batch):
    # Get premise and hypothesis texts
    premises = [d['premise'] for d in batch]
    hypotheses = [d['hypothesis'] for d in batch]
    
    # Concatenate premise and hypothesis with [SEP] token for BERT input
    text_pairs = list(zip(premises, hypotheses))
    
    # Tokenize using BERT tokenizer
    encoded = tokenizer.batch_encode_plus(
        text_pairs,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )
    
    # Get labels - fix the key name here
    relatedness_scores = torch.tensor([d['relatedness_score'] for d in batch], dtype=torch.float)
    entailment_labels = torch.tensor([d['entailment_judgment'] for d in batch], dtype=torch.long)
    
    return (
        encoded['input_ids'], 
        encoded['attention_mask'],
        encoded['token_type_ids'],
        relatedness_scores,
        entailment_labels
    )

# Create DataLoaders
dl_train = DataLoader(
    SemevalDataset(split="train"),
    batch_size=train_batch_size,
    shuffle=True,
    collate_fn=collate_fn
)

dl_validation = DataLoader(
    SemevalDataset(split="validation"),
    batch_size=validation_batch_size,
    shuffle=False,
    collate_fn=collate_fn
)

In [6]:
# TODO2: Construct your model

class MultiOutputModel(nn.Module):
    def __init__(self):
        super(MultiOutputModel, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.regressor = nn.Linear(self.bert.config.hidden_size, 1)  # Regression task
        self.classifier = nn.Linear(self.bert.config.hidden_size, 3)  # Classification task

    def forward(self, input_ids, attention_mask, token_type_ids):
        bert_output = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = bert_output.pooler_output
        relatedness_score = self.regressor(hidden_state)
        entailment_judgement = self.classifier(hidden_state)
        return relatedness_score, entailment_judgement


In [7]:
# Improve: Add dropout and deeper task-specific heads

MIDDLE_LAYER_SIZE = 2048

class EnhancedMultiOutputModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        
        # Add dropout
        self.dropout = nn.Dropout(0.3)
        
        # Deeper task-specific heads
        self.regression_head = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, MIDDLE_LAYER_SIZE),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(MIDDLE_LAYER_SIZE, 1)
        )
        
        self.classification_head = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, MIDDLE_LAYER_SIZE),
            nn.ReLU(), 
            nn.Dropout(0.2),
            nn.Linear(MIDDLE_LAYER_SIZE, 3)
        )

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        
        relatedness_score = self.regression_head(pooled_output)
        entailment_logits = self.classification_head(pooled_output)
        
        return relatedness_score, entailment_logits

In [8]:
# model = MultiOutputModel().to(device)
model = EnhancedMultiOutputModel().to(device)

In [9]:
# TODO3: Define your optimizer and loss function

# Optimizer
optimizer = AdamW(model.parameters(), lr=lr)

# Loss functions
regression_loss = nn.MSELoss()  # For relatedness score regression
classification_loss = nn.CrossEntropyLoss()  # For entailment classification

# scoring functions
spc = SpearmanCorrCoef()
acc = Accuracy(task="multiclass", num_classes=3)
f1 = F1Score(task="multiclass", num_classes=3, average='macro')



In [10]:
@torch.no_grad()
def evaluate(model, dataloader):
    model.eval()
    all_rel_preds = []
    all_rel_true = []
    all_ent_preds = []
    all_ent_true = []
    
    for batch in dataloader:
        # Unpack batch and move to device
        input_ids, attention_mask, token_type_ids, rel_scores, ent_labels = [
            x.to(device) for x in batch
        ]
        
        # Forward pass
        rel_pred, ent_pred = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        
        # Store predictions and true labels
        all_rel_preds.extend(rel_pred.squeeze().cpu().tolist())
        all_rel_true.extend(rel_scores.cpu().tolist())
        all_ent_preds.extend(ent_pred.argmax(dim=-1).cpu().tolist())
        all_ent_true.extend(ent_labels.cpu().tolist())
    
    # Calculate metrics
    rel_preds = torch.tensor(all_rel_preds)
    rel_true = torch.tensor(all_rel_true)
    ent_preds = torch.tensor(all_ent_preds)
    ent_true = torch.tensor(all_ent_true)
    
    spearman = spc(rel_preds, rel_true)
    accuracy = acc(ent_preds, ent_true)
    f1_macro = f1(ent_preds, ent_true)
    
    return {
        'spearman': spearman.item(),
        'accuracy': accuracy.item(),
        'f1_macro': f1_macro.item()
    }

In [11]:
import time

start_train_time = time.strftime("%Y%m%d_%H%M%S", time.localtime())
best_spearman = -1  # Initialize best spearman correlation

for ep in range(epochs):
    pbar = tqdm(dl_train)
    pbar.set_description(f"Training epoch [{ep+1}/{epochs}]")
    model.train()
    
    for batch in pbar:
        # Clear gradients
        optimizer.zero_grad()
        
        # Unpack batch and move to device
        input_ids, attention_mask, token_type_ids, rel_scores, ent_labels = [
            x.to(device) for x in batch
        ]
        
        # Forward pass
        rel_pred, ent_pred = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        
        # Calculate losses
        loss_rel = regression_loss(rel_pred.squeeze(), rel_scores)
        loss_ent = classification_loss(ent_pred, ent_labels)
        
        # Combine losses
        total_loss = loss_rel + loss_ent
        
        # Backward pass
        total_loss.backward()
        
        # Update weights
        optimizer.step()
        
        # Update progress bar
        pbar.set_postfix({
            'rel_loss': f'{loss_rel.item():.4f}',
            'ent_loss': f'{loss_ent.item():.4f}'
        })
    
    pbar = tqdm(dl_validation)
    pbar.set_description(f"Validation epoch [{ep+1}/{epochs}]")
    model.eval()
    metrics = evaluate(model, dl_validation)

    print(f"Epoch {ep+1} Validation Metrics:")
    print(f"Spearman: {metrics['spearman']:.4f}")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"F1-macro: {metrics['f1_macro']:.4f}")
    
    # Save the model
    torch.save(model, f'./{start_train_time}-ep-{ep}.ckpt')
    
    # Save the best model
    if metrics['spearman'] > best_spearman:
        best_spearman = metrics['spearman']
        torch.save(model, f'./{start_train_time}-best.ckpt')
        print(f"Best model saved with Spearman: {best_spearman:.4f}")

Training epoch [1/10]: 100%|██████████| 563/563 [00:22<00:00, 25.50it/s, rel_loss=0.2759, ent_loss=0.3555]
Validation epoch [1/10]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch 1 Validation Metrics:
Spearman: 0.8247
Accuracy: 0.8140
F1-macro: 0.8181
Best model saved with Spearman: 0.8247


Validation epoch [1/10]:   0%|          | 0/63 [00:01<?, ?it/s]
Training epoch [2/10]: 100%|██████████| 563/563 [00:22<00:00, 25.24it/s, rel_loss=0.0995, ent_loss=0.2053]
Validation epoch [2/10]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch 2 Validation Metrics:
Spearman: 0.8315
Accuracy: 0.8560
F1-macro: 0.8532
Best model saved with Spearman: 0.8315


Validation epoch [2/10]:   0%|          | 0/63 [00:01<?, ?it/s]
Training epoch [3/10]: 100%|██████████| 563/563 [00:22<00:00, 25.52it/s, rel_loss=0.4531, ent_loss=1.2038]
Validation epoch [3/10]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch 3 Validation Metrics:
Spearman: 0.8475
Accuracy: 0.8540
F1-macro: 0.8539
Best model saved with Spearman: 0.8475


Validation epoch [3/10]:   0%|          | 0/63 [00:01<?, ?it/s]
Training epoch [4/10]: 100%|██████████| 563/563 [00:22<00:00, 24.64it/s, rel_loss=0.2996, ent_loss=0.0379]
Validation epoch [4/10]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch 4 Validation Metrics:
Spearman: 0.8479
Accuracy: 0.8420
F1-macro: 0.8400
Best model saved with Spearman: 0.8479


Validation epoch [4/10]:   0%|          | 0/63 [00:01<?, ?it/s]
Training epoch [5/10]: 100%|██████████| 563/563 [00:22<00:00, 25.19it/s, rel_loss=0.0395, ent_loss=0.0045]
Validation epoch [5/10]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch 5 Validation Metrics:
Spearman: 0.8393
Accuracy: 0.8340
F1-macro: 0.8344


Validation epoch [5/10]:   0%|          | 0/63 [00:00<?, ?it/s]
Training epoch [6/10]: 100%|██████████| 563/563 [00:21<00:00, 25.75it/s, rel_loss=0.0453, ent_loss=0.0092]
Validation epoch [6/10]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch 6 Validation Metrics:
Spearman: 0.8504
Accuracy: 0.8840
F1-macro: 0.8739
Best model saved with Spearman: 0.8504


Validation epoch [6/10]:   0%|          | 0/63 [00:01<?, ?it/s]
Training epoch [7/10]: 100%|██████████| 563/563 [00:21<00:00, 25.97it/s, rel_loss=0.0468, ent_loss=0.2261]
Validation epoch [7/10]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch 7 Validation Metrics:
Spearman: 0.8376
Accuracy: 0.8480
F1-macro: 0.8446


Validation epoch [7/10]:   0%|          | 0/63 [00:00<?, ?it/s]
Training epoch [8/10]: 100%|██████████| 563/563 [00:21<00:00, 26.35it/s, rel_loss=0.2946, ent_loss=0.0016]
Validation epoch [8/10]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch 8 Validation Metrics:
Spearman: 0.8436
Accuracy: 0.8700
F1-macro: 0.8635


Validation epoch [8/10]:   0%|          | 0/63 [00:00<?, ?it/s]
Training epoch [9/10]: 100%|██████████| 563/563 [00:21<00:00, 26.45it/s, rel_loss=0.0790, ent_loss=0.0041]
Validation epoch [9/10]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch 9 Validation Metrics:
Spearman: 0.8376
Accuracy: 0.8620
F1-macro: 0.8523


Validation epoch [9/10]:   0%|          | 0/63 [00:00<?, ?it/s]
Training epoch [10/10]: 100%|██████████| 563/563 [00:21<00:00, 26.69it/s, rel_loss=0.0877, ent_loss=0.0008]
Validation epoch [10/10]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch 10 Validation Metrics:
Spearman: 0.8406
Accuracy: 0.8480
F1-macro: 0.8408


For test set predictions, you can write perform evaluation simlar to #TODO5.

In [12]:
# Load test dataset
test_dataset = load_dataset("sem_eval_2014_task_1", split="test", cache_dir="./cache/")

# Create test dataloader
dl_test = DataLoader(
   test_dataset,
   batch_size=validation_batch_size,
   shuffle=False,
   collate_fn=collate_fn
)

# Load best model
best_model = torch.load('./ep2.ckpt')
best_model.eval()

@torch.no_grad()
def predict_test(model, dataloader):
   pbar = tqdm(dataloader)
   pbar.set_description("Testing")
   all_rel_preds = []
   all_rel_true = []
   all_ent_preds = []
   all_ent_true = []
   
   for batch in pbar:
       input_ids, attention_mask, token_type_ids, rel_scores, ent_labels = [
           x.to(device) for x in batch
       ]
       
       rel_pred, ent_pred = model(
           input_ids=input_ids,
           attention_mask=attention_mask, 
           token_type_ids=token_type_ids
       )
       
       all_rel_preds.extend(rel_pred.squeeze().cpu().tolist())
       all_rel_true.extend(rel_scores.cpu().tolist())
       all_ent_preds.extend(ent_pred.argmax(dim=-1).cpu().tolist())
       all_ent_true.extend(ent_labels.cpu().tolist())
   
   # Calculate metrics
   rel_preds = torch.tensor(all_rel_preds)
   rel_true = torch.tensor(all_rel_true)
   ent_preds = torch.tensor(all_ent_preds)
   ent_true = torch.tensor(all_ent_true)
   
   spearman = spc(rel_preds, rel_true)
   accuracy = acc(ent_preds, ent_true)
   f1_macro = f1(ent_preds, ent_true)
   
   return {
       'spearman': spearman.item(),
       'accuracy': accuracy.item(),
       'f1_macro': f1_macro.item()
   }

# Run predictions
test_metrics = predict_test(best_model, dl_test)
print("\nTest Set Results:")
print(f"Spearman Correlation: {test_metrics['spearman']:.4f}")
print(f"Accuracy: {test_metrics['accuracy']:.4f}") 
print(f"F1 Macro: {test_metrics['f1_macro']:.4f}")

  best_model = torch.load('./ep2.ckpt')
Testing: 100%|██████████| 616/616 [00:05<00:00, 112.18it/s]


Test Set Results:
Spearman Correlation: 0.8400
Accuracy: 0.8851
F1 Macro: 0.8777





In [13]:
# Error analysis

@torch.no_grad()
def predict_test(model, dataloader):
    pbar = tqdm(dataloader)
    pbar.set_description("Testing")
    all_rel_preds = []
    all_rel_true = []
    all_ent_preds = []
    all_ent_true = []
    error_predictions = []
    
    for batch in pbar:
        input_ids, attention_mask, token_type_ids, rel_scores, ent_labels = [
            x.to(device) for x in batch
        ]
        
        rel_pred, ent_pred = model(
            input_ids=input_ids,
            attention_mask=attention_mask, 
            token_type_ids=token_type_ids
        )
        
        all_rel_preds.extend(rel_pred.squeeze().cpu().tolist())
        all_rel_true.extend(rel_scores.cpu().tolist())
        all_ent_preds.extend(ent_pred.argmax(dim=-1).cpu().tolist())
        all_ent_true.extend(ent_labels.cpu().tolist())
        
        # Store incorrect predictions
        for i in range(len(ent_labels)):
            if ent_pred.argmax(dim=-1)[i] != ent_labels[i]:
                error_predictions.append({
                    'premise': tokenizer.decode(input_ids[i], skip_special_tokens=True),
                    'true_label': ent_labels[i].cpu().item(),
                    'predicted_label': ent_pred.argmax(dim=-1)[i].cpu().item()
                })
    
    # Calculate metrics
    rel_preds = torch.tensor(all_rel_preds)
    rel_true = torch.tensor(all_rel_true)
    ent_preds = torch.tensor(all_ent_preds)
    ent_true = torch.tensor(all_ent_true)
    
    spearman = spc(rel_preds, rel_true)
    accuracy = acc(ent_preds, ent_true)
    f1_macro = f1(ent_preds, ent_true)
    
    return {
        'spearman': spearman.item(),
        'accuracy': accuracy.item(),
        'f1_macro': f1_macro.item(),
        'error_predictions': error_predictions
    }

# Run predictions
test_metrics = predict_test(best_model, dl_test)
print("\nTest Set Results:")
print(f"Spearman Correlation: {test_metrics['spearman']:.4f}")
print(f"Accuracy: {test_metrics['accuracy']:.4f}") 
print(f"F1 Macro: {test_metrics['f1_macro']:.4f}")

# Error count:
False_Label = [error['true_label'] for error in test_metrics['error_predictions']]
for i in range(3):
    print(f"Error count for label {i}: {False_Label.count(i)}")

# Print error predictions
print("\nError Predictions:")
for error in test_metrics['error_predictions']:
    print(f"Premise: {error['premise']}")
    print(f"True Label: {error['true_label']}")
    print(f"Predicted Label: {error['predicted_label']}")
    print()


Testing: 100%|██████████| 616/616 [00:05<00:00, 108.75it/s]


Test Set Results:
Spearman Correlation: 0.8400
Accuracy: 0.8851
F1 Macro: 0.8777
Error count for label 0: 272
Error count for label 1: 182
Error count for label 2: 112

Error Predictions:
Premise: a person in a black jacket is doing tricks on a motorbike a person on a black motorbike is doing tricks with a jacket
True Label: 0
Predicted Label: 1

Premise: the player is missing the basket and a crowd is in background the player is dunking the basketball into the net and a crowd is in background
True Label: 2
Predicted Label: 0

Premise: there is no man dunking the ball at a basketball game the player is dunking the basketball into the net and a crowd is in background
True Label: 2
Predicted Label: 0

Premise: two people are kickboxing and spectators are watching two people are fighting and spectators are watching
True Label: 1
Predicted Label: 0

Premise: two spectators are kickboxing and some people are watching two people are kickboxing and spectators are watching
True Label: 0
Predi


