<a href="https://colab.research.google.com/github/vishesh711/NLP-HW3/blob/main/hw3_code_skeleton_Dont_know%3F.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install the necessary libraries


In [20]:
%%capture
! pip install tqdm boto3 requests regex sentencepiece sacremoses
! pip install transformers

## BERT Features

In this part, you will use BERT features to classify DBPedia articles.
The data is already pre-processed, and the data loader is implemented below.

In [21]:
# Basics: dataset, data loaders, Classifier
import collections
import json
import torch
import torch.nn as nn
import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import numpy as np
import pandas as pd
import random

SPLITS = ['train', 'dev', 'test']

class DBPediaDataset(Dataset):
  '''DBPedia dataset.
    Args:
      path[str]: path to the original data.
  '''
  def __init__(self, path):
    with open(path) as fin:
      self._data = [json.loads(l) for l in fin]
    self._n_classes = len(set([datum['label'] for datum in self._data]))

  def __getitem__(self, index):
    return self._data[index]

  def __len__(self):
    return len(self._data)

  @property
  def n_classes(self):
    return self._n_classes

  @staticmethod
  def collate_fn(tokenizer, device, batch):
    '''The collate function that compresses a training batch.
      Args:
        batch[list[dict[str, Any]]]: data in the batch.
      Returns:
        labels[torch.LongTensor]: the labels in the batch.
        sentences[dict[str, torch.Tensor]]: sentences converted by tokenizers.
    '''
    labels = torch.tensor([datum['label'] for datum in batch]).long().to(device)
    sentences = tokenizer(
        [datum['sentence'] for datum in batch],
        return_tensors='pt',  # pt = pytorch style tensor
        padding=True)
    for key in sentences:
      sentences[key] = sentences[key].to(device)
    return labels, sentences

def construct_datasets(prefix, batch_size, tokenizer, device):
  '''Constructs datasets and data loaders.
    Args:
      prefix[str]: prefix of the dataset (e.g., dbpedia_).
      batch_size[int]: maximum number of examples in a batch.
      tokenizer: model tokenizer that converts sentences to integer tensors.
      device[torch.device]: the device (cpu/gpu) that the tensor should be on.
    Returns:
      datasets[dict[str, Dataset]]: a dict of constructed datasets.
      dataloaders[dict[str, DataLoader]]: a dict of constructed data loaders.
  '''
  datasets = collections.defaultdict()
  dataloaders = collections.defaultdict()
  for split in SPLITS:
    datasets[split] = DBPediaDataset(f'{prefix}{split}.json')
    dataloaders[split] = DataLoader(
        datasets[split],
        batch_size=batch_size,
        shuffle=(split == 'train'),
        collate_fn=lambda x:DBPediaDataset.collate_fn(tokenizer, device, x))
  return datasets, dataloaders

In [24]:
# 1.1: [CODE] put your implementation of classifer here
class Classifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Classifier, self).__init__()
        # First linear layer (input_size -> hidden_size)
        self.fc1 = nn.Linear(input_size, hidden_size)
        # ReLU activation function
        self.relu = nn.ReLU()
        # Second linear layer (hidden_size -> num_classes)
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Pass input through first linear layer
        x = self.fc1(x)
        # Apply ReLU activation
        x = self.relu(x)
        # Pass through second linear layer to get logits
        x = self.fc2(x)
        return x

## Training and Evaluation

1.1

In [31]:
print("Q1.1")
# hyperparameters
batch_size = 32
classifier_hidden_size = 32
# hyperparameters ends

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert_model = AutoModel.from_pretrained('bert-base-cased')
if torch.cuda.is_available():  # use GPU if available
  bert_model = bert_model.cuda()
datasets, dataloaders = construct_datasets(
    prefix='dbpedia_',
    batch_size=batch_size,
    tokenizer=tokenizer,
    device=bert_model.device)

classifier = Classifier(
    bert_model.config.hidden_size,
    classifier_hidden_size,
    datasets['train'].n_classes).to(bert_model.device)
optimizer = torch.optim.Adam(classifier.parameters(), lr=5e-4)
loss_func = nn.CrossEntropyLoss()
pbar = tqdm.tqdm(dataloaders['train'])
for labels, sentences in pbar:
  with torch.no_grad():
    unpooled_features = bert_model(**sentences)['last_hidden_state'] # [B, L, D]
  # 1.1: [CODE] train your classifier here
import random
import numpy as np
from sklearn.metrics import accuracy_score

# Function to set random seeds for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

# Lists to store development accuracies and test accuracies
dev_accuracies = []
test_accuracies = []
# List of seed values to use
seed_values = [42, 123, 999, 2021, 7]  # Example seeds
best_dev_accuracy = 0.0
best_model_state = None

# Loop over each seed value
for seed in seed_values:
    set_seed(seed)
    # Initialize the classifier for each run
    classifier = Classifier(
        input_size=bert_model.config.hidden_size,  # Input size is 768 for BERT base
        hidden_size=classifier_hidden_size,        # Hidden layer size as per requirement
        num_classes=datasets['train'].n_classes    # Number of classes (14)
    ).to(bert_model.device)
    # Initialize the optimizer with classifier parameters
    optimizer = torch.optim.Adam(classifier.parameters(), lr=5e-4)
    # Define the loss function
    loss_func = nn.CrossEntropyLoss()
    # Training loop for one epoch
    pbar = tqdm.tqdm(dataloaders['train'])
    for labels, sentences in pbar:
        # Use torch.no_grad() since BERT parameters are frozen
        with torch.no_grad():
            # Get unpooled features from BERT (batch_size, seq_length, hidden_size)
            unpooled_features = bert_model(**sentences)['last_hidden_state']
        # 1.1: [CODE] train your classifier here
        # Extract the [CLS] token representation (batch_size, hidden_size)
        cls_embeddings = unpooled_features[:, 0, :]
        # Forward pass through the classifier to get logits
        logits = classifier(cls_embeddings)
        # Compute the cross-entropy loss between logits and labels
        loss = loss_func(logits, labels)
        # Backpropagation and optimization steps
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # Update progress bar with current loss
        pbar.set_description(f"Seed: {seed} | Loss: {loss.item():.4f}")
    # Evaluate the model on the development set
    classifier.eval()  # Set classifier to evaluation mode
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for labels, sentences in dataloaders['dev']:
            # Get BERT features
            unpooled_features = bert_model(**sentences)['last_hidden_state']
            # Extract [CLS] token representation
            cls_embeddings = unpooled_features[:, 0, :]
            # Get logits from classifier
            logits = classifier(cls_embeddings)
            # Get predicted classes
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    # Calculate accuracy on the development set
    dev_accuracy = accuracy_score(all_labels, all_preds)
    dev_accuracies.append(dev_accuracy)
    print(f"Seed: {seed} | Dev Accuracy: {dev_accuracy:.4f}")

    # Save the best model based on dev accuracy
    if dev_accuracy > best_dev_accuracy:
        best_dev_accuracy = dev_accuracy
        best_model_state = classifier.state_dict()
        best_seed = seed

# Compute mean and standard deviation of development accuracies
mean_dev_accuracy = np.mean(dev_accuracies)
std_dev_accuracy = np.std(dev_accuracies)
print(f"\nMean Dev Accuracy: {mean_dev_accuracy:.4f}")
print(f"Std Dev Accuracy: {std_dev_accuracy:.4f}")

# Evaluate the best model on the test set
# Load the best model state
classifier.load_state_dict(best_model_state)
classifier.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for labels, sentences in dataloaders['test']:
        # Get BERT features
        unpooled_features = bert_model(**sentences)['last_hidden_state']
        # Extract [CLS] token representation
        cls_embeddings = unpooled_features[:, 0, :]
        # Get logits from classifier
        logits = classifier(cls_embeddings)
        # Get predicted classes
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
# Calculate accuracy on the test set
test_accuracy = accuracy_score(all_labels, all_preds)
print(f"\nBest Model Seed: {best_seed} | Test Accuracy: {test_accuracy:.4f}")
  # 1.1: [CODE] ends here
print("******************************************************************************")
print("Q1.2")
#------------------------------------------------------------------------------------------------------------------------
  # Note: you can re-use this code snippet for 1.2 as well
class Classifier(nn.Module):
    # Added code for 1.2 starts here
    def __init__(self, input_size, hidden_size, num_classes):
        super(Classifier, self).__init__()
        # First linear layer (input_size -> hidden_size)
        self.fc1 = nn.Linear(input_size, hidden_size)
        # ReLU activation function
        self.relu = nn.ReLU()
        # Second linear layer (hidden_size -> num_classes)
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Pass input through first linear layer
        x = self.fc1(x)
        # Apply ReLU activation
        x = self.relu(x)
        # Pass through second linear layer to get logits
        x = self.fc2(x)
        return x
    # Added code for 1.2 ends here

# Training and Evaluation

# hyperparameters
batch_size = 32
classifier_hidden_size = 32
# hyperparameters ends

# Initialize tokenizer and BERT model
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert_model = AutoModel.from_pretrained('bert-base-cased')
if torch.cuda.is_available():  # use GPU if available
    bert_model = bert_model.cuda()

# Construct datasets and data loaders
datasets, dataloaders = construct_datasets(
    prefix='dbpedia_',
    batch_size=batch_size,
    tokenizer=tokenizer,
    device=bert_model.device
)

# Import necessary modules for setting random seeds and evaluation
import random
import numpy as np
from sklearn.metrics import accuracy_score

# Function to set random seeds for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

# Lists to store development accuracies and test accuracies
dev_accuracies = []
test_accuracies = []
# List of seed values to use
seed_values = [42, 123, 999, 2021, 7]  # Example seeds
best_dev_accuracy = 0.0
best_model_state = None

# Loop over each seed value
for seed in seed_values:
    set_seed(seed)
    # Initialize the classifier for each run

    # 1.2: [CODE] Adjust the input size of the classifier
    # Since we are concatenating mean-pooled and max-pooled vectors, each of size 768,
    # the total input size becomes 768 * 2 = 1536
    classifier_input_size = bert_model.config.hidden_size * 2  # 768 * 2 = 1536

    classifier = Classifier(
        input_size=classifier_input_size,
        hidden_size=classifier_hidden_size,
        num_classes=datasets['train'].n_classes
    ).to(bert_model.device)

    # Initialize the optimizer with classifier parameters
    optimizer = torch.optim.Adam(classifier.parameters(), lr=5e-4)
    # Define the loss function
    loss_func = nn.CrossEntropyLoss()

    # Training loop for one epoch
    pbar = tqdm.tqdm(dataloaders['train'])
    for labels, sentences in pbar:
        # Use torch.no_grad() since BERT parameters are frozen
        with torch.no_grad():
            # Get unpooled features from BERT
            outputs = bert_model(**sentences)
            unpooled_features = outputs['last_hidden_state']  # Shape: [B, L, D]

            # 1.2: [CODE] Compute mean and max pooling over content tokens
            # Added code starts here

            # Get the attention mask to identify content tokens
            attention_mask = sentences['attention_mask']  # Shape: [B, L]

            # Expand attention mask to match the dimensions of unpooled_features
            # We need to unsqueeze and expand it so that it can be used to mask the embeddings
            # Mask shape will be [B, L, D]
            mask_expanded = attention_mask.unsqueeze(-1).expand(unpooled_features.size()).float()

            # Apply the attention mask to the unpooled features
            # This zeroes out the embeddings of padding tokens
            masked_embeddings = unpooled_features * mask_expanded  # Shape: [B, L, D]

            # Compute mean pooling
            # Sum the embeddings along the sequence length dimension
            sum_embeddings = torch.sum(masked_embeddings, dim=1)  # Shape: [B, D]
            # Sum the attention mask to get the number of valid tokens for each sample
            sum_mask = torch.clamp(mask_expanded.sum(dim=1), min=1e-9)  # Shape: [B, D]
            # Compute the mean by dividing summed embeddings by the number of valid tokens
            mean_pooled = sum_embeddings / sum_mask  # Shape: [B, D]

            # Compute max pooling
            # For max pooling, we need to replace zeros in masked_embeddings with a very small value
            # so that they do not affect the max operation
            masked_embeddings[mask_expanded == 0] = -1e9  # Replace zeros with large negative number
            # Compute the max over the sequence length dimension
            max_pooled = torch.max(masked_embeddings, dim=1)[0]  # Shape: [B, D]

            # Concatenate mean-pooled and max-pooled vectors
            pooled_features = torch.cat((mean_pooled, max_pooled), dim=1)  # Shape: [B, D * 2]

            # Added code ends here

        # Forward pass through the classifier to get logits
        logits = classifier(pooled_features)
        # Compute the cross-entropy loss between logits and labels
        loss = loss_func(logits, labels)
        # Backpropagation and optimization steps
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # Update progress bar with current loss
        pbar.set_description(f"Seed: {seed} | Loss: {loss.item():.4f}")

    # Evaluate the model on the development set
    classifier.eval()  # Set classifier to evaluation mode
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for labels, sentences in dataloaders['dev']:
            # Get unpooled features from BERT
            outputs = bert_model(**sentences)
            unpooled_features = outputs['last_hidden_state']  # Shape: [B, L, D]

            # 1.2: [CODE] Compute mean and max pooling over content tokens
            # Same code as in training
            # Added code starts here

            # Get the attention mask
            attention_mask = sentences['attention_mask']  # Shape: [B, L]

            # Expand attention mask
            mask_expanded = attention_mask.unsqueeze(-1).expand(unpooled_features.size()).float()

            # Apply the attention mask
            masked_embeddings = unpooled_features * mask_expanded  # Shape: [B, L, D]

            # Compute mean pooling
            sum_embeddings = torch.sum(masked_embeddings, dim=1)  # Shape: [B, D]
            sum_mask = torch.clamp(mask_expanded.sum(dim=1), min=1e-9)  # Shape: [B, D]
            mean_pooled = sum_embeddings / sum_mask  # Shape: [B, D]

            # Compute max pooling
            masked_embeddings[mask_expanded == 0] = -1e9  # Replace zeros with large negative number
            max_pooled = torch.max(masked_embeddings, dim=1)[0]  # Shape: [B, D]

            # Concatenate mean-pooled and max-pooled vectors
            pooled_features = torch.cat((mean_pooled, max_pooled), dim=1)  # Shape: [B, D * 2]

            # Added code ends here

            # Get logits from classifier
            logits = classifier(pooled_features)
            # Get predicted classes
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate accuracy on the development set
    dev_accuracy = accuracy_score(all_labels, all_preds)
    dev_accuracies.append(dev_accuracy)
    print(f"Seed: {seed} | Dev Accuracy: {dev_accuracy:.4f}")

    # Save the best model based on dev accuracy
    if dev_accuracy > best_dev_accuracy:
        best_dev_accuracy = dev_accuracy
        best_model_state = classifier.state_dict()
        best_seed = seed

# Compute mean and standard deviation of development accuracies
mean_dev_accuracy = np.mean(dev_accuracies)
std_dev_accuracy = np.std(dev_accuracies)
print(f"\nMean Dev Accuracy: {mean_dev_accuracy:.4f}")
print(f"Std Dev Accuracy: {std_dev_accuracy:.4f}")

# Evaluate the best model on the test set
# Load the best model state
classifier.load_state_dict(best_model_state)
classifier.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for labels, sentences in dataloaders['test']:
        # Get unpooled features from BERT
        outputs = bert_model(**sentences)
        unpooled_features = outputs['last_hidden_state']  # Shape: [B, L, D]

        # 1.2: [CODE] Compute mean and max pooling over content tokens
        # Same code as in training
        # Added code starts here

        # Get the attention mask
        attention_mask = sentences['attention_mask']  # Shape: [B, L]

        # Expand attention mask
        mask_expanded = attention_mask.unsqueeze(-1).expand(unpooled_features.size()).float()

        # Apply the attention mask
        masked_embeddings = unpooled_features * mask_expanded  # Shape: [B, L, D]

        # Compute mean pooling
        sum_embeddings = torch.sum(masked_embeddings, dim=1)  # Shape: [B, D]
        sum_mask = torch.clamp(mask_expanded.sum(dim=1), min=1e-9)  # Shape: [B, D]
        mean_pooled = sum_embeddings / sum_mask  # Shape: [B, D]

        # Compute max pooling
        masked_embeddings[mask_expanded == 0] = -1e9  # Replace zeros with large negative number
        max_pooled = torch.max(masked_embeddings, dim=1)[0]  # Shape: [B, D]

        # Concatenate mean-pooled and max-pooled vectors
        pooled_features = torch.cat((mean_pooled, max_pooled), dim=1)  # Shape: [B, D * 2]

        # Added code ends here

        # Get logits from classifier
        logits = classifier(pooled_features)
        # Get predicted classes
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate accuracy on the test set
test_accuracy = accuracy_score(all_labels, all_preds)
print(f"\nBest Model Seed: {best_seed} | Test Accuracy: {test_accuracy:.4f}")

Seed: 2021 | Loss: 0.8122: 100%|██████████| 313/313 [00:41<00:00,  7.56it/s]


Seed: 2021 | Dev Accuracy: 0.9310


Seed: 7 | Loss: 0.7101: 100%|██████████| 313/313 [00:41<00:00,  7.49it/s]


Seed: 7 | Dev Accuracy: 0.9080

Mean Dev Accuracy: 0.9264
Std Dev Accuracy: 0.0133

Best Model Seed: 123 | Test Accuracy: 0.9320


100%|██████████| 313/313 [00:40<00:00,  7.74it/s]
Seed: 42 | Loss: 0.5374: 100%|██████████| 313/313 [00:41<00:00,  7.57it/s]


Seed: 42 | Dev Accuracy: 0.9620


Seed: 123 | Loss: 0.2741: 100%|██████████| 313/313 [00:41<00:00,  7.62it/s]


Seed: 123 | Dev Accuracy: 0.9620


Seed: 999 | Loss: 0.3493: 100%|██████████| 313/313 [00:41<00:00,  7.58it/s]


Seed: 999 | Dev Accuracy: 0.9690


Seed: 2021 | Loss: 0.4027: 100%|██████████| 313/313 [00:41<00:00,  7.56it/s]


Seed: 2021 | Dev Accuracy: 0.9610


Seed: 7 | Loss: 0.4847: 100%|██████████| 313/313 [00:41<00:00,  7.62it/s]


Seed: 7 | Dev Accuracy: 0.9080

Mean Dev Accuracy: 0.9524
Std Dev Accuracy: 0.0224

Best Model Seed: 999 | Test Accuracy: 0.9740
******************************************************************************


Seed: 42 | Loss: 0.4006: 100%|██████████| 313/313 [00:41<00:00,  7.61it/s]


Seed: 42 | Dev Accuracy: 0.9360


Seed: 123 | Loss: 0.6728: 100%|██████████| 313/313 [00:41<00:00,  7.54it/s]


Seed: 123 | Dev Accuracy: 0.9430


Seed: 999 | Loss: 0.8128: 100%|██████████| 313/313 [00:41<00:00,  7.49it/s]


Seed: 999 | Dev Accuracy: 0.9140


Seed: 2021 | Loss: 0.8122: 100%|██████████| 313/313 [00:41<00:00,  7.55it/s]


Seed: 2021 | Dev Accuracy: 0.9310


Seed: 7 | Loss: 0.7101: 100%|██████████| 313/313 [00:41<00:00,  7.49it/s]


Seed: 7 | Dev Accuracy: 0.9080

Mean Dev Accuracy: 0.9264
Std Dev Accuracy: 0.0133

Best Model Seed: 123 | Test Accuracy: 0.9320


In [40]:
print("Q1.3")
# hyperparameters
batch_size = 32
classifier_hidden_size = 32
# hyperparameters ends

classifier = Classifier(
    bert_model.config.hidden_size,
    classifier_hidden_size,
    datasets['train'].n_classes).to(bert_model.device)

params = list()
for name, param in bert_model.named_parameters():
    if name.startswith('encoder.layer.10') or name.startswith('encoder.layer.11'):
        param.requires_grad = True
    else:
        param.requires_grad = False

# Collect parameters to optimize without duplicates
params_to_optimize = []

# Add classifier parameters
classifier_params = list(classifier.parameters())
params_to_optimize.extend(classifier_params)

# Add BERT parameters that require gradients
bert_params = [param for param in bert_model.parameters() if param.requires_grad]
params_to_optimize.extend(bert_params)

# Ensure no duplicates by checking parameter IDs
seen = set()
unique_params = []
for param in params_to_optimize:
    if id(param) not in seen:
        unique_params.append(param)
        seen.add(id(param))

optimizer = torch.optim.Adam(unique_params, lr=5e-5)
loss_func = nn.CrossEntropyLoss()
pbar = tqdm.tqdm(dataloaders['train'])
for labels, sentences in pbar:
    # No torch.no_grad() since we are fine-tuning some BERT parameters
    outputs = bert_model(**sentences)
    unpooled_features = outputs['last_hidden_state']  # Shape: [B, L, D]

    # Extract the [CLS] token representation
    cls_features = unpooled_features[:, 0, :]  # Shape: [B, D]

    # Forward pass through the classifier
    logits = classifier(cls_features)

    # Compute loss
    loss = loss_func(logits, labels)

    # Backpropagation and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Update progress bar with current loss
    pbar.set_description(f"Loss: {loss.item():.4f}")
print("***************************************************************************")
print("Q1.4")
# Finish your code here for 1.4. You may re-used most of your code for 1.1.
# Lists to store development accuracies and test accuracies
dev_accuracies = []
test_accuracies = []
# List of seed values to use
seed_values = [42, 123, 999, 2021, 7]  # Example seeds
best_dev_accuracy = 0.0
best_model_state = None

# Loop over each seed value
for seed in seed_values:
    set_seed(seed)
    # Initialize the classifier for each run
    classifier = Classifier(
        input_size=bert_model.config.hidden_size,
        hidden_size=classifier_hidden_size,
        num_classes=datasets['train'].n_classes
    ).to(bert_model.device)


    # Set requires_grad for BERT parameters
    for name, param in bert_model.named_parameters():
        if name.startswith('encoder.layer.10.') or name.startswith('encoder.layer.11.'):
            param.requires_grad = True  # Unfreeze last two layers
        else:
            param.requires_grad = False  # Freeze other layers

    # Collect classifier parameters
    classifier_params = list(classifier.parameters())

    # Collect BERT parameters that require gradients (last two layers)
    bert_params = [param for param in bert_model.parameters() if param.requires_grad]

    # Combine parameters and ensure no duplicates
    params_to_optimize = classifier_params + bert_params

    # Initialize the optimizer
    optimizer = torch.optim.Adam(params_to_optimize, lr=5e-5)  # Use a smaller learning rate for fine-tuning

    # Define the loss function
    loss_func = nn.CrossEntropyLoss()

    # Training loop for one epoch
    pbar = tqdm.tqdm(dataloaders['train'])
    for labels, sentences in pbar:
        # No torch.no_grad() since we are fine-tuning some BERT parameters
        # Get outputs from BERT
        outputs = bert_model(**sentences)
        unpooled_features = outputs['last_hidden_state']  # Shape: [B, L, D]

        # Extract [CLS] token representation
        cls_embeddings = unpooled_features[:, 0, :]  # Shape: [B, D]

        # Forward pass through the classifier to get logits
        logits = classifier(cls_embeddings)

        # Compute the cross-entropy loss between logits and labels
        loss = loss_func(logits, labels)

        # Backpropagation and optimization steps
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update progress bar with current loss
        pbar.set_description(f"Seed: {seed} | Loss: {loss.item():.4f}")

    # Evaluate the model on the development set
    classifier.eval()  # Set classifier to evaluation mode
    bert_model.eval()  # Set BERT model to evaluation mode
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for labels, sentences in dataloaders['dev']:
            # Get BERT features
            outputs = bert_model(**sentences)
            unpooled_features = outputs['last_hidden_state']  # Shape: [B, L, D]

            # Extract [CLS] token representation
            cls_embeddings = unpooled_features[:, 0, :]  # Shape: [B, D]

            # Get logits from classifier
            logits = classifier(cls_embeddings)

            # Get predicted classes
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate accuracy on the development set
    dev_accuracy = accuracy_score(all_labels, all_preds)
    dev_accuracies.append(dev_accuracy)
    print(f"Seed: {seed} | Dev Accuracy: {dev_accuracy:.4f}")

    # Save the best model based on dev accuracy
    if dev_accuracy > best_dev_accuracy:
        best_dev_accuracy = dev_accuracy
        best_model_state = {
            'classifier': classifier.state_dict(),
            'bert_model': bert_model.state_dict()
        }
        best_seed = seed

    # Set models back to training mode
    classifier.train()
    bert_model.train()

# Compute mean and standard deviation of development accuracies
mean_dev_accuracy = np.mean(dev_accuracies)
std_dev_accuracy = np.std(dev_accuracies)
print(f"\nMean Dev Accuracy: {mean_dev_accuracy:.4f}")
print(f"Std Dev Accuracy: {std_dev_accuracy:.4f}")

# Evaluate the best model on the test set
# Load the best model state
classifier.load_state_dict(best_model_state['classifier'])
bert_model.load_state_dict(best_model_state['bert_model'])
classifier.eval()
bert_model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for labels, sentences in dataloaders['test']:
        # Get BERT features
        outputs = bert_model(**sentences)
        unpooled_features = outputs['last_hidden_state']  # Shape: [B, L, D]

        # Extract [CLS] token representation
        cls_embeddings = unpooled_features[:, 0, :]  # Shape: [B, D]

        # Get logits from classifier
        logits = classifier(cls_embeddings)

        # Get predicted classes
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate accuracy on the test set
test_accuracy = accuracy_score(all_labels, all_preds)
print(f"\nBest Model Seed: {best_seed} | Test Accuracy: {test_accuracy:.4f}")

Q1.3


Loss: 0.0640: 100%|██████████| 313/313 [00:55<00:00,  5.63it/s]


***************************************************************************
Q1.4


Seed: 42 | Loss: 0.0321: 100%|██████████| 313/313 [00:53<00:00,  5.83it/s]


Seed: 42 | Dev Accuracy: 0.9960


Seed: 123 | Loss: 0.0343: 100%|██████████| 313/313 [00:54<00:00,  5.80it/s]


Seed: 123 | Dev Accuracy: 0.9940


Seed: 999 | Loss: 0.0324: 100%|██████████| 313/313 [00:54<00:00,  5.75it/s]


Seed: 999 | Dev Accuracy: 0.9960


Seed: 2021 | Loss: 0.0411: 100%|██████████| 313/313 [00:54<00:00,  5.71it/s]


Seed: 2021 | Dev Accuracy: 0.9940


Seed: 7 | Loss: 0.0438: 100%|██████████| 313/313 [00:54<00:00,  5.78it/s]


Seed: 7 | Dev Accuracy: 0.9950

Mean Dev Accuracy: 0.9950
Std Dev Accuracy: 0.0009

Best Model Seed: 42 | Test Accuracy: 0.0570


In [45]:
# hyperparameters
batch_size = 32
classifier_hidden_size = 32
# hyperparameters ends

# Initialize tokenizer and GPT-2 model
from transformers import GPT2Tokenizer, GPT2Model

# 1.5: [CODE] Replace BERT with GPT-2
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# GPT-2 does not have padding token by default, so we need to add one
tokenizer.pad_token = tokenizer.eos_token

gpt2_model = GPT2Model.from_pretrained('gpt2')
if torch.cuda.is_available():  # use GPU if available
    gpt2_model = gpt2_model.cuda()

# Construct datasets and data loaders
datasets, dataloaders = construct_datasets(
    prefix='dbpedia_',
    batch_size=batch_size,
    tokenizer=tokenizer,
    device=gpt2_model.device
)

# Import necessary modules for setting random seeds and evaluation
import random
import numpy as np
from sklearn.metrics import accuracy_score

# Function to set random seeds for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

# Lists to store development accuracies and test accuracies
dev_accuracies = []
test_accuracies = []
# List of seed values to use
seed_values = [42, 123, 999, 2021, 7]  # Example seeds
best_dev_accuracy = 0.0
best_model_state = None

# Loop over each seed value
for seed in seed_values:
    set_seed(seed)
    # Initialize the classifier for each run
    classifier = Classifier(
        input_size=gpt2_model.config.hidden_size,  # GPT-2 hidden size is 768
        hidden_size=classifier_hidden_size,
        num_classes=datasets['train'].n_classes
    ).to(gpt2_model.device)

    # Initialize the optimizer with classifier parameters
    optimizer = torch.optim.Adam(classifier.parameters(), lr=5e-4)
    # Define the loss function
    loss_func = nn.CrossEntropyLoss()

    # Training loop for one epoch
    pbar = tqdm.tqdm(dataloaders['train'])
    for labels, sentences in pbar:
        # Use torch.no_grad() since GPT-2 parameters are frozen
        with torch.no_grad():
            # Get outputs from GPT-2
            outputs = gpt2_model(**sentences)
            # Get hidden states
            hidden_states = outputs['last_hidden_state']  # Shape: [B, L, D]

            # 1.5: [CODE] Extract features for classification
            # Option 1: Use the last token's hidden state
            # features = hidden_states[:, -1, :]  # Shape: [B, D]

            # Option 2: Use mean pooling over all tokens
            # Since GPT-2 is autoregressive, mean pooling can be effective
            # Compute attention mask to exclude padding tokens
            attention_mask = sentences['attention_mask'].unsqueeze(-1).expand(hidden_states.size()).float()
            masked_hidden_states = hidden_states * attention_mask
            sum_hidden_states = torch.sum(masked_hidden_states, dim=1)
            sum_mask = attention_mask.sum(dim=1).clamp(min=1e-9)
            features = sum_hidden_states / sum_mask  # Shape: [B, D]

            # Note: You can choose either option. Here we use mean pooling.

        # Forward pass through the classifier to get logits
        logits = classifier(features)
        # Compute the cross-entropy loss between logits and labels
        loss = loss_func(logits, labels)
        # Backpropagation and optimization steps
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # Update progress bar with current loss
        pbar.set_description(f"Seed: {seed} | Loss: {loss.item():.4f}")

    # Evaluate the model on the development set
    classifier.eval()  # Set classifier to evaluation mode
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for labels, sentences in dataloaders['dev']:
            # Get outputs from GPT-2
            outputs = gpt2_model(**sentences)
            hidden_states = outputs['last_hidden_state']  # Shape: [B, L, D]

            # Extract features for classification (same as during training)
            # Compute attention mask
            attention_mask = sentences['attention_mask'].unsqueeze(-1).expand(hidden_states.size()).float()
            masked_hidden_states = hidden_states * attention_mask
            sum_hidden_states = torch.sum(masked_hidden_states, dim=1)
            sum_mask = attention_mask.sum(dim=1).clamp(min=1e-9)
            features = sum_hidden_states / sum_mask  # Shape: [B, D]

            # Get logits from classifier
            logits = classifier(features)
            # Get predicted classes
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    # Calculate accuracy on the development set
    dev_accuracy = accuracy_score(all_labels, all_preds)
    dev_accuracies.append(dev_accuracy)
    print(f"Seed: {seed} | Dev Accuracy: {dev_accuracy:.4f}")

    # Save the best model based on dev accuracy
    if dev_accuracy > best_dev_accuracy:
        best_dev_accuracy = dev_accuracy
        best_model_state = classifier.state_dict()
        best_seed = seed

    # Set classifier back to training mode
    classifier.train()

# Compute mean and standard deviation of development accuracies
mean_dev_accuracy = np.mean(dev_accuracies)
std_dev_accuracy = np.std(dev_accuracies)
print(f"\nMean Dev Accuracy: {mean_dev_accuracy:.4f}")
print(f"Std Dev Accuracy: {std_dev_accuracy:.4f}")

# Evaluate the best model on the test set
# Load the best model state
classifier.load_state_dict(best_model_state)
classifier.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for labels, sentences in dataloaders['test']:
        # Get outputs from GPT-2
        outputs = gpt2_model(**sentences)
        hidden_states = outputs['last_hidden_state']  # Shape: [B, L, D]

        # Extract features for classification (same as during training)
        # Compute attention mask
        attention_mask = sentences['attention_mask'].unsqueeze(-1).expand(hidden_states.size()).float()
        masked_hidden_states = hidden_states * attention_mask
        sum_hidden_states = torch.sum(masked_hidden_states, dim=1)
        sum_mask = attention_mask.sum(dim=1).clamp(min=1e-9)
        features = sum_hidden_states / sum_mask  # Shape: [B, D]

        # Get logits from classifier
        logits = classifier(features)
        # Get predicted classes
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
# Calculate accuracy on the test set
test_accuracy = accuracy_score(all_labels, all_preds)
print(f"\nBest Model Seed: {best_seed} | Test Accuracy: {test_accuracy:.4f}")

Seed: 42 | Loss: 0.6046: 100%|██████████| 313/313 [00:46<00:00,  6.80it/s]


Seed: 42 | Dev Accuracy: 0.8960


Seed: 123 | Loss: 0.7952: 100%|██████████| 313/313 [00:43<00:00,  7.14it/s]


Seed: 123 | Dev Accuracy: 0.8300


Seed: 999 | Loss: 0.7159: 100%|██████████| 313/313 [00:44<00:00,  7.07it/s]


Seed: 999 | Dev Accuracy: 0.8910


Seed: 2021 | Loss: 0.6895: 100%|██████████| 313/313 [00:44<00:00,  7.11it/s]


Seed: 2021 | Dev Accuracy: 0.8920


Seed: 7 | Loss: 0.6747: 100%|██████████| 313/313 [00:44<00:00,  7.11it/s]


Seed: 7 | Dev Accuracy: 0.8650

Mean Dev Accuracy: 0.8748
Std Dev Accuracy: 0.0249

Best Model Seed: 42 | Test Accuracy: 0.8900
