<a href="https://colab.research.google.com/github/vishesh711/NLP-HW3/blob/main/hw3_code_SAMPLE_NOT_CORRECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install the necessary libraries


In [5]:
%%capture
! pip install tqdm boto3 requests regex sentencepiece sacremoses
! pip install transformers

## BERT Features

In this part, you will use BERT features to classify DBPedia articles.
The data is already pre-processed, and the data loader is implemented below.

In [6]:
# Basics: dataset, data loaders, Classifier
import collections
import json
import torch
import torch.nn as nn
import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel


SPLITS = ['train', 'dev', 'test']

class DBPediaDataset(Dataset):
  '''DBPedia dataset.
    Args:
      path[str]: path to the original data.
  '''
  def __init__(self, path):
    with open(path) as fin:
      self._data = [json.loads(l) for l in fin]
    self._n_classes = len(set([datum['label'] for datum in self._data]))

  def __getitem__(self, index):
    return self._data[index]

  def __len__(self):
    return len(self._data)

  @property
  def n_classes(self):
    return self._n_classes

  @staticmethod
  def collate_fn(tokenizer, device, batch):
    '''The collate function that compresses a training batch.
      Args:
        batch[list[dict[str, Any]]]: data in the batch.
      Returns:
        labels[torch.LongTensor]: the labels in the batch.
        sentences[dict[str, torch.Tensor]]: sentences converted by tokenizers.
    '''
    labels = torch.tensor([datum['label'] for datum in batch]).long().to(device)
    sentences = tokenizer(
        [datum['sentence'] for datum in batch],
        return_tensors='pt',  # pt = pytorch style tensor
        padding=True)
    for key in sentences:
      sentences[key] = sentences[key].to(device)
    return labels, sentences

def construct_datasets(prefix, batch_size, tokenizer, device):
  '''Constructs datasets and data loaders.
    Args:
      prefix[str]: prefix of the dataset (e.g., dbpedia_).
      batch_size[int]: maximum number of examples in a batch.
      tokenizer: model tokenizer that converts sentences to integer tensors.
      device[torch.device]: the device (cpu/gpu) that the tensor should be on.
    Returns:
      datasets[dict[str, Dataset]]: a dict of constructed datasets.
      dataloaders[dict[str, DataLoader]]: a dict of constructed data loaders.
  '''
  datasets = collections.defaultdict()
  dataloaders = collections.defaultdict()
  for split in SPLITS:
    datasets[split] = DBPediaDataset(f'{prefix}{split}.json')
    dataloaders[split] = DataLoader(
        datasets[split],
        batch_size=batch_size,
        shuffle=(split == 'train'),
        collate_fn=lambda x:DBPediaDataset.collate_fn(tokenizer, device, x))
  return datasets, dataloaders

In [11]:
# 1.1: [CODE] put your implementation of classifer here
class Classifier(nn.Module):


SyntaxError: incomplete input (<ipython-input-11-25a47fbfba18>, line 2)

In [12]:
#MY_CODE

import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel
import numpy as np
import random
# 1.1: [CODE] put your implementation of classifer here
class Classifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Set random seeds for reproducibility
def set_random_seeds(seed_value):
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    np.random.seed(seed_value)
    random.seed(seed_value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Hyperparameters
batch_size = 32
classifier_hidden_size = 32
learning_rate = 5e-4
num_epochs = 1
input_size = 768  # BERT [CLS] token size
num_classes = 14

# Initialize tokenizer and BERT model
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert_model = AutoModel.from_pretrained('bert-base-cased')
if torch.cuda.is_available():
    bert_model = bert_model.cuda()

# Prepare data loaders
datasets, dataloaders = construct_datasets(
    prefix='dbpedia_',
    batch_size=batch_size,
    tokenizer=tokenizer,
    device=bert_model.device
)

# Train and evaluate for multiple seeds
num_runs = 5
dev_accuracies = []
best_test_accuracy = 0
best_seed = None

for run in range(num_runs):
    # Set a different random seed for each run
    seed = 42 + run  # or choose any other seed values
    set_random_seeds(seed)

    # Initialize the classifier and optimizer
    classifier = Classifier(input_size, classifier_hidden_size, num_classes).to(bert_model.device)
    optimizer = optim.Adam(classifier.parameters(), lr=learning_rate)
    loss_func = nn.CrossEntropyLoss()

    # Training loop
    bert_model.eval()  # Freeze BERT model
    classifier.train()
    for epoch in range(num_epochs):
        pbar = tqdm.tqdm(dataloaders['train'], desc=f"Run {run+1}, Epoch {epoch+1}")
        for labels, sentences in pbar:
            with torch.no_grad():  # Freeze BERT and extract CLS features
                cls_features = bert_model(**sentences)['last_hidden_state'][:, 0, :]  # [CLS] token

            # Forward pass through the classifier
            outputs = classifier(cls_features)
            loss = loss_func(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Evaluate on development set
    classifier.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for labels, sentences in dataloaders['dev']:
            cls_features = bert_model(**sentences)['last_hidden_state'][:, 0, :]
            outputs = classifier(cls_features)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    dev_accuracy = correct / total
    dev_accuracies.append(dev_accuracy)

    # Check if this is the best model
    if dev_accuracy > best_test_accuracy:
        best_test_accuracy = dev_accuracy
        best_seed = seed

    print(f"Run {run+1} - Dev Accuracy: {dev_accuracy:.4f}")

# Calculate mean and standard deviation for dev set accuracies
mean_dev_accuracy = np.mean(dev_accuracies)
std_dev_accuracy = np.std(dev_accuracies)
print(f"Mean Dev Accuracy: {mean_dev_accuracy:.4f}, Standard Deviation: {std_dev_accuracy:.4f}")

# Evaluate the best model on the test set
set_random_seeds(best_seed)  # Set the seed for best model
classifier = Classifier(input_size, classifier_hidden_size, num_classes).to(bert_model.device)
optimizer = optim.Adam(classifier.parameters(), lr=learning_rate)
loss_func = nn.CrossEntropyLoss()

bert_model.eval()
classifier.eval()
correct = 0
total = 0
with torch.no_grad():
    for labels, sentences in dataloaders['test']:
        cls_features = bert_model(**sentences)['last_hidden_state'][:, 0, :]
        outputs = classifier(cls_features)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

test_accuracy = correct / total
print(f"Best Test Accuracy: {test_accuracy:.4f} (Seed: {best_seed})")

Run 1, Epoch 1: 100%|██████████| 313/313 [00:41<00:00,  7.52it/s]


Run 1 - Dev Accuracy: 0.9620


Run 2, Epoch 1: 100%|██████████| 313/313 [00:42<00:00,  7.33it/s]


Run 2 - Dev Accuracy: 0.9600


Run 3, Epoch 1: 100%|██████████| 313/313 [00:42<00:00,  7.43it/s]


Run 3 - Dev Accuracy: 0.9790


Run 4, Epoch 1: 100%|██████████| 313/313 [00:42<00:00,  7.38it/s]


Run 4 - Dev Accuracy: 0.9760


Run 5, Epoch 1: 100%|██████████| 313/313 [00:41<00:00,  7.47it/s]


Run 5 - Dev Accuracy: 0.9390
Mean Dev Accuracy: 0.9632, Standard Deviation: 0.0142
Best Test Accuracy: 0.1660 (Seed: 44)


In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel
import numpy as np
import random

# Updated Classifier Model for Mean-Pooling and Max-Pooling (Input size is doubled)
class Classifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Mean and Max Pooling Function
def mean_max_pooling(features, attention_mask):
    attention_mask_expanded = attention_mask.unsqueeze(-1).expand(features.size()).float()

    # Mean Pooling
    sum_embeddings = torch.sum(features * attention_mask_expanded, dim=1)
    sum_mask = torch.clamp(attention_mask_expanded.sum(dim=1), min=1e-9)  # Avoid division by zero
    mean_pooled = sum_embeddings / sum_mask

    # Max Pooling
    features[attention_mask == 0] = -1e9  # Mask out padding tokens for max pooling
    max_pooled, _ = torch.max(features, dim=1)

    # Concatenate Mean and Max Pooling
    pooled_output = torch.cat((mean_pooled, max_pooled), dim=1)
    return pooled_output

# Set random seeds for reproducibility
def set_random_seeds(seed_value):
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    np.random.seed(seed_value)
    random.seed(seed_value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Hyperparameters
batch_size = 32
classifier_hidden_size = 32
learning_rate = 5e-4
num_epochs = 1
input_size = 1536  # Doubled due to mean and max pooling
num_classes = 14

# Initialize tokenizer and BERT model
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert_model = AutoModel.from_pretrained('bert-base-cased')
if torch.cuda.is_available():
    bert_model = bert_model.cuda()

# Prepare data loaders
datasets, dataloaders = construct_datasets(
    prefix='dbpedia_',
    batch_size=batch_size,
    tokenizer=tokenizer,
    device=bert_model.device
)

# Train and evaluate for multiple seeds
num_runs = 5
dev_accuracies = []
best_test_accuracy = 0
best_seed = None

for run in range(num_runs):
    # Set a different random seed for each run
    seed = 42 + run
    set_random_seeds(seed)

    # Initialize the classifier and optimizer
    classifier = Classifier(input_size, classifier_hidden_size, num_classes).to(bert_model.device)
    optimizer = optim.Adam(classifier.parameters(), lr=learning_rate)
    loss_func = nn.CrossEntropyLoss()

    # Training loop
    bert_model.eval()  # Freeze BERT model
    classifier.train()
    for epoch in range(num_epochs):
        pbar = tqdm.tqdm(dataloaders['train'], desc=f"Run {run+1}, Epoch {epoch+1}")
        for labels, sentences in pbar:
            with torch.no_grad():  # Freeze BERT and extract features
                bert_outputs = bert_model(**sentences)
                attention_mask = sentences['attention_mask']
                cls_features = mean_max_pooling(bert_outputs['last_hidden_state'], attention_mask)  # Mean and Max Pooling

            # Forward pass through the classifier
            outputs = classifier(cls_features)
            loss = loss_func(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Evaluate on development set
    classifier.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for labels, sentences in dataloaders['dev']:
            bert_outputs = bert_model(**sentences)
            attention_mask = sentences['attention_mask']
            cls_features = mean_max_pooling(bert_outputs['last_hidden_state'], attention_mask)
            outputs = classifier(cls_features)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    dev_accuracy = correct / total
    dev_accuracies.append(dev_accuracy)

    # Check if this is the best model
    if dev_accuracy > best_test_accuracy:
        best_test_accuracy = dev_accuracy
        best_seed = seed

    print(f"Run {run+1} - Dev Accuracy: {dev_accuracy:.4f}")

# Calculate mean and standard deviation for dev set accuracies
mean_dev_accuracy = np.mean(dev_accuracies)
std_dev_accuracy = np.std(dev_accuracies)
print(f"Mean Dev Accuracy: {mean_dev_accuracy:.4f}, Standard Deviation: {std_dev_accuracy:.4f}")

# Evaluate the best model on the test set
set_random_seeds(best_seed)
classifier = Classifier(input_size, classifier_hidden_size, num_classes).to(bert_model.device)
optimizer = optim.Adam(classifier.parameters(), lr=learning_rate)
loss_func = nn.CrossEntropyLoss()

bert_model.eval()
classifier.eval()
correct = 0
total = 0
with torch.no_grad():
    for labels, sentences in dataloaders['test']:
        bert_outputs = bert_model(**sentences)
        attention_mask = sentences['attention_mask']
        cls_features = mean_max_pooling(bert_outputs['last_hidden_state'], attention_mask)
        outputs = classifier(cls_features)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

test_accuracy = correct / total
print(f"Best Test Accuracy: {test_accuracy:.4f} (Seed: {best_seed})")



Run 1, Epoch 1: 100%|██████████| 313/313 [00:42<00:00,  7.37it/s]


Run 1 - Dev Accuracy: 0.9360


Run 2, Epoch 1: 100%|██████████| 313/313 [00:41<00:00,  7.47it/s]


Run 2 - Dev Accuracy: 0.9360


Run 3, Epoch 1: 100%|██████████| 313/313 [00:42<00:00,  7.41it/s]


Run 3 - Dev Accuracy: 0.9250


Run 4, Epoch 1: 100%|██████████| 313/313 [00:42<00:00,  7.39it/s]


Run 4 - Dev Accuracy: 0.9310


Run 5, Epoch 1: 100%|██████████| 313/313 [00:42<00:00,  7.42it/s]


Run 5 - Dev Accuracy: 0.9340
Mean Dev Accuracy: 0.9324, Standard Deviation: 0.0041
Best Test Accuracy: 0.0710 (Seed: 42)


In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel
import numpy as np
import random

# Set random seeds for reproducibility
def set_random_seeds(seed_value):
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    np.random.seed(seed_value)
    random.seed(seed_value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Hyperparameters
batch_size = 32
classifier_hidden_size = 32
learning_rate = 5e-4
num_epochs = 1
input_size = 768  # First-token CLS size
num_classes = 14

# Initialize tokenizer and BERT model
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert_model = AutoModel.from_pretrained('bert-base-cased')
if torch.cuda.is_available():
    bert_model = bert_model.cuda()

# Prepare data loaders
datasets, dataloaders = construct_datasets(
    prefix='dbpedia_',
    batch_size=batch_size,
    tokenizer=tokenizer,
    device=bert_model.device
)

# Updated Classifier Model
class Classifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Train and evaluate for multiple seeds
num_runs = 5
dev_accuracies = []
best_test_accuracy = 0
best_seed = None

for run in range(num_runs):
    # Set a different random seed for each run
    seed = 42 + run
    set_random_seeds(seed)

    # Initialize the classifier and optimizer
    classifier = Classifier(input_size, classifier_hidden_size, num_classes).to(bert_model.device)

    # Gather parameters for last two layers and classifier
    params = list()
    for name, param in bert_model.named_parameters():
        if "encoder.layer.10" in name or "encoder.layer.11" in name:
            param.requires_grad = True
            params.append(param)
        else:
            param.requires_grad = False
    optimizer = optim.Adam(params + list(classifier.parameters()), lr=learning_rate)
    loss_func = nn.CrossEntropyLoss()

    # Training loop with fine-tuning of last two BERT layers
    bert_model.train()  # Allow training of last two layers
    classifier.train()
    for epoch in range(num_epochs):
        pbar = tqdm.tqdm(dataloaders['train'], desc=f"Run {run+1}, Epoch {epoch+1}")
        for labels, sentences in pbar:
            # Extract CLS features from BERT with grad enabled for last two layers
            cls_features = bert_model(**sentences)['last_hidden_state'][:, 0, :]

            # Forward pass through the classifier
            outputs = classifier(cls_features)
            loss = loss_func(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Evaluate on development set
    classifier.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for labels, sentences in dataloaders['dev']:
            cls_features = bert_model(**sentences)['last_hidden_state'][:, 0, :]
            outputs = classifier(cls_features)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    dev_accuracy = correct / total
    dev_accuracies.append(dev_accuracy)

    # Check if this is the best model
    if dev_accuracy > best_test_accuracy:
        best_test_accuracy = dev_accuracy
        best_seed = seed

    print(f"Run {run+1} - Dev Accuracy: {dev_accuracy:.4f}")

# Calculate mean and standard deviation for dev set accuracies
mean_dev_accuracy = np.mean(dev_accuracies)
std_dev_accuracy = np.std(dev_accuracies)
print(f"Mean Dev Accuracy: {mean_dev_accuracy:.4f}, Standard Deviation: {std_dev_accuracy:.4f}")

# Evaluate the best model on the test set
set_random_seeds(best_seed)
classifier = Classifier(input_size, classifier_hidden_size, num_classes).to(bert_model.device)
optimizer = optim.Adam(params + list(classifier.parameters()), lr=learning_rate)
loss_func = nn.CrossEntropyLoss()

bert_model.eval()
classifier.eval()
correct = 0
total = 0
with torch.no_grad():
    for labels, sentences in dataloaders['test']:
        cls_features = bert_model(**sentences)['last_hidden_state'][:, 0, :]
        outputs = classifier(cls_features)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

test_accuracy = correct / total
print(f"Best Test Accuracy: {test_accuracy:.4f} (Seed: {best_seed})")


Run 1, Epoch 1: 100%|██████████| 313/313 [00:52<00:00,  5.91it/s]


Run 1 - Dev Accuracy: 0.9750


Run 2, Epoch 1: 100%|██████████| 313/313 [00:55<00:00,  5.65it/s]


Run 2 - Dev Accuracy: 0.9910


Run 3, Epoch 1: 100%|██████████| 313/313 [00:54<00:00,  5.70it/s]


Run 3 - Dev Accuracy: 0.9850


Run 4, Epoch 1: 100%|██████████| 313/313 [00:54<00:00,  5.73it/s]


Run 4 - Dev Accuracy: 0.9920


Run 5, Epoch 1: 100%|██████████| 313/313 [00:54<00:00,  5.77it/s]


Run 5 - Dev Accuracy: 0.9890
Mean Dev Accuracy: 0.9864, Standard Deviation: 0.0062
Best Test Accuracy: 0.9820 (Seed: 45)


In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel
import numpy as np
import random
import tqdm

# Updated Classifier Model
class Classifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Set random seeds for reproducibility
def set_random_seeds(seed_value):
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    np.random.seed(seed_value)
    random.seed(seed_value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Hyperparameters
batch_size = 32
classifier_hidden_size = 32
learning_rate = 5e-4
num_epochs = 1
input_size = 768  # Hidden size for GPT-2
num_classes = 14

# Initialize tokenizer and GPT-2 model
tokenizer = AutoTokenizer.from_pretrained('gpt2')

# Set padding token for GPT-2 tokenizer
tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as pad_token

gpt2_model = AutoModel.from_pretrained('gpt2')
if torch.cuda.is_available():
    gpt2_model = gpt2_model.cuda()

# Prepare data loaders
datasets, dataloaders = construct_datasets(
    prefix='dbpedia_',
    batch_size=batch_size,
    tokenizer=tokenizer,
    device=gpt2_model.device
)

# Train and evaluate for multiple seeds
num_runs = 5
dev_accuracies = []
best_test_accuracy = 0
best_seed = None

for run in range(num_runs):
    # Set a different random seed for each run
    seed = 42 + run
    set_random_seeds(seed)

    # Initialize the classifier and optimizer
    classifier = Classifier(input_size, classifier_hidden_size, num_classes).to(gpt2_model.device)
    optimizer = optim.Adam(classifier.parameters(), lr=learning_rate)
    loss_func = nn.CrossEntropyLoss()

    # Training loop with frozen GPT-2
    gpt2_model.eval()  # Freeze GPT-2
    classifier.train()
    for epoch in range(num_epochs):
        pbar = tqdm.tqdm(dataloaders['train'], desc=f"Run {run+1}, Epoch {epoch+1}")
        for labels, sentences in pbar:
            with torch.no_grad():  # Extract features from GPT-2
                gpt2_outputs = gpt2_model(**sentences)
                last_token_features = gpt2_outputs['last_hidden_state'][:, -1, :]  # Last token of each sequence

            # Forward pass through the classifier
            outputs = classifier(last_token_features)
            loss = loss_func(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Evaluate on development set
    classifier.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for labels, sentences in dataloaders['dev']:
            gpt2_outputs = gpt2_model(**sentences)
            last_token_features = gpt2_outputs['last_hidden_state'][:, -1, :]
            outputs = classifier(last_token_features)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    dev_accuracy = correct / total
    dev_accuracies.append(dev_accuracy)

    # Check if this is the best model
    if dev_accuracy > best_test_accuracy:
        best_test_accuracy = dev_accuracy
        best_seed = seed

    print(f"Run {run+1} - Dev Accuracy: {dev_accuracy:.4f}")

# Calculate mean and standard deviation for dev set accuracies
mean_dev_accuracy = np.mean(dev_accuracies)
std_dev_accuracy = np.std(dev_accuracies)
print(f"Mean Dev Accuracy: {mean_dev_accuracy:.4f}, Standard Deviation: {std_dev_accuracy:.4f}")

# Evaluate the best model on the test set
set_random_seeds(best_seed)
classifier = Classifier(input_size, classifier_hidden_size, num_classes).to(gpt2_model.device)
optimizer = optim.Adam(classifier.parameters(), lr=learning_rate)
loss_func = nn.CrossEntropyLoss()

gpt2_model.eval()
classifier.eval()
correct = 0
total = 0
with torch.no_grad():
    for labels, sentences in dataloaders['test']:
        gpt2_outputs = gpt2_model(**sentences)
        last_token_features = gpt2_outputs['last_hidden_state'][:, -1, :]
        outputs = classifier(last_token_features)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

test_accuracy = correct / total
print(f"Best Test Accuracy: {test_accuracy:.4f} (Seed: {best_seed})")


Run 1, Epoch 1: 100%|██████████| 313/313 [00:41<00:00,  7.50it/s]


Run 1 - Dev Accuracy: 0.3270


Run 2, Epoch 1: 100%|██████████| 313/313 [00:44<00:00,  7.11it/s]


Run 2 - Dev Accuracy: 0.3030


Run 3, Epoch 1: 100%|██████████| 313/313 [00:43<00:00,  7.25it/s]


Run 3 - Dev Accuracy: 0.4140


Run 4, Epoch 1: 100%|██████████| 313/313 [00:43<00:00,  7.19it/s]


Run 4 - Dev Accuracy: 0.2660


Run 5, Epoch 1: 100%|██████████| 313/313 [00:43<00:00,  7.25it/s]


Run 5 - Dev Accuracy: 0.3140
Mean Dev Accuracy: 0.3248, Standard Deviation: 0.0490
Best Test Accuracy: 0.0580 (Seed: 44)



## Training and Evaluation

In [None]:
# hyperparameters
batch_size = 32
classifier_hidden_size = 32
# hyperparameters ends

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert_model = AutoModel.from_pretrained('bert-base-cased')
if torch.cuda.is_available():  # use GPU if available
  bert_model = bert_model.cuda()
datasets, dataloaders = construct_datasets(
    prefix='dbpedia_',
    batch_size=batch_size,
    tokenizer=tokenizer,
    device=bert_model.device)

classifier = Classifier(
    bert_model.config.hidden_size,
    classifier_hidden_size,
    datasets['train'].n_classes).to(bert_model.device)
optimizer = torch.optim.Adam(classifier.parameters(), lr=5e-4)
loss_func = nn.CrossEntropyLoss()
pbar = tqdm.tqdm(dataloaders['train'])
for labels, sentences in pbar:
  with torch.no_grad():
    unpooled_features = bert_model(**sentences)['last_hidden_state'] # [B, L, D]
  # 1.1: [CODE] train your classifier here

  # 1.1: [CODE] ends here
  # Note: you can re-use this code snippet for 1.2 as well

In [None]:
# hyperparameters
batch_size = 32
classifier_hidden_size = 32
# hyperparameters ends

classifier = Classifier(
    bert_model.config.hidden_size,
    classifier_hidden_size,
    datasets['train'].n_classes).to(bert_model.device)

params = list()
for name, param in bert_model.named_parameters():
  if name.startswith... # 1.3: [CODE] this line is incomplete, you can finish this line by adding the last two layers' parameters to "params", or re-write your own code
    params.append(param)
optimizer = torch.optim.Adam(params + list(classifier.parameters()), lr=5e-4)
loss_func = nn.CrossEntropyLoss()
pbar = tqdm.tqdm(dataloaders['train'])
# Finish your code here for 1.4. You may re-used most of your code for 1.1.