<a href="https://colab.research.google.com/github/vishesh711/NLP-HW3/blob/main/hw3_code_skeleton.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install the necessary libraries


In [None]:
%%capture
! pip install tqdm boto3 requests regex sentencepiece sacremoses
! pip install transformers

## BERT Features

In this part, you will use BERT features to classify DBPedia articles.
The data is already pre-processed, and the data loader is implemented below.

In [None]:
# Basics: dataset, data loaders, Classifier
import collections
import json
import torch
import torch.nn as nn
import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel


SPLITS = ['train', 'dev', 'test']

class DBPediaDataset(Dataset):
  '''DBPedia dataset.
    Args:
      path[str]: path to the original data.
  '''
  def __init__(self, path):
    with open(path) as fin:
      self._data = [json.loads(l) for l in fin]
    self._n_classes = len(set([datum['label'] for datum in self._data]))

  def __getitem__(self, index):
    return self._data[index]

  def __len__(self):
    return len(self._data)

  @property
  def n_classes(self):
    return self._n_classes

  @staticmethod
  def collate_fn(tokenizer, device, batch):
    '''The collate function that compresses a training batch.
      Args:
        batch[list[dict[str, Any]]]: data in the batch.
      Returns:
        labels[torch.LongTensor]: the labels in the batch.
        sentences[dict[str, torch.Tensor]]: sentences converted by tokenizers.
    '''
    labels = torch.tensor([datum['label'] for datum in batch]).long().to(device)
    sentences = tokenizer(
        [datum['sentence'] for datum in batch],
        return_tensors='pt',  # pt = pytorch style tensor
        padding=True)
    for key in sentences:
      sentences[key] = sentences[key].to(device)
    return labels, sentences

def construct_datasets(prefix, batch_size, tokenizer, device):
  '''Constructs datasets and data loaders.
    Args:
      prefix[str]: prefix of the dataset (e.g., dbpedia_).
      batch_size[int]: maximum number of examples in a batch.
      tokenizer: model tokenizer that converts sentences to integer tensors.
      device[torch.device]: the device (cpu/gpu) that the tensor should be on.
    Returns:
      datasets[dict[str, Dataset]]: a dict of constructed datasets.
      dataloaders[dict[str, DataLoader]]: a dict of constructed data loaders.
  '''
  datasets = collections.defaultdict()
  dataloaders = collections.defaultdict()
  for split in SPLITS:
    datasets[split] = DBPediaDataset(f'{prefix}{split}.json')
    dataloaders[split] = DataLoader(
        datasets[split],
        batch_size=batch_size,
        shuffle=(split == 'train'),
        collate_fn=lambda x:DBPediaDataset.collate_fn(tokenizer, device, x))
  return datasets, dataloaders

In [None]:
# 1.1: [CODE] put your implementation of classifer here
class Classifier(nn.Module):

## Training and Evaluation

In [None]:
# hyperparameters
batch_size = 32
classifier_hidden_size = 32
# hyperparameters ends

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert_model = AutoModel.from_pretrained('bert-base-cased')
if torch.cuda.is_available():  # use GPU if available
  bert_model = bert_model.cuda()
datasets, dataloaders = construct_datasets(
    prefix='dbpedia_',
    batch_size=batch_size,
    tokenizer=tokenizer,
    device=bert_model.device)

classifier = Classifier(
    bert_model.config.hidden_size,
    classifier_hidden_size,
    datasets['train'].n_classes).to(bert_model.device)
optimizer = torch.optim.Adam(classifier.parameters(), lr=5e-4)
loss_func = nn.CrossEntropyLoss()
pbar = tqdm.tqdm(dataloaders['train'])
for labels, sentences in pbar:
  with torch.no_grad():
    unpooled_features = bert_model(**sentences)['last_hidden_state'] # [B, L, D]
  # 1.1: [CODE] train your classifier here

  # 1.1: [CODE] ends here
  # Note: you can re-use this code snippet for 1.2 as well

In [None]:
# hyperparameters
batch_size = 32
classifier_hidden_size = 32
# hyperparameters ends

classifier = Classifier(
    bert_model.config.hidden_size,
    classifier_hidden_size,
    datasets['train'].n_classes).to(bert_model.device)

params = list()
for name, param in bert_model.named_parameters():
  if name.startswith... # 1.3: [CODE] this line is incomplete, you can finish this line by adding the last two layers' parameters to "params", or re-write your own code
    params.append(param)
optimizer = torch.optim.Adam(params + list(classifier.parameters()), lr=5e-4)
loss_func = nn.CrossEntropyLoss()
pbar = tqdm.tqdm(dataloaders['train'])
# Finish your code here for 1.4. You may re-used most of your code for 1.1.