In [1]:
# Type your full names
Student_1 = "Eman Mohamed Mahmoud"
Student_2 = "Zeinab Moawad Fayez"

# Named Entity Recognition Assignment
NER is a subtask of information extraction that locates and classifies named entities in a text. The named entities could be organizations, persons, locations, times, etc. In this assignment, you will train a named entity recognition system and test it on a test data. \
Let's get started

In [17]:
import os 
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
from utils import get_params, get_vocab
import random as rnd
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

# Importing and discovering the data

In [3]:
vocab, tag_map = get_vocab('data/large/words.txt', 'data/large/tags.txt')
t_sentences, t_labels, t_size = get_params(vocab, tag_map, 'data/large/train/sentences.txt', 'data/large/train/labels.txt')
v_sentences, v_labels, v_size = get_params(vocab, tag_map, 'data/large/val/sentences.txt', 'data/large/val/labels.txt')
test_sentences, test_labels, test_size = get_params(vocab, tag_map, 'data/large/test/sentences.txt', 'data/large/test/labels.txt')

`vocab` is a dictionary that translates a word string to a unique number. Given a sentence, you can represent it as an array of numbers translating with this dictionary. The dictionary contains a `<PAD>` token. 

When training an LSTM using batches, all your input sentences must be the same size. To accomplish this, you set the length of your sentences to a certain number and add the generic `<PAD>` token to fill all the empty spaces. 

In [4]:
# vocab translates from a word to a unique number
print('vocab["the"]:', vocab["the"])
# Pad token
print('padded token:', vocab['<PAD>'])

vocab["the"]: 9
padded token: 35180


In [11]:
# The possible tags
print(tag_map)

{'O': 0, 'B-geo': 1, 'B-gpe': 2, 'B-per': 3, 'I-geo': 4, 'B-org': 5, 'I-org': 6, 'B-tim': 7, 'B-art': 8, 'I-art': 9, 'I-per': 10, 'I-gpe': 11, 'I-tim': 12, 'B-nat': 13, 'B-eve': 14, 'I-eve': 15, 'I-nat': 16}


So the coding scheme that tags the entities is a minimal one where B- indicates the first token in a multi-token entity, and I- indicates one in the middle of a multi-token entity. If you had the sentence 

**"Sharon flew to Miami on Friday"**

the outputs would look like:

```
Sharon B-per
flew   O
to     O
Miami  B-geo
on     O
Friday B-tim
```

your tags would reflect three tokens beginning with B-, since there are no multi-token entities in the sequence. But if you added Sharon's last name to the sentence: 

**"Sharon Floyd flew to Miami on Friday"**

```
Sharon B-per
Floyd  I-per
flew   O
to     O
Miami  B-geo
on     O
Friday B-tim
```

then your tags would change to show first "Sharon" as B-per, and "Floyd" as I-per, where I- indicates an inner token in a multi-token sequence.

In [6]:
# Exploring information about the data
print('The number of outputs is tag_map', len(tag_map))
# The number of vocabulary tokens (including <PAD>)
g_vocab_size = len(vocab)
print(f"Num of vocabulary words: {g_vocab_size}")
print('The vocab size is', len(vocab))
print('The training size is', t_size)
print('The validation size is', v_size)
print('An example of the first sentence is', t_sentences[0])
print('An example of its corresponding label is', t_labels[0])

The number of outputs is tag_map 17
Num of vocabulary words: 35181
The vocab size is 35181
The training size is 33570
The validation size is 7194
An example of the first sentence is [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 9, 15, 1, 16, 17, 18, 19, 20, 21]
An example of its corresponding label is [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0]


# NERDataset
The class that impelements the dataset for NER

In [12]:
class NERDataset(torch.utils.data.Dataset):

  def __init__(self, x, y, pad):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    ##################### TODO: create two tensors one for x and the other for labels ###############################
    # Find the maximum length in x and y
    max_length_x = max(len(sentence) for sentence in x)
    max_length_y = max(len(labels) for labels in y)

    # Pad x and y
    for i in range(len(x)):
        x[i] += [pad] * (max_length_x - len(x[i]))
        y[i] += [tag_map['O']] * (max_length_y - len(y[i]))
    self.tokens_ids = torch.tensor(x)
    self.tokens_labels = torch.tensor(y)
    
    #################################################################################################################

  def __len__(self):
    """
    This function should return the length of the dataset (the number of sentences)
    """
    ###################### TODO: return the length of the dataset #############################
    return len(self.tokens_ids)
    ###########################################################################################

  def __getitem__(self, idx):
    """
    This function returns a subset of the whole dataset
    """
    ###################### TODO: return a tuple of x and y ###################################
    return self.tokens_ids[idx], self.tokens_labels[idx]
    ##########################################################################################

In [13]:
batch_size = 5
mini_sentences = t_sentences[0: 8]
mini_labels = t_labels[0: 8]
mini_dataset = NERDataset(mini_sentences, mini_labels, vocab['<PAD>'])
dummy_dataloader = torch.utils.data.DataLoader(mini_dataset, batch_size=5)
dg = iter(dummy_dataloader)
X1, Y1 = next(dg)
X2, Y2 = next(dg)
print(Y1.shape, X1.shape, Y2.shape, X2.shape)
print(X1[0][:], "\n", Y1[0][:])

torch.Size([5, 30]) torch.Size([5, 30]) torch.Size([3, 30]) torch.Size([3, 30])
tensor([    0,     1,     2,     3,     4,     5,     6,     7,     8,     9,
           10,    11,    12,    13,    14,     9,    15,     1,    16,    17,
           18,    19,    20,    21, 35180, 35180, 35180, 35180, 35180, 35180]) 
 tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0])


#### Expected output
torch.Size([5, 30]) torch.Size([5, 30]) torch.Size([3, 30]) torch.Size([3, 30])\
tensor([    0,     1,     2,     3,     4,     5,     6,     7,     8,     9,
           10,    11,    12,    13,    14,     9,    15,     1,    16,    17,
           18,    19,    20,    21, 35180, 35180, 35180, 35180, 35180, 35180]) \
tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0])

# NER
The class that implementss the pytorch model for NER

In [27]:
class NER(nn.Module):
  def __init__(self, vocab_size=35181, embedding_dim=50, hidden_size=50, n_classes=len(tag_map)):
    """
    The constructor of our NER model
    Inputs:
    - vacab_size: the number of unique words
    - embedding_dim: the embedding dimension
    - n_classes: the number of final classes (tags)
    """
    super(NER, self).__init__()
    ####################### TODO: Create the layers of your model #######################################
    # (1) Create the embedding layer
    self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

    # (2) Create an LSTM layer with hidden size = hidden_size and batch_first = True
    self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, batch_first=True)

    # (3) Create a linear layer with number of neorons = n_classes
    self.linear = nn.Linear(hidden_size, n_classes)
    #####################################################################################################

  def forward(self, sentences):
    """
    This function does the forward pass of our model
    Inputs:
    - sentences: tensor of shape (batch_size, max_length)

    Returns:
    - final_output: tensor of shape (batch_size, max_length, n_classes)
    """

    final_output = None
    ######################### TODO: implement the forward pass ####################################
    sentences = self.embedding(sentences)
    sentences,_ = self.lstm(sentences)
    final_output = self.linear(sentences)
    ###############################################################################################
    return final_output

In [28]:
model = NER()
print(model)

NER(
  (embedding): Embedding(35181, 50)
  (lstm): LSTM(50, 50, batch_first=True)
  (linear): Linear(in_features=50, out_features=17, bias=True)
)


#### Expected output
NER( \
  (embedding): Embedding(35181, 50) \
  (lstm): LSTM(50, 50, batch_first=True) \
  (linear): Linear(in_features=50, out_features=17, bias=True) \
)

# Training

In [35]:
def train(model, train_dataset, batch_size=512, epochs=5, learning_rate=0.01):
  """
  This function implements the training logic
  Inputs:
  - model: the model to be trained
  - train_dataset: the training set of type NERDataset
  - batch_size: integer represents the number of examples per step
  - epochs: integer represents the total number of epochs (full training pass)
  - learning_rate: the learning rate to be used by the optimizer
  """

  ############################## TODO: replace the Nones in the following code ##################################
  
  # (1) create the dataloader of the training set (make the shuffle=True)
  train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
  # (2) make the criterion cross entropy loss
  criterion = nn.CrossEntropyLoss()

  # (3) create the optimizer (Adam)
  optimizer = optim.Adam(model.parameters(), lr=learning_rate)

  # GPU configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()

  for epoch_num in range(epochs):
    total_acc_train = 0
    total_loss_train = 0

    for train_input, train_label in tqdm(train_dataloader):

      # (4) move the train input to the device
      train_label = train_label.to(device)

      # (5) move the train label to the device
      train_input = train_input.to(device)


      # (6) do the forward pass
      output = model(train_input)
      print(output.shape)
      print(train_label.shape)
      # (7) loss calculation (you need to think in this part how to calculate the loss correctly)
      batch_loss = criterion(output.view(-1,output.shape[-1]), train_label.view(-1))

      # (8) append the batch loss to the total_loss_train
      total_loss_train += batch_loss.item()
      
      # (9) calculate the batch accuracy (just add the number of correct predictions)
      acc = (output.argmax(dim =-1) == train_label).sum().item()
      total_acc_train += acc

      # (10) zero your gradients
      optimizer.zero_grad()

      # (11) do the backward pass
      batch_loss.backward()

      # (12) update the weights with your optimizer
      optimizer.step()
      
    # epoch loss
    epoch_loss = total_loss_train / len(train_dataset)

    # (13) calculate the accuracy
    epoch_acc = total_acc_train/ len(train_dataset)

    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
        | Train Accuracy: {epoch_acc}\n')

  ##############################################################################################################

In [36]:
train_dataset = NERDataset(t_sentences, t_labels, vocab['<PAD>'])
val_dataset = NERDataset(v_sentences, v_labels, vocab['<PAD>'])
test_dataset = NERDataset(test_sentences, test_labels, vocab['<PAD>'])

In [37]:
train(model, train_dataset)

  0%|          | 0/66 [00:00<?, ?it/s]

torch.Size([512, 104, 17])
torch.Size([512, 104])


  2%|▏         | 1/66 [00:05<05:47,  5.35s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


  3%|▎         | 2/66 [00:09<05:11,  4.86s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


  5%|▍         | 3/66 [00:13<04:36,  4.39s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


  6%|▌         | 4/66 [00:17<04:16,  4.13s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


  8%|▊         | 5/66 [00:21<04:03,  3.99s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


  9%|▉         | 6/66 [00:24<03:54,  3.91s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 11%|█         | 7/66 [00:28<03:50,  3.90s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 12%|█▏        | 8/66 [00:32<03:43,  3.85s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 14%|█▎        | 9/66 [00:36<03:39,  3.85s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 15%|█▌        | 10/66 [00:40<03:37,  3.89s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 17%|█▋        | 11/66 [00:44<03:39,  3.99s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 18%|█▊        | 12/66 [00:48<03:30,  3.89s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 20%|█▉        | 13/66 [00:51<03:22,  3.82s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 21%|██        | 14/66 [00:55<03:16,  3.78s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 23%|██▎       | 15/66 [00:59<03:13,  3.79s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 24%|██▍       | 16/66 [01:03<03:08,  3.76s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 26%|██▌       | 17/66 [01:07<03:13,  3.95s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 27%|██▋       | 18/66 [01:13<03:35,  4.49s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 29%|██▉       | 19/66 [01:21<04:19,  5.52s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 30%|███       | 20/66 [01:28<04:39,  6.08s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 32%|███▏      | 21/66 [01:32<04:06,  5.48s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 33%|███▎      | 22/66 [01:38<04:00,  5.48s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 35%|███▍      | 23/66 [01:41<03:32,  4.95s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 36%|███▋      | 24/66 [01:45<03:12,  4.58s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 38%|███▊      | 25/66 [01:49<02:54,  4.26s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 39%|███▉      | 26/66 [01:52<02:45,  4.15s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 41%|████      | 27/66 [01:57<02:50,  4.38s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 42%|████▏     | 28/66 [02:01<02:43,  4.30s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 44%|████▍     | 29/66 [02:06<02:43,  4.42s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 45%|████▌     | 30/66 [02:10<02:37,  4.37s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 47%|████▋     | 31/66 [02:14<02:28,  4.25s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 48%|████▊     | 32/66 [02:19<02:23,  4.22s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 50%|█████     | 33/66 [02:23<02:20,  4.26s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 52%|█████▏    | 34/66 [02:27<02:16,  4.27s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 53%|█████▎    | 35/66 [02:32<02:16,  4.39s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 55%|█████▍    | 36/66 [02:37<02:19,  4.64s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 56%|█████▌    | 37/66 [02:42<02:14,  4.65s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 58%|█████▊    | 38/66 [02:46<02:06,  4.53s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 59%|█████▉    | 39/66 [02:50<01:59,  4.42s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 61%|██████    | 40/66 [02:54<01:50,  4.26s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 62%|██████▏   | 41/66 [02:59<01:53,  4.52s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 64%|██████▎   | 42/66 [03:03<01:43,  4.30s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 65%|██████▌   | 43/66 [03:07<01:38,  4.28s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 67%|██████▋   | 44/66 [03:11<01:31,  4.15s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 68%|██████▊   | 45/66 [03:15<01:23,  3.99s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 70%|██████▉   | 46/66 [03:18<01:16,  3.82s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 71%|███████   | 47/66 [03:22<01:10,  3.70s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 73%|███████▎  | 48/66 [03:25<01:07,  3.75s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 74%|███████▍  | 49/66 [03:30<01:05,  3.87s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 76%|███████▌  | 50/66 [03:35<01:07,  4.25s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 77%|███████▋  | 51/66 [03:39<01:04,  4.31s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 79%|███████▉  | 52/66 [03:44<01:04,  4.62s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 80%|████████  | 53/66 [03:49<01:01,  4.74s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 82%|████████▏ | 54/66 [03:55<00:59,  4.97s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 83%|████████▎ | 55/66 [03:59<00:52,  4.76s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 85%|████████▍ | 56/66 [04:03<00:45,  4.59s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 86%|████████▋ | 57/66 [04:07<00:38,  4.30s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 88%|████████▊ | 58/66 [04:11<00:33,  4.15s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 89%|████████▉ | 59/66 [04:15<00:28,  4.08s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 91%|█████████ | 60/66 [04:19<00:24,  4.12s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 92%|█████████▏| 61/66 [04:24<00:21,  4.31s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 94%|█████████▍| 62/66 [04:28<00:17,  4.34s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 95%|█████████▌| 63/66 [04:32<00:12,  4.20s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 97%|█████████▋| 64/66 [04:36<00:08,  4.14s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 98%|█████████▊| 65/66 [04:40<00:04,  4.04s/it]

torch.Size([290, 104, 17])
torch.Size([290, 104])


100%|██████████| 66/66 [04:43<00:00,  4.29s/it]


Epochs: 1 | Train Loss: 0.0005676914009800758         | Train Accuracy: 98.57896931784332



  0%|          | 0/66 [00:00<?, ?it/s]

torch.Size([512, 104, 17])
torch.Size([512, 104])


  2%|▏         | 1/66 [00:04<04:45,  4.39s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


  3%|▎         | 2/66 [00:09<05:22,  5.04s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


  5%|▍         | 3/66 [00:14<04:51,  4.63s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


  6%|▌         | 4/66 [00:18<04:31,  4.37s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


  8%|▊         | 5/66 [00:22<04:38,  4.56s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


  9%|▉         | 6/66 [00:27<04:25,  4.42s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 11%|█         | 7/66 [00:30<04:08,  4.22s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 12%|█▏        | 8/66 [00:35<04:15,  4.41s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 14%|█▎        | 9/66 [00:39<04:05,  4.31s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 15%|█▌        | 10/66 [00:43<03:52,  4.14s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 17%|█▋        | 11/66 [00:47<03:52,  4.23s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 18%|█▊        | 12/66 [00:51<03:41,  4.10s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 20%|█▉        | 13/66 [00:55<03:31,  3.98s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 21%|██        | 14/66 [00:59<03:22,  3.90s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 23%|██▎       | 15/66 [01:02<03:17,  3.87s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 24%|██▍       | 16/66 [01:06<03:10,  3.81s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 26%|██▌       | 17/66 [01:10<03:03,  3.75s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 27%|██▋       | 18/66 [01:14<03:00,  3.77s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 29%|██▉       | 19/66 [01:17<02:56,  3.76s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 30%|███       | 20/66 [01:21<02:56,  3.84s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 32%|███▏      | 21/66 [01:27<03:11,  4.26s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 33%|███▎      | 22/66 [01:32<03:17,  4.50s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 35%|███▍      | 23/66 [01:37<03:21,  4.68s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 36%|███▋      | 24/66 [01:43<03:41,  5.27s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 38%|███▊      | 25/66 [01:50<03:47,  5.55s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 39%|███▉      | 26/66 [01:57<04:01,  6.04s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 41%|████      | 27/66 [02:04<04:11,  6.44s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 42%|████▏     | 28/66 [02:12<04:19,  6.83s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 44%|████▍     | 29/66 [02:20<04:21,  7.08s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 45%|████▌     | 30/66 [02:25<04:02,  6.74s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 47%|████▋     | 31/66 [02:29<03:26,  5.89s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 48%|████▊     | 32/66 [02:34<03:05,  5.46s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 50%|█████     | 33/66 [02:38<02:51,  5.21s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 52%|█████▏    | 34/66 [02:46<03:12,  6.02s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 53%|█████▎    | 35/66 [02:52<03:06,  6.02s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 55%|█████▍    | 36/66 [03:00<03:13,  6.45s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 56%|█████▌    | 37/66 [03:07<03:10,  6.58s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 58%|█████▊    | 38/66 [03:14<03:08,  6.73s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 59%|█████▉    | 39/66 [03:20<02:57,  6.57s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 61%|██████    | 40/66 [03:27<02:54,  6.71s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 62%|██████▏   | 41/66 [03:34<02:52,  6.90s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 64%|██████▎   | 42/66 [03:42<02:51,  7.13s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 65%|██████▌   | 43/66 [03:49<02:44,  7.16s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 67%|██████▋   | 44/66 [03:57<02:39,  7.26s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 68%|██████▊   | 45/66 [04:04<02:33,  7.31s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 70%|██████▉   | 46/66 [04:12<02:28,  7.41s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 71%|███████   | 47/66 [04:19<02:20,  7.42s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 73%|███████▎  | 48/66 [04:27<02:14,  7.48s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 74%|███████▍  | 49/66 [04:35<02:09,  7.61s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 76%|███████▌  | 50/66 [04:42<02:01,  7.57s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 77%|███████▋  | 51/66 [04:49<01:50,  7.38s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 79%|███████▉  | 52/66 [04:56<01:41,  7.23s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 80%|████████  | 53/66 [05:04<01:35,  7.34s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 82%|████████▏ | 54/66 [05:12<01:29,  7.49s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 83%|████████▎ | 55/66 [05:19<01:23,  7.59s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 85%|████████▍ | 56/66 [05:27<01:16,  7.68s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 86%|████████▋ | 57/66 [05:34<01:07,  7.50s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 88%|████████▊ | 58/66 [05:42<01:00,  7.60s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 89%|████████▉ | 59/66 [05:50<00:53,  7.59s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 91%|█████████ | 60/66 [05:57<00:45,  7.56s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 92%|█████████▏| 61/66 [06:05<00:38,  7.61s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 94%|█████████▍| 62/66 [06:11<00:28,  7.10s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 95%|█████████▌| 63/66 [06:16<00:19,  6.38s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 97%|█████████▋| 64/66 [06:20<00:11,  5.65s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 98%|█████████▊| 65/66 [06:23<00:05,  5.14s/it]

torch.Size([290, 104, 17])
torch.Size([290, 104])


100%|██████████| 66/66 [06:25<00:00,  5.85s/it]


Epochs: 2 | Train Loss: 0.00016232448201965         | Train Accuracy: 101.53893357164135



  0%|          | 0/66 [00:00<?, ?it/s]

torch.Size([512, 104, 17])
torch.Size([512, 104])


  2%|▏         | 1/66 [00:03<04:08,  3.83s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


  3%|▎         | 2/66 [00:07<03:58,  3.73s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


  5%|▍         | 3/66 [00:11<03:54,  3.73s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


  6%|▌         | 4/66 [00:14<03:48,  3.69s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


  8%|▊         | 5/66 [00:18<03:47,  3.73s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


  9%|▉         | 6/66 [00:22<03:40,  3.68s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 11%|█         | 7/66 [00:26<03:58,  4.04s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 12%|█▏        | 8/66 [00:31<03:53,  4.03s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 14%|█▎        | 9/66 [00:34<03:40,  3.88s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 15%|█▌        | 10/66 [00:38<03:39,  3.91s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 17%|█▋        | 11/66 [00:42<03:40,  4.01s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 18%|█▊        | 12/66 [00:46<03:37,  4.03s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 20%|█▉        | 13/66 [00:51<03:40,  4.16s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 21%|██        | 14/66 [00:55<03:34,  4.12s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 23%|██▎       | 15/66 [00:59<03:25,  4.03s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 24%|██▍       | 16/66 [01:02<03:15,  3.91s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 26%|██▌       | 17/66 [01:06<03:08,  3.84s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 27%|██▋       | 18/66 [01:10<03:01,  3.79s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 29%|██▉       | 19/66 [01:13<02:56,  3.75s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 30%|███       | 20/66 [01:17<02:53,  3.78s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 32%|███▏      | 21/66 [01:22<03:03,  4.07s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 33%|███▎      | 22/66 [01:27<03:16,  4.48s/it]

torch.Size([512, 104, 17])
torch.Size([512, 104])


 33%|███▎      | 22/66 [01:32<03:04,  4.19s/it]


KeyboardInterrupt: 

#### Expected train accuracy after 5 epochs to be above 0.99

# Evaluation

In [None]:
def evaluate(model, test_dataset, batch_size=512):
  """
  This function takes a NER model and evaluates its performance (accuracy) on a test data
  Inputs:
  - model: a NER model
  - test_dataset: dataset of type NERDataset
  """
  ########################### TODO: Replace the Nones in the following code ##########################

  # (1) create the test data loader
  test_dataloader = None

  # GPU Configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()

  total_acc_test = 0
  
  # (2) disable gradients
  with None:

    for test_input, test_label in tqdm(test_dataloader):
      # (3) move the test input to the device
      test_label =  test_label.to(device)


      # (4) move the test label to the device
      test_input = test_input.to(device)

      # (5) do the forward pass
      output = model(test_input)

      # accuracy calculation (just add the correct predicted items to total_acc_test)
      acc = (output.argmax(dim =-1) == test_label).sum().item()
      total_acc_test += acc
    
    # (6) calculate the over all accuracy
    total_acc_test /= len(test_input)
  ##################################################################################################

  
  print(f'\nTest Accuracy: {total_acc_test}')

In [None]:
evaluate(model, test_dataset)

#### Expected test accuracy to be above 0.98

# Thank you