# Assignment 3
Training a simple neural named entity recognizer (NER)

In [None]:
import torch
import torch.nn as nn

# our additional imports:
import numpy as np
from random import shuffle
from sklearn.metrics import classification_report

# DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In this assignment you are required to build a full training and testing pipeline for a neural sequentail tagger for named entities, using LSTM.

The dataset that you will be working on is called ReCoNLL 2003, which is a corrected version of the CoNLL 2003 dataset: https://www.clips.uantwerpen.be/conll2003/ner/


The three files (train, test and eval) are available from the course git repository (https://github.com/kfirbar/nlp-course)

As you can see, the annotated texts are labeled according to the IOB annotation scheme, for 3 entity types: Person, Organization, Location.

**Task 1:** Write a funtion *read_data* for reading the data from a single file (either train, test or eval). This function recieves a filepath and returns a list of sentence. Every sentence is encoded as a pair of lists, one list contains the words and one list contains the labels.

In [None]:
!git clone https://github.com/kfirbar/nlp-course

Cloning into 'nlp-course'...
remote: Enumerating objects: 71, done.[K
remote: Counting objects: 100% (71/71), done.[K
remote: Compressing objects: 100% (57/57), done.[K
remote: Total 71 (delta 29), reused 40 (delta 11), pack-reused 0[K
Unpacking objects: 100% (71/71), done.


In [None]:
def read_data(filepath):
    data = []
    
    # TODO... write your code accordingly
    with open(filepath) as file:
        words = []
        labels = []

        for index, line in enumerate(file, start=1):
            if line != '\n':
                word, label = line.split()
                words.append(word)
                labels.append(label)
            else:
                data.append((words, labels))
                words = []
                labels = []
    
    return data


train = read_data('/content/nlp-course/connl03_train.txt')
test = read_data('/content/nlp-course/connl03_test.txt')
dev = read_data('/content/nlp-course/connl03_dev.txt')

The following Vocab class can be served as a dictionary that maps words and tags into Ids. The UNK_TOKEN should be used for words that are not part of the training data.

In [None]:
UNK_TOKEN = 0


class Vocab:

    def __init__(self):
        self.word2id = {"__unk__": UNK_TOKEN}
        self.id2word = {UNK_TOKEN: "__unk__"}
        self.n_words = 1
        
        self.tag2id = {"O":0, "B-PER":1, "I-PER": 2, "B-LOC": 3, "I-LOC": 4, "B-ORG": 5, "I-ORG": 6}
        self.id2tag = {0:"O", 1:"B-PER", 2:"I-PER", 3:"B-LOC", 4:"I-LOC", 5:"B-ORG", 6:"I-ORG"}
    
    
    def index_words(self, words):
        word_indexes = [self.index_word(w) for w in words]
        return word_indexes


    def index_tags(self, tags):
        tag_indexes = [self.tag2id[t] for t in tags]
        return tag_indexes
    

    def index_word(self, w):
        if w not in self.word2id:
            self.word2id[w] = self.n_words
            self.id2word[self.n_words] = w
            self.n_words += 1
        
        return self.word2id[w]

**Task 2:** Write a function *prepare_data* that takes one of the [train, dev, test] and the Vocab instance, for converting each pair of (words,labels) to a pair of indexes (from Vocab). Each pair should be added to *data_sequences*, which is returned back from the function.

In [None]:
vocab = Vocab()


def prepare_data(data, vocab):
    data_sequences = []
    
    # TODO - your code...
    for words, labels in data:
        # in case we DON'T want to save them as tensors:
        # data_sequences.append((vocab.index_words(words), vocab.index_tags(labels)))

        # in case we DO want to save them as tensors:
        words_indexes_tensor = torch.tensor(vocab.index_words(words), dtype=torch.long)
        tags_indexes_tensor = torch.tensor(vocab.index_tags(labels), dtype=torch.long)
        data_sequences.append((words_indexes_tensor, tags_indexes_tensor))

    return data_sequences, vocab


train_sequences, vocab = prepare_data(train, vocab)
dev_sequences, vocab = prepare_data(dev, vocab)
test_sequences, vocab = prepare_data(test, vocab)

**Task 3:** Write NERNet, a PyTorch Module for labeling words with NER tags. 

*input_size:* the size of the vocabulary

*embedding_size:* the size of the embeddings

*hidden_size:* the LSTM hidden size

*output_size:* the number tags we are predicting for

*n_layers:* the number of layers we want to use in LSTM

*directions:* could 1 or 2, indicating unidirectional or bidirectional LSTM, respectively

The input for your forward function should be a single sentence tensor.

In [None]:
class NERNet(nn.Module):
    
    def __init__(self, input_size, embedding_size, hidden_size, output_size, n_layers, directions):
        super(NERNet, self).__init__()
        # TODO: your code...
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers, bidirectional=(True if directions==2 else False))
        self.out = nn.Linear(hidden_size*directions, output_size)
    
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.directions = directions


    def forward(self, input_sentence):
        # TODO: your code...
        dimension = len(input_sentence)

        # Maybe not needed:
        # hidden_state = torch.randn(self.n_layers * self.directions, 1, self.hidden_size).cuda()
        # cell_state = torch.randn(self.n_layers * self.directions, 1, self.hidden_size).cuda()
        # hidden = (hidden_state, cell_state)
        hidden = None

        # 1. embed the sentence
        embedded = self.embedding(input_sentence)

        # 2. give the embedding to LSTM
        # "If (h_0, c_0) is not provided, both h_0 and c_0 default to zero" (https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html)
        lstm_output, _ = self.lstm(embedded.view(dimension, 1, -1), hidden) # The view function is meant to reshape the tensor https://stackoverflow.com/a/48650355/7786691

        # 3. run output through prediction function
        output = self.out(lstm_output.view(dimension, -1)) # Applies a linear transformation to the incoming data

        return output

**Task 4:** write a training loop, which takes a model (instance of NERNet) and number of epochs to train on. The loss is always CrossEntropyLoss and the optimizer is always Adam.

In [None]:
def train_loop(model, n_epochs):
    # Loss function
    criterion = nn.CrossEntropyLoss()

    # Optimizer (ADAM is a fancy version of SGD)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
  
    # shuffle data before training phase (added by Ofir)
    shuffle(train_sequences)
    STEP = 400 

    for e in range(1, n_epochs + 1):
        # TODO - your code goes here...
        for i, sequence in enumerate(train_sequences):
            sentence, labels = sequence
            sentence_tensor = torch.LongTensor(sentence).cuda()
            labels_tensor = torch.LongTensor(labels).cuda()

            if len(sentence_tensor) == 0:
                continue

            model.zero_grad()
            scores = model(sentence_tensor)
            criterion(scores, labels_tensor).backward()
            optimizer.step()

**Task 5:** write an evaluation loop on a trained model, using the dev and test datasets. This function print the true positive rate (TPR), also known as Recall and the opposite to false positive rate (FPR), also known as precision, of each label seperately (7 labels in total), and for all the 6 labels (except O) together. The caption argument for the function should be served for printing, so that when you print include it as a prefix.

In [None]:
def evaluate(model, caption):
    # TODO - your code goes here
    # from Piazza: https://piazza.com/class/klxc3m1tzqz2o8?cid=59

    all_target_names = ["O", "B-PER", "I-PER", "B-LOC", "I-LOC", "B-ORG", "I-ORG"]
    binary_target_names = ["O", "OTHERS"]

    print(f"****************    Results for {caption}    ****************")

    # evaluate test
    all_test_words_pred = []
    all_test_words_true = []
    binary_test_words_pred = []
    binary_test_words_true = []

    all_dev_words_pred = []
    all_dev_words_true = []
    binary_dev_words_pred = []
    binary_dev_words_true = []

    for sentence, labels in test_sequences:
        sentence_tensor = torch.LongTensor(sentence).cuda()
        labels_tensor = torch.LongTensor(labels).cuda()
        
        _, pred_labels = model(sentence_tensor).T.max(0)

        all_test_words_pred += pred_labels.tolist()
        all_test_words_true += labels.tolist()
        
        binary_test_words_pred += [1 if i >=1 else i for i in all_test_words_pred]
        binary_test_words_true += [1 if i >=1 else i for i in all_test_words_true]

    # evaluate dev
    for sentence, labels in dev_sequences:
        sentence_tensor = torch.LongTensor(sentence).cuda()
        labels_tensor = torch.LongTensor(labels).cuda()
        
        _, pred_labels = model(sentence_tensor).T.max(0)

        all_dev_words_pred += pred_labels.tolist()
        all_dev_words_true += labels.tolist()
    
        binary_dev_words_pred += [1 if i >=1 else i for i in all_dev_words_pred]
        binary_dev_words_true += [1 if i >=1 else i for i in all_dev_words_true]

    print("ALL Test Results:")
    print(classification_report(all_test_words_true, all_test_words_pred, target_names=all_target_names))

    print("ALL Dev Results:")
    print(classification_report(all_dev_words_true, all_dev_words_pred, target_names=all_target_names))

    print("BINARY Test Results:")
    print(classification_report(binary_test_words_true, binary_test_words_pred, target_names=binary_target_names))

    print("BINARY Dev Results:")
    print(classification_report(binary_dev_words_true, binary_dev_words_pred, target_names=binary_target_names))

**Task 6:** Train and evaluate a few models, all with embedding_size=300, and with the following hyper parameters (you may use that as captions for the models as well):

Model 1: (hidden_size: 500, n_layers: 1, directions: 1)

Model 2: (hidden_size: 500, n_layers: 2, directions: 1)

Model 3: (hidden_size: 500, n_layers: 3, directions: 1)

Model 4: (hidden_size: 500, n_layers: 1, directions: 2)

Model 5: (hidden_size: 500, n_layers: 2, directions: 2)

Model 6: (hidden_size: 500, n_layers: 3, directions: 2)

Model 7: (hidden_size: 800, n_layers: 1, directions: 2)

Model 8: (hidden_size: 800, n_layers: 2, directions: 2)

Model 9: (hidden_size: 800, n_layers: 3, directions: 2)

In [None]:
# TODO - your code goes here...
EMBEDDING_SIZE = 300
EPOCHS = 10
INPUT_SIZE = len(vocab.word2id) # 8955
OUTPUT_SIZE = len(vocab.tag2id) # 7

model_1 = NERNet(INPUT_SIZE, EMBEDDING_SIZE, 500, OUTPUT_SIZE, 1, 1).cuda()
train_loop(model_1, EPOCHS)

model_2 = NERNet(INPUT_SIZE, EMBEDDING_SIZE, 500, OUTPUT_SIZE, 2, 1).cuda()
train_loop(model_2, EPOCHS)

model_3 = NERNet(INPUT_SIZE, EMBEDDING_SIZE, 500, OUTPUT_SIZE, 3, 1).cuda()
train_loop(model_3, EPOCHS)

model_4 = NERNet(INPUT_SIZE, EMBEDDING_SIZE, 500, OUTPUT_SIZE, 1, 2).cuda()
train_loop(model_4, EPOCHS)

model_5 = NERNet(INPUT_SIZE, EMBEDDING_SIZE, 500, OUTPUT_SIZE, 2, 2).cuda()
train_loop(model_5, EPOCHS)

model_6 = NERNet(INPUT_SIZE, EMBEDDING_SIZE, 500, OUTPUT_SIZE, 3, 2).cuda()
train_loop(model_6, EPOCHS)

model_7 = NERNet(INPUT_SIZE, EMBEDDING_SIZE, 800, OUTPUT_SIZE, 1, 2).cuda()
train_loop(model_7, EPOCHS)

model_8 = NERNet(INPUT_SIZE, EMBEDDING_SIZE, 800, OUTPUT_SIZE, 2, 2).cuda()
train_loop(model_8, EPOCHS)

model_9 = NERNet(INPUT_SIZE, EMBEDDING_SIZE, 800, OUTPUT_SIZE, 3, 2).cuda()
train_loop(model_9, EPOCHS)


models = [model_1, model_2, model_3, model_4, model_5, model_6, model_7, model_8, model_9]

In [None]:
for i, model in enumerate(models, start=1):
    model_name = "model_"+str(i)
    evaluate(model, model_name)

****************    Results for model_1    ****************
ALL Test Results:
              precision    recall  f1-score   support

           O       0.92      0.97      0.94      6567
       B-PER       0.72      0.62      0.67       434
       I-PER       0.79      0.71      0.75       296
       B-LOC       0.82      0.69      0.75       343
       I-LOC       0.85      0.62      0.72        53
       B-ORG       0.58      0.54      0.56       350
       I-ORG       0.66      0.30      0.42       200

    accuracy                           0.89      8243
   macro avg       0.76      0.64      0.69      8243
weighted avg       0.88      0.89      0.88      8243

ALL Dev Results:
              precision    recall  f1-score   support

           O       0.92      0.97      0.94      3096
       B-PER       0.75      0.66      0.70       200
       I-PER       0.89      0.69      0.77       157
       B-LOC       0.77      0.67      0.72       183
       I-LOC       1.00      0.43    

**Task 6:** Download the GloVe embeddings from https://nlp.stanford.edu/projects/glove/ (use the 300-dim vectors from glove.6B.zip). Then intialize the nn.Embedding module in your NERNet with these embeddings, so that you can start your training with pre-trained vectors. Repeat Task 6 and print the results for each model.

Note: make sure that vectors are aligned with the IDs in your Vocab, in other words, make sure that for example the word with ID 0 is the first vector in the GloVe matrix of vectors that you initialize nn.Embedding with. For a dicussion on how to do that, check it this link:
https://discuss.pytorch.org/t/can-we-use-pre-trained-word-embeddings-for-weight-initialization-in-nn-embedding/1222

In [None]:
# TODO - your code goes here...
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip
GLOVE_PATH = 'glove.6B.300d.txt'

--2021-06-20 18:10:05--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-06-20 18:10:05--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-06-20 18:10:06--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-0

In [None]:
!ls

glove.6B.100d.txt  glove.6B.300d.txt  glove.6B.zip  sample_data
glove.6B.200d.txt  glove.6B.50d.txt   nlp-course


In [None]:
def get_glove_pre_trained_embeddings_weights(input_size, embedding_size, word2id = vocab.word2id):
    weights = np.zeros((input_size, embedding_size))

    with open(GLOVE_PATH) as glove:
        for line in glove.readlines():
            split = line.split()
            word = split[0]
            word_id = word2id.get(word)

        if word_id:
            weights[word_id] = split[1:]

    return torch.from_numpy(weights).float()

In [None]:
class GloveNERNet(nn.Module):
    
    def __init__(self, input_size, embedding_size, hidden_size, output_size, n_layers, directions):
        super(GloveNERNet, self).__init__()
        
        # TODO: your code...

        self.embedding = nn.Embedding(input_size, embedding_size)

        pre_trained_weights = get_glove_pre_trained_embeddings_weights(input_size, embedding_size)
        self.embedding.weight = nn.Parameter(pre_trained_weights)


        self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers, bidirectional=(True if directions==2 else False))
        self.out = nn.Linear(hidden_size*directions, output_size)
    
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.directions = directions


    def forward(self, input_sentence):
        # TODO: your code...
        dimension = len(input_sentence)

        # Maybe not needed:
        # hidden_state = torch.randn(self.n_layers * self.directions, 1, self.hidden_size).cuda()
        # cell_state = torch.randn(self.n_layers * self.directions, 1, self.hidden_size).cuda()
        # hidden = (hidden_state, cell_state)
        hidden = None

        # 1. embed the sentence
        embedded = self.embedding(input_sentence)

        # 2. give the embedding to LSTM
        # "If (h_0, c_0) is not provided, both h_0 and c_0 default to zero" (https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html)
        lstm_output, _ = self.lstm(embedded.view(dimension, 1, -1), hidden) # The view function is meant to reshape the tensor https://stackoverflow.com/a/48650355/7786691

        # 3. run output through prediction function
        output = self.out(lstm_output.view(dimension, -1)) # Applies a linear transformation to the incoming data

        return output

In [None]:
EMBEDDING_SIZE = 300
EPOCHS = 10 # change to 10
INPUT_SIZE = len(vocab.word2id) # 8955
OUTPUT_SIZE = len(vocab.tag2id) # 7

model_glove_1 = GloveNERNet(INPUT_SIZE, EMBEDDING_SIZE, 500, OUTPUT_SIZE, 1, 1).cuda()
train_loop(model_glove_1, EPOCHS)

model_glove_2 = GloveNERNet(INPUT_SIZE, EMBEDDING_SIZE, 500, OUTPUT_SIZE, 2, 1).cuda()
train_loop(model_glove_2, EPOCHS)

model_glove_3 = GloveNERNet(INPUT_SIZE, EMBEDDING_SIZE, 500, OUTPUT_SIZE, 3, 1).cuda()
train_loop(model_glove_3, EPOCHS)

model_glove_4 = GloveNERNet(INPUT_SIZE, EMBEDDING_SIZE, 500, OUTPUT_SIZE, 1, 2).cuda()
train_loop(model_glove_4, EPOCHS)

model_glove_5 = GloveNERNet(INPUT_SIZE, EMBEDDING_SIZE, 500, OUTPUT_SIZE, 2, 2).cuda()
train_loop(model_glove_5, EPOCHS)

model_glove_6 = GloveNERNet(INPUT_SIZE, EMBEDDING_SIZE, 500, OUTPUT_SIZE, 3, 2).cuda()
train_loop(model_glove_6, EPOCHS)

model_glove_7 = GloveNERNet(INPUT_SIZE, EMBEDDING_SIZE, 800, OUTPUT_SIZE, 1, 2).cuda()
train_loop(model_glove_7, EPOCHS)

model_glove_8 = GloveNERNet(INPUT_SIZE, EMBEDDING_SIZE, 800, OUTPUT_SIZE, 2, 2).cuda()
train_loop(model_glove_8, EPOCHS)

model_glove_9 = GloveNERNet(INPUT_SIZE, EMBEDDING_SIZE, 800, OUTPUT_SIZE, 3, 2).cuda()
train_loop(model_glove_9, EPOCHS)


models_glove = [model_glove_1, model_glove_2, model_glove_3, model_glove_4, model_glove_5,
          model_glove_6, model_glove_7, model_glove_8, model_glove_9]

In [None]:
for i, model in enumerate(models_glove, start=1):
    model_name = "model_glove_"+str(i)
    evaluate(model, model_name)

****************    Results for model_glove_1    ****************
ALL Test Results:
              precision    recall  f1-score   support

           O       0.98      0.90      0.94      6567
       B-PER       0.88      0.69      0.78       434
       I-PER       0.86      0.68      0.76       296
       B-LOC       0.81      0.79      0.80       343
       I-LOC       0.66      0.72      0.68        53
       B-ORG       0.48      0.80      0.60       350
       I-ORG       0.25      0.83      0.38       200

    accuracy                           0.87      8243
   macro avg       0.70      0.77      0.71      8243
weighted avg       0.92      0.87      0.89      8243

ALL Dev Results:
              precision    recall  f1-score   support

           O       0.98      0.91      0.95      3096
       B-PER       0.89      0.72      0.80       200
       I-PER       0.87      0.66      0.75       157
       B-LOC       0.85      0.85      0.85       183
       I-LOC       0.67      0.

**Good luck!**

In [None]:
# Thank you! :)