# Assignment 4, task 1

In this task, we will have a final crack at the NER problem, using recurrent neural networks, or Gated Recurrent Units (GRUs) to be more exact.

We want to consider both the context of the word (the surrounding words) and the contents of the word (the letters and other symbols that make up the actual word). Therefore we are using two bi-directional GRUs, one world-level GRU for the words in the sentence, and one character-level GRU for the letters and other symbols in a word.

We will process one sentence at a time. Each hidden state vector in the word-level GRU represents that word in relation to the other words in the sentence, whereas the final state vector(s) in the character-level RNN represent morphological and typographical information about the word. We will concatenate these vectors to obtain a single information-rich representation of the word.


In [17]:
# First run this cell
import csv
from tqdm import tqdm
import string
import codecs
import torch
import torch.optim as optim
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import clip_grad_norm_

In [18]:
# Run this cell to init mappings from characters to IDs and back again,
# from words to IDs and back again, and from labels to IDs and back again

UNKNOWN = '<UNK>'  # Unknown char or unknown word
CHARS = [UNKNOWN, '’', '—'] + \
    list(string.punctuation) + list(string.ascii_letters) + list(string.digits)
char_to_id = {c: i for i, c in enumerate(CHARS)}
PADDING_WORD = '<PAD>'
id_to_label = ['noname', 'name']


def label_to_id(label):
    return 0 if label == 'O' else 1

We want to have a vector representation of the syntactic and semantic properties of words, and in order to avoid having to train these from scratch, we are going to re-use pre-trained Glove vectors.


In [19]:
def load_glove_embeddings(embedding_file,
                          padding_word=PADDING_WORD,
                          unknown_word=UNKNOWN):
    """
    Reads Glove embeddings from a file.

    Returns vector dimensionality, the word_to_id mapping (as a dict),
    and the embeddings (as a list of lists).
    """
    word_to_id = {}  # Dictionary to store word-to-ID mapping
    word_to_id[padding_word] = 0
    word_to_id[unknown_word] = 1
    embeddings = []
    with open(embedding_file, encoding='utf8') as f:
        for line in f:
            data = line.split()
            word = data[0]
            vec = [float(x) for x in data[1:]]
            embeddings.append(vec)
            word_to_id[word] = len(word_to_id)
    D = len(embeddings[0])

    # <PAD> has an embedding of just zeros
    embeddings.insert(word_to_id[padding_word], [0]*D)
    # <UNK> has an embedding of just minus-ones
    embeddings.insert(word_to_id[unknown_word], [-1]*D)

    return D, word_to_id, embeddings

We can now create our dataset. Each datapoint will consist of a sentence and its associated labels for each word in the sentence. The label is either 1 (a name) or 0 (not a name).


In [20]:
class NERDataset(Dataset):
    """
    A class loading NER dataset from a CSV file to be used as an input 
    to PyTorch DataLoader.

    The CSV file has 4 fields: sentence number (only at the start of a new
    sentence), word, POS tag (ignored), and label.

    Datapoints are sentences + associated labels for each word. If the 
    words have not been seen before (i.e, they are not found in the 
    'word_to_id' dict), they will be mapped to the unknown word '<UNK>'.
    """

    def __init__(self, filename, word_to_id):
        reader = csv.reader(codecs.open(filename, encoding='ascii',
                                        errors='ignore'), delimiter=',')

        self.sentences = []
        self.labels = []

        sentence, labels = [], []
        for row in reader:
            if row:
                if row[0].strip():  # A new sentence begins
                    if sentence and labels:
                        self.sentences.append(sentence)
                        self.labels.append(labels)
                    sentence = [row[1].strip()]
                    labels = [label_to_id(row[3].strip())]
                else:
                    sentence.append(row[1].strip())
                    labels.append(label_to_id(row[3].strip()))

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.labels[idx]

In [21]:
# Let's check out some of these data structures
dim, word_to_id, embeddings = load_glove_embeddings(
    '/datasets/dd2417/glove.6B.50d.txt')
print("The embedding for the word 'good' looks like this:")
print(embeddings[word_to_id['good']])
print()

# Read the data we are going to use for testing the model
test_set = NERDataset('/datasets/dd2417/ner_test.csv', word_to_id)
print("There are", len(test_set), "data points in the test_set")
dp = 1600
sentence, labels = test_set[dp]
print("Data point", dp, "is", sentence)
print("It has the labels", labels)

The embedding for the word 'good' looks like this:
[-0.35586, 0.5213, -0.6107, -0.30131, 0.94862, -0.31539, -0.59831, 0.12188, -0.031943, 0.55695, -0.10621, 0.63399, -0.4734, -0.075895, 0.38247, 0.081569, 0.82214, 0.2222, -0.0083764, -0.7662, -0.56253, 0.61759, 0.20292, -0.048598, 0.87815, -1.6549, -0.77418, 0.15435, 0.94823, -0.3952, 3.7302, 0.82855, -0.14104, 0.016395, 0.21115, -0.036085, -0.15587, 0.86583, 0.26309, -0.71015, -0.03677, 0.0018282, -0.17704, 0.27032, 0.11026, 0.14133, -0.057322, 0.27207, 0.31305, 0.92771]

There are 4542 data points in the test_set
Data point 1600 is ['Isolated', 'grass', 'fires', 'continue', 'to', 'burn', 'in', 'the', 'southern', 'U.S.', 'states', 'of', 'Oklahoma', 'and', 'Texas', ',', 'but', 'they', 'have', 'weakened', 'since', 'killing', 'one', 'elderly', 'woman', 'and', 'scorching', 'dozens', 'of', 'homes', 'on', 'Tuesday', '.']
It has the labels [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]


In [22]:
# Run this cell. The function below will take care of the case of
# sequences of unequal lengths.

def pad_sequence(batch, padding_word=PADDING_WORD, padding_label=0):
    batch_data, batch_labels = zip(*batch)
    max_len = max(map(len, batch_labels))
    padded_data = [[b[i] if i < len(b) else padding_word for i in range(
        max_len)] for b in batch_data]
    padded_labels = [[l[i] if i < len(l) else padding_label for i in range(
        max_len)] for l in batch_labels]
    return padded_data, padded_labels

In [23]:
# This is how it works
x = [([1, 2, 3], [0, 0, 1]), ([4, 5], [1, 0]), ([6, 7, 8, 9], [0, 1, 1, 0])]
pad_sequence(x)

([[1, 2, 3, '<PAD>'], [4, 5, '<PAD>', '<PAD>'], [6, 7, 8, 9]],
 [[0, 0, 1, 0], [1, 0, 0, 0], [0, 1, 1, 0]])

Here is the actual classifier, as a class extending the Pytorch 'nn.Module' class. Your task is to write the forward function (look for "YOUR CODE HERE" below).


In [37]:
class NERClassifier(nn.Module):

    def __init__(self, word_embeddings,  # Pre-trained word embeddings
                 char_to_id,             # Mapping from chars to ids
                 word_to_id,             # Mapping from words to ids
                 char_emb_size=16,
                 char_hidden_size=25,    # Hidden size of the character-level biRNN
                 word_hidden_size=100,   # Hidden size of the word-level biRNN
                 padding_word=PADDING_WORD,
                 unknown_word=UNKNOWN,
                 char_bidirectional=True,
                 word_bidirectional=True,
                 device='cpu'
                 ):

        super(NERClassifier, self).__init__()
        self.padding_word = padding_word
        self.unknown_word = unknown_word
        self.char_to_id = char_to_id
        self.word_to_id = word_to_id
        self.char_emb_size = char_emb_size
        self.char_hidden_size = char_hidden_size
        self.word_hidden_size = word_hidden_size
        self.char_bidirectional = char_bidirectional
        self.word_bidirectional = word_bidirectional
        self.device = device

        # Create an embedding tensor for the words and import the Glove
        # embeddings. The embeddings are frozen (i.e., they will not be
        # updated during training).
        vocabulary_size = len(word_embeddings)
        self.word_emb_size = len(word_embeddings[0])

        self.word_emb = nn.Embedding(vocabulary_size, self.word_emb_size)
        self.word_emb.weight = nn.Parameter(torch.tensor(embeddings, dtype=torch.float),
                                            requires_grad=False)

        # Create an embedding tensor for character embeddings. These embeddings
        # are learnt from scratch (i.e., they are not frozen).
        if self.char_emb_size > 0:
            self.char_emb = nn.Embedding(len(char_to_id), char_emb_size)
            self.char_birnn = nn.GRU(
                self.char_emb_size,
                self.char_hidden_size,
                bidirectional=char_bidirectional,
                batch_first=True
            )
        else:
            self.char_hidden_size = 0

        multiplier = 2 if self.char_bidirectional else 1
        self.word_birnn = nn.GRU(
            self.word_emb_size + multiplier * self.char_hidden_size,  # input size
            self.word_hidden_size,
            bidirectional=word_bidirectional,
            batch_first=True
        )

        # Binary classification - 0 if not part of the name, 1 if a name
        multiplier = 2 if self.word_bidirectional else 1
        self.final_pred = nn.Linear(multiplier * self.word_hidden_size, 2)

    def forward(self, x):
        """
        Performs a forward pass of a NER classifier
        Takes as input a 2D list `x` of dimensionality (B, T),
        where B is the batch size;
              T is the max sentence length in the batch (shorter sentences
              are already padded with the special token <PAD>)

        Returns logits, i.e. the output of the last linear layer before applying softmax.

        :param      x:    A batch of sentences
        :type       x:    list of strings
        """

        # First find all word IDs of all words in all sentences in the batch
        # and the character IDs of all characters in all words in all sentences
        word_ids = [self.word_to_id[word if word in self.word_to_id else self.unknown_word] for seq in x for word in seq]
        char_ids = [[self.char_to_id[char] for char in word] for seq in x for word in seq]

        # The 'to(self.device)' below is necessary for making sure that
        # the model and the data are on the same device (CPU or CUDA).
        word_tensor = torch.tensor(word_ids).to(self.device)
        char_tensor = torch.tensor(char_ids).to(self.device)

        # YOUR CODE HERE

        # Get character-level embeddings and pass them through the character-level biRNN
        char_embeds = self.char_emb(char_tensor)
        char_embeds = torch.nn.functional.pad(char_embeds, (1, 0), "constant", 0)  # Pad on the left
        packed_char_embeds = torch.nn.utils.rnn.pack_padded_sequence(char_embeds, [len(seq) for seq in char_ids], batch_first=True)
        char_outputs, _ = self.char_birnn(packed_char_embeds)
        char_outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(char_outputs, batch_first=True)
        char_outputs = char_outputs[:, :, :self.char_hidden_size] + char_outputs[:, :, self.char_hidden_size:]  # Sum if bidirectional

        # Concatenate character-level and word-level embeddings and pass them through the word-level biRNN
        word_embeds = self.word_emb(word_tensor)
        combined_embeds = torch.cat((word_embeds, char_outputs), dim=-1)
        packed_combined_embeds = torch.nn.utils.rnn.pack_padded_sequence(combined_embeds, [len(seq) for seq in x], batch_first=True)
        word_outputs, _ = self.word_birnn(packed_combined_embeds)
        word_outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(word_outputs, batch_first=True)
        word_outputs = word_outputs[:, :, :self.word_hidden_size] + word_outputs[:, :, self.word_hidden_size:]  # Sum if bidirectional

        # Pass the outputs through the final linear layer to get the logits
        logits = self.final_pred(word_outputs)

        return logits

In [38]:
# ================== Hyper-parameters ==================== #

learning_rate = 0.001
epochs = 5

# ======================= Training ======================= #

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print( "Running on", device )

dim, word_to_id, embeddings = load_glove_embeddings('/datasets/dd2417/glove.6B.50d.txt')
training_set = NERDataset('/datasets/dd2417/ner_training.csv', word_to_id)
training_loader = DataLoader(training_set, batch_size=128, collate_fn=pad_sequence)

ner = NERClassifier(embeddings, char_to_id, word_to_id, device=device).to(device)

optimizer = optim.Adam(ner.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

ner.train()
for epoch in range(epochs):   
    for x, y in tqdm(training_loader, desc="Epoch {}".format(epoch + 1)):
        optimizer.zero_grad()
        logits = ner(x)
            
        loss = criterion(logits.reshape(-1, logits.shape[2]), torch.tensor(y).to(device).reshape(-1,))
        loss.backward()
        
        clip_grad_norm_(ner.parameters(), 5)
        optimizer.step()


Running on cuda


Epoch 1:   0%|          | 0/340 [00:00<?, ?it/s]


ValueError: expected sequence of length 13 at dim 1 (got 8)

In [None]:
# Evaluation
import numpy as np
%pip install terminaltables
from terminaltables import AsciiTable

ner.eval()
confusion_matrix = [[0, 0],
                    [0, 0]]
test_set = NERDataset('/datasets/dd2417/ner_test.csv', word_to_id)
for x, y in test_set:
    pred = torch.argmax(ner([x]), dim=-1).detach().cpu().numpy().reshape(-1,)
    y = np.array(y)
    tp = np.sum(pred[y == 1])
    tn = np.sum(1 - pred[y == 0])
    fp = np.sum(1 - y[pred == 1])
    fn = np.sum(y[pred == 0])

    confusion_matrix[0][0] += tn
    confusion_matrix[1][1] += tp
        
    confusion_matrix[0][1] += fp
    confusion_matrix[1][0] += fn

    
table = [['', 'Predicted no name', 'Predicted name'],
             ['Real no name', confusion_matrix[0][0], confusion_matrix[0][1]],
             ['Real name', confusion_matrix[1][0], confusion_matrix[1][1]]]

t = AsciiTable(table)
print(t.table)
print("Accuracy: {}".format(
    round((confusion_matrix[0][0] + confusion_matrix[1][1]) / np.sum(confusion_matrix), 4))
)



Collecting terminaltables
  Downloading terminaltables-3.1.10-py2.py3-none-any.whl.metadata (3.5 kB)
Downloading terminaltables-3.1.10-py2.py3-none-any.whl (15 kB)
Installing collected packages: terminaltables
Successfully installed terminaltables-3.1.10
Note: you may need to restart the kernel to use updated packages.


NameError: name 'ner' is not defined