In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pprint

pp = pprint.PrettyPrinter()

### Mini Project: Word Window Classification

In this NLP task, we will train a model to recognize words within a sentence that correspond to the name of a `LOCATION`, e.g. in the sentence "I went to France last year", the word "France" is a LOCATION. We will use a window that scans over each word in a sentence and classifies the center word as either a LOCATION or not.

In [4]:
# raw dataset/corpus
corpus = [
    "We always come to Paris",
    "The professor is from Australia",
    "I live in Stanford",
    "He comes from Taiwan",
    "The capital of Turkey is Ankara"
]

### Pre-processing

We will tokenize each sentence into a list of words and make all words lowercase.

In [9]:
def preprocess_sentence(sentence):
    tokenized_sent = sentence.lower().split(" ")
    return tokenized_sent

# create pre-processed training instances
train_sentences = [preprocess_sentence(sentence) for sentence in corpus]
pp.pprint(train_sentences)

[['we', 'always', 'come', 'to', 'paris'],
 ['the', 'professor', 'is', 'from', 'australia'],
 ['i', 'live', 'in', 'stanford'],
 ['he', 'comes', 'from', 'taiwan'],
 ['the', 'capital', 'of', 'turkey', 'is', 'ankara']]


In [11]:
# create labels for training instances, i.e. in each instance each word is labeled 1 if it's a location and 0 otherwise
locations = ['paris', 'australia', 'stanford', 'taiwan', 'turkey', 'ankara']

train_labels = [[1 if word in locations else 0 for word in sentence] for sentence in train_sentences]
train_labels

[[0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1, 0, 1]]

### Now we create the vocabulary set

In [13]:
vocabulary = set(word for sentence in train_sentences for word in sentence)
# also add an unknown token '<unk>' for out-of-vocab words
vocabulary.add("<unk>")

vocabulary

{'<unk>',
 'always',
 'ankara',
 'australia',
 'capital',
 'come',
 'comes',
 'from',
 'he',
 'i',
 'in',
 'is',
 'live',
 'of',
 'paris',
 'professor',
 'stanford',
 'taiwan',
 'the',
 'to',
 'turkey',
 'we'}

Pad the beginning of end of each sentence with a special token so that the window always contains the same number of words (even at beginning and end of a sentence)

In [14]:
# add a special padding token
vocabulary.add("<pad>")

# pad the beginning and end of each sentence
def pad_sentence(sentence, window_size, pad_token="<pad>"):
    padding = [pad_token]*window_size
    padded_sentence = padding + sentence + padding
    return padded_sentence

# example
window_size = 2
pad_sentence(train_sentences[0], window_size)

['<pad>', '<pad>', 'we', 'always', 'come', 'to', 'paris', '<pad>', '<pad>']

Index the words in our vocabulary

In [16]:
# sort the tokens so that padding token is the first one in the list
ind2word = sorted(list(vocabulary))
word2ind = {word:i for i,word in enumerate(ind2word)}
word2ind

{'<pad>': 0,
 '<unk>': 1,
 'always': 2,
 'ankara': 3,
 'australia': 4,
 'capital': 5,
 'come': 6,
 'comes': 7,
 'from': 8,
 'he': 9,
 'i': 10,
 'in': 11,
 'is': 12,
 'live': 13,
 'of': 14,
 'paris': 15,
 'professor': 16,
 'stanford': 17,
 'taiwan': 18,
 'the': 19,
 'to': 20,
 'turkey': 21,
 'we': 22}

Convert training sentences into sequences of word indices

In [20]:
def sentence_to_indices(sentence, word2ind):
    indices = [word2ind.get(token, word2ind["<unk>"]) for token in sentence]
    return indices

def indices_to_sentence(indices, ind2word):
    sentence = [ind2word[ix] for ix in indices]
    return sentence

# example
example_sentence = ["i", "live", "in", "argentina"]
example_indices = sentence_to_indices(example_sentence, word2ind)
sentence_restored = indices_to_sentence(example_indices, ind2word)

print(f"example sentence: {example_sentence}")
print(f"converted to indices: {example_indices}")
print(f"sentence restored from indices: {sentence_restored}")


example sentence: ['i', 'live', 'in', 'argentina']
converted to indices: [10, 13, 11, 1]
sentence restored from indices: ['i', 'live', 'in', '<unk>']


In [21]:
# now convert the training sentences to indices
train_indices = [sentence_to_indices(s, word2ind) for s in train_sentences]
train_indices

[[22, 2, 6, 20, 15],
 [19, 16, 12, 8, 4],
 [10, 13, 11, 17],
 [9, 7, 8, 18],
 [19, 5, 14, 21, 12, 3]]

### Creating embedding vectors using a pytorch `nn.Embedding` layer

In [23]:
embedding_dim = 5
embeds = nn.Embedding(len(vocabulary), embedding_dim)

# show the embedding layer parameters (randomly initialized at the moment), i.e. the embedding matrix in which each row is the embedding vector for the corresponding word from the vocabulary
list(embeds.parameters()) 

[Parameter containing:
 tensor([[-1.6204, -1.3235, -0.3489,  0.3866,  0.0549],
         [-0.4126,  1.9121,  0.5493, -0.0802, -0.7693],
         [ 1.6380,  2.0960, -0.9376, -0.2087,  0.3668],
         [ 0.0562,  1.3598, -1.4467, -1.1859, -0.0695],
         [ 0.6596,  0.4296, -0.7453, -0.9501,  0.1531],
         [-0.1683,  1.4943, -1.0681,  2.1593,  1.1089],
         [ 0.5112,  0.4445,  0.1515, -1.3416, -0.3067],
         [ 1.5297, -1.7061, -1.0710, -0.4877, -0.7624],
         [ 1.4409, -0.8704,  1.3974,  1.6310,  0.3263],
         [ 0.6451, -0.6095,  1.9080,  0.3958, -0.2163],
         [ 0.6885,  1.2283,  0.4844,  0.7660,  0.3550],
         [-0.6197, -0.0463, -0.2521, -0.2903, -0.6396],
         [-0.8334, -0.3446, -0.3361,  1.4775, -0.2071],
         [-1.9010, -0.6506, -1.7711, -1.4585, -0.9989],
         [ 1.3669,  0.9970, -0.0068,  0.1039,  1.9751],
         [ 0.3550,  0.4291, -0.1219, -1.6771,  0.7565],
         [-0.6452, -0.8114, -1.2496,  0.2212, -0.1379],
         [ 1.0280,  0.503

In [27]:
# to index into the embedding vector for a particular word, we need to use an index tensor of type torch.long
index = word2ind["paris"]
index_tensor = torch.tensor(index, dtype=torch.long)
paris_embed = embeds(index_tensor)
print(paris_embed)

# can also get embeddings for multiple word indices
index_paris = word2ind["paris"]
index_ankara = word2ind["ankara"]
indices = [index_paris, index_ankara]
indices_tensor = torch.tensor(indices, dtype=torch.long)
embeddings = embeds(indices_tensor)
print(embeddings)


tensor([ 0.3550,  0.4291, -0.1219, -1.6771,  0.7565],
       grad_fn=<EmbeddingBackward0>)
tensor([[ 0.3550,  0.4291, -0.1219, -1.6771,  0.7565],
        [ 0.0562,  1.3598, -1.4467, -1.1859, -0.0695]],
       grad_fn=<EmbeddingBackward0>)


### Creating batches of sentences using the `torch.util.data.DataLoader` class

`DataLoader(data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)`

The `collate_fn` is a user defined function that can be used to print stats about each batch and/or do some extra pre-processing to the data

In [28]:
from torch.utils.data import DataLoader
from functools import partial

In [36]:
def _custom_collate_fn(batch, window_size, word2ind):
    # split batch into sentences and labels
    x, y = zip(*batch)

    # pad the beginngin and end of sentences for window and convert to indices
    x = [pad_sentence(s, window_size=window_size) for s in x]
    x = [sentence_to_indices(s, word2ind) for s in x]

    # convert each sequence of indices to pytorch tensor
    x = [torch.LongTensor(indices) for indices in x]

    # additional padding to make all sentences the same length using the pytorch nn.utils.rnn.pad_sequences function
    pad_token_ix = word2ind["<pad>"]
    x_padded = nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=pad_token_ix)

    # also pad the labels and record original sentence length, i.e. length without padding
    lengths = torch.LongTensor([len(label) for label in y])
    y = [torch.LongTensor(label) for label in y]
    y_padded = nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=0)

    return x_padded, y_padded, lengths

In [37]:
# dataloader parameters
data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(_custom_collate_fn, window_size=window_size, word2ind=word2ind)

# instantiate dataloader
loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

# show the batches
counter = 0 
for batched_x, batched_y, batched_lengths in loader:
    print(f"Batch # {counter}")
    print("Batched input: ")
    print(batched_x)
    print("Batched labels: ")
    print(batched_y)
    print("Batched lengths: ")
    print(batched_lengths)
    print()
    counter += 1


Batch # 0
Batched input: 
tensor([[ 0,  0, 22,  2,  6, 20, 15,  0,  0],
        [ 0,  0, 19, 16, 12,  8,  4,  0,  0]])
Batched labels: 
tensor([[0, 0, 0, 0, 1],
        [0, 0, 0, 0, 1]])
Batched lengths: 
tensor([5, 5])

Batch # 1
Batched input: 
tensor([[ 0,  0, 10, 13, 11, 17,  0,  0,  0,  0],
        [ 0,  0, 19,  5, 14, 21, 12,  3,  0,  0]])
Batched labels: 
tensor([[0, 0, 0, 1, 0, 0],
        [0, 0, 0, 1, 0, 1]])
Batched lengths: 
tensor([4, 6])

Batch # 2
Batched input: 
tensor([[ 0,  0,  9,  7,  8, 18,  0,  0]])
Batched labels: 
tensor([[0, 0, 0, 1]])
Batched lengths: 
tensor([4])



In [38]:
# creating all windows of an indexed sentence, i.e. all chunks of size 2*window_size +1
print(f"Original tensor: ")
print(batched_x)
print()

# we will use the unfold function of pytorch to create the chunks
chunks = batched_x.unfold(dimension=1, size=window_size*2+1, step=1)
print("Windows: ")
print(chunks)

Original tensor: 
tensor([[ 0,  0,  9,  7,  8, 18,  0,  0]])

Windows: 
tensor([[[ 0,  0,  9,  7,  8],
         [ 0,  9,  7,  8, 18],
         [ 9,  7,  8, 18,  0],
         [ 7,  8, 18,  0,  0]]])


### Building the model as a custom module

In [49]:
class WordWindowClassifier(nn.Module):

    def __init__(self, hyperparameters, vocab_size, pad_ix=0):
        super(WordWindowClassifier, self).__init__()

        # instance variables 
        self.vocab_size = vocab_size
        self.pad_ix = pad_ix
        self.window_size = hyperparameters["window_size"]
        self.embed_dim = hyperparameters["embed_dim"]
        self.hidden_dim = hyperparameters["hidden_dim"]
        self.freeze_embeddings = hyperparameters["freeze_embeddings"]

        # define layers of the neural network
        self.embed_layer = nn.Embedding(self.vocab_size, self.embed_dim, padding_idx=self.pad_ix)
        if self.freeze_embeddings:
            # make the embeddings untrainable if the freeze_embeddings setting is on
            self.embed_layer.weight.requires_grad=False

        # input of hidden layer is the contatenated embedding vectors of all words in the full window 
        full_window_size = 2*self.window_size+1
        self.hidden_layer = nn.Sequential(
            nn.Linear(full_window_size * self.embed_dim, self.hidden_dim),
            nn.Tanh()    
        )

        # output layer for binary classification
        self.output_layer = nn.Sequential(
            nn.Linear(self.hidden_dim, 1),
            nn.Sigmoid()
        )

    # implement forward pass
    def forward(self, inputs): 
        # input is a batch of padded indexed sentences
        # i.e. a tensor of shape (B, L)
        # where B = batch_size and L = window padded sentence length

        B, L = inputs.shape
        S = 2*self.window_size+1

        # we need to create token windows for the sentences
        # converts input of shape (B, L) into shape (B, W, S)
        # where W=number of windows and S=full window length
        token_windows = inputs.unfold(1, S, 1)

        # do a sanity check to make sure the token_windows tensor has the right shape
        _, W , _ = token_windows.shape
        assert token_windows.shape == (B, W, S), "Error in token_windows!" 

        # convert token indices to embedding vectors
        # converts input of shape (B, W, S) into shape (B, W, S, D)
        # where D=embedding dims
        embedded_windows = self.embed_layer(token_windows)

        # reshape so that all embedding vectors of words within a windo get concatenated
        # converts input of shape (B, W, S, D) into shape (B, W, S*D)
        embedded_windows = embedded_windows.view(B, W, -1)

        # pass the inputs into hidden layer
        # converts input of shape (B, W, S*D) into shape (B, W, H)
        # where H=hidden dim
        hidden_layer_output = self.hidden_layer(embedded_windows)

        # pass to output layer 
        # converts input of shape (B, W, H) into shape (B, W, 1)
        output = self.output_layer(hidden_layer_output)

        # reshape output from (B, W, 1) into shape (B, W*1)
        output = output.view(B,-1)

        return output

### Training the model

In [50]:
# prep the data
data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(_custom_collate_fn, window_size=window_size, word2ind=word2ind)

# instantiate dataloader
loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

# instantaite the model
model_hyperparameters = {
    "batch_size" : 4,
    "window_size" : 2,
    "embed_dim" : 25, 
    "hidden_dim" : 25,
    "freeze_embeddings" : False
}
vocab_size = len(ind2word)
model = WordWindowClassifier(model_hyperparameters, vocab_size)

# define optimizer
learning_rate = 0.01
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# define a custom loss function
def loss_function(batch_outputs, batch_labels, batch_lengths):
    # compute binary crossentropy loss for whole batch
    bceloss = nn.BCELoss()
    loss = bceloss(batch_outputs, batch_labels.float())

    # rescale the loss by diving by total number of words in batch (original/unpadded sentences)
    loss = loss / batch_lengths.sum().float()

    return loss    

### Training loop

In [51]:
def train_epoch(loss_function, optimizer, model, loader):
    
    total_batch_loss = 0.0
    for batch_inputs, batch_labels, batch_lengths in loader:
        # clear the gradients
        optimizer.zero_grad()
        # forward pass
        batch_outputs = model(batch_inputs)
        # compute loss
        loss = loss_function(batch_outputs, batch_labels, batch_lengths)
        # backpropagate the grads
        loss.backward()
        # optimizer step to update the parameters
        optimizer.step()
        total_batch_loss += loss.item()

    return total_batch_loss

def train(loss_function, optimizer, model, loader, num_epochs=10000):

    for epoch in range(num_epochs):
        epoch_loss = train_epoch(loss_function, optimizer, model, loader)
        if epoch % 100 == 0:
            print(f"Epoch# {epoch}, loss = {epoch_loss}")    

Now train the model

In [52]:
num_epochs = 1000
train(loss_function, optimizer, model, loader, num_epochs=num_epochs)

Epoch# 0, loss = 0.3357775807380676
Epoch# 100, loss = 0.2630639150738716
Epoch# 200, loss = 0.2061525546014309
Epoch# 300, loss = 0.15910308063030243
Epoch# 400, loss = 0.12577852047979832
Epoch# 500, loss = 0.10344464145600796
Epoch# 600, loss = 0.0634592603892088
Epoch# 700, loss = 0.05704061035066843
Epoch# 800, loss = 0.045995863154530525
Epoch# 900, loss = 0.03718583658337593


### Making predictions on test data using the trained model

In [55]:
test_corpus = ["She comes from Paris"]
test_sentences = [s.lower().split() for s in test_corpus]
test_labels = [[0,0,0,1]]

# create test dataset loader
test_data = list(zip(test_sentences, test_labels))
batch_size = 1
shuffle = False
window_size = 2
collate_fn = partial(_custom_collate_fn, window_size=window_size, word2ind=word2ind)

# instantiate dataloader
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

In [57]:
for test_instance, labels, _ in test_loader:
    # compute predictions
    outputs = model(test_instance)
    print(f"Predictions: {outputs}")
    print(f"Ground truth: {labels}")    

Predictions: tensor([[0.0270, 0.0695, 0.0371, 0.9002]], grad_fn=<ViewBackward0>)
Ground truth: tensor([[0, 0, 0, 1]])


It works!