In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

#### *Toy example (model needs to find words corresponding to the location)*
---

##### Data & Preprocessing

In [2]:
corpus = [
          "We always come to Paris",
          "The professor is from Australia",
          "I live in Stanford",
          "He comes from Taiwan",
          "The capital of Turkey is Ankara"
         ]

    - tokenization
    - stop words removal
    - lowercasing
    - noise removal

In [3]:
def preprocessing(sent):
    return sent.lower().split()
 
train_sentences = [preprocessing(sentence) for sentence in corpus]

In [4]:
train_sentences

[['we', 'always', 'come', 'to', 'paris'],
 ['the', 'professor', 'is', 'from', 'australia'],
 ['i', 'live', 'in', 'stanford'],
 ['he', 'comes', 'from', 'taiwan'],
 ['the', 'capital', 'of', 'turkey', 'is', 'ankara']]

    location set:

In [5]:
locations = set(["australia", "ankara", "paris", "stanford", "taiwan", "turkey"])

train_labels = [[1 if word in locations else 0 for word in sentence] for sentence in train_sentences]

In [6]:
train_labels

[[0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1, 0, 1]]

    vocabulary from sentences:

In [7]:
vocab = set(word for sentence in train_sentences for word in sentence)

vocab

{'always',
 'ankara',
 'australia',
 'capital',
 'come',
 'comes',
 'from',
 'he',
 'i',
 'in',
 'is',
 'live',
 'of',
 'paris',
 'professor',
 'stanford',
 'taiwan',
 'the',
 'to',
 'turkey',
 'we'}

In [8]:
vocab.add("<unk>")

    Padding for window classification

In [9]:
vocab.add('<pad>')

In [10]:
def add_padding(sentence, window_size, pad='<pad>'):
    window = [pad] * window_size
    return window + sentence + window

In [11]:
words = sorted(list(vocab))

words_to_indx = {word:indx for indx, word in enumerate(words)}

In [12]:
words_to_indx

{'<pad>': 0,
 '<unk>': 1,
 'always': 2,
 'ankara': 3,
 'australia': 4,
 'capital': 5,
 'come': 6,
 'comes': 7,
 'from': 8,
 'he': 9,
 'i': 10,
 'in': 11,
 'is': 12,
 'live': 13,
 'of': 14,
 'paris': 15,
 'professor': 16,
 'stanford': 17,
 'taiwan': 18,
 'the': 19,
 'to': 20,
 'turkey': 21,
 'we': 22}

    converting sentences into sequence of indices

In [13]:
def sent_to_ind(sentence, words_indexes):
    indexed_sentence = []
    for word in sentence:
        if word in words_indexes:
            index = words_to_indx[word]
        else:
            index = words_to_indx['<unk>']
        indexed_sentence.append(index)

    return indexed_sentence

In [14]:
indexed_sent = sent_to_ind(train_sentences[0], words_to_indx)
restored_sent = [words[indx] for indx in indexed_sent]

print('indexed sentence from train corpus: ', indexed_sent)
print('from indecis to words: ', restored_sent)

indexed sentence from train corpus:  [22, 2, 6, 20, 15]
from indecis to words:  ['we', 'always', 'come', 'to', 'paris']


In [15]:
train_indecies = [sent_to_ind(sentence, words_to_indx) for sentence in train_sentences]
train_indecies

[[22, 2, 6, 20, 15],
 [19, 16, 12, 8, 4],
 [10, 13, 11, 17],
 [9, 7, 8, 18],
 [19, 5, 14, 21, 12, 3]]

In [16]:
import pprint

In [17]:
print('Train tokens:')
pprint.pprint(train_sentences)
print('-------------')

print('Train labels for locations:')
pprint.pprint(train_labels)
print('-------------')

print('Train sentences turned into sequence of indecies:')
pprint.pprint(train_indecies)
print('-------------')

Train tokens:
[['we', 'always', 'come', 'to', 'paris'],
 ['the', 'professor', 'is', 'from', 'australia'],
 ['i', 'live', 'in', 'stanford'],
 ['he', 'comes', 'from', 'taiwan'],
 ['the', 'capital', 'of', 'turkey', 'is', 'ankara']]
-------------
Train labels for locations:
[[0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1, 0, 1]]
-------------
Train sentences turned into sequence of indecies:
[[22, 2, 6, 20, 15],
 [19, 16, 12, 8, 4],
 [10, 13, 11, 17],
 [9, 7, 8, 18],
 [19, 5, 14, 21, 12, 3]]
-------------


##### Batching

In [18]:
from torch.utils.data import DataLoader
from functools import partial

In [19]:
def custom_collate_fn(batch, window_size, words_to_indx):
    x, y = zip(*batch)

    #add padding to x
    x = [add_padding(sentence, window_size) for sentence in x]

    #convert sentences into indecies
    x = [sent_to_ind(sentence, words_to_indx) for sentence in x]

    #make sure that the lengths of all the example in one batch are the same
    pad_token = words_to_indx['<pad>']
    x = [torch.LongTensor(indx_sent) for indx_sent in x]
    x_padded = nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=pad_token)

    #save initial sizes
    lengths = [len(label) for label in y]
    lengths = torch.LongTensor(lengths)

    #add padding to y
    y = [torch.LongTensor(i) for i in y]
    y_padded = nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=0)

    return x_padded, y_padded, lengths


In [20]:
data = list(zip(train_sentences, train_labels))

In [21]:
pprint.pprint(data)

[(['we', 'always', 'come', 'to', 'paris'], [0, 0, 0, 0, 1]),
 (['the', 'professor', 'is', 'from', 'australia'], [0, 0, 0, 0, 1]),
 (['i', 'live', 'in', 'stanford'], [0, 0, 0, 1]),
 (['he', 'comes', 'from', 'taiwan'], [0, 0, 0, 1]),
 (['the', 'capital', 'of', 'turkey', 'is', 'ankara'], [0, 0, 0, 1, 0, 1])]


In [22]:
batch_size = 2
shuffle = True
window_size = 2 
collate_fn = partial(custom_collate_fn, window_size=window_size, words_to_indx=words_to_indx)

loader = DataLoader(
    data,
    batch_size=batch_size,
    shuffle=shuffle,
    collate_fn=collate_fn
)

counter = 0
for batched_x, batched_y, batched_lengths in loader:
  print(f"iteration {counter}")
  print("batched input:")
  print(batched_x)
  print("batched labels:")
  print(batched_y)
  print("batched lengths:")
  print(batched_lengths)
  print("")
  counter += 1

iteration 0
batched input:
tensor([[ 0,  0, 19, 16, 12,  8,  4,  0,  0,  0],
        [ 0,  0, 19,  5, 14, 21, 12,  3,  0,  0]])
batched labels:
tensor([[0, 0, 0, 0, 1, 0],
        [0, 0, 0, 1, 0, 1]])
batched lengths:
tensor([5, 6])

iteration 1
batched input:
tensor([[ 0,  0, 10, 13, 11, 17,  0,  0],
        [ 0,  0,  9,  7,  8, 18,  0,  0]])
batched labels:
tensor([[0, 0, 0, 1],
        [0, 0, 0, 1]])
batched lengths:
tensor([4, 4])

iteration 2
batched input:
tensor([[ 0,  0, 22,  2,  6, 20, 15,  0,  0]])
batched labels:
tensor([[0, 0, 0, 0, 1]])
batched lengths:
tensor([5])



##### Model

In [23]:
class WordWindowClassifier(nn.Module):
    
    def __init__(self, hyperparameters, vocab_size, pad_index=0):
        super(WordWindowClassifier, self).__init__()
        self.window_size = hyperparameters['window_size']
        self.embed_dim = hyperparameters['embed_dim']
        self.hidden_dim = hyperparameters['hidden_dim']
        self.freeze_embed = hyperparameters['freeze_embed']
        
        # 1. Embedding Layer
        self.embed = nn.Embedding(vocab_size, self.embed_dim, padding_idx=pad_index)
        if self.freeze_embed == True:
            self.embed.weight.requires_grad = False

        # 2. Hidden Layer
        full_window_size = 2 * window_size + 1
        self.hidden_layer = nn.Sequential(
            nn.Linear(full_window_size * self.embed_dim, self.hidden_dim),
            nn.Tanh()
        )

        # 3. Output Layer
        self.output_layer = nn.Linear(self.hidden_dim, 1)

        # 4. Probability
        self.probabilities = nn.Sigmoid()



    def forward(self, input):
        # SIZE1 = (BATCH, SENTENCE LENGTH)

        batch_size, sentence_length = input.size()

        words_windows = input.unfold(1, 2 * self.window_size + 1, 1)
                                    # (dimension, size, step)
        _, adjusted_length, _ = words_windows.size()

        # SIZE2 = (BATCH, WINDOWS QUANTITY, WINDOW SIZE)

        # check size
        assert words_windows.size() == (batch_size, adjusted_length, 2 * self.window_size + 1)

        # embedding 
        embedded_windows = self.embed(words_windows)

        # SIZE3 = (BATCH, WINDOWS QUANTITY, WINDOW SIZE, EMBEDDINGS)

        #reshaping for linear layer
        embedded_windows = embedded_windows.view(batch_size, adjusted_length, -1)

        # SIZE4 = (BATCH, WINDOWS QUANTITY, WINDOW SIZE * EMBEDDINGS)

        #hidden layer
        hid_layer = self.hidden_layer(embedded_windows)

        # SIZE5 = (BATCH, WINDOWS QUANTITY, HIDDEN)

        #output layer
        output = self.output_layer(hid_layer)

        # SIZE6 = (BATCH, WINDOWS QUANTITY, 1)

        #probabilities layer
        output = self.probabilities(output)
        output = output.view(batch_size, -1)

        # SIZE6 = (BATCH, WINDOWS QUANTITY * 1)  
        return output

##### Training

In [24]:
data = list(zip(train_sentences, train_labels))
shuffle = True
window_size = 2
batch_size = 2
collate_fn = partial(custom_collate_fn, window_size=window_size, words_to_indx=words_to_indx)

loader = DataLoader(
    dataset=data,
    batch_size=batch_size,
    shuffle=shuffle,
    collate_fn=collate_fn
)

model_hyperparameters = {
    'batch_size': 4,
    'window_size': 2,
    'embed_dim': 25,
    'hidden_dim': 25,
    'freeze_embed': False
}

vocab_size = len(words_to_indx)

# model
model = WordWindowClassifier(
    hyperparameters=model_hyperparameters, 
    vocab_size=vocab_size)

# optim
learning_rate = 1e-2
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# loss function
def loss_func(batch_outputs, batch_lables, lenghts):
    bceloss= nn.BCELoss()
    loss = bceloss(batch_outputs, batch_lables.float())
    loss = loss / lenghts.sum().float()

    return loss

In [25]:
def train_epoch(model, optimizer, loader, loss_func):
    total_loss = 0

    for batch_x, batch_y, batch_lenghts in loader:
        optimizer.zero_grad()

        outputs = model.forward(batch_x)
        loss = loss_func(outputs, batch_y, batch_lenghts)
        loss.backward()

        optimizer.step()    

        total_loss += loss.item()
        
    return total_loss

def train(model, optimizer, loader, loss_func, n_epochs=10000):
    for epoch in range(n_epochs):
        epoch_loss = train_epoch(model, optimizer, loader, loss_func)
        if epoch % 100 == 0: 
            print(epoch_loss)

In [26]:
num = 1000
train(
    model,
    optimizer,
    loader,
    loss_func,
    n_epochs=num
)

0.2997586205601692
0.21746812388300896
0.18579839542508125
0.1254400946199894
0.10261280462145805
0.07098693773150444
0.05055864527821541
0.047555478289723396
0.038901240564882755
0.03216703236103058


##### Prediction

In [27]:
example_corpus = ['She comes from Warsaw']
tokens = [s.lower().split() for s in example_corpus]
labels = [[0, 0, 0, 1]]

In [28]:
example_data = list(zip(tokens, labels))
batch_size = 1
shuffle = False
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=window_size, words_to_indx=words_to_indx)

loader = DataLoader(
    dataset=example_data,
    batch_size=batch_size,
    shuffle=shuffle,
    collate_fn=collate_fn
)

In [29]:
for x, y, _ in loader:
    output = model.forward(x)
    print(labels)
    print(output)

[[0, 0, 0, 1]]
tensor([[0.0278, 0.0718, 0.1278, 0.8953]], grad_fn=<ViewBackward0>)
