In [1]:
import nltk
from nltk.corpus import treebank
from nltk.tokenize import sent_tokenize, word_tokenize
import os
from torch import nn
from torch import optim
import math
import torch
import torch.nn.functional as F
from tqdm import tqdm_notebook as tqdm

In [2]:
cwd = os.getcwd()
nltk.download('treebank', download_dir=cwd+'/data/nltk_data')
nltk.data.path.append(cwd+'/data/nltk_data')

[nltk_data] Downloading package treebank to
[nltk_data]     /home/vladimir/Workspace/bnn/data/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


In [3]:
treebank.tagged_words()

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ...]

In [4]:
len(treebank.tagged_sents())

3914

In [5]:
data = []
treebank.tagged_sents()[0]
for tagged_sent in treebank.tagged_sents():
    sent = []
    tags = []
    for word, tag in tagged_sent:
        sent.append(word)
        tags.append(tag)
    data.append((sent, tags))

In [6]:
TRAIN_PRCENT = 80
training_data = data[:len(data)*TRAIN_PRCENT//100]
test_data = data[len(data)*TRAIN_PRCENT//100:]
print("Sentenses in training set:",len(training_data))
print("Sentenses in test set:", len(test_data))

Sentenses in training set: 3131
Sentenses in test set: 783


In [7]:
word_to_idx = {}
tag_to_idx = {}
for word, tag in treebank.tagged_words():
    if word not in word_to_idx:
        word_to_idx[word] = len(word_to_idx)
    if tag not in tag_to_idx:
        tag_to_idx[tag] = len(tag_to_idx)

In [8]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [9]:
len(tag_to_idx)

46

In [10]:
len(word_to_idx)

12408

In [11]:
EMBEDDING_DIM = 300
HIDDEN_DIM = 200

In [12]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [18]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_idx), len(tag_to_idx))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [21]:
# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()

device = torch.device("cuda" if True else "cpu")

model = model.to(device)

with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_idx).to(device)
    tag_scores = model(inputs)
#     print(tag_scores)

for epoch in tqdm(range(5)):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_idx).to(device)
        targets = prepare_sequence(tags, tag_to_idx).to(device)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_idx).to(device)
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(tag_scores)

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

tensor([[-3.1419e-02, -9.7085e+00, -6.7459e+00, -4.6970e+00, -6.1373e+00,
         -7.4405e+00, -8.5361e+00, -9.3167e+00, -4.9332e+00, -1.0355e+01,
         -9.6318e+00, -8.2196e+00, -9.1874e+00, -8.2341e+00, -8.6118e+00,
         -9.5217e+00, -9.6572e+00, -5.9154e+00, -1.0521e+01, -7.1255e+00,
         -9.8906e+00, -9.6603e+00, -8.9358e+00, -9.8526e+00, -9.4892e+00,
         -9.0704e+00, -9.5246e+00, -1.0336e+01, -9.4239e+00, -9.4228e+00,
         -9.4124e+00, -1.0245e+01, -1.0045e+01, -8.2837e+00, -9.5789e+00,
         -5.5713e+00, -9.3702e+00, -9.5667e+00, -9.8943e+00, -9.1967e+00,
         -1.0472e+01, -9.6066e+00, -1.0013e+01, -9.7150e+00, -8.7808e+00,
         -1.0006e+01],
        [-1.6940e-02, -1.3448e+01, -6.6455e+00, -8.5861e+00, -1.0640e+01,
         -1.0825e+01, -9.1086e+00, -1.3483e+01, -9.9356e+00, -1.3109e+01,
         -1.1823e+01, -4.4056e+00, -9.9120e+00, -1.1941e+01, -7.8904e+00,
         -1.0478e+01, -8.3229e+00, -9.9537e+00, -1.2098e+01, -8.5656e+00,
         -1.074

In [22]:
print(tag_scores.shape)

torch.Size([18, 46])
