In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x10687f258>

In [2]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[e] for e in seq]
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)

In [3]:
import string
# simple training data
training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"]),
    ("The boy wrote the letter".split(), ["DET", "NN", "V", "DET", "NN"]),
]
word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)
char_to_ix = {c: i for i, c in enumerate(string.ascii_letters + ' ')}
print(char_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}
ix_to_tag = {ix: tag for tag, ix in tag_to_ix.items()}

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8, 'boy': 9, 'wrote': 10, 'letter': 11}
{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8, 'j': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, 'p': 15, 'q': 16, 'r': 17, 's': 18, 't': 19, 'u': 20, 'v': 21, 'w': 22, 'x': 23, 'y': 24, 'z': 25, 'A': 26, 'B': 27, 'C': 28, 'D': 29, 'E': 30, 'F': 31, 'G': 32, 'H': 33, 'I': 34, 'J': 35, 'K': 36, 'L': 37, 'M': 38, 'N': 39, 'O': 40, 'P': 41, 'Q': 42, 'R': 43, 'S': 44, 'T': 45, 'U': 46, 'V': 47, 'W': 48, 'X': 49, 'Y': 50, 'Z': 51, ' ': 52}


In [4]:
word_embeddings = nn.Embedding(len(word_to_ix), 5)
char_embeddings = nn.Embedding(len(char_to_ix), 5)
print(prepare_sequence(['Everybody', 'read', 'that', 'book'], word_to_ix))
embeds = word_embeddings(prepare_sequence(['Everybody', 'read', 'that', 'book'], word_to_ix))
print(embeds)
print(torch.cat((embeds, torch.randn(4, 5)), 1))
prepped_chars = prepare_sequence('Everybody', char_to_ix)
print(prepped_chars)
char_embeds = char_embeddings(prepped_chars)
print(char_embeds)
print(char_embeds.view(len('Everybody'),1,-1))

Variable containing:
 5
 6
 7
 8
[torch.LongTensor of size 4]

Variable containing:
 1.0386  0.5206 -0.5006  1.2182  0.2117
-1.0613 -1.9441 -0.9596  0.5489 -0.9901
-0.3826  1.5037  1.8267  0.5561  1.6445
 0.4973 -1.5067  1.7661 -0.3569 -0.1713
[torch.FloatTensor of size 4x5]

Variable containing:
 1.0386  0.5206 -0.5006  1.2182  0.2117 -1.9366  1.0067 -1.8593  0.9329  1.4066
-1.0613 -1.9441 -0.9596  0.5489 -0.9901  1.4414  0.1690  0.2575  0.1212 -1.8270
-0.3826  1.5037  1.8267  0.5561  1.6445  0.1571 -1.3312 -1.0505 -1.0007 -0.4621
 0.4973 -1.5067  1.7661 -0.3569 -0.1713 -0.5060  1.1233  0.4800 -0.0344 -0.4928
[torch.FloatTensor of size 4x10]

Variable containing:
 30
 21
  4
 17
 24
  1
 14
  3
 24
[torch.LongTensor of size 9]

Variable containing:
-0.8018 -0.7855  0.7877  0.0786  1.7053
-0.9347 -0.9882  1.3801 -0.1173  0.9317
 1.4666 -0.1028 -0.0097 -0.8420 -0.2067
 1.6200  0.3436 -0.9112 -0.9952  0.7455
 1.3210  1.1608  0.3457 -0.1136 -0.8910
-0.2694 -0.6491 -0.1373 -0.2954 -0.7725


In [5]:
class CharEncoder(nn.Module):

    def __init__(self, embedding_dim, hidden_dim):
        super(CharEncoder, self).__init__()
        self.hidden_dim = hidden_dim

        self.embeddings = nn.Embedding(len(string.ascii_letters + ' '), embedding_dim)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
                autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))

    def forward(self, word, hidden):
        embeds = self.embeddings(word)
        return self.lstm(embeds.view(len(word), 1, -1), hidden)

In [6]:
ce = CharEncoder(6, 6)
ce_out = ce(prepare_sequence('Everybody', char_to_ix), ce.init_hidden())
print(ce_out)
print(ce_out[0].view(len('Everybody'), -1))
print(ce_out[1][0].view(6))  # hidden state
print(torch.cat((ce_out[1][0].view(6), ce_out[1][0].view(6))).view(-1, 6))

(Variable containing:
(0 ,.,.) = 
  0.0694  0.2588  0.2725 -0.0762  0.0271  0.1901

(1 ,.,.) = 
  0.1410 -0.0108  0.0867 -0.0295  0.0695  0.0723

(2 ,.,.) = 
 -0.0600 -0.0699  0.3287 -0.2876  0.1991  0.0357

(3 ,.,.) = 
 -0.1751 -0.1662  0.2829 -0.1596  0.0308  0.0775

(4 ,.,.) = 
 -0.2352 -0.0011  0.3157 -0.1436  0.0696  0.1769

(5 ,.,.) = 
  0.0229  0.2280  0.3494 -0.0650  0.1643  0.1937

(6 ,.,.) = 
  0.2333  0.1481  0.1651  0.2835  0.1776  0.2084

(7 ,.,.) = 
  0.0856  0.0761  0.3421  0.1467  0.1772  0.2845

(8 ,.,.) = 
 -0.0732  0.1365  0.3774 -0.0013  0.1512  0.2672
[torch.FloatTensor of size 9x1x6]
, (Variable containing:
(0 ,.,.) = 
 -0.0732  0.1365  0.3774 -0.0013  0.1512  0.2672
[torch.FloatTensor of size 1x1x6]
, Variable containing:
(0 ,.,.) = 
 -0.1613  0.2766  0.6521 -0.0031  0.5743  0.6119
[torch.FloatTensor of size 1x1x6]
))
Variable containing:
 0.0694  0.2588  0.2725 -0.0762  0.0271  0.1901
 0.1410 -0.0108  0.0867 -0.0295  0.0695  0.0723
-0.0600 -0.0699  0.3287 -0.287

In [7]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, char_hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.char_hidden_dim = char_hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim + char_hidden_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
                autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))

    def forward(self, sentence, char_hidden_states):
        embeds = self.word_embeddings(sentence)
                
        # augment the word embeddings
        aug = torch.cat(char_hidden_states).view(-1, self.char_hidden_dim)
        input = torch.cat((embeds, aug), 1)
        lstm_out, self.hidden = self.lstm(
            input.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space)
        return tag_scores

In [8]:
def tag(sentence, char_model, tagger):
    # get the char hidden states for each word
    char_hidden_states = []
    for word in sentence:
        word_in = prepare_sequence(word, char_to_ix)
        _, (hidden, _) = char_model(word_in, char_model.init_hidden())
        char_hidden_states.append(hidden.view(char_model.hidden_dim))
        
    # prepare the inputs
    sentence_in = prepare_sequence(sentence, word_to_ix)
    
    # run forward pass for the tagger
    return tagger(sentence_in, char_hidden_states)

In [9]:
import itertools
char_model = CharEncoder(EMBEDDING_DIM, HIDDEN_DIM)
tagger = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
char_optimizer = optim.SGD(char_model.parameters(), lr=0.1)
tag_optimizer = optim.SGD(tagger.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
tag_scores = tag(training_data[0][0], char_model, tagger)
print(tag_scores)
_, predicted = torch.max(tag_scores.data, 1)
print([ix_to_tag[p] for p in predicted])
for epoch in range(300):
    for sentence, tags in training_data:
        # zero out gradients
        char_model.zero_grad()
        tagger.zero_grad()
        
        targets = prepare_sequence(tags, tag_to_ix)
        tag_scores = tag(sentence, char_model, tagger)
        
        # calculate loss, gradients and update the params
        loss = loss_function(tag_scores, targets)
        loss.backward()
        char_optimizer.step()
        tag_optimizer.step()
        
    if epoch % 100 == 0:
        print(loss.data)
            
# See what the scores are after training
tag_scores = tag(training_data[0][0], char_model, tagger)
# The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
#  for word i. The predicted tag is the maximum scoring tag.
# Here, we can see the predicted sequence below is 0 1 2 0 1
# since 0 is index of the maximum value of row 1,
# 1 is the index of maximum value of row 2, etc.
# Which is DET NOUN VERB DET NOUN, the correct sequence!
print(tag_scores)
_, predicted = torch.max(tag_scores.data, 1)
print([ix_to_tag[p] for p in predicted])

Variable containing:
-0.8922 -1.4439 -1.0378
-0.9071 -1.4206 -1.0364
-0.9318 -1.3157 -1.0851
-0.8566 -1.2834 -1.2097
-0.9763 -1.3206 -1.0319
[torch.FloatTensor of size 5x3]

['DET', 'DET', 'DET', 'DET', 'DET']


RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.

In [10]:
class LSTMTaggerExample(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTaggerExample, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
                autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space)
        return tag_scores

model = LSTMTaggerExample(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
inputs = prepare_sequence(training_data[0][0], word_to_ix)
tag_scores = model(inputs)
print(tag_scores)
_, predicted = torch.max(tag_scores.data, 1)
print([ix_to_tag[p] for p in predicted])

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Variables of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
    if epoch % 100 == 0:
        print(loss.data)

# See what the scores are after training
inputs = prepare_sequence(training_data[0][0], word_to_ix)
tag_scores = model(inputs)
# The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
#  for word i. The predicted tag is the maximum scoring tag.
# Here, we can see the predicted sequence below is 0 1 2 0 1
# since 0 is index of the maximum value of row 1,
# 1 is the index of maximum value of row 2, etc.
# Which is DET NOUN VERB DET NOUN, the correct sequence!
print(tag_scores)
_, predicted = torch.max(tag_scores.data, 1)
print([ix_to_tag for p in predicted])

Variable containing:
-1.1086 -1.3025 -0.9210
-1.0946 -1.2369 -0.9807
-1.1936 -1.3858 -0.8058
-1.1100 -1.2606 -0.9494
-1.1319 -1.2421 -0.9447
[torch.FloatTensor of size 5x3]

['V', 'V', 'V', 'V', 'V']

 1.0753
[torch.FloatTensor of size 1]


 0.2795
[torch.FloatTensor of size 1]


1.00000e-02 *
  4.5409
[torch.FloatTensor of size 1]

Variable containing:
-0.0647 -3.0983 -4.0455
-4.5395 -0.0112 -7.5848
-4.1240 -7.4205 -0.0169
-0.0131 -4.7049 -5.5385
-4.3413 -0.0139 -7.1342
[torch.FloatTensor of size 5x3]

[{0: 'DET', 1: 'NN', 2: 'V'}, {0: 'DET', 1: 'NN', 2: 'V'}, {0: 'DET', 1: 'NN', 2: 'V'}, {0: 'DET', 1: 'NN', 2: 'V'}, {0: 'DET', 1: 'NN', 2: 'V'}]


In [12]:
class LSTMTaggerCombined(nn.Module):

    def __init__(self, word_embedding_dim, char_embedding_dim, word_hidden_dim, char_hidden_dim, vocab_size, tagset_size):
        super(LSTMTaggerCombined, self).__init__()
        self.word_hidden_dim = word_hidden_dim
        self.char_hidden_dim = char_hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, word_embedding_dim)
        self.char_embeddings = nn.Embedding(len(string.ascii_letters + ' '), char_embedding_dim)

        self.char_lstm = nn.LSTM(char_embedding_dim, char_hidden_dim)
        # The LSTM takes word embeddings and char representation as inputs, 
        # and outputs hidden states with dimensionality hidden_dim.
        self.word_lstm = nn.LSTM(word_embedding_dim + char_hidden_dim, word_hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(word_hidden_dim, tagset_size)

    def init_char_hidden(self):
        return (autograd.Variable(torch.zeros(1, 1, self.char_hidden_dim)),
                autograd.Variable(torch.zeros(1, 1, self.char_hidden_dim)))
    
    def init_word_hidden(self):
        return (autograd.Variable(torch.zeros(1, 1, self.word_hidden_dim)),
                autograd.Variable(torch.zeros(1, 1, self.word_hidden_dim)))

    def forward(self, sentence, word_hidden, char_hidden):
        # get the char hidden states for each word
        char_hidden_states = []
        for word in sentence:
            word_in = prepare_sequence(word, char_to_ix)
            embeds = self.char_embeddings(word_in)
            _, (hidden, _) = self.char_lstm(embeds.view(len(word), 1, -1), char_hidden)
            char_hidden_states.append(hidden.view(self.char_hidden_dim))

        # prepare the inputs
        sentence_in = prepare_sequence(sentence, word_to_ix)
        embeds = self.word_embeddings(sentence_in)
                
        # augment the word embeddings
        aug = torch.cat(char_hidden_states).view(-1, self.char_hidden_dim)
        input = torch.cat((embeds, aug), 1)
        lstm_out, _ = self.word_lstm(
            input.view(len(sentence), 1, -1), word_hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space)
        return tag_scores

In [14]:
model = LSTMTaggerCombined(EMBEDDING_DIM, EMBEDDING_DIM, HIDDEN_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
tag_scores = model(training_data[0][0], model.init_word_hidden(), model.init_char_hidden())
print(tag_scores)
_, predicted = torch.max(tag_scores.data, 1)
print([ix_to_tag[p] for p in predicted])

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get the targets ready
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence, model.init_word_hidden(), model.init_char_hidden())

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
    if epoch % 100 == 0:
        print(loss.data)

# See what the scores are after training
tag_scores = model(training_data[0][0], model.init_word_hidden(), model.init_char_hidden())
# The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
#  for word i. The predicted tag is the maximum scoring tag.
# Here, we can see the predicted sequence below is 0 1 2 0 1
# since 0 is index of the maximum value of row 1,
# 1 is the index of maximum value of row 2, etc.
# Which is DET NOUN VERB DET NOUN, the correct sequence!
print(tag_scores)
_, predicted = torch.max(tag_scores.data, 1)
print([ix_to_tag[p] for p in predicted])

Variable containing:
-1.5590 -1.1455 -0.7517
-1.7298 -1.0822 -0.7260
-1.9817 -1.2273 -0.5637
-1.9563 -1.1564 -0.6088
-1.7868 -1.1616 -0.6549
[torch.FloatTensor of size 5x3]

['V', 'V', 'V', 'V', 'V']

 1.2574
[torch.FloatTensor of size 1]


 0.1827
[torch.FloatTensor of size 1]


1.00000e-02 *
  3.1274
[torch.FloatTensor of size 1]

Variable containing:
-0.0159 -4.2114 -7.0014
-5.8940 -0.0049 -6.1528
-6.5028 -4.5698 -0.0119
-0.0077 -4.9708 -7.1924
-5.0401 -0.0106 -5.4953
[torch.FloatTensor of size 5x3]

['DET', 'NN', 'V', 'DET', 'NN']
