# References

* [SEQUENCE MODELS AND LONG-SHORT TERM MEMORY NETWORKS](https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html)
* [Implementation: Augmenting the LSTM part-of-speech tagger with character-level features](https://discuss.pytorch.org/t/implementation-augmenting-the-lstm-part-of-speech-tagger-with-character-level-features/10221)

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x10c1be5b0>

In [3]:
lstm = nn.LSTM(3, 3)  # Input dim is 3, output dim is 3
inputs = [torch.randn(1, 3) for _ in range(5)]  # make a sequence of length 5

# initialize the hidden state.
hidden = (torch.randn(1, 1, 3),
          torch.randn(1, 1, 3))
for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)
    
# alternatively, we can do the entire sequence all at once.
# the first value returned by LSTM is all of the hidden states throughout
# the sequence. the second is just the most recent hidden state
# (compare the last slice of "out" with "hidden" below, they are the same)
# The reason for this is that:
# "out" will give you access to all hidden states in the sequence
# "hidden" will allow you to continue the sequence and backpropagate,
# by passing it as an argument  to the lstm at a later time
# Add the extra 2nd dimension
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))  # clean out hidden state
out, hidden = lstm(inputs, hidden)
print(out)
print(hidden)

tensor([[[ 0.2486, -0.0525, -0.2524]],

        [[ 0.1750, -0.0048, -0.1143]],

        [[-0.0102,  0.0536, -0.1400]],

        [[-0.0357,  0.0877, -0.0192]],

        [[ 0.2145,  0.0192, -0.0337]]], grad_fn=<StackBackward>)
(tensor([[[ 0.2145,  0.0192, -0.0337]]], grad_fn=<StackBackward>), tensor([[[ 0.2984,  0.0952, -0.1647]]], grad_fn=<StackBackward>))


## Example: An LSTM for Part-of-Speech Tagging



In [4]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [5]:
training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}


In [6]:
# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

In [7]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [8]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)
    print(tag_scores)

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(tag_scores)

tensor([[-0.7874, -1.2815, -1.3193],
        [-0.7907, -1.2226, -1.3783],
        [-0.7569, -1.2333, -1.4290],
        [-0.7544, -1.2828, -1.3766],
        [-0.8054, -1.2002, -1.3784]])
tensor([[-1.8430e-01, -1.8953e+00, -4.0148e+00],
        [-4.6627e+00, -3.1775e-02, -3.8243e+00],
        [-2.9806e+00, -3.9329e+00, -7.2944e-02],
        [-5.3053e-02, -3.7181e+00, -3.5976e+00],
        [-5.4043e+00, -5.6400e-03, -6.7883e+00]])


# Exercise: Augmenting the LSTM part-of-speech tagger with character-level features

In the example above, each word had an embedding, which served as the inputs to our sequence model. Let’s augment the word embeddings with a representation derived from the characters of the word. We expect that this should help significantly, since character-level information like affixes have a large bearing on part-of-speech. For example, words with the affix -ly are almost always tagged as adverbs in English.

In [9]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from time import time

torch.manual_seed(1)

def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)

training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}
char_to_ix = {}
for sent,_ in training_data:
    for w in sent:
        for char in w:
            if char not in char_to_ix:
                char_to_ix[char] = len(char_to_ix)

EMBEDDING_DIM = 6
HIDDEN_DIM = 6
CHAR_EMBEDDING = 3
CHAR_LEVEL_REPRESENTATION_DIM = 3

def prepare_both_sequences(sentence, word_to_ix, char_to_ix):
    chars = [prepare_sequence(w, char_to_ix) for w in sentence]
    return prepare_sequence(sentence, word_to_ix), chars

class LSTMCharTagger(nn.Module):
    '''
    Augmented model, takes both sequence of words and char to predict tag.
    Characters are embedded and then get their own representation for each WORD.
    It is this representation that is merged with word embeddings and then fed to the sequence
    LSTM which decodes the tags.
    '''
    def __init__(self, word_embedding_dim, char_embedding_dim, hidden_dim,
                 hidden_char_dim, vocab_size, charset_size, tagset_size):
        super(LSTMCharTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.hidden_char_dim = hidden_char_dim

        # Word embedding:
        self.word_embedding = nn.Embedding(vocab_size, word_embedding_dim)

        # Char embedding and encoding into char-lvl representation of words (c_w):
        self.char_embedding = nn.Embedding(charset_size, char_embedding_dim)
        self.char_lstm = nn.LSTM(char_embedding_dim, hidden_char_dim)

        # Sequence model:
        self.lstm = nn.LSTM(word_embedding_dim + hidden_char_dim, hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

        # Init hidden state for lstms
        self.hidden = self.init_hidden(self.hidden_dim)
        self.hidden_char = self.init_hidden(self.hidden_char_dim)

    def init_hidden(self, size, batch_size=1):
        "Batch size argument used when PackedSequence are used"
        return (autograd.Variable(torch.zeros(1, batch_size, size)),
                autograd.Variable(torch.zeros(1, batch_size, size)))

    def forward_one_word(self, word_sequence, char_sequence):
        ''' For a word by word processing.
        '''
        # Word Embedding
        word_embeds = self.word_embedding(word_sequence)
        # Char lvl representation of each words with 1st LSTM
        char_embeds = self.char_embedding(char_sequence)
        char_lvl, self.hidden_char = self.char_lstm(char_embeds.view(len(char_sequence),1,-1), self.hidden_char)
        # Merge
        merged = torch.cat([word_embeds.view(1,1,-1), char_lvl[-1].view(1,1,-1)], dim=2)
        # Predict tag with 2nd LSTM:
        lstm_out, self.hidden = self.lstm(merged, self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(1, -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

    def forward(self, word_sequence, char_sequence):
        ''' Importantly, char_sequence is a list of tensors, one per word, and one tensor 
        must represent a whole sequence of character for a given word.
        E.g.: is word_sequence has length 4, char_seq must be of length 4, thus char_lstm
        will output 4 char-level word representations (c_w).

        Here we deal with variable lengths of character tensors sequence using nn.utils.rnn.pack_sequence
        '''
        # Word Embedding
        word_embeds = self.word_embedding(word_sequence)

        # Char lvl representation of each words with 1st LSTM
        # We will pack variable length embeddings in PackedSequence. Must sort by decreasing length first.
        sorted_length = np.argsort([char_sequence[k].size()[0] for k in range(len(char_sequence))])
        sorted_length = sorted_length[::-1] # decreasing order
        char_embeds = [self.char_embedding(char_sequence[k]) for k in sorted_length]
        packed = nn.utils.rnn.pack_sequence(char_embeds) # pack variable length sequence
        out, self.hidden_char = self.char_lstm(packed, self.hidden_char)
        encodings_unpacked, seqlengths = nn.utils.rnn.pad_packed_sequence(out, batch_first=True) # unpack and pad
        # We need to take only last element in sequence of lstm char output for each word:
        unsort_list = np.argsort(sorted_length) # indices to put list of encodings in orginal word order
        char_lvl = torch.stack([encodings_unpacked[k][seqlengths[k]-1] for k in unsort_list])

        # Merge
        merged = torch.cat([word_embeds, char_lvl], dim=1) # gives tensor of size (#words, #concatenated features)

        # Predict tag with 2nd LSTM:
        lstm_out, self.hidden = self.lstm(merged.view(len(word_sequence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(word_sequence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

def get_batch_size(seq2pack):
    "Need this to correctly initialize batch lstm hidden states when packing variable length sequences..."
    sorted_length = np.argsort([seq2pack[k].size()[0] for k in range(len(seq2pack))])
    sorted_length = sorted_length[::-1] # decreasing order
    packed = nn.utils.rnn.pack_sequence([seq2pack[k] for k in sorted_length]) 
    return max(packed.batch_sizes)

model = LSTMCharTagger(EMBEDDING_DIM, CHAR_EMBEDDING, HIDDEN_DIM, CHAR_LEVEL_REPRESENTATION_DIM,
                       len(word_to_ix), len(char_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
words_in, chars_in = prepare_both_sequences(training_data[0][0], word_to_ix, char_to_ix)
model.hidden_char = model.init_hidden(model.hidden_char_dim, batch_size=get_batch_size(chars_in))
tag_score = model(words_in, chars_in)
print(tag_score)

t0 = time()
for epoch in range(300): 
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        model.zero_grad()

        # Step 2. Get our inputs ready
        sentence_in, chars_in = prepare_both_sequences(sentence, word_to_ix, char_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
        model.hidden = model.init_hidden(model.hidden_dim)
        model.hidden_char = model.init_hidden(model.hidden_char_dim, batch_size=get_batch_size(chars_in))

        # Step 3. Run our forward pass.
        tag_score = model(sentence_in, chars_in)

        # Step 4. Compute the loss, gradients, and update the parameters
        loss = loss_function(tag_score, targets)
        loss.backward()
        optimizer.step()
print("300 epochs in %.2f sec for model with packed sequences"%(time()-t0))

model = LSTMCharTagger(EMBEDDING_DIM, CHAR_EMBEDDING, HIDDEN_DIM, CHAR_LEVEL_REPRESENTATION_DIM,
                       len(word_to_ix), len(char_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

t0 = time()
for epoch in range(300):
    for sentence, tags in training_data:
        sentence_score = []
        # Step 1. Remember that Pytorch accumulates gradients.
        model.zero_grad()

        # Step 2. Get our inputs ready
        sentence_in, chars_in = prepare_both_sequences(sentence, word_to_ix, char_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
        model.hidden = model.init_hidden(model.hidden_dim)
        #model.hidden_char = model.init_hidden(model.hidden_char_dim)

        # Step 3. Run our forward pass on each word
        for k in range(len(sentence)):
            # Clear hidden state between EACH word (char level representation must be independent of previous word)
            model.hidden_char = model.init_hidden(model.hidden_char_dim)
            tag_score = model.forward_one_word(sentence_in[k], chars_in[k])
            sentence_score.append(tag_score)
            loss = loss_function(tag_score, targets[k].view(1,))
            loss.backward(retain_graph=True) # accumulate gradients now
            #tag_score = autograd.Variable(torch.cat(sentence_score), requires_grad=True)

        # Step 4. Update parameters at the end of sentence
        optimizer.step()
print("300 epochs in %.2f sec for model at word level"%(time()-t0))

# See what the scores are after training
words_in, chars_in = prepare_both_sequences(training_data[0][0], word_to_ix, char_to_ix)
model.hidden_char = model.init_hidden(model.hidden_char_dim, batch_size=get_batch_size(chars_in))
tag_score = model(words_in, chars_in)
print(tag_score)

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}
tensor([[-1.0479, -1.3389, -0.9489],
        [-1.0570, -1.3522, -0.9318],
        [-1.0992, -1.2888, -0.9384],
        [-1.1452, -1.3915, -0.8366],
        [-1.0716, -1.2387, -1.0002]], grad_fn=<LogSoftmaxBackward>)
300 epochs in 2.03 sec for model with packed sequences
300 epochs in 6.41 sec for model at word level
tensor([[-4.2012e-02, -3.3351e+00, -5.1973e+00],
        [-5.7950e+00, -9.9068e-03, -4.9886e+00],
        [-5.4003e+00, -4.5270e+00, -1.5447e-02],
        [-2.4004e-03, -7.0694e+00, -6.4716e+00],
        [-7.1866e+00, -1.5168e-03, -7.1835e+00]], grad_fn=<LogSoftmaxBackward>)
