In [1]:
import torch
from torch import nn
import math

In [2]:
from torch import functional as F
import torch.optim as optim

$
    \begin{array}{ll} \\
        i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{t-1} + b_{hi}) \\
        f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{t-1} + b_{hf}) \\
        g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{t-1} + b_{hg}) \\
        o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{t-1} + b_{ho}) \\
        c_t = f_t \odot c_{t-1} + i_t \odot g_t \\
        h_t = o_t \odot \tanh(c_t) \\
    \end{array}
$

In [43]:
def get_uniform_param(shape, extent):
    return nn.Parameter(torch.FloatTensor(*shape).uniform_(*extent))

def MVb(Mat, vecs):
    '''Matrix multiply Mat by batched vectors vecs'''
    vecs = vecs.unsqueeze(-1)
    out = Mat @ vecs
    out = out.squeeze(-1)
    return out

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size,
                 #num_layers = 1, bias = True,
                 #batch_first = False, dropout = 0,
                 #bidirectional = False
                ):
        super().__init__()
        self.hidden_size = hidden_size
        #range of weight values for initialization:
        k = 1/hidden_size
        extent = (-math.sqrt(k), math.sqrt(k))
        #shapes of weights and biases:
        in_hid_shape = (4*hidden_size, input_size)
        hid_hid_shape = (4*hidden_size, hidden_size)
        bias_shape = (4*hidden_size,)
        #Create initialized weights and biases:
        self.weight_ih_l0 = get_uniform_param(in_hid_shape, extent)
        self.weight_hh_l0 = get_uniform_param(hid_hid_shape, extent)
        self.bias_ih_l0 = get_uniform_param(bias_shape, extent)
        self.bias_hh_l0 = get_uniform_param(bias_shape, extent)
        self.sigmoid = nn.Sigmoid()
        self.tanh = nn.Tanh()
        self.tanh2 = nn.Tanh()

    def single_forward(self, x, hc):
        '''
        Compute h and c given the current input x and previous hc
        (where x is a single element of the sequence)
        '''
        h,c = hc
        a = MVb(self.weight_ih_l0, x) + self.bias_ih_l0 #weighted activations
        a = a + MVb(self.weight_hh_l0, h) + self.bias_hh_l0
        a[:self.hidden_size] = self.tanh(a[:self.hidden_size]) #g
        a[self.hidden_size:] = self.sigmoid(a[self.hidden_size:]) #i,f, and o
        g,i,f,o = a.split(self.hidden_size, -1)
        c = f * c + i * g
        h = o * self.tanh2(c)
        return h, c

    def forward(self, x, hc = None):
        '''
        Operate over sequence x of dim: seq x batch x features
        '''
        if hc is None: #initialize with hidden of all zeros:
            h_shape = (1, x.shape[1], self.hidden_size)
            hc = (torch.zeros(h_shape), torch.zeros(h_shape))
        #allocate seq_len x batch x hidden for output:
        outputs = torch.empty(x.shape[:2] + (self.hidden_size,))
        #Iterate over input sequence, saving hidden state
        for i, xi in enumerate(x):
            hc = self.single_forward(xi, hc)
            outputs[i] = hc[0] #h
        return outputs, hc
                


# TESTING

In [44]:
# lstm = nn.LSTM(4,3)
my_lstm = LSTM(4,3)

inputs = torch.randn(5,2,4)

hidden = (torch.randn(1, 2, 3),
          torch.randn(1, 2, 3))

out, hidden = my_lstm(inputs, hidden)
print(out)
print(hidden[1].shape)

tensor([[[ 2.5651e-01,  5.7178e-01, -5.3439e-01],
         [-1.2970e-02, -6.0097e-02,  1.6398e-01]],

        [[ 1.6066e-04, -9.1151e-02,  8.4208e-02],
         [-4.4458e-03,  6.0881e-02,  3.7133e-02]],

        [[-1.2920e-01, -1.8123e-01, -6.0872e-02],
         [ 1.2895e-01, -7.4025e-02,  1.5713e-01]],

        [[-6.1014e-02, -7.2531e-02,  4.3606e-02],
         [-3.3204e-02, -3.9318e-03, -2.5229e-02]],

        [[-1.4493e-01, -3.1395e-01,  7.1604e-02],
         [-1.6903e-02,  2.8890e-02,  6.4808e-02]]], grad_fn=<CopySlices>)
torch.Size([1, 2, 3])


### Actual training Example

From https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

In [45]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


training_data = [
    # Tags are: DET - determiner; NN - noun; V - verb
    # For example, the word "The" is a determiner
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {}
# For each words-list (sentence) and tags-list in each tuple of training_data
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:  # word has not been assigned an index yet
            word_to_ix[word] = len(word_to_ix)  # Assign each word with a unique index
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}  # Assign each tag with a unique index

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6


{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}


In [46]:
def train_LSTM(LSTM_class):
    class LSTMTagger(nn.Module):

        def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
            super(LSTMTagger, self).__init__()
            self.hidden_dim = hidden_dim

            self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

            # The LSTM takes word embeddings as inputs, and outputs hidden states
            # with dimensionality hidden_dim.
            self.lstm = LSTM_class(embedding_dim, hidden_dim)

            # The linear layer that maps from hidden state space to tag space
            self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
            self.log_softmax = nn.LogSoftmax(1)

        def forward(self, sentence):
            embeds = self.word_embeddings(sentence)
            lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
            tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
            tag_scores = self.log_softmax(tag_space)
            return tag_scores

    model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
    loss_function = nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.1)

    # See what the scores are before training
    # Note that element i,j of the output is the score for tag j for word i.
    # Here we don't need to train, so the code is wrapped in torch.no_grad()
    with torch.no_grad():
        inputs = prepare_sequence(training_data[0][0], word_to_ix)
        tag_scores = model(inputs)
        print(tag_scores)

    for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
        for sentence, tags in training_data:
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Tensors of word indices.
            sentence_in = prepare_sequence(sentence, word_to_ix)
            targets = prepare_sequence(tags, tag_to_ix)

            # Step 3. Run our forward pass.
            tag_scores = model(sentence_in)

            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            loss = loss_function(tag_scores, targets)
            loss.backward()
            optimizer.step()

    # See what the scores are after training
    with torch.no_grad():
        inputs = prepare_sequence(training_data[0][0], word_to_ix)
        tag_scores = model(inputs)

        # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
        # for word i. The predicted tag is the maximum scoring tag.
        # Here, we can see the predicted sequence below is 0 1 2 0 1
        # since 0 is index of the maximum value of row 1,
        # 1 is the index of maximum value of row 2, etc.
        # Which is DET NOUN VERB DET NOUN, the correct sequence!
        print(tag_scores)
        print(tag_scores.argmax(1))

In [47]:
train_LSTM(nn.LSTM)

tensor([[-0.9517, -1.3514, -1.0355],
        [-0.9099, -1.3568, -1.0790],
        [-0.9057, -1.3901, -1.0594],
        [-0.9018, -1.3869, -1.0662],
        [-0.8958, -1.3902, -1.0709]])
tensor([[-0.1139, -2.8609, -2.9873],
        [-1.7924, -0.1944, -4.5919],
        [-2.0898, -5.1299, -0.1388],
        [-0.0945, -2.8945, -3.3563],
        [-3.5632, -0.0304, -6.4241]])
tensor([0, 1, 2, 0, 1])


In [48]:
train_LSTM(LSTM)

tensor([[-0.8505, -1.2281, -1.2732],
        [-0.8412, -1.2122, -1.3046],
        [-0.8751, -1.2147, -1.2505],
        [-0.8707, -1.1938, -1.2792],
        [-0.9126, -1.2212, -1.1918]])
tensor([[-3.6089e-02, -4.6551e+00, -3.6523e+00],
        [-5.3321e+00, -1.4396e-02, -4.6608e+00],
        [-3.9737e+00, -5.5045e+00, -2.3138e-02],
        [-6.5820e-03, -6.0480e+00, -5.4732e+00],
        [-6.7747e+00, -1.5989e-03, -7.6947e+00]])
tensor([0, 1, 2, 0, 1])


Gets the right answer, but somehow is more confident than pytorch implementation? (also slower)