In [1]:
import torch
from torch import nn
import math

In [2]:
from torch import functional as F
import torch.optim as optim

$
    \begin{array}{ll} \\
        i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{t-1} + b_{hi}) \\
        f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{t-1} + b_{hf}) \\
        g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{t-1} + b_{hg}) \\
        o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{t-1} + b_{ho}) \\
        c_t = f_t \odot c_{t-1} + i_t \odot g_t \\
        h_t = o_t \odot \tanh(c_t) \\
    \end{array}
$

In [65]:
a = torch.zeros(4,4)
d,e,f,g = a

In [71]:
a.split(1)

(tensor([[0., 0., 0., 0.]]),
 tensor([[0., 0., 0., 0.]]),
 tensor([[0., 0., 0., 0.]]),
 tensor([[0., 0., 0., 0.]]))

In [67]:
d.shape

torch.Size([4])

In [92]:
def get_uniform_param(shape, extent):
    return nn.Parameter(torch.FloatTensor(*shape).uniform_(*extent))

def MVb(Mat, vecs):
    '''Matrix multiply Mat by batched vectors vecs'''
    vecs = vecs.unsqueeze(-1)
    out = Mat @ vecs
    out = out.squeeze(-1)
    return out

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size,
                 #num_layers = 1, bias = True,
                 #batch_first = False, dropout = 0,
                 #bidirectional = False
                ):
        super().__init__()
        self.hidden_size = hidden_size
        #range of weight values for initialization:
        k = 1/hidden_size
        extent = (-math.sqrt(k), math.sqrt(k))
        #shapes of weights and biases:
        in_hid_shape = (4*hidden_size, input_size)
        hid_hid_shape = (4*hidden_size, hidden_size)
        bias_shape = (4*hidden_size,)
        #Create initialized weights and biases:
        self.weight_ih_l0 = get_uniform_param(in_hid_shape, extent)
        self.weight_hh_l0 = get_uniform_param(hid_hid_shape, extent)
        self.bias_ih_l0 = get_uniform_param(bias_shape, extent)
        self.bias_hh_l0 = get_uniform_param(bias_shape, extent)
        self.sigmoid = nn.Sigmoid()
        self.tanh = nn.Tanh()

    def single_forward(self, x, hc):
        '''
        Compute h and c given the current input x and previous hc
        (where x is a single element of the sequence)
        '''
        h,c = hc
        wis = self.weight_ih_l0.split(self.hidden_size) #split into 4 sets of weights
        whs = self.weight_hh_l0.split(self.hidden_size)
        bis = self.bias_ih_l0.split(self.hidden_size)
        bhs = self.bias_hh_l0.split(self.hidden_size)
        #weighted activations. for is messy but non-trivial to vectorize
        As = [MVb(wis[i], x) + bis[i] + MVb(whs[i], h) + bhs[i] for i in range(4)]
        g = self.tanh(As[0])
        i,f,o = [self.sigmoid(a) for a in As[1:]] #i,f, and o
        c = f * c + i * g
        h = o * self.tanh(c)
        return h, c

    def forward(self, x, hc = None):
        '''
        Operate over sequence x of dim: seq x batch x features
        '''
        if hc is None: #initialize with hidden of all zeros:
            h_shape = (1, x.shape[1], self.hidden_size)
            hc = (torch.zeros(h_shape), torch.zeros(h_shape))
        #allocate seq_len x batch x hidden for output:
        outputs = torch.empty(x.shape[:2] + (self.hidden_size,))
        #Iterate over input sequence, saving hidden state
        for i, xi in enumerate(x):
            hc = self.single_forward(xi, hc)
            outputs[i] = hc[0] #h
        return outputs, hc
                


# TESTING

In [93]:
# lstm = nn.LSTM(4,3)
my_lstm = LSTM(4,3)

inputs = torch.randn(5,2,4)

hidden = (torch.randn(1, 2, 3),
          torch.randn(1, 2, 3))

out, hidden = my_lstm(inputs, hidden)
print(out)
print(hidden[1].shape)

tensor([[[ 0.3147,  0.2199, -0.0194],
         [-0.4538, -0.1486, -0.1851]],

        [[ 0.5117,  0.3117, -0.0852],
         [-0.0708, -0.1156,  0.0874]],

        [[ 0.6136,  0.2184,  0.0378],
         [ 0.1698, -0.1697,  0.1435]],

        [[ 0.6082,  0.3992, -0.1496],
         [ 0.0355, -0.0972,  0.0979]],

        [[ 0.4133,  0.1604,  0.0671],
         [ 0.1286, -0.1735,  0.2223]]], grad_fn=<CopySlices>)
torch.Size([1, 2, 3])


### Actual training Example

From https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

In [94]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


training_data = [
    # Tags are: DET - determiner; NN - noun; V - verb
    # For example, the word "The" is a determiner
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {}
# For each words-list (sentence) and tags-list in each tuple of training_data
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:  # word has not been assigned an index yet
            word_to_ix[word] = len(word_to_ix)  # Assign each word with a unique index
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}  # Assign each tag with a unique index

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6


{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}


In [95]:
def train_LSTM(LSTM_class):
    class LSTMTagger(nn.Module):

        def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
            super(LSTMTagger, self).__init__()
            self.hidden_dim = hidden_dim

            self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

            # The LSTM takes word embeddings as inputs, and outputs hidden states
            # with dimensionality hidden_dim.
            self.lstm = LSTM_class(embedding_dim, hidden_dim)

            # The linear layer that maps from hidden state space to tag space
            self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
            self.log_softmax = nn.LogSoftmax(1)

        def forward(self, sentence):
            embeds = self.word_embeddings(sentence)
            lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
            tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
            tag_scores = self.log_softmax(tag_space)
            return tag_scores

    model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
    loss_function = nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.1)

    # See what the scores are before training
    # Note that element i,j of the output is the score for tag j for word i.
    # Here we don't need to train, so the code is wrapped in torch.no_grad()
    with torch.no_grad():
        inputs = prepare_sequence(training_data[0][0], word_to_ix)
        tag_scores = model(inputs)
        print(tag_scores)

    for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
        for sentence, tags in training_data:
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Tensors of word indices.
            sentence_in = prepare_sequence(sentence, word_to_ix)
            targets = prepare_sequence(tags, tag_to_ix)

            # Step 3. Run our forward pass.
            tag_scores = model(sentence_in)

            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            loss = loss_function(tag_scores, targets)
            loss.backward()
            optimizer.step()

    # See what the scores are after training
    with torch.no_grad():
        inputs = prepare_sequence(training_data[0][0], word_to_ix)
        tag_scores = model(inputs)

        # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
        # for word i. The predicted tag is the maximum scoring tag.
        # Here, we can see the predicted sequence below is 0 1 2 0 1
        # since 0 is index of the maximum value of row 1,
        # 1 is the index of maximum value of row 2, etc.
        # Which is DET NOUN VERB DET NOUN, the correct sequence!
        print(tag_scores)
        print(tag_scores.argmax(1))

In [96]:
train_LSTM(nn.LSTM)

tensor([[-1.3887, -0.8282, -1.1591],
        [-1.4371, -0.8561, -1.0860],
        [-1.4444, -0.7989, -1.1574],
        [-1.3369, -0.8006, -1.2439],
        [-1.4216, -0.8232, -1.1406]])
tensor([[-0.0429, -3.7287, -4.0211],
        [-4.0891, -0.0540, -3.3296],
        [-2.1677, -2.5137, -0.2174],
        [-0.0175, -4.6791, -4.8214],
        [-4.5135, -0.0223, -4.4985]])
tensor([0, 1, 2, 0, 1])


In [97]:
train_LSTM(LSTM)

tensor([[-1.2070, -1.4539, -0.7609],
        [-1.2626, -1.4106, -0.7484],
        [-1.2714, -1.3578, -0.7715],
        [-1.2227, -1.3853, -0.7868],
        [-1.2283, -1.4457, -0.7516]])
tensor([[-0.0377, -3.3528, -6.2028],
        [-2.8416, -0.0918, -3.5260],
        [-3.9234, -3.9229, -0.0404],
        [-0.0428, -3.4167, -4.6988],
        [-5.0078, -0.0087, -6.2402]])
tensor([0, 1, 2, 0, 1])


Gets reasonable looking answer, but rather slow, and more testing needed.