In [610]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm_notebook as tqdm
from torch.autograd import Variable
torch.manual_seed(1)

<torch._C.Generator at 0x114a27230>

In [417]:
import sys
sys.path.append('../../src/')

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [302]:
def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

In [397]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2),
                torch.randn(2, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [398]:
START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 5
HIDDEN_DIM = 4

# Make up some training data
training_data = [(
    "the wall street journal reported today that apple corporation made money".split(),
    "B I I I O O O B I O O".split()
), (
    "georgia tech is a university in georgia".split(),
    "B I O O O O B".split()
)]

word_to_ix = {}
for sentence, tags in training_data:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

tag_to_ix = {"B": 0, "I": 1, "O": 2, START_TAG: 3, STOP_TAG: 4}

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

# Check predictions before training
with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
    precheck_tags = torch.tensor([tag_to_ix[t] for t in training_data[0][1]], dtype=torch.long)
    print(model(precheck_sent))

(tensor(11.9120), [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0])


In [399]:
for epoch in tqdm(range(300)):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()


HBox(children=(IntProgress(value=0, max=300), HTML(value='')))




In [400]:
# Check predictions after training
with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
    _, tag_seq = model(precheck_sent)
    print(training_data[0][0])
    print(tag_seq)

['the', 'wall', 'street', 'journal', 'reported', 'today', 'that', 'apple', 'corporation', 'made', 'money']
[0, 1, 1, 1, 2, 2, 2, 0, 1, 2, 2]


## テスト

In [366]:
import CRF

In [410]:
class Model(nn.Module):
    def __init__(self, tag_to_ix):
        super(Model, self).__init__()
        self.embedding_dim = 5
        self.hidden_dim = 4
        
        self.emb = nn.Embedding(len(word_to_ix), (len(tag_to_ix) + 2))
        
        self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim // 2, num_layers=1, bidirectional=True)
        
        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(self.hidden_dim, len(tag_to_ix) + 2)

        self.crf = CRF.CRF(tag_to_ix)
        
        self.hidden = self.init_hidden()

    def forward(self, x):
        lstm_feats = self._get_lstm_features(x)
        score, tag_seq = self.crf(lstm_feats)
        return lstm_feats, tag_seq
    
    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2),
                torch.randn(2, 1, self.hidden_dim // 2))
    
    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.emb(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

In [413]:
tag_to_ix = {"B": 0, "I": 1, "O": 2}
model = Model(tag_to_ix)

optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [414]:
with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
    print(training_data[0][0])
    
    h_lstm, pred = model(precheck_sent)
    tag_seq = torch.functional.argmax(h_lstm, 1)
    print(score, tag_seq)

['the', 'wall', 'street', 'journal', 'reported', 'today', 'that', 'apple', 'corporation', 'made', 'money']
tensor(10.3681) tensor([2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2])


In [415]:
for epoch in tqdm(range(300)):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()
        
        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)

        # Step 3. Run our forward pass.
        h_lstm, pred = model(sentence_in)
        loss = model.crf.loss(h_lstm, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

HBox(children=(IntProgress(value=0, max=300), HTML(value='')))




In [416]:
with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
    print(training_data[0][0])
    
    _, pred = model(precheck_sent)
    tag_seq = torch.functional.argmax(pred, 1)
    print(tag_seq)

['the', 'wall', 'street', 'journal', 'reported', 'today', 'that', 'apple', 'corporation', 'made', 'money']
tensor([0, 1, 1, 1, 2, 2, 2, 0, 1, 2, 2])


In [419]:
seqs = ['gigantic_string','tiny_str','medium_str']

# make <pad> idx 0
vocab = ['<pad>'] + sorted(set(''.join(seqs)))

# make model
embed = nn.Embedding(len(vocab), 10)
lstm = nn.LSTM(10, 5)

vectorized_seqs = [[vocab.index(tok) for tok in seq] for seq in seqs]

# get the length of each seq in your batch
seq_lengths = torch.LongTensor([len(seq) for seq in vectorized_seqs])

In [434]:
seq_tensor = torch.zeros((len(vectorized_seqs), seq_lengths.max())).long()

In [420]:
seq_lengths

tensor([15,  8, 10])

In [421]:
vectorized_seqs

[[6, 7, 6, 2, 9, 12, 7, 3, 1, 11, 12, 10, 7, 9, 6],
 [12, 7, 9, 14, 1, 11, 12, 10],
 [8, 5, 4, 7, 13, 8, 1, 11, 12, 10]]

In [437]:
for idx, (seq, seqlen) in enumerate(zip(vectorized_seqs, seq_lengths)):
	seq_tensor[idx, :seqlen] = torch.LongTensor(seq)

In [441]:
seq_tensor

tensor([[ 6,  7,  6,  2,  9, 12,  7,  3,  1, 11, 12, 10,  7,  9,  6],
        [12,  7,  9, 14,  1, 11, 12, 10,  0,  0,  0,  0,  0,  0,  0],
        [ 8,  5,  4,  7, 13,  8,  1, 11, 12, 10,  0,  0,  0,  0,  0]])

In [442]:
seq_tensor = seq_tensor.transpose(0,1)

In [444]:
seq_lengths, perm_idx = seq_lengths.sort(0, descending=True)

In [445]:
torch.nn.utils.rnn.pack_padded_sequence(seq_tensor, seq_lengths)

PackedSequence(data=tensor([ 6, 12,  8,  7,  7,  5,  6,  9,  4,  2, 14,  7,  9,  1, 13, 12, 11,  8,
         7, 12,  1,  3, 10, 11,  1,  0, 11,  0, 12, 10,  7,  9,  6]), batch_sizes=tensor([3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 1]))

In [722]:
# char_dim = 3
chars = \
torch.LongTensor(
    [
        [1, 5, 0, 0]
        , [1, 2, 3, 4]
    ]
)
chars

tensor([[1, 5, 0, 0],
        [1, 2, 3, 4]])

In [540]:
embed = nn.Embedding(6, 3)

In [541]:
char_emb = embed(chars)

In [577]:
print(char_emb.shape)
char_emb

torch.Size([2, 4, 3])


tensor([[[ 0.0036, -0.2383, -1.8412],
         [-0.6197, -1.1801, -0.8510],
         [ 1.2851, -0.6701,  0.9620],
         [-1.6931, -0.7102,  2.8641]],

        [[ 0.0036, -0.2383, -1.8412],
         [ 1.0953,  0.7799,  1.7714],
         [ 1.3923, -0.3023,  0.8974],
         [ 1.3923, -0.3023,  0.8974]]], grad_fn=<EmbeddingBackward>)

In [687]:
chars[[1, 0]]

tensor([[1, 5, 0, 0],
        [1, 2, 3, 4]])

In [748]:
torch.nn.utils.rnn.pack_padded_sequence(char_emb, torch.Tensor([0, 0]), batch_first=True)

RuntimeError: Length of all samples has to be greater than 0, but found an element in 'lengths' that is <= 0

In [601]:
packed = torch.nn.utils.rnn.pack_padded_sequence(char_emb, torch.Tensor([4, 2]), batch_first=True)

In [602]:
c_lstm = nn.LSTM(input_size=3, hidden_size=10, bidirectional=True)

In [666]:
l_out, state = c_lstm(packed)
output, o_lengths = torch.nn.utils.rnn.pad_packed_sequence(l_out)

In [670]:
#output = output.transpose(0, 1)
output.transpose_(0, 1)

tensor([[[ 0.0182,  0.0828,  0.0733, -0.0670, -0.0591, -0.0608, -0.0757,
          -0.0538,  0.0959, -0.1278,  0.1077, -0.1012, -0.1781, -0.1004,
           0.0272, -0.1288, -0.1596, -0.1384,  0.2392,  0.1334],
         [ 0.0502,  0.0987,  0.0704, -0.0847, -0.1048, -0.0070, -0.1416,
           0.0180,  0.1537, -0.1874,  0.0389,  0.0148, -0.1099, -0.1018,
          -0.0837, -0.2622, -0.0675, -0.1174,  0.0868,  0.0858],
         [-0.1344,  0.0015,  0.0799, -0.1870,  0.0455, -0.0596, -0.1230,
           0.1527,  0.0482, -0.1459, -0.1010,  0.0735,  0.1124, -0.0628,
          -0.2038, -0.2028,  0.1586, -0.0849, -0.1440, -0.0589],
         [-0.2557,  0.0215,  0.1709, -0.1721,  0.2028,  0.0111, -0.1549,
           0.2178, -0.0704, -0.1796, -0.2083,  0.0193,  0.0088,  0.0242,
          -0.0908, -0.2623,  0.0917, -0.1537, -0.0797, -0.1687]],

        [[ 0.0182,  0.0828,  0.0733, -0.0670, -0.0591, -0.0608, -0.0757,
          -0.0538,  0.0959, -0.1278,  0.0400, -0.1114, -0.0280, -0.0997,
        

In [624]:
chars_embeds_temp = Variable(torch.FloatTensor(torch.zeros((output.size(0), output.size(2)))))
chars_embeds_temp

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [641]:
o_lengths

tensor([4, 2])

In [745]:
[[i, l] for i, l in zip(output, o_lengths)]

[[tensor([[ 0.0182,  0.0828,  0.0733, -0.0670, -0.0591, -0.0608, -0.0757, -0.0538,
            0.0959, -0.1278,  0.1077, -0.1012, -0.1781, -0.1004,  0.0272, -0.1288,
           -0.1596, -0.1384,  0.2392,  0.1334],
          [ 0.0502,  0.0987,  0.0704, -0.0847, -0.1048, -0.0070, -0.1416,  0.0180,
            0.1537, -0.1874,  0.0389,  0.0148, -0.1099, -0.1018, -0.0837, -0.2622,
           -0.0675, -0.1174,  0.0868,  0.0858],
          [-0.1344,  0.0015,  0.0799, -0.1870,  0.0455, -0.0596, -0.1230,  0.1527,
            0.0482, -0.1459, -0.1010,  0.0735,  0.1124, -0.0628, -0.2038, -0.2028,
            0.1586, -0.0849, -0.1440, -0.0589],
          [-0.2557,  0.0215,  0.1709, -0.1721,  0.2028,  0.0111, -0.1549,  0.2178,
           -0.0704, -0.1796, -0.2083,  0.0193,  0.0088,  0.0242, -0.0908, -0.2623,
            0.0917, -0.1537, -0.0797, -0.1687]], grad_fn=<SelectBackward>),
  tensor(4)],
 [tensor([[ 0.0182,  0.0828,  0.0733, -0.0670, -0.0591, -0.0608, -0.0757, -0.0538,
            0.0959,

In [671]:
index = 1
output[index, o_lengths[index] - 1, :10], output[index, 0, 10:]

(tensor([-0.2348,  0.0016,  0.1730, -0.1228,  0.0832, -0.0612, -0.0797,  0.1041,
         -0.0075, -0.0405], grad_fn=<SliceBackward>),
 tensor([ 0.0400, -0.1114, -0.0280, -0.0997, -0.0134, -0.0454, -0.0569, -0.1046,
          0.1304,  0.1267], grad_fn=<SliceBackward>))

In [647]:
torch.cat((output[1, 1, :10], output[1, 1, 10:]))

tensor([-0.2348,  0.0016,  0.1730, -0.1228,  0.0832, -0.0612, -0.0797,  0.1041,
        -0.0075, -0.0405,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000], grad_fn=<CatBackward>)

In [673]:
o_lengths

tensor([4, 2])

In [733]:
for i, index in enumerate(o_lengths):
    chars_embeds_temp[i] = \
        torch.cat((output[i, index-1, :10], output[i, 0, 10:]))


In [724]:
prev_lengths = torch.LongTensor([2, 4])
lengths, perm_index = prev_lengths.sort(0, descending=True)

In [738]:
tmp_chars = Variable(torch.FloatTensor(torch.zeros((output.size(0), output.size(2)))))
tmp_chars[perm_index] = chars_embeds_temp

In [739]:
tmp_chars

tensor([[-0.2348,  0.0016,  0.1730, -0.1228,  0.0832, -0.0612, -0.0797,  0.1041,
         -0.0075, -0.0405,  0.0400, -0.1114, -0.0280, -0.0997, -0.0134, -0.0454,
         -0.0569, -0.1046,  0.1304,  0.1267],
        [-0.2557,  0.0215,  0.1709, -0.1721,  0.2028,  0.0111, -0.1549,  0.2178,
         -0.0704, -0.1796,  0.1077, -0.1012, -0.1781, -0.1004,  0.0272, -0.1288,
         -0.1596, -0.1384,  0.2392,  0.1334]], grad_fn=<IndexPutBackward>)

In [747]:
torch.Tensor(torch.zeros((output.size(0), output.size(2))))

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])