In [36]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]

word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

######################################################################
# Create the model:

class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

######################################################################
# Train the model:

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)
    print(tag_scores)

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(tag_scores)

######################################################################
# Exercise: Augmenting the LSTM part-of-speech tagger with character-level features
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# In the example above, each word had an embedding, which served as the
# inputs to our sequence model. Let's augment the word embeddings with a
# representation derived from the characters of the word. We expect that
# this should help significantly, since character-level information like
# affixes have a large bearing on part-of-speech. For example, words with
# the affix *-ly* are almost always tagged as adverbs in English.
#
# To do this, let :math:`c_w` be the character-level representation of
# word :math:`w`. Let :math:`x_w` be the word embedding as before. Then
# the input to our sequence model is the concatenation of :math:`x_w` and
# :math:`c_w`. So if :math:`x_w` has dimension 5, and :math:`c_w`
# dimension 3, then our LSTM should accept an input of dimension 8.
#
# To get the character level representation, do an LSTM over the
# characters of a word, and let :math:`c_w` be the final hidden state of
# this LSTM. Hints:
#
# * There are going to be two LSTM's in your new model.
#   The original one that outputs POS tag scores, and the new one that
#   outputs a character-level representation of each word.
# * To do a sequence model over characters, you will have to embed characters.
#   The character embeddings will be the input to the character LSTM.
#

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}
tensor([[-1.3852, -0.9624, -1.0003],
        [-1.3570, -1.1749, -0.8354],
        [-1.3794, -1.2678, -0.7618],
        [-1.3699, -1.1893, -0.8177],
        [-1.3667, -1.2508, -0.7792]])
tensor([[-0.0341, -4.1614, -4.0227],
        [-4.5262, -0.0158, -5.3256],
        [-3.7815, -4.2559, -0.0377],
        [-0.0226, -4.4558, -4.5316],
        [-4.6019, -0.0128, -5.9298]])


In [1]:
def answer_vocab(answer_vocab_dir):
    index2answer = []
    answer2index = {}
    with open(answer_vocab_dir, 'r') as file:
        answers = file.readlines()
        for i in range(len(answers)):
            w = word_tokenize(answers[i])
            index2answer.append(w)
            if w in answer2index:
                print(answer2index[w], i, w)
            answer2index[w] = i
    pickle.dump((index2answer, answer2index), open("../data/vocabs/answer_index.pkl",'wb'))
    return index2answer, answer2index

def word_tokenize(word):
    word = word.lower()
    word = word.replace("?", "").replace("'s", " 's") 
    return word.strip()

In [2]:
import pickle
i2a, a2i = answer_vocab("../data/vocabs/answers_textvqa_more_than_1.txt")

In [3]:
len(i2a), len(a2i)

(3995, 3995)

#### 统计单词列表

In [18]:
import numpy as np

In [31]:
dataset = np.load(open("../data/imdb/textvqa_0.5/imdb_textvqa_train.npy", "rb"), allow_pickle=True)

## ------

In [10]:
# !CUDA_VISIBLE_DEVICES=1 python run.py --config options/al/exp_1_15_9.yaml  --eval_name test #--is_train True

In [9]:
# !CUDA_VISIBLE_DEVICES=1 python run.py --config options/al/exp_1_17_1.yaml  --is_train True --eval_name test

In [8]:
# !CUDA_VISIBLE_DEVICES=1 python run.py --config options/al/exp_1_17_1.yaml  --eval_name test #--is_train True

In [11]:
!CUDA_VISIBLE_DEVICES=1 python run.py --config options/al/exp_1_20_1.yaml --is_train True

training . . .
Total 34602 train samples.
Use 34602 train samples.
no existing answer 9241
Total 5000 val samples.
Use 5000 val samples.
no existing answer 1345
nParams: 61824341
sucees to create model.
train with train dataset
Epoch 1 of Train:
[1/541/9] iter:10 accuracy:0.00 loss:17.26013 lr: 0.000375
[1/541/19] iter:20 accuracy:0.00 loss:6.37488 lr: 0.000375
[1/541/29] iter:30 accuracy:0.52 loss:2.86010 lr: 0.000375
[1/541/39] iter:40 accuracy:7.81 loss:1.58068 lr: 0.000375
[1/541/49] iter:50 accuracy:7.29 loss:1.00685 lr: 0.000375
[1/541/59] iter:60 accuracy:10.94 loss:0.75308 lr: 0.000375
[1/541/69] iter:70 accuracy:10.94 loss:0.68325 lr: 0.000375
[1/541/79] iter:80 accuracy:7.29 loss:0.74481 lr: 0.000375
[1/541/89] iter:90 accuracy:4.69 loss:0.64164 lr: 0.000375
[1/541/99] iter:100 accuracy:17.71 loss:0.62559 lr: 0.000375
[1/541/109] iter:110 accuracy:6.77 loss:0.58874 lr: 0.000375
[1/541/119] iter:120 accuracy:9.90 loss:0.50894 lr: 0.000375
[1/541/129] iter:130 accuracy:18.75 lo