In [1]:
# !pip install visdom

In [1]:
import unicodedata
import string
import re
import random
import time
import datetime
import math
import socket

hostname = socket.gethostname()

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence  #, masked_cross_entropy
from masked_cross_entropy import *

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
# %matplotlib inline
# import visdom
# vis = visdom.Visdom()
import os

In [2]:
USE_CUDA = True

In [3]:
# https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation.ipynb
PAD_token = 0
SOS_token = 1
EOS_token = 2

In [4]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "PAD", 1: "SOS", 2: "EOS"}
        self.n_words = 3  # Count default tokens

    def index_words(self, sentence):
        for word in sentence.split(' '):
            word = str(word)
            self.index_word(word)

    def index_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
#             print(word)
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

    def trim(self, min_count):
        if self.trimmed: return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words %s / %s = %.4f' % (
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "PAD", 1: "SOS", 2: "EOS"}
        self.n_words = 3  # Count default tokens

        for word in keep_words:
            self.index_word(word)

To read the data file we will split the file into lines, and then split lines into pairs. The files are all English → Other Language, so if we want to translate from Other Language → English I added the reverse flag to reverse the pairs.

In [5]:
def read_langs(lang1, lang2, reverse=False):
    print("Reading lines...")

    filename = './%s-%s' % (lang1, lang2)
    print(filename)
    lines = open(filename).read().strip().split('\n')
#     print(lines)
    # pairs = []
    # input_lang = []
    # output_lang = []

    pairs = [[s for s in l.split(':')] for l in lines]
    print(pairs)

    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)
#     print(input_lang)
#     print(output_lang)

    return input_lang, output_lang, pairs

In [6]:
MIN_LENGTH = 10
MAX_LENGTH = 5760

def filter_pairs(pairs):
    filtered_pairs = []
    for pair in pairs:
        filtered_pairs.append(pair)
    return filtered_pairs

In [7]:
def prepare_data(lang1_name, lang2_name, reverse=False):
    input_lang, output_lang, pairs = read_langs(lang1_name, lang2_name, reverse)
    print("Read %d sentence pairs" % len(pairs))

    pairs = filter_pairs(pairs)
#     print(pairs)
    print("Filtered to %d pairs" % len(pairs))

    print("Indexing words...")
    for pair in pairs:
#         print(pair)
        input_lang.index_words(pair[0])
        output_lang.index_words(pair[1])

#     print('Indexed %d words in input language, %d words in output' % (input_lang.n_words, output_lang.n_words))
    return input_lang, output_lang, pairs

In [8]:
input_lang, output_lang, pairs = prepare_data('way', 'base', False)
output_lang.index2word[4]

Reading lines...
./way-base
[['13800389 13848825 13848825 13848825 13848825 13848825 13848825 13848825 14326174 14326256 30405557 30405557 128131496 128131496 128463450 128463451 128463455 128463455 130666508 167487519 200072875 207634523 207634523 207634523 207634523 262803358 262803520 262803520 262803520 346994183 434882583 434882592 482296562 507522243 507522243 507522243 571755274 571755274 571755274 571755274 571755274 571755274 743461569 743461569 743461569 769985994 769985994 769985994 769985995 769985995 769985995 927900710 927900710 927900710 927900710 927900710 927900710 927900710 1008096491 1008096491 1008096491 1008096491 1008096491 1008096491 1008096492 1008096492 1008096492 1008096492 1008096492 1008096492 1016567885 1016994265 1018551322 1028336211 1028336212 1071925032 ', '13800389 200072875 207634523 262803358 482296562 207634523 262803520 769985994 769985995 207634523 262803520 769985994 769985995 207634523 262803520 769985994 769985995 30405557 346994183 30405557 10

'200072875'

In [9]:
keep_pairs = []

In [10]:
for pair in pairs:
    input_sentence = pair[0]
    output_sentence = pair[1]
    keep_input = True
    keep_output = True
    for word in input_sentence.split(' '):
        if word not in input_lang.word2index:
            keep_input=False
            break
    for word in output_sentence.split(' '):
        if word not in output_lang.word2index:
            keep_output=False
            break
    if keep_input and keep_output:
        keep_pairs.append(pair)
        
    


In [11]:
def indexes_from_sentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')] + [EOS_token]

In [12]:
def pad_seq(seq, max_length):
    seq += [PAD_token for i in range(max_length - len(seq))]
    return seq

In [13]:
def random_batch(batch_size):
    input_seqs = []
    target_seqs = []
    for i in range(batch_size):
        pair = random.choice(pairs)
#         print(pair)
        input_seqs.append(indexes_from_sentence(input_lang, pair[0]))
        target_seqs.append(indexes_from_sentence(output_lang, pair[1]))

    seq_pairs = sorted(zip(input_seqs, target_seqs), key=lambda p: len(p[0]), reverse=True)
    input_seqs, target_seqs = zip(*seq_pairs)

    input_lengths = [len(s) for s in input_seqs]
    input_padded = [pad_seq(s, max(input_lengths)) for s in input_seqs]
    target_lengths = [len(s) for s in target_seqs]
    target_padded = [pad_seq(s, max(target_lengths)) for s in target_seqs]

    input_var = Variable(torch.LongTensor(input_padded)).transpose(0, 1)
    target_var = Variable(torch.LongTensor(target_padded)).transpose(0, 1)

    if USE_CUDA:
        input_var = input_var.cuda()
        target_var = target_var.cuda()

    return input_var, input_lengths, target_var, target_lengths

In [14]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1, dropout=0.1):
        super(EncoderRNN, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=self.dropout, bidirectional=True)

    def forward(self, input_seqs, input_lengths, hidden=None):
        embedded = self.embedding(input_seqs)
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        outputs, hidden = self.gru(packed, hidden)
        outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs)  # unpack (back to padded)
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]  # Sum bidirectional outputs
        return outputs, hidden

In [15]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()

        self.method = method
        self.hidden_size = hidden_size

        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)

        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(1, hidden_size))

    def forward(self, hidden, encoder_outputs):
        max_len = encoder_outputs.size(0)
        this_batch_size = encoder_outputs.size(1)

        attn_energies = Variable(torch.zeros(this_batch_size, max_len))  # B x S

        if USE_CUDA:
            attn_energies = attn_energies.cuda()

        for b in range(this_batch_size):
            for i in range(max_len):
                attn_energies[b, i] = self.score(hidden[:, b], encoder_outputs[i, b].unsqueeze(0))

        return F.softmax(attn_energies).unsqueeze(1)

    def score(self, hidden, encoder_output):
        if self.method == 'dot':
            energy = torch.dot(hidden.view(-1), encoder_output.view(-1))
        elif self.method == 'general':
            energy = self.attn(encoder_output)
            energy = torch.dot(hidden.view(-1), energy.view(-1))
        elif self.method == 'concat':
            energy = self.attn(torch.cat((hidden, encoder_output), 1))
            energy = torch.dot(self.v.view(-1), energy.view(-1))
        return energy



In [16]:
class BahdanauAttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, n_layers=1, dropout_p=0.1):
        super(BahdanauAttnDecoderRNN, self).__init__()

        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.dropout = nn.Dropout(dropout_p)
        self.attn = Attn('concat', hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=dropout_p)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, word_input, last_hidden, encoder_outputs):
        word_embedded = self.embedding(word_input).view(1, 1, -1)  # S=1 x B x N
        word_embedded = self.dropout(word_embedded)

        attn_weights = self.attn(last_hidden[-1], encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        context = context.transpose(0, 1)

        rnn_input = torch.cat((word_embedded, context), 2)
        output, hidden = self.gru(rnn_input, last_hidden)

        output = output.squeeze(0)  # B x N
        output = F.log_softmax(self.out(torch.cat((output, context), 1)))

        return output, hidden, attn_weights

In [17]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=dropout)
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        if attn_model != 'none':
            self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_seq, last_hidden, encoder_outputs):
        batch_size = input_seq.size(0)
        embedded = self.embedding(input_seq)
        embedded = self.embedding_dropout(embedded)
        embedded = embedded.view(1, batch_size, self.hidden_size)

        rnn_output, hidden = self.gru(embedded, last_hidden)

        attn_weights = self.attn(rnn_output, encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))

        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = F.tanh(self.concat(concat_input))

        output = self.out(concat_output)

        return output, hidden, attn_weights

In [18]:
encoder_test = EncoderRNN(10, 10, 2)
decoder_test = LuongAttnDecoderRNN('general', 10, 10, 2)
print(encoder_test)
print(decoder_test)

EncoderRNN(
  (embedding): Embedding(10, 10)
  (gru): GRU(10, 10, num_layers=2, dropout=0.1, bidirectional=True)
)
LuongAttnDecoderRNN(
  (embedding): Embedding(10, 10)
  (embedding_dropout): Dropout(p=0.1, inplace=False)
  (gru): GRU(10, 10, num_layers=2, dropout=0.1)
  (concat): Linear(in_features=20, out_features=10, bias=True)
  (out): Linear(in_features=10, out_features=10, bias=True)
  (attn): Attn(
    (attn): Linear(in_features=10, out_features=10, bias=True)
  )
)


In [19]:
# Configure models
attn_model = 'dot'
hidden_size = 100
n_layers = 2
dropout = 0.1
batch_size = 1

clip = 50.0
teacher_forcing_ratio = 0.5
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_epochs = 100
epoch = 0
plot_every = 1
print_every = 1
evaluate_every = 1

In [20]:
encoder = EncoderRNN(input_lang.n_words, hidden_size, n_layers, dropout=dropout)
decoder = LuongAttnDecoderRNN(attn_model, hidden_size, output_lang.n_words, n_layers, dropout=dropout)

In [21]:
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
criterion = nn.CrossEntropyLoss()

if USE_CUDA:
    encoder.cuda()
    decoder.cuda()
start = time.time()
plot_losses = []
print_loss_total = 0
plot_loss_total = 0

In [22]:
def as_minutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def time_since(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (as_minutes(s), as_minutes(rs))

In [23]:
def train(input_batches, input_lengths, target_batches, target_lengths, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0

    encoder_outputs, encoder_hidden = encoder(input_batches, input_lengths, None)

    decoder_input = Variable(torch.LongTensor([SOS_token] * batch_size))
    decoder_hidden = encoder_hidden[:decoder.n_layers]  # Use last (forward) hidden state from encoder

    max_target_length = max(target_lengths)
    all_decoder_outputs = Variable(torch.zeros(max_target_length, batch_size, decoder.output_size))

    if USE_CUDA:
        decoder_input = decoder_input.cuda()
        all_decoder_outputs = all_decoder_outputs.cuda()

    print("The max target length this time is \t", max_target_length)
    for t in range(max_target_length):
        decoder_output, decoder_hidden, decoder_attn = decoder(
            decoder_input, decoder_hidden, encoder_outputs
        )

        all_decoder_outputs[t] = decoder_output
        decoder_input = target_batches[t]

    loss = masked_cross_entropy(
        all_decoder_outputs.transpose(0, 1).contiguous(),
        target_batches.transpose(0, 1).contiguous(),
        target_lengths
    )
    loss.backward()

    ec = torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
    dc = torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.data, ec, dc

In [25]:
ecs = []
dcs = []
eca = 0
dca = 0
epoch=0
# while epoch < n_epochs:
#     epoch += 1
#     print('epoch \t', epoch)
#     input_batches, input_lengths, target_batches, target_lengths = random_batch(batch_size)

#     loss, ec, dc = train(
#         input_batches, input_lengths, target_batches, target_lengths,
#         encoder, decoder,
#         encoder_optimizer, decoder_optimizer, criterion
#     )

#     print_loss_total += loss
#     plot_loss_total += loss
#     eca += ec
#     dca += dc
#     if epoch==0:
#         continue
#     if epoch % print_every == 0:
#         print_loss_avg = print_loss_total / print_every
#         print_loss_total = 0
#         print_summary = '%s (%d %d%%) %.4f' % (
#         time_since(start, epoch / n_epochs), epoch, epoch / n_epochs * 100, print_loss_avg)
#         print(print_summary)

#     if epoch % evaluate_every == 0:
#         print("Hello")
#         # evaluate_randomly()

#     if epoch % plot_every == 0:
#         plot_loss_avg = plot_loss_total / plot_every
#         plot_losses.append(plot_loss_avg)
#         plot_loss_total = 0
#         ecs.append(eca / plot_every)
#         dcs.append(dca / plot_every)
#         ecs_win = 'encoder grad (%s)' % hostname
#         dcs_win = 'decoder grad (%s)' % hostname
#         eca = 0
#         dca = 0

epoch 	 1
The max target length this time is 	 58


  return F.softmax(attn_energies).unsqueeze(1)
  ec = torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
  dc = torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)


0m 1s (- 2m 24s) (1 1%) 7.2268
Hello
epoch 	 2
The max target length this time is 	 136
0m 6s (- 5m 35s) (2 2%) 7.2490
Hello
epoch 	 3
The max target length this time is 	 136
0m 12s (- 6m 31s) (3 3%) 7.2310
Hello
epoch 	 4
The max target length this time is 	 87
0m 14s (- 5m 48s) (4 4%) 7.2553
Hello
epoch 	 5
The max target length this time is 	 85
0m 16s (- 5m 17s) (5 5%) 7.2187
Hello
epoch 	 6
The max target length this time is 	 156
0m 23s (- 6m 11s) (6 6%) 7.2348
Hello
epoch 	 7
The max target length this time is 	 75
0m 25s (- 5m 37s) (7 7%) 7.1794
Hello
epoch 	 8
The max target length this time is 	 101
0m 28s (- 5m 27s) (8 8%) 7.2307
Hello
epoch 	 9
The max target length this time is 	 76
0m 30s (- 5m 5s) (9 9%) 7.2162
Hello
epoch 	 10
The max target length this time is 	 16
0m 30s (- 4m 32s) (10 10%) 7.1908
Hello
epoch 	 11
The max target length this time is 	 76
0m 32s (- 4m 19s) (11 11%) 7.2113
Hello
epoch 	 12
The max target length this time is 	 96
0m 34s (- 4m 16s) (12 12

11m 43s (- 1m 1s) (92 92%) 7.4024
Hello
epoch 	 93
The max target length this time is 	 329
12m 35s (- 0m 56s) (93 93%) 7.0420
Hello
epoch 	 94
The max target length this time is 	 148
12m 42s (- 0m 48s) (94 94%) 6.4877
Hello
epoch 	 95
The max target length this time is 	 54
12m 44s (- 0m 40s) (95 95%) 6.2553
Hello
epoch 	 96
The max target length this time is 	 106
13m 8s (- 0m 32s) (96 96%) 6.1505
Hello
epoch 	 97
The max target length this time is 	 42
13m 9s (- 0m 24s) (97 97%) 7.3226
Hello
epoch 	 98
The max target length this time is 	 73
13m 11s (- 0m 16s) (98 98%) 6.3203
Hello
epoch 	 99
The max target length this time is 	 47
13m 12s (- 0m 8s) (99 99%) 7.3562
Hello
epoch 	 100
The max target length this time is 	 50
13m 13s (- 0m 0s) (100 100%) 6.6729
Hello


In [26]:
# input_batches, input_lengths, target_batches, target_lengths = random_batch(batch_size)
# print(input_batches.shape)
# print(input_lengths)
# # encoder(input_batches, input_lengths, None)
# # input_lengths = [len(input_seq)]


# # print(target_batches)
# # print(target_lengths)

In [27]:
input_seq=["42195876"]
input_lengths = [len(input_seq)]
input_seqs = [indexes_from_sentence(input_lang, input_seq[0])]
input_batches = Variable(torch.LongTensor(input_seqs), volatile=True).transpose(0, 1)
# print(input_batches.shape)
# print(input_lengths)
if USE_CUDA:
        input_batches = input_batches.cuda()
# print(input_seqs)
encoder_outputs, encoder_hidden=encoder(input_batches, input_lengths, None)
decoder_input = Variable(torch.LongTensor([SOS_token]), volatile=True)
decoder_hidden = encoder_hidden[:decoder.n_layers]
if USE_CUDA:
        decoder_input = decoder_input.cuda()
print(decoder_hidden)

KeyError: '42195876'

In [28]:
# torch.save(encoder.state_dict(), 'encoder100.dict')
# torch.save(decoder.state_dict(), 'decoder100.dict')

In [24]:
encoder.load_state_dict(torch.load('encoder100.dict'))
decoder.load_state_dict(torch.load('decoder100.dict'))

<All keys matched successfully>

In [25]:
def show_attention(input_sentence, output_words, attentions):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)
    ax.set_xticklabels([''] + input_sentence.split(' ') + ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

In [26]:
def evaluate_and_show_attention(input_sentence, target_sentence=None):
    output_words, attentions = evaluate(input_sentence)
    output_sentence = ' '.join(output_words)
    print('>', input_sentence)
    if target_sentence is not None:
        print('=', target_sentence)
    print('<', output_sentence)

    show_attention(input_sentence, output_words, attentions)
    win = 'evaluted (%s)' % hostname
    text = '<p>&gt; %s</p><p>= %s</p><p>&lt; %s</p>' % (input_sentence, target_sentence, output_sentence)
    vis.text(text, win=win, opts={'title': win})

In [34]:
def evaluate(input_seq, max_length=MAX_LENGTH):
#     print(max_length)
    input_lengths = [len(input_seq)]
    input_seqs = [indexes_from_sentence(input_lang, input_seq[0])]
    input_batches = Variable(torch.LongTensor(input_seqs), volatile=True).transpose(0, 1)
    print(input_batches)
    print(input_lengths)
    print(input_seqs)
    if USE_CUDA:
        input_batches = input_batches.cuda()
#     encoder.train(False)
#     decoder.train(False)

    encoder_outputs, encoder_hidden = encoder(input_batches, input_lengths, None)

    decoder_input = Variable(torch.LongTensor([SOS_token]), volatile=True)  # SOS
    decoder_hidden = encoder_hidden[:decoder.n_layers]  # Use last (forward) hidden state from encoder

    if USE_CUDA:
        decoder_input = decoder_input.cuda()

    decoded_words = []
    decoder_attentions = torch.zeros(max_length + 1, max_length + 1)

    for di in range(max_length):
#         print("I am here")
#         print(di)
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs
        )
        decoder_attentions[di, :decoder_attention.size(2)] += decoder_attention.squeeze(0).squeeze(0).cpu().data

        topv, topi = decoder_output.data.topk(1)
        print(topv)
        ni = topi[0][0]
        ni=ni.item()
        print(ni)
#         print(decoded_words)
        if ni == EOS_token:
#             decoded_words.append('<EOS>')
            break
        else:
            decoded_words.append(output_lang.index2word[ni])

        decoder_input = Variable(torch.LongTensor([ni]))
        if USE_CUDA:
            decoder_input = decoder_input.cuda()

#     encoder.train(True)
#     decoder.train(True)

    return decoded_words, decoder_attentions[:di + 1, :len(encoder_outputs)]

In [35]:
evaluate([ "583836282 587512684 608418573"])

tensor([[124],
        [552],
        [ 59],
        [  2]])
[1]
[[124, 552, 59, 2]]
tensor([[0.7042]], device='cuda:0')
36
tensor([[1.5367]], device='cuda:0')
2


  input_batches = Variable(torch.LongTensor(input_seqs), volatile=True).transpose(0, 1)
  decoder_input = Variable(torch.LongTensor([SOS_token]), volatile=True)  # SOS
  return F.softmax(attn_energies).unsqueeze(1)


([''],
 tensor([[1.],
         [1.]]))

In [33]:
evaluate_and_show_attention(["37049916 42112939 37049916 37049916 37049916 37049916"])

NameError: name 'evaluate_and_show_attention' is not defined

In [None]:
input_lengths = [len("42112939")]
print(input_lengths)

In [None]:
input_batches = Variable(torch.LongTensor(input_seqs), volatile=True).transpose(0, 1)
print(input_batches)