In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from data_utils import prep_data
from model import EncoderRNN, LuongAttnDecoderRNN, GreedySearchDecoder, loss_function
from global_hparams import voc_hparams, data_hparams


import torch
from torch.jit import script, trace

from torch import optim

import csv
import random

import os
import codecs

import itertools
import math

import torch.nn as nn


USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

In [2]:
corpus_name = 'data'
corpus = os.path.join(corpus_name)
filename = "formatted_movie_lines.txt"

In [3]:
datafile = os.path.join(corpus, filename)

delimiter = str(codecs.decode('\n', "unicode_escape"))

# Initialize lines dict, conversations list, and field ids
lines = {}
conversations = []


In [4]:
save_dir = os.path.join("data", "save")
prep_data_obj = prep_data(data_hparams['MAX_LENGTH'])
voc, pairs = prep_data_obj.loadPrepareData(corpus, corpus_name, datafile, save_dir)
for pair in pairs[:1]:
    print(pair)

Start preparing training data ...
Reading lines...
Read 442564 sentence pairs
Trimmed to 428758 sentence pairs
Counting words...
['canwemakethisquick?roxannekorrineandandrewbarrettarehavinganincrediblyhorrendouspublicbreakuponthequad.again.', 'can we make this quick ? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad . again .']


In [5]:
#Count all the characters
all_c = sum(list(voc.word2count.values()))

In [6]:
voc.index2word

{0: 'PAD',
 1: 'SOS',
 2: 'EOS',
 3: 'c',
 4: 'a',
 5: 'n',
 6: ' ',
 7: 'w',
 8: 'e',
 9: 'm',
 10: 'k',
 11: 't',
 12: 'h',
 13: 'i',
 14: 's',
 15: 'q',
 16: 'u',
 17: '?',
 18: 'r',
 19: 'o',
 20: 'x',
 21: 'd',
 22: 'b',
 23: 'v',
 24: 'g',
 25: 'l',
 26: 'y',
 27: 'p',
 28: '.',
 29: 'f',
 30: 'j',
 31: 'z',
 32: '!'}

In [7]:
voc.word2count[' ']/all_c

0.2160765254803255

In [8]:
train_ratio = 0.8
train_pairs = pairs[:int(len(pairs)*train_ratio)]
test_pairs = pairs[int(len(pairs)*train_ratio):]

In [9]:
# Example for validation
small_batch_size = 50
batches = prep_data_obj.batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input_variable:", input_variable.shape)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)

input_variable: torch.Size([154, 50])
lengths: tensor([154, 134, 119, 107, 105,  93,  91,  80,  77,  75,  55,  52,  38,  32,
         32,  28,  28,  28,  26,  26,  26,  26,  26,  26,  25,  25,  23,  22,
         21,  20,  20,  18,  17,  17,  16,  14,  14,  12,  12,  12,  11,  11,
         11,  10,  10,   8,   6,   5,   5,   5])
target_variable: tensor([[26,  4, 26,  ..., 26,  7, 26],
        [19,  5,  8,  ...,  8, 12,  8],
        [16, 21,  4,  ..., 14, 26, 14],
        ...,
        [ 9,  0,  0,  ...,  0,  0,  0],
        [ 8,  0,  0,  ...,  0,  0,  0],
        [ 2,  0,  0,  ...,  0,  0,  0]])
mask: tensor([[ True,  True,  True,  ...,  True,  True,  True],
        [ True,  True,  True,  ...,  True,  True,  True],
        [ True,  True,  True,  ...,  True,  True,  True],
        ...,
        [ True, False, False,  ..., False, False, False],
        [ True, False, False,  ..., False, False, False],
        [ True, False, False,  ..., False, False, False]])
max_target_len: 198


In [10]:
input_variable[:,0] 

tensor([26, 19, 16, 21, 19,  5, 11,  7, 18,  8, 14, 11, 25,  8, 13, 29, 26, 19,
        16, 21, 19,  5, 11, 12,  4, 23,  8, 14, 16, 27, 18,  8,  9,  8,  3, 19,
         5, 29, 13, 21,  8,  5,  3,  8,  4,  5, 21, 13,  7, 18,  8, 14, 11, 25,
         8, 21, 28, 22, 16, 11,  7, 12,  8,  5, 13, 24,  8, 11,  7, 13, 11, 12,
        26, 19, 16,  4,  5, 21, 13, 14,  4, 26, 11, 19,  9, 26, 14,  8, 25, 29,
         9, 26, 24, 19, 21, 12,  8, 18,  8, 13,  4,  9,  7, 13, 11, 12, 11, 12,
         8,  7, 13, 29,  8, 19, 29, 12,  4, 18, 19, 25, 21, 18, 26,  4,  5, 19,
         5,  8, 19, 29, 11, 12,  8, 24, 18,  8,  4, 11, 12,  8, 18, 19,  8, 14,
        19, 29,  4, 25, 25, 11, 13,  9,  8,  2])

In [11]:
target_variable[:, 0]

tensor([26, 19, 16,  6, 21, 19,  5,  6, 11,  6,  7, 18,  8, 14, 11, 25,  8,  6,
        13, 29,  6, 26, 19, 16,  6, 21, 19,  5,  6, 11,  6, 12,  4, 23,  8,  6,
        14, 16, 27, 18,  8,  9,  8,  6,  3, 19,  5, 29, 13, 21,  8,  5,  3,  8,
         6,  4,  5, 21,  6, 13,  6,  7, 18,  8, 14, 11, 25,  8, 21,  6, 28,  6,
        22, 16, 11,  6,  7, 12,  8,  5,  6, 13,  6, 24,  8, 11,  6,  7, 13, 11,
        12,  6, 26, 19, 16,  6,  4,  5, 21,  6, 13,  6, 14,  4, 26,  6, 11, 19,
         6,  9, 26, 14,  8, 25, 29,  6,  9, 26,  6, 24, 19, 21,  6, 12,  8, 18,
         8,  6, 13,  6,  4,  9,  6,  7, 13, 11, 12,  6, 11, 12,  8,  6,  7, 13,
        29,  8,  6, 19, 29,  6, 12,  4, 18, 19, 25, 21,  6, 18, 26,  4,  5,  6,
        19,  5,  8,  6, 19, 29,  6, 11, 12,  8,  6, 24, 18,  8,  4, 11,  6, 12,
         8, 18, 19,  8, 14,  6, 19, 29,  6,  4, 25, 25,  6, 11, 13,  9,  8,  2])

In [12]:
#Input sentence
''.join([voc.index2word[i.item()] for i in input_variable[:,0]])

'youdontwrestleifyoudonthavesupremeconfidenceandiwrestled.butwhenigetwithyouandisaytomyselfmygodhereiamwiththewifeofharoldryanoneofthegreatheroesofalltimeEOS'

In [13]:
#Output sentence
''.join([voc.index2word[i.item()] for i in target_variable[:,0]])

'you don t wrestle if you don t have supreme confidence and i wrestled . but when i get with you and i say to myself my god here i am with the wife of harold ryan one of the great heroes of all timeEOS'

In [14]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          loss_fns, encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=data_hparams['MAX_LENGTH']):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)
    # Lengths for rnn packing should always be on the cpu
    lengths = lengths.to("cpu")

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[voc_hparams['SOS_token'] for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = loss_fns.maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = loss_fns.maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropatation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

In [15]:
def trainIters(model_name, voc, pairs, encoder, decoder, loss_fn ,encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename):

    # Load batches for each iteration
    training_batches = [prep_data_obj.batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    # Initializations
    print('Initializing ...')
    print(f"total training batches that you will need are {len(pairs)//batch_size}")
    start_iteration = 1
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        # Extract fields from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, loss_fn, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))

In [16]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=data_hparams['MAX_LENGTH']):
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [prep_data_obj.indexesFromSentence(voc, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)
    #lengths = lengths.to(device)
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_length)
    # indexes -> words
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words


def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = prep_data_obj.normalizeString(input_sentence)
            # Evaluate sentence
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Model Output:', ''.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")

In [17]:
# Configure models
model_name = 'cb_model'
attn_model = 'dot' #  'general' or 'concat'
hidden_size = 512
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

# Set checkpoint to load from; set to None if starting from scratch
#checkpoint_iter = 200
loadFilename = None
# loadFilename = os.path.join(save_dir, model_name, corpus_name,
#                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                            '{}_checkpoint.tar'.format(checkpoint_iter))


# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
if loadFilename:
    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# Use appropriate device
loss_fn = loss_function(device)
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

Building encoder and decoder ...
Models built and ready to go!


In [18]:
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 200
print_every = 10
save_every = 50

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# If you have cuda, configure cuda to call
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()
            
# Run training iterations
print("Training is started!")
trainIters(model_name, voc, train_pairs, encoder, decoder, loss_fn , encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, corpus_name, loadFilename)

Building optimizers ...
Training is started!
Initializing ...
total training batches that you will need are 5359
Training...


KeyboardInterrupt: 

In [19]:
test_sentences = [i[0] for i in test_pairs[:30] if len(i[0]) > 5]

In [20]:
encoder.eval()
decoder.eval()

searcher = GreedySearchDecoder(encoder, decoder, device)
try:
    outputs = []
    for input_sentence in test_sentences:
        input_sentence = prep_data_obj.normalizeString(input_sentence)
        output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
        output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
        outputs.append([input_sentence, ''.join(output_words)])
        
except KeyError:
    print("Error: Encountered unknown character.")

In [21]:
outputs

[['aboutasgoodascanbeexpectedwithonefootinthegrave .goodtoseeyouson .didyoubringanyofthatyankeewhiskywithyou ?',
  '                                                                                                                                                                                                        '],
 ['itisntyankeewhiskydaddyitsscotch .',
  '                                                                                                                                                                                                        '],
 ['itisntyankeewhiskydaddyitsscotch .',
  '                                                                                                                                                                                                        '],
 ['itsyankeewhiskytome .',
  '                                                                                                                                                            

In [None]:
# test_input = ["thismodelputspacesbetweencharacters",
#               "ithinkicansolvethisriddle",
#               'wellyouareamazing',
#               'adogisverypissedatme!',
#               'ithinkilovethismovie.charactersfromthismotionpictureisawesome',
#               'thisisabeautifulcap',
#               'icansolvethispuzzleveryeasily',
#               'usuallyachairhasfourlegs',
#               'tablealsohasfourlegs',
#               'thisismysignature.',
#               'knowlegeisnotsameaswisdom.',
#               "iamsorryidon'twanttoosoundrudesbutareyouplanningtodothisactivity?"
#               'themorningsunlightgivesyoumorevitamindthantablets'
#              ] 