# IDS 576 - SP22 Project - Group2

**Let’s import some necessities**

In [1]:
from bs4 import BeautifulSoup 
import pandas as pd
import numpy as np
import pickle
import unicodedata
import codecs
import re
import random
import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import itertools
import os

In [2]:
USE_CUDA = torch.cuda.is_available()
# device = torch.device("cuda" if USE_CUDA else "cpu")
device = "cpu"
device

'cpu'

## Load Cleaned Data

In [3]:
save_dir = 'D:\Spring22 - UIC\SP22 Notebooks\IDS 576\Project\Chatbot - Generative\Models'
corpus_name = 'Yahoo_subdata'

In [4]:
# f=open('yahoo_data_subset.pkl','wb')
# pickle.dump(df_conv,f)
f=open('yahoo_data_subset.pkl','rb')
df_subset=pickle.load(f)

## Process Data

First, we must convert the Unicode strings to ASCII using unicodeToAscii. Next, we should convert all letters to lowercase and trim all non-letter characters except for basic punctuation (normalizeString). Finally, to aid in training convergence, we will filter out sentences with length greater than the MAX_LENGTH threshold (filterPairs).

In [5]:
lines=[]
for i in range(len(df_subset)):
    lines.append('\t'.join(df_subset.iloc[i]))

contractions = {
    "don't": "do not",
    "didn't": "did not",
    "isn't": "is not",
    "can't": "cannot",
    "wasn't": "was not",
    "won't": "will not",
    "doesn't": "does not",
    "couldn't": "could not",
    "haven't": "have not",
    "shouldn't": "should not",
    "wouldn't": "would not",
    "hasn't": "has not",
    "aren't": "are not",
    "weren't": "were not"
}
MAX_LENGTH = 40

def uncontract(s):
    return ' '.join(contractions[c] if c in contractions else c for c in s.split())

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', uncontract(s))
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r'[^\w\s]','',s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

Load and trim data
~~~~~~~~~~~~~~~~~~

Our next order of business is to create a vocabulary and load
query/response sentence pairs into memory.

Note that we are dealing with sequences of **words**, which do not have
an implicit mapping to a discrete numerical space. Thus, we must create
one by mapping each unique word that we encounter in our dataset to an
index value.

For this we define a ``Voc`` class, which keeps a mapping from words to
indexes, a reverse mapping of indexes to words, a count of each word and
a total word count. The class provides methods for adding a word to the
vocabulary (``addWord``), adding all words in a sentence
(``addSentence``) and trimming infrequently seen words (``trim``). More
on trimming later.


In [6]:
# Default word tokens
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token

class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  # Count SOS, EOS, PAD

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    # Remove words below a certain count threshold
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 # Count default tokens

        for word in keep_words:
            self.addWord(word)

*Now we can assemble our vocabulary and query/response sentence pairs.*

In [7]:
# gloveindex = set(eng_glove_300_dim.index)
pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
# print(f"{len(pairs)} sentences before removing those with words missing in GloVe")
# pairs = [pair for pair in pairs if len(set(pair[0].split()) - gloveindex) == 0 & len(set(pair[1].split()) - gloveindex) == 0]
# print(f"{len(pairs)} sentences after removing those with words missing in GloVe")
voc = Voc('Yahoo_subdata')
for pair in pairs:
    voc.addSentence(pair[0])
    voc.addSentence(pair[1])
print("Counted words:", voc.num_words)

Counted words: 45700


**Inspect some pairs Question-Response Pairs**

In [8]:
print("\npairs:")
for pair in pairs[:10]:
    print(pair)


pairs:
['how to get rid of water stain in a stall shower', 'sounds like hard water deposits i find that lime away takes them right off']
['how do you remove permanent marker on appliances counter tops like store receipt s blue ink', 'just use hand sanitizer to get the stain out it will take that permanent marker right off the surface']
['how to fix vertical blinds', 'click on the link and follow the steps']
['how to clean window screens', 'nylon covered sponges are great for cleaning window screens']
['how to get rid of rodents', 'use glue traps for small mice and spring traps for rats you can also spread peanut butter']
['how can i make love last and my marriage stronger', 'focus on your marriage be fully committed maintain intimacy and compliment each other']
['how do house chores get divided in your family', 'depending on ability my husband cooks because he s good at it and i do the cleaning because i m particular about cleanliness']
['how do i convert metric to u s units', 'i reco

Another tactic that is beneficial to achieving faster convergence during training is trimming rarely used words out of our vocabulary. Decreasing the feature space will also soften the difficulty of the function that the model must learn to approximate. We will do this as a two-step process:

1) Trim words used under MIN_COUNT threshold using the voc.trim function.

2) Filter out pairs with trimmed words.

In [9]:
MIN_COUNT = 3    # Minimum word count threshold for trimming

def trimRareWords(voc, pairs, MIN_COUNT):
    # Trim words used under the MIN_COUNT from the voc
    voc.trim(MIN_COUNT)
    # Filter out pairs with trimmed words
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        # Check input sentence
        for word in input_sentence.split(' '):
            if word not in voc.word2index:
                keep_input = False
                break
        # Check output sentence
        for word in output_sentence.split(' '):
            if word not in voc.word2index:
                keep_output = False
                break

        # Only keep pairs that do not contain trimmed word(s) in their input or output sentence
        if keep_input and keep_output:
            keep_pairs.append(pair)

    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
    return keep_pairs


# Trim voc and pairs
pairs = trimRareWords(voc, pairs, MIN_COUNT)

keep_words 13815 / 45697 = 0.3023
Trimmed from 35854 pairs to 16246, 0.4531 of total


## Constructing the Training Data

In [10]:
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]


def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def binaryMatrix(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

# Returns padded input sequence tensor and lengths
def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

# Returns padded target sequence tensor, padding mask, and max target length
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

# Returns all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, voc)
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len


In [11]:
# Example for validation
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)

input_variable: tensor([[    3,     3,     3,     3,     3],
        [   27,    27,    75,     4,  1096],
        [   18,    18,    18,    80, 10659],
        [ 1307,    19,    19,    87, 13487],
        [   83,    46,  1003,   229,    38],
        [ 3873,   486,    32,  2304,   997],
        [   20,   122,    83,  5346,   385],
        [  411,  3075,   441,     2,     2],
        [  847,   248,    66,     0,     0],
        [    4,   165,    45,     0,     0],
        [ 1185,    11,    82,     0,     0],
        [   46,  5075,   403,     0,     0],
        [    7,   701,     2,     0,     0],
        [   83,     2,     0,     0,     0],
        [ 3087,     0,     0,     0,     0],
        [ 1679,     0,     0,     0,     0],
        [    2,     0,     0,     0,     0]])
lengths: tensor([17, 14, 13,  8,  8])
target_variable: tensor([[ 1531,   637,   329,   435,   243],
        [ 3441,    28,   366,     3,   120],
        [   47,  2169,   337,    56,  2601],
        [   38,   527,  4798

## Building the Encoder RNN

In [12]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding

        # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
        #   because our input size is a word embedding with number of features == hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        # Convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # Pack padded batch of sequences for RNN module
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        # Forward pass through GRU
        outputs, hidden = self.gru(packed, hidden)
        # Unpack padding
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        # Return output and final hidden state
        return outputs, hidden

## Constructing the Attention

In [13]:
# Luong attention layer
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

## Constructing the Decoder

In [14]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

## Defining Loss

In [15]:
def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

## Training 

In [16]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip,max_length=MAX_LENGTH):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)
    # Lengths for rnn packing should always be on the cpu
    lengths = lengths.to("cpu")

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropatation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

## Training Iterations

In [17]:
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, \
               embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, \
               batch_size, print_every, save_every, clip, corpus_name, loadFilename):

    # Load batches for each iteration
    training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        # Extract fields from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))

## Decoding the input

In [18]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores

## Evaluation

In [19]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [indexesFromSentence(voc, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to("cpu")
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_length)
    # indexes -> words
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words


def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")

## Training the Model

In [21]:
# Configure models
model_name = 'chatbot'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

# Set checkpoint to load from; set to None if starting from scratch
# loadFilename = None
checkpoint_iter = 2000
loadFilename = os.path.join(save_dir, 
                           '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
                           '{}_checkpoint.tar'.format(checkpoint_iter))


# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
if loadFilename:
    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

Building encoder and decoder ...
Models built and ready to go!


In [31]:
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1
learning_rate = 0.0001
decoder_learning_ratio = 5
n_iteration = 2000
print_every = 1
save_every = 500

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# If you have cuda, configure cuda to call
# for state in encoder_optimizer.state.values():
#     for k, v in state.items():
#         if isinstance(v, torch.Tensor):
#             state[k] = v.cuda()

# for state in decoder_optimizer.state.values():
#     for k, v in state.items():
#         if isinstance(v, torch.Tensor):
#             state[k] = v.cuda()

# Run training iterations
print("Starting Training!")
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, corpus_name, loadFilename)

Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1; Percent complete: 0.1%; Average loss: 4.4798
Iteration: 2; Percent complete: 0.1%; Average loss: 4.2618
Iteration: 3; Percent complete: 0.1%; Average loss: 4.2264
Iteration: 4; Percent complete: 0.2%; Average loss: 4.2104
Iteration: 5; Percent complete: 0.2%; Average loss: 4.2381
Iteration: 6; Percent complete: 0.3%; Average loss: 4.3343
Iteration: 7; Percent complete: 0.4%; Average loss: 4.3618
Iteration: 8; Percent complete: 0.4%; Average loss: 4.5357
Iteration: 9; Percent complete: 0.4%; Average loss: 4.1299
Iteration: 10; Percent complete: 0.5%; Average loss: 4.2910
Iteration: 11; Percent complete: 0.5%; Average loss: 4.4975
Iteration: 12; Percent complete: 0.6%; Average loss: 4.1625
Iteration: 13; Percent complete: 0.7%; Average loss: 4.0911
Iteration: 14; Percent complete: 0.7%; Average loss: 4.3982
Iteration: 15; Percent complete: 0.8%; Average loss: 4.4928
Iteration: 16; Percent complete: 0.8%

Iteration: 136; Percent complete: 6.8%; Average loss: 4.2453
Iteration: 137; Percent complete: 6.9%; Average loss: 4.1017
Iteration: 138; Percent complete: 6.9%; Average loss: 4.2810
Iteration: 139; Percent complete: 7.0%; Average loss: 4.2081
Iteration: 140; Percent complete: 7.0%; Average loss: 3.9257
Iteration: 141; Percent complete: 7.0%; Average loss: 4.5348
Iteration: 142; Percent complete: 7.1%; Average loss: 4.4747
Iteration: 143; Percent complete: 7.1%; Average loss: 4.3215
Iteration: 144; Percent complete: 7.2%; Average loss: 4.4755
Iteration: 145; Percent complete: 7.2%; Average loss: 4.4312
Iteration: 146; Percent complete: 7.3%; Average loss: 4.1672
Iteration: 147; Percent complete: 7.3%; Average loss: 4.2180
Iteration: 148; Percent complete: 7.4%; Average loss: 4.4191
Iteration: 149; Percent complete: 7.4%; Average loss: 4.2411
Iteration: 150; Percent complete: 7.5%; Average loss: 4.0908
Iteration: 151; Percent complete: 7.5%; Average loss: 4.1651
Iteration: 152; Percent 

Iteration: 270; Percent complete: 13.5%; Average loss: 4.3519
Iteration: 271; Percent complete: 13.6%; Average loss: 4.0246
Iteration: 272; Percent complete: 13.6%; Average loss: 4.2066
Iteration: 273; Percent complete: 13.7%; Average loss: 4.4465
Iteration: 274; Percent complete: 13.7%; Average loss: 3.9394
Iteration: 275; Percent complete: 13.8%; Average loss: 4.1548
Iteration: 276; Percent complete: 13.8%; Average loss: 4.0825
Iteration: 277; Percent complete: 13.9%; Average loss: 4.2624
Iteration: 278; Percent complete: 13.9%; Average loss: 4.0621
Iteration: 279; Percent complete: 14.0%; Average loss: 4.0148
Iteration: 280; Percent complete: 14.0%; Average loss: 4.4320
Iteration: 281; Percent complete: 14.1%; Average loss: 4.3405
Iteration: 282; Percent complete: 14.1%; Average loss: 4.3782
Iteration: 283; Percent complete: 14.1%; Average loss: 4.3863
Iteration: 284; Percent complete: 14.2%; Average loss: 4.1772
Iteration: 285; Percent complete: 14.2%; Average loss: 3.9867
Iteratio

Iteration: 403; Percent complete: 20.2%; Average loss: 4.2381
Iteration: 404; Percent complete: 20.2%; Average loss: 3.9581
Iteration: 405; Percent complete: 20.2%; Average loss: 4.0873
Iteration: 406; Percent complete: 20.3%; Average loss: 4.1962
Iteration: 407; Percent complete: 20.3%; Average loss: 4.0119
Iteration: 408; Percent complete: 20.4%; Average loss: 4.0777
Iteration: 409; Percent complete: 20.4%; Average loss: 4.3228
Iteration: 410; Percent complete: 20.5%; Average loss: 3.8911
Iteration: 411; Percent complete: 20.5%; Average loss: 4.0813
Iteration: 412; Percent complete: 20.6%; Average loss: 4.1032
Iteration: 413; Percent complete: 20.6%; Average loss: 4.2639
Iteration: 414; Percent complete: 20.7%; Average loss: 4.0751
Iteration: 415; Percent complete: 20.8%; Average loss: 4.1211
Iteration: 416; Percent complete: 20.8%; Average loss: 4.1195
Iteration: 417; Percent complete: 20.8%; Average loss: 3.9938
Iteration: 418; Percent complete: 20.9%; Average loss: 4.0909
Iteratio

Iteration: 536; Percent complete: 26.8%; Average loss: 3.7678
Iteration: 537; Percent complete: 26.9%; Average loss: 3.8424
Iteration: 538; Percent complete: 26.9%; Average loss: 3.9349
Iteration: 539; Percent complete: 27.0%; Average loss: 4.0128
Iteration: 540; Percent complete: 27.0%; Average loss: 4.0422
Iteration: 541; Percent complete: 27.1%; Average loss: 4.0506
Iteration: 542; Percent complete: 27.1%; Average loss: 3.8157
Iteration: 543; Percent complete: 27.2%; Average loss: 3.9204
Iteration: 544; Percent complete: 27.2%; Average loss: 3.9452
Iteration: 545; Percent complete: 27.3%; Average loss: 4.1923
Iteration: 546; Percent complete: 27.3%; Average loss: 3.9119
Iteration: 547; Percent complete: 27.4%; Average loss: 4.0332
Iteration: 548; Percent complete: 27.4%; Average loss: 4.0853
Iteration: 549; Percent complete: 27.5%; Average loss: 4.0957
Iteration: 550; Percent complete: 27.5%; Average loss: 4.0121
Iteration: 551; Percent complete: 27.6%; Average loss: 3.9957
Iteratio

Iteration: 669; Percent complete: 33.5%; Average loss: 3.8048
Iteration: 670; Percent complete: 33.5%; Average loss: 3.8775
Iteration: 671; Percent complete: 33.6%; Average loss: 3.5731
Iteration: 672; Percent complete: 33.6%; Average loss: 3.9433
Iteration: 673; Percent complete: 33.7%; Average loss: 3.4991
Iteration: 674; Percent complete: 33.7%; Average loss: 4.0190
Iteration: 675; Percent complete: 33.8%; Average loss: 3.8289
Iteration: 676; Percent complete: 33.8%; Average loss: 3.8508
Iteration: 677; Percent complete: 33.9%; Average loss: 3.9810
Iteration: 678; Percent complete: 33.9%; Average loss: 3.8901
Iteration: 679; Percent complete: 34.0%; Average loss: 3.8764
Iteration: 680; Percent complete: 34.0%; Average loss: 3.9082
Iteration: 681; Percent complete: 34.1%; Average loss: 3.8376
Iteration: 682; Percent complete: 34.1%; Average loss: 3.9873
Iteration: 683; Percent complete: 34.2%; Average loss: 3.8321
Iteration: 684; Percent complete: 34.2%; Average loss: 3.9124
Iteratio

Iteration: 802; Percent complete: 40.1%; Average loss: 3.6809
Iteration: 803; Percent complete: 40.2%; Average loss: 3.9359
Iteration: 804; Percent complete: 40.2%; Average loss: 3.9225
Iteration: 805; Percent complete: 40.2%; Average loss: 3.7952
Iteration: 806; Percent complete: 40.3%; Average loss: 3.6856
Iteration: 807; Percent complete: 40.4%; Average loss: 4.0228
Iteration: 808; Percent complete: 40.4%; Average loss: 3.6899
Iteration: 809; Percent complete: 40.5%; Average loss: 3.4670
Iteration: 810; Percent complete: 40.5%; Average loss: 3.8100
Iteration: 811; Percent complete: 40.6%; Average loss: 3.9692
Iteration: 812; Percent complete: 40.6%; Average loss: 3.8438
Iteration: 813; Percent complete: 40.6%; Average loss: 3.7076
Iteration: 814; Percent complete: 40.7%; Average loss: 3.6242
Iteration: 815; Percent complete: 40.8%; Average loss: 3.6469
Iteration: 816; Percent complete: 40.8%; Average loss: 3.8415
Iteration: 817; Percent complete: 40.8%; Average loss: 3.7378
Iteratio

Iteration: 935; Percent complete: 46.8%; Average loss: 3.7591
Iteration: 936; Percent complete: 46.8%; Average loss: 3.4290
Iteration: 937; Percent complete: 46.9%; Average loss: 3.9438
Iteration: 938; Percent complete: 46.9%; Average loss: 3.5619
Iteration: 939; Percent complete: 46.9%; Average loss: 3.5771
Iteration: 940; Percent complete: 47.0%; Average loss: 3.7294
Iteration: 941; Percent complete: 47.0%; Average loss: 3.4906
Iteration: 942; Percent complete: 47.1%; Average loss: 3.7626
Iteration: 943; Percent complete: 47.1%; Average loss: 3.7159
Iteration: 944; Percent complete: 47.2%; Average loss: 3.6526
Iteration: 945; Percent complete: 47.2%; Average loss: 3.7189
Iteration: 946; Percent complete: 47.3%; Average loss: 3.4121
Iteration: 947; Percent complete: 47.3%; Average loss: 3.8054
Iteration: 948; Percent complete: 47.4%; Average loss: 3.5385
Iteration: 949; Percent complete: 47.4%; Average loss: 3.6571
Iteration: 950; Percent complete: 47.5%; Average loss: 3.5822
Iteratio

Iteration: 1067; Percent complete: 53.3%; Average loss: 3.5655
Iteration: 1068; Percent complete: 53.4%; Average loss: 3.3520
Iteration: 1069; Percent complete: 53.4%; Average loss: 3.5840
Iteration: 1070; Percent complete: 53.5%; Average loss: 3.6515
Iteration: 1071; Percent complete: 53.5%; Average loss: 3.3106
Iteration: 1072; Percent complete: 53.6%; Average loss: 3.6423
Iteration: 1073; Percent complete: 53.6%; Average loss: 3.4797
Iteration: 1074; Percent complete: 53.7%; Average loss: 3.5460
Iteration: 1075; Percent complete: 53.8%; Average loss: 3.4320
Iteration: 1076; Percent complete: 53.8%; Average loss: 3.5462
Iteration: 1077; Percent complete: 53.8%; Average loss: 3.7294
Iteration: 1078; Percent complete: 53.9%; Average loss: 3.6475
Iteration: 1079; Percent complete: 53.9%; Average loss: 3.5688
Iteration: 1080; Percent complete: 54.0%; Average loss: 3.4711
Iteration: 1081; Percent complete: 54.0%; Average loss: 3.4182
Iteration: 1082; Percent complete: 54.1%; Average loss:

Iteration: 1198; Percent complete: 59.9%; Average loss: 3.4195
Iteration: 1199; Percent complete: 60.0%; Average loss: 3.4045
Iteration: 1200; Percent complete: 60.0%; Average loss: 3.5767
Iteration: 1201; Percent complete: 60.1%; Average loss: 3.4969
Iteration: 1202; Percent complete: 60.1%; Average loss: 3.3548
Iteration: 1203; Percent complete: 60.2%; Average loss: 3.3931
Iteration: 1204; Percent complete: 60.2%; Average loss: 3.5405
Iteration: 1205; Percent complete: 60.2%; Average loss: 3.4405
Iteration: 1206; Percent complete: 60.3%; Average loss: 3.6144
Iteration: 1207; Percent complete: 60.4%; Average loss: 3.7893
Iteration: 1208; Percent complete: 60.4%; Average loss: 3.6923
Iteration: 1209; Percent complete: 60.5%; Average loss: 3.6308
Iteration: 1210; Percent complete: 60.5%; Average loss: 3.3728
Iteration: 1211; Percent complete: 60.6%; Average loss: 3.7880
Iteration: 1212; Percent complete: 60.6%; Average loss: 3.7072
Iteration: 1213; Percent complete: 60.7%; Average loss:

Iteration: 1329; Percent complete: 66.5%; Average loss: 3.5055
Iteration: 1330; Percent complete: 66.5%; Average loss: 3.3843
Iteration: 1331; Percent complete: 66.5%; Average loss: 3.3498
Iteration: 1332; Percent complete: 66.6%; Average loss: 3.4960
Iteration: 1333; Percent complete: 66.6%; Average loss: 3.5489
Iteration: 1334; Percent complete: 66.7%; Average loss: 3.3674
Iteration: 1335; Percent complete: 66.8%; Average loss: 3.2798
Iteration: 1336; Percent complete: 66.8%; Average loss: 3.4196
Iteration: 1337; Percent complete: 66.8%; Average loss: 3.3651
Iteration: 1338; Percent complete: 66.9%; Average loss: 3.2749
Iteration: 1339; Percent complete: 67.0%; Average loss: 3.2918
Iteration: 1340; Percent complete: 67.0%; Average loss: 3.4490
Iteration: 1341; Percent complete: 67.0%; Average loss: 3.3479
Iteration: 1342; Percent complete: 67.1%; Average loss: 3.3637
Iteration: 1343; Percent complete: 67.2%; Average loss: 3.4330
Iteration: 1344; Percent complete: 67.2%; Average loss:

Iteration: 1460; Percent complete: 73.0%; Average loss: 3.2023
Iteration: 1461; Percent complete: 73.0%; Average loss: 3.3747
Iteration: 1462; Percent complete: 73.1%; Average loss: 3.4382
Iteration: 1463; Percent complete: 73.2%; Average loss: 3.4038
Iteration: 1464; Percent complete: 73.2%; Average loss: 3.3913
Iteration: 1465; Percent complete: 73.2%; Average loss: 3.2477
Iteration: 1466; Percent complete: 73.3%; Average loss: 3.4692
Iteration: 1467; Percent complete: 73.4%; Average loss: 3.1872
Iteration: 1468; Percent complete: 73.4%; Average loss: 3.4512
Iteration: 1469; Percent complete: 73.5%; Average loss: 3.1884
Iteration: 1470; Percent complete: 73.5%; Average loss: 3.4354
Iteration: 1471; Percent complete: 73.6%; Average loss: 3.3552
Iteration: 1472; Percent complete: 73.6%; Average loss: 3.2888
Iteration: 1473; Percent complete: 73.7%; Average loss: 3.1584
Iteration: 1474; Percent complete: 73.7%; Average loss: 3.4778
Iteration: 1475; Percent complete: 73.8%; Average loss:

Iteration: 1591; Percent complete: 79.5%; Average loss: 3.0945
Iteration: 1592; Percent complete: 79.6%; Average loss: 3.3010
Iteration: 1593; Percent complete: 79.7%; Average loss: 3.2369
Iteration: 1594; Percent complete: 79.7%; Average loss: 3.2522
Iteration: 1595; Percent complete: 79.8%; Average loss: 3.0250
Iteration: 1596; Percent complete: 79.8%; Average loss: 3.2670
Iteration: 1597; Percent complete: 79.8%; Average loss: 3.3630
Iteration: 1598; Percent complete: 79.9%; Average loss: 3.1411
Iteration: 1599; Percent complete: 80.0%; Average loss: 3.3815
Iteration: 1600; Percent complete: 80.0%; Average loss: 3.2788
Iteration: 1601; Percent complete: 80.0%; Average loss: 3.2306
Iteration: 1602; Percent complete: 80.1%; Average loss: 3.2331
Iteration: 1603; Percent complete: 80.2%; Average loss: 3.4404
Iteration: 1604; Percent complete: 80.2%; Average loss: 3.0967
Iteration: 1605; Percent complete: 80.2%; Average loss: 3.1549
Iteration: 1606; Percent complete: 80.3%; Average loss:

Iteration: 1722; Percent complete: 86.1%; Average loss: 3.0336
Iteration: 1723; Percent complete: 86.2%; Average loss: 3.1759
Iteration: 1724; Percent complete: 86.2%; Average loss: 3.1882
Iteration: 1725; Percent complete: 86.2%; Average loss: 3.0445
Iteration: 1726; Percent complete: 86.3%; Average loss: 3.1577
Iteration: 1727; Percent complete: 86.4%; Average loss: 3.0158
Iteration: 1728; Percent complete: 86.4%; Average loss: 3.2559
Iteration: 1729; Percent complete: 86.5%; Average loss: 3.0137
Iteration: 1730; Percent complete: 86.5%; Average loss: 3.2372
Iteration: 1731; Percent complete: 86.6%; Average loss: 3.1816
Iteration: 1732; Percent complete: 86.6%; Average loss: 3.3401
Iteration: 1733; Percent complete: 86.7%; Average loss: 3.4212
Iteration: 1734; Percent complete: 86.7%; Average loss: 3.2365
Iteration: 1735; Percent complete: 86.8%; Average loss: 3.1021
Iteration: 1736; Percent complete: 86.8%; Average loss: 3.0842
Iteration: 1737; Percent complete: 86.9%; Average loss:

Iteration: 1853; Percent complete: 92.7%; Average loss: 3.1457
Iteration: 1854; Percent complete: 92.7%; Average loss: 3.0669
Iteration: 1855; Percent complete: 92.8%; Average loss: 2.9643
Iteration: 1856; Percent complete: 92.8%; Average loss: 2.9986
Iteration: 1857; Percent complete: 92.8%; Average loss: 3.0160
Iteration: 1858; Percent complete: 92.9%; Average loss: 3.0124
Iteration: 1859; Percent complete: 93.0%; Average loss: 3.1941
Iteration: 1860; Percent complete: 93.0%; Average loss: 3.2718
Iteration: 1861; Percent complete: 93.0%; Average loss: 3.1412
Iteration: 1862; Percent complete: 93.1%; Average loss: 3.1162
Iteration: 1863; Percent complete: 93.2%; Average loss: 3.2365
Iteration: 1864; Percent complete: 93.2%; Average loss: 3.0706
Iteration: 1865; Percent complete: 93.2%; Average loss: 3.0980
Iteration: 1866; Percent complete: 93.3%; Average loss: 3.1219
Iteration: 1867; Percent complete: 93.3%; Average loss: 3.0031
Iteration: 1868; Percent complete: 93.4%; Average loss:

Iteration: 1984; Percent complete: 99.2%; Average loss: 2.8864
Iteration: 1985; Percent complete: 99.2%; Average loss: 2.9628
Iteration: 1986; Percent complete: 99.3%; Average loss: 3.0923
Iteration: 1987; Percent complete: 99.4%; Average loss: 2.9709
Iteration: 1988; Percent complete: 99.4%; Average loss: 3.0161
Iteration: 1989; Percent complete: 99.5%; Average loss: 2.9678
Iteration: 1990; Percent complete: 99.5%; Average loss: 3.0377
Iteration: 1991; Percent complete: 99.6%; Average loss: 3.1103
Iteration: 1992; Percent complete: 99.6%; Average loss: 3.0926
Iteration: 1993; Percent complete: 99.7%; Average loss: 2.8771
Iteration: 1994; Percent complete: 99.7%; Average loss: 2.8842
Iteration: 1995; Percent complete: 99.8%; Average loss: 3.0736
Iteration: 1996; Percent complete: 99.8%; Average loss: 3.2255
Iteration: 1997; Percent complete: 99.9%; Average loss: 2.9569
Iteration: 1998; Percent complete: 99.9%; Average loss: 2.8867
Iteration: 1999; Percent complete: 100.0%; Average loss

In [22]:
# Set dropout layers to eval mode
encoder.eval()
decoder.eval()

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)

# Begin chatting (uncomment and run the following line to begin)
evaluateInput(encoder, decoder, searcher, voc)

> what are you
Bot: the link below is a good place to start good luck
> How to handle baby
Bot: just be yourself and just do not understand the question to do with it and then take the flow
> how do house chores get divided in your family
Bot: i do not know but i do not know why i do not understand the question
> how do you feel about the show cheaters
Bot: i do not know but i do not know what the answer is
> nice 
Bot: the same way you do it yourself you ll be a nice boy
> how to love your partner
Bot: you need to know what about sex is to be
> how to express love
Bot: you can find someone else to love you in love with you about yourself and love the rest will be love
> how to hookup
Bot: just go to a clinic and buy a new one like you like of a day
> am i a good boy
Bot: a boy boy is a lot of us is a lot you can do better really
> who is avinash
Error: Encountered unknown word.
> who is zohaib
Error: Encountered unknown word.
> who is sanjay
Bot: the link is to get a new one in the co