# LSTM Bot

## Project Overview

In this project, you will build a chatbot that can converse with you at the command line. The chatbot will use a Sequence to Sequence text generation architecture with an LSTM as it's memory unit. You will also learn to use pretrained word embeddings to improve the performance of the model. At the conclusion of the project, you will be able to show your chatbot to potential employers.

Additionally, you have the option to use pretrained word embeddings in your model. We have loaded Brown Embeddings from Gensim in the starter code below. You can compare the performance of your model with pre-trained embeddings against a model without the embeddings.



---



A sequence to sequence model (Seq2Seq) has two components:
- An Encoder consisting of an embedding layer and LSTM unit.
- A Decoder consisting of an embedding layer, LSTM unit, and linear output unit.

The Seq2Seq model works by accepting an input into the Encoder, passing the hidden state from the Encoder to the Decoder, which the Decoder uses to output a series of token predictions.

## Dependencies

- Pytorch
- Numpy
- Pandas
- NLTK
- Gzip
- Gensim


Please choose a dataset from the Torchtext website. We recommend looking at the Squad dataset first. Here is a link to the website where you can view your options:

- https://pytorch.org/text/stable/datasets.html





In [1]:
# !pip install torchdata==0.3.0

In [2]:
import gensim
import gensim.downloader
import math
import nltk
import numpy as np
import pandas as pd
import gzip
import re
import random
import torch
import torch.nn as nn
import torch.nn.utils.rnn
import torch.utils.data
import requests
from nltk.tokenize import word_tokenize
from gensim.models import KeyedVectors
import torchtext.datasets

In [3]:
# Flags to avoid repeat work

get_embeddings = False
get_input_data = False
augment_embed_data = False
preprocess_input_data = False

In [4]:
# Global constants
embedding_name = 'glove-twitter-100'
validation_frac = 0.2
sosToken = 'soseq'
eosToken = 'eoseq'
loader_qty = 1 # Data loading thread quantity
layer_count = 1
hidden_unit_dim = 256
rep_int = 1000 # Samples per status printout
val_int = 5000 # Batches per validation (with printout)

In [5]:
# Pre-trained embeddings

if (get_embeddings) == True:
    
    base_embeddings = gensim.downloader.load(embedding_name)
    base_embeddings.save(embedding_name+'.kv')
    
else:
    
    base_embeddings = KeyedVectors.load(embedding_name+'.kv')

In [6]:
if get_input_data == True:

    train_squad, dev_squad = torchtext.datasets.SQuAD1()
    base_data = []
    for dP in train_squad:
        for dpAns in dP[2]:
            base_data.append((" ".join([sosToken,dP[0],dP[1],eosToken])," ".join([sosToken,dpAns,eosToken])))
    
    qa_df = pd.DataFrame(base_data,columns = ['qTxt','aTxt'])
    qa_df.to_pickle("rawQuestAnsData.pkl")

else:
    qa_df = pd.read_pickle("rawQuestAnsData.pkl")

In [7]:
# Add sequence boundary tokens, make all keywords lowercase, rebuild and save keyedVectors

if augment_embed_data == True:
    
    # Prepare numpy array to hold new embedding matrix with sosToken and eosToken added as one-hot
    aug_words = []
    aug_embed = np.zeros((len(base_embeddings.index_to_key)+2,len(base_embeddings[0])+2))
    for i in range(len(base_embeddings.index_to_key)):
        aug_words.append(base_embeddings.index_to_key[i].lower())
        aug_embed[i,:-2] = base_embeddings[base_embeddings.index_to_key[i]]
    
    # Add sosToken and eosToken
    aug_words.append(sosToken)
    aug_embed[-2,-2:] = np.array([1,0])
    aug_words.append(eosToken)
    aug_embed[-1,-2:] = np.array([0,1])
    
    # Create new KeyedVectors instance with the extra dimensions for sos, eos
    aug_kv = KeyedVectors(aug_embed.shape[1],aug_embed.shape[0])
    aug_kv.add_vectors(aug_words,aug_embed)
    
    # aug_kv.unit_normalize_all()
    
    # Save
    aug_kv.save(embedding_name+'-aug.kv')
    
else:
    aug_kv = KeyedVectors.load(embedding_name+'-aug.kv')

In [8]:
# Preprocessing functions

# Pre-Reqs
nltk.download('punkt')

# Remove tokens from list not present in embedding
def scrubTokens(inTokenList,emb_kv):
    outList = [ token for token in inTokenList if (emb_kv.has_index_for(token)) ]
    return outList

# Tokenization
def prepare_text(sentence,emb_kv):
    tokens = scrubTokens(word_tokenize(sentence),emb_kv)
    return tokens

# Prepend to token list
def token_prepend(inList,preItem):
    return [preItem] + inList

# Append to token list
def token_append(inList,postItem):
    return inList + [postItem]

# Transform list of tokens to their indices in embedding
def tokensToIndices(tokens,emb_kv):
    tokenInds = [emb_kv.get_index(token) for token in tokens]
    return tokenInds
    

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
def repeat_ans(token_list,repeat_times):
    out_list = []
    out_list.append(token_list[0])
    out_list += repeat_times*token_list[1:-1]
    out_list.append(token_list[-1])
    return out_list

In [10]:
# Tokenize strings
# Remove tokens that are not in the embedding
# Perform training and validation split
# Prepare new KeyedVectors that does not have tokens absent from training data

if preprocess_input_data == True:
    
    proc_qa_df = qa_df.copy()
    proc_qa_df['qTxtClean'] = proc_qa_df['qTxt'].str.lower().apply(prepare_text, emb_kv = aug_kv)
    proc_qa_df['aTxtClean'] = proc_qa_df['aTxt'].str.lower().apply(prepare_text, emb_kv = aug_kv)
    
    # Remove rows with no answers in vocab
    proc_qa_df['aTxtLen'] = proc_qa_df['aTxtClean'].apply(len)
    proc_qa_df = proc_qa_df[proc_qa_df['aTxtLen']>2]    
    
    print("Cleaned Text")
    
    pr_kv = KeyedVectors(aug_kv[sosToken].size)
    
    keyDict = {}
    for series in [proc_qa_df['qTxtClean'],proc_qa_df['aTxtClean']]:
        for index,tokenList in series.items():
            for token in tokenList:
                if not token in keyDict.keys():
                    keyDict[token] = aug_kv[token]
    
    keyList = []
    valList = []
    for token in keyDict.keys():
        keyList.append(token)
        valList.append(keyDict[token])
    
    pr_kv.add_vectors(keyList,valList)
    
    print("Pruned vocabulary")
    
    # Add dataframe columns with tokens by their numerical indices
    proc_qa_df['qIdxs'] = proc_qa_df['qTxtClean'].apply(tokensToIndices, emb_kv = pr_kv)
    proc_qa_df['aIdxs'] = proc_qa_df['aTxtClean'].apply(tokensToIndices, emb_kv = pr_kv)
    
    print("Converted to indices")
    
    # Save tokenized dataframes with training and validation split
    training_size = math.floor((1-validation_frac)*proc_qa_df.shape[0])
    train_qa_df = proc_qa_df[:training_size]
    train_qa_df.to_pickle('tokenizedTrainingData.pkl')
    validation_qa_df = proc_qa_df[training_size:]
    validation_qa_df.to_pickle('tokenizedValidationData.pkl')
    pr_kv.save(embedding_name+'-prn.kv')
    print("Saved preprocessed data")
    
else:
    
    train_qa_df = pd.read_pickle('tokenizedTrainingData.pkl')
    validation_qa_df = pd.read_pickle('tokenizedValidationData.pkl')
    pr_kv = KeyedVectors.load(embedding_name+'-prn.kv') 

In [11]:
# Dataset object

class qaWithContextDataset(torch.utils.data.Dataset):
    
    def __init__(self, questionAndAnswer_df):
        self.qa_df = questionAndAnswer_df
        self.length = questionAndAnswer_df['qIdxs'].count()
    
    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        # Return tuple of (question,answer)
        return (torch.tensor(self.qa_df['qIdxs'].iat[idx]),torch.tensor(self.qa_df['aIdxs'].iat[idx]))
    
# Batch collation - Pad sequences as tensors, add sequence lengths as lists with nested tuple
# ( (questionTensor,questionLengthList), (answerTensor,answerLengthList) )
def collate_qa_samples(batch):
    
    # Sort batch tuples by decreasing question length
    batch_sorted = batch.copy()
    batch_sorted.sort(reverse = True, key = lambda qa_p: qa_p[0].size()[0])
    
    questionLengths = [qa_pair[0].size()[0] for qa_pair in batch_sorted]
    answerLengths = [qa_pair[1].size()[0] for qa_pair in batch_sorted]
    
    # Pad sequences with 1's.
    # during forward() computations
    questions = torch.ones([len(questionLengths),max(questionLengths)], dtype=torch.int64)
    answers = torch.ones([len(answerLengths),max(answerLengths)], dtype=torch.int64)
    
    # Copy sequences into output tensor
    for i in range(len(batch_sorted)):
        questions[i,0:questionLengths[i]] = batch_sorted[i][0]
        answers[i,0:answerLengths[i]] = batch_sorted[i][1]
    
    return ( (questions,questionLengths) , (answers,answerLengths) )

In [12]:
# Initialize training and validation datasets
# Truncate to 5000 to try and manage training time
train_dataset = qaWithContextDataset(train_qa_df[0:5000])
validation_dataset = qaWithContextDataset(validation_qa_df[0:5000])

In [13]:
# Dataloader creator so batch_size may be varied
def make_dataloader(d_set,batch_qty):
    return torch.utils.data.DataLoader( \
                       d_set, \
                       batch_qty, \
                       shuffle = True, \
                       num_workers = loader_qty, \
                       collate_fn = collate_qa_samples, \
                       drop_last = True, \
                       persistent_workers = True)

In [14]:
class Encoder(nn.Module):
    
    def __init__(self, hidden_size, layer_qty, pretrained_embed):
        
        super(Encoder, self).__init__()
        
        adjusted_hidden = hidden_size if hidden_size > pretrained_embed.size(dim=1) else pretrained_embed.size(dim=1)
        
        # self.embedding provides a vector representation of the inputs to our model
        self.embedding = nn.Embedding( \
                                     num_embeddings = pretrained_embed.size(dim=0), \
                                     embedding_dim = adjusted_hidden, \
                                     )
        
        # initialize weights for encoder embedding, loading pretrained, expand hidden size if less than pretrained dim
        
        init_weights = torch.randn(pretrained_embed.size(dim=0),adjusted_hidden)
        init_weights[:,:pretrained_embed.size(dim=1)] = pretrained_embed
        self.embedding.weight = torch.nn.Parameter(init_weights)
        
        # self.lstm, accepts the vectorized input and passes a hidden state
        if (layer_qty > 1):
            self.lstm = \
                nn.LSTM(adjusted_hidden,adjusted_hidden,num_layers = layer_qty,batch_first=True,dropout=0.3)
        else:
            self.lstm = \
                nn.LSTM(adjusted_hidden,adjusted_hidden,num_layers = layer_qty,batch_first=True)
    
    def forward(self, i):
        
        '''
        Inputs: i, the src tuple (questions, questionLengths) for batch
        Outputs: o, the encoder outputs
                h, the hidden state
                c, the cell state
        '''
        
        # Shape of i[0] is [batch_size,sequence_length]
        
        max_question_length = max(i[1])-1 
        
        # Get permutation order for sorting questions by decreasing length
        sorted_ques_lengths = []
        for idx in range(len(i[1])):
            sorted_ques_lengths.append((i[1][idx],idx))
        sorted_ques_lengths.sort(reverse = True, key = lambda ansLen : ansLen[0])
        
        # Split up for easier downstream ops
        sorted_dec_lengths = [quesLen[0] for quesLen in sorted_ques_lengths]
        sorted_idx_list = [quesLen[1] for quesLen in sorted_ques_lengths]
        
        # Sort the answer padded tensor by decreasing answer length
        sorted_idx_tensor = i[0].new_tensor(sorted_idx_list,dtype=torch.int64)
        decreasing_length_questions = torch.index_select(i[0],0,sorted_idx_tensor)
        
        # Shape of embed_rslt is [batch_size,sequence_length,embedding_dim]
        embed_rslt = self.embedding(decreasing_length_questions)
        
        # Encoder does not require online data substitution.  Can use sequence packing functionality.
        packed_questions = torch.nn.utils.rnn.pack_padded_sequence(embed_rslt,i[1],batch_first=True,enforce_sorted=True)
        
        o,(h,c) = self.lstm(packed_questions)
        
        # Need to undo permutation so questions and answers remain aligned. Output of encoder is unused
        inverted_sort_list = [sorted_idx_list.index(old_idx) for old_idx in range(len(sorted_idx_list))]
        inverted_sort_tensor = i[0].new_tensor(inverted_sort_list,dtype=torch.int64)
        h = torch.index_select(h,1,inverted_sort_tensor)
        c = torch.index_select(c,1,inverted_sort_tensor)
        
        # h.shape == c.shape == [num_layers, batch_size, hidden_size]
        return o, (h, c)
    

class Decoder(nn.Module):
      
    def __init__(self, hidden_size, layer_qty, pretrained_embed):
        
        super(Decoder, self).__init__()
        
        adjusted_hidden = hidden_size if hidden_size > pretrained_embed.size(dim=1) else pretrained_embed.size(dim=1)
        
        # self.embedding provides a vector representation of the inputs to our model
        self.embedding = nn.Embedding( \
                                     num_embeddings = pretrained_embed.size(dim=0), \
                                     embedding_dim = adjusted_hidden, \
                                     )
        
        # initialize weights for encoder embedding, loading pretrained, expand hidden size if less than pretrained dim
        init_weights = torch.randn(pretrained_embed.size(dim=0),adjusted_hidden)
        init_weights[:,:pretrained_embed.size(dim=1)] = pretrained_embed
        self.embedding.weight = torch.nn.Parameter(init_weights)
        
        # Output dimension used to construct outputs
        self.out_dim = pretrained_embed.size()[0]
        
        # self.lstm, accepts the embeddings and outputs a hidden state
        if (layer_qty > 1):
            self.lstm = \
                nn.LSTM(adjusted_hidden,adjusted_hidden,num_layers = layer_qty,batch_first=True,dropout=0.3)
        else:
            self.lstm = \
                nn.LSTM(adjusted_hidden,adjusted_hidden,num_layers = layer_qty,batch_first=True)
        
        # self.output, predicts on the LSTM output with linear layer
        self.output = nn.Linear(adjusted_hidden,pretrained_embed.size()[0])
        self.lsftmx = nn.LogSoftmax(dim=2)
        
    def forward(self, i, enc_state, teach_freq):
        
        '''
        Inputs: i, the target tuple (answers, answerLengths) for batch
        Outputs: o, the prediction
        '''

        are_teaching = True if random.random() < teach_freq else False
        max_answer_length = max(i[1])-1 
        
        # Get permutation order for sorting answers by decreasing length
        # Each answer length is reduced by one since the eos token is only used in loss calculation
        sorted_ans_lengths = []
        for idx in range(len(i[1])):
            sorted_ans_lengths.append((i[1][idx]-1,idx))
        sorted_ans_lengths.sort(reverse = True, key = lambda ansLen : ansLen[0])
        
        # Split up for easier downstream ops
        sorted_dec_lengths = [ansLen[0] for ansLen in sorted_ans_lengths]
        sorted_idx_list = [ansLen[1] for ansLen in sorted_ans_lengths]
        
        # Sort the answer padded tensor by decreasing answer length
        sorted_idx_tensor = i[0].new_tensor(sorted_idx_list,dtype=torch.int64)
        decreasing_length_answers = torch.index_select(i[0],0,sorted_idx_tensor)
        
        # Allocate tensor for predictions
        out_preds = i[0].new_ones([len(i[1]),max_answer_length,self.out_dim])
        
        # Encoder states need to be permuted in same manner as answers (decreasing answer length)
        h_enc_decreasing_ans_length = torch.index_select(enc_state[0],1,sorted_idx_tensor)
        c_enc_decreasing_ans_length = torch.index_select(enc_state[1],1,sorted_idx_tensor)
        
        # Teaching: 
        #  -Can pack answer sequence and run full sequence with single lstm call
        # Not Teaching: 
        #  -Step one token at a time, feeding previous prediction tokens as input.
        #  -Don't run sequences through decoder beyond their labeled answer length
        if are_teaching:
               
            embed_rslt = self.embedding(decreasing_length_answers[:,:-1]) # Do not pass eos token
            packed_answers = torch.nn.utils.rnn.pack_padded_sequence( \
                embed_rslt,sorted_dec_lengths,batch_first=True,enforce_sorted=True)
            teach_pred_out,(h_decode,c_decode) = self.lstm( \
                                                            packed_answers, \
                                                            (h_enc_decreasing_ans_length,c_enc_decreasing_ans_length) \
                                                          )
            unpacked_ans, unpacked_len = torch.nn.utils.rnn.pad_packed_sequence(teach_pred_out,batch_first=True)
            
            # Pass through linear layer
            out_preds = self.output(unpacked_ans)
            out_preds = self.lsftmx(out_preds)
            
        else:
            
            prev_h = h_enc_decreasing_ans_length
            prev_c = c_enc_decreasing_ans_length
            
            # feed in sos token on first iteration to prime
            prev_o = torch.unsqueeze(self.embedding(decreasing_length_answers[:,0]),1)
            
            # Accumulator list and alias
            out_pred_list = []
            total_seq = len(sorted_dec_lengths)
            
            for step in range(max_answer_length):
                
                # Ensure prepped for gpu run
                prev_h = prev_h.contiguous()
                prev_c = prev_c.contiguous()
                prev_o = prev_o.contiguous()
                
                # Determine number of sequences that still have remaining predictions to make
                seq_left = sum([(1 if step <= ans_len else 0) for ans_len in sorted_dec_lengths])
                ltsm_out, (prev_h,prev_c) = \
                    self.lstm( \
                        prev_o[:seq_left,:,:].contiguous(), \
                        (prev_h[:,:seq_left,:].contiguous(),prev_c[:,:seq_left,:].contiguous()))
                net_out = self.output(ltsm_out)
                net_out = self.lsftmx(net_out)
                prev_o = self.embedding(torch.argmax(net_out,dim=2))
                
                # Add tensor with outputs for seq_left sequences padded 
                out_pred_list.append( \
                     torch.cat([net_out,i[0].new_ones([total_seq-seq_left,1,self.out_dim])], dim = 0) \
                )
        
            # Concatenate all the sequence outputs from every sequence step
            out_preds = torch.cat(out_pred_list,dim=1)
        
        # Reorder predictions to original order for loss computation
        inverted_sort_list = [sorted_idx_list.index(old_idx) for old_idx in range(len(sorted_idx_list))]
        o = torch.index_select(out_preds,0,i[0].new_tensor(inverted_sort_list,dtype=torch.int64))
        
        return o

class Seq2Seq(nn.Module):
    
    def __init__(self, hidden_size, layer_qty, pretrained_kv):
        
        super(Seq2Seq, self).__init__()
        
        # Convert keyedvector's numpy array to tensor
        pretrained_embed = torch.tensor(pretrained_kv.vectors,dtype=torch.float32)
        
        self.seq2seqEncoder = Encoder(hidden_size, layer_qty, pretrained_embed)
        self.seq2seqDecoder = Decoder(hidden_size, layer_qty, pretrained_embed)
    
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):      
        
        o, enc_state = self.seq2seqEncoder.forward(src)
        o = self.seq2seqDecoder.forward(trg, enc_state, teacher_forcing_ratio)
        
        return o
    
    # Pass tensor to embedding
    def embed_tensor(self,inTensor):
        return self.seq2seqEncoder.embedding(inTensor)

In [15]:
# Initialize network object

# seqToseqNet = Seq2Seq(hidden_unit_dim,layer_count,pr_kv)

In [16]:
# Function to compute loss for variable length sequence data
# net_output = (batch size, max sequence length, vocab dimension)
# target_seqs = (batch size, max sequence length) indices in vocabulary space
# target_lengths = list of lengths for each target in batch
# loss_criterion = Loss which can be computed on pair of 1d tensors
def computeMaskedLoss(net_output,target_seqs,target_lengths,loss_criterion):
    
    preds = torch.flatten(net_output,start_dim=0,end_dim=1)
    targets = target_seqs[:,1:] # Skip sos token
    for i in range(len(target_lengths)):
        targets[i,(target_lengths[i]-1):] = (-1)*targets.new_ones((1,targets.size(dim=1)-(target_lengths[i]-1)))
    
    targets = torch.flatten(targets,start_dim=0,end_dim=1)
    loss = loss_criterion(preds,targets) 
    
    return loss

In [17]:
# Training routine
def train_model(net, train_dset, val_dset,ses_lrn = 0.01,ses_tea = 0.5,ses_epochs = 1,ses_batch_size = 16):

    train_loader = make_dataloader(train_dset,ses_batch_size)
    val_loader = make_dataloader(val_dset,ses_batch_size)
    report_interval = rep_int // ses_batch_size
    validation_interval = val_int // ses_batch_size
    
    least_validation_loss = float("inf")
    report_interval_counter = 0
    validation_interval_counter = 0
    val_iter = iter(val_loader)
    s = next(val_iter)
    
    gpu_avail = torch.cuda.is_available()
    
    if (gpu_avail):
        net.cuda()
    
    loss_criterion = nn.NLLLoss(ignore_index=-1) # Batches padded with -1's
    
    optimizer = torch.optim.Adam(net.parameters(),lr=ses_lrn)
    
    for epoch in range(ses_epochs):
        
        net.train()
        train_loss = 0.0
        
        for i, train_data in enumerate(train_loader):
            
            train_inputs, train_labels = train_data
            
            if (gpu_avail):
                train_inputs = (train_inputs[0].cuda(), train_inputs[1]) 
                train_labels = (train_labels[0].cuda(), train_labels[1])
            
            # Zero out the gradients of the optimizer
            optimizer.zero_grad()

            # Get model outputs
            train_outputs = net(train_inputs,train_labels,teacher_forcing_ratio=ses_tea)
            
            # Compute loss
            train_loss = computeMaskedLoss(train_outputs,train_labels[0],train_labels[1],loss_criterion)
            
            # Compute the loss gradient using the backward method and have the optimizer take a step
            train_loss.backward()
            optimizer.step()
            
            # Report status if is time
            report_interval_counter += 1
            if report_interval_counter >= report_interval:
                report_interval_counter = 0
                print(f"{i+1} batches of epoch {epoch+1} completed.  Last Training Loss: {train_loss: .6f}")
            
            # Perform validation run if is time
            validation_interval_counter += 1
            if validation_interval_counter >= validation_interval:
                validation_interval_counter = 0
                val_loss = 0.0
                net.eval()
                
                # Get validation batch and evaluate model
                # Need try block to handle iterator terminating. See
                # https://github.com/pytorch/pytorch/issues/1917#issuecomment-433698337
                try:
                    val_inputs, val_labels = next(val_iter)
                except StopIteration:
                    val_iter = iter(val_loader)
                    val_inputs, val_labels = next(val_iter)
                
                if (gpu_avail):
                    val_inputs = (val_inputs[0].cuda(), val_inputs[1]) 
                    val_labels = (val_labels[0].cuda(), val_labels[1])
                
                # Evaluate validation batch outputs against labels
                val_outputs = net(val_inputs, val_labels,teacher_forcing_ratio=0)
                
                # Compute loss
                val_loss = computeMaskedLoss(val_outputs,val_labels[0],val_labels[1],loss_criterion)
                
                # Update min val loss
                if val_loss < least_validation_loss:
                    least_validation_loss = val_loss
                    print("Saving model . . .")
                    torch.save(net,"Min-Validation-Loss-Model-512.pt")
    
                # Report
                print(f"Last Validation Loss: {val_loss: .6f}, Lowest Validation Loss: {least_validation_loss: .6f}")
        
                # Cleanup after validation
                net.train()

In [18]:
# Perform first training run
train_model(seqToseqNet, \
            train_dataset,validation_dataset, \
            ses_lrn = 0.001, ses_tea = 1.0, ses_epochs = 20, ses_batch_size = 128)

7 batches of epoch 1 completed.  Last Training Loss:  9.962809
14 batches of epoch 1 completed.  Last Training Loss:  7.303122
21 batches of epoch 1 completed.  Last Training Loss:  7.047356
28 batches of epoch 1 completed.  Last Training Loss:  6.844414
35 batches of epoch 1 completed.  Last Training Loss:  6.639485
Saving model . . .
Last Validation Loss:  7.582885, Lowest Validation Loss:  7.582885
3 batches of epoch 2 completed.  Last Training Loss:  5.755796
10 batches of epoch 2 completed.  Last Training Loss:  5.765522
17 batches of epoch 2 completed.  Last Training Loss:  5.990032
24 batches of epoch 2 completed.  Last Training Loss:  6.128414
31 batches of epoch 2 completed.  Last Training Loss:  6.042053
38 batches of epoch 2 completed.  Last Training Loss:  5.783677
Last Validation Loss:  7.887328, Lowest Validation Loss:  7.582885
6 batches of epoch 3 completed.  Last Training Loss:  5.536327
13 batches of epoch 3 completed.  Last Training Loss:  5.603015
20 batches of epoc

In [21]:
torch.save(seqToseqNet,'afterRunOne.pt')
# Second training run.  Lower learning rate to 0.0001, keep teach at 1.0.
train_model(seqToseqNet, \
            train_dataset,validation_dataset, \
            ses_lrn = 0.0001, ses_tea = 1.0, ses_epochs = 20, ses_batch_size = 128)

7 batches of epoch 1 completed.  Last Training Loss:  3.234920
14 batches of epoch 1 completed.  Last Training Loss:  3.202101
21 batches of epoch 1 completed.  Last Training Loss:  3.233509
28 batches of epoch 1 completed.  Last Training Loss:  3.348517
35 batches of epoch 1 completed.  Last Training Loss:  3.252787
Saving model . . .
Last Validation Loss:  9.689758, Lowest Validation Loss:  9.689758
3 batches of epoch 2 completed.  Last Training Loss:  3.150636
10 batches of epoch 2 completed.  Last Training Loss:  3.329682
17 batches of epoch 2 completed.  Last Training Loss:  3.153099
24 batches of epoch 2 completed.  Last Training Loss:  3.233053
31 batches of epoch 2 completed.  Last Training Loss:  3.143806
38 batches of epoch 2 completed.  Last Training Loss:  3.119638
Last Validation Loss:  9.699584, Lowest Validation Loss:  9.689758
6 batches of epoch 3 completed.  Last Training Loss:  3.147513
13 batches of epoch 3 completed.  Last Training Loss:  3.060886
20 batches of epoc

In [22]:
torch.save(seqToseqNet,'afterRunTwo.pt')
# Third training run.  Raise learning rate to 0.0005, keep teach at 1.0.
train_model(seqToseqNet, \
            train_dataset,validation_dataset, \
            ses_lrn = 0.0005, ses_tea = 1.0, ses_epochs = 20, ses_batch_size = 128)

7 batches of epoch 1 completed.  Last Training Loss:  2.844584
14 batches of epoch 1 completed.  Last Training Loss:  2.900691
21 batches of epoch 1 completed.  Last Training Loss:  3.041736
28 batches of epoch 1 completed.  Last Training Loss:  2.972529
35 batches of epoch 1 completed.  Last Training Loss:  3.015214
Saving model . . .
Last Validation Loss:  10.372571, Lowest Validation Loss:  10.372571
3 batches of epoch 2 completed.  Last Training Loss:  2.777930
10 batches of epoch 2 completed.  Last Training Loss:  2.867497
17 batches of epoch 2 completed.  Last Training Loss:  2.907537
24 batches of epoch 2 completed.  Last Training Loss:  2.807493
31 batches of epoch 2 completed.  Last Training Loss:  2.879120
38 batches of epoch 2 completed.  Last Training Loss:  2.900671
Last Validation Loss:  10.401089, Lowest Validation Loss:  10.372571
6 batches of epoch 3 completed.  Last Training Loss:  2.755357
13 batches of epoch 3 completed.  Last Training Loss:  2.728160
20 batches of 

In [18]:
# torch.save(seqToseqNet,'afterRunThree.pt')
# Fourth training run.  Keep learning rate at 0.0005, keep teach at 1.0.
# Had a cuda memory error so reloaded, decreased batch size to 96
seqToseqNet = torch.load('afterRunThree.pt')
train_model(seqToseqNet, \
            train_dataset,validation_dataset, \
            ses_lrn = 0.0005, ses_tea = 1.0, ses_epochs = 20, ses_batch_size = 96)

10 batches of epoch 1 completed.  Last Training Loss:  1.754943
20 batches of epoch 1 completed.  Last Training Loss:  1.693097
30 batches of epoch 1 completed.  Last Training Loss:  1.807624
40 batches of epoch 1 completed.  Last Training Loss:  1.780202
50 batches of epoch 1 completed.  Last Training Loss:  1.713104
Saving model . . .
Last Validation Loss:  13.993649, Lowest Validation Loss:  13.993649
8 batches of epoch 2 completed.  Last Training Loss:  1.529749
18 batches of epoch 2 completed.  Last Training Loss:  1.568932
28 batches of epoch 2 completed.  Last Training Loss:  1.645175
38 batches of epoch 2 completed.  Last Training Loss:  1.647937
48 batches of epoch 2 completed.  Last Training Loss:  1.673804
Saving model . . .
Last Validation Loss:  12.735620, Lowest Validation Loss:  12.735620
6 batches of epoch 3 completed.  Last Training Loss:  1.575722
16 batches of epoch 3 completed.  Last Training Loss:  1.355341
26 batches of epoch 3 completed.  Last Training Loss:  1.5

In [19]:
torch.save(seqToseqNet,'afterRunFour.pt')
# Fifth training run.  Got training loss under one, so expect conditioned to make reasonable one-token
# forward inference.  Turn off teaching.

train_model(seqToseqNet, \
            train_dataset,validation_dataset, \
            ses_lrn = 0.0005, ses_tea = 0.0, ses_epochs = 20, ses_batch_size = 96)

10 batches of epoch 1 completed.  Last Training Loss:  5.544827
20 batches of epoch 1 completed.  Last Training Loss:  5.635997
30 batches of epoch 1 completed.  Last Training Loss:  4.969920
40 batches of epoch 1 completed.  Last Training Loss:  5.720712
50 batches of epoch 1 completed.  Last Training Loss:  5.105931
Saving model . . .
Last Validation Loss:  9.903163, Lowest Validation Loss:  9.903163
8 batches of epoch 2 completed.  Last Training Loss:  4.501165
18 batches of epoch 2 completed.  Last Training Loss:  3.681868
28 batches of epoch 2 completed.  Last Training Loss:  4.268641
38 batches of epoch 2 completed.  Last Training Loss:  4.115623
48 batches of epoch 2 completed.  Last Training Loss:  3.652003
Last Validation Loss:  10.084297, Lowest Validation Loss:  9.903163
6 batches of epoch 3 completed.  Last Training Loss:  3.594330
16 batches of epoch 3 completed.  Last Training Loss:  3.061910
26 batches of epoch 3 completed.  Last Training Loss:  3.777947
36 batches of ep

In [22]:
torch.save(seqToseqNet,'afterRunFive.pt')
# Sixth training run.  Seems to not be generalizing, so giving it a new batch of 5000 training examples on which
# to train without teaching
train_dataset_2 = qaWithContextDataset(train_qa_df[5000:10000])

train_model(seqToseqNet, \
            train_dataset_2,validation_dataset, \
            ses_lrn = 0.0005, ses_tea = 0.0, ses_epochs = 20, ses_batch_size = 96)

10 batches of epoch 1 completed.  Last Training Loss:  9.115084
20 batches of epoch 1 completed.  Last Training Loss:  7.586810
30 batches of epoch 1 completed.  Last Training Loss:  7.292883
40 batches of epoch 1 completed.  Last Training Loss:  6.997441
50 batches of epoch 1 completed.  Last Training Loss:  6.683289
Saving model . . .
Last Validation Loss:  7.563903, Lowest Validation Loss:  7.563903
8 batches of epoch 2 completed.  Last Training Loss:  6.366477
18 batches of epoch 2 completed.  Last Training Loss:  6.256734
28 batches of epoch 2 completed.  Last Training Loss:  6.255616
38 batches of epoch 2 completed.  Last Training Loss:  5.929092
48 batches of epoch 2 completed.  Last Training Loss:  5.870816
Saving model . . .
Last Validation Loss:  7.369114, Lowest Validation Loss:  7.369114
6 batches of epoch 3 completed.  Last Training Loss:  5.714532
16 batches of epoch 3 completed.  Last Training Loss:  5.622810
26 batches of epoch 3 completed.  Last Training Loss:  5.62249

In [26]:
torch.save(seqToseqNet,'afterRunSix.pt')

print(f"How many questions available: {train_qa_df['qIdxs'].count()}")
# Seventh training run.  Some validation performance improvement but still tending to overfit.
# Giving it a new batch of 10000 training examples on which
# to train without teaching

# Go back to lowest validation network due to overfitting on last run
seqToseqNet = torch.load("Min-Validation-Loss-Model.pt") 

train_dataset_3 = qaWithContextDataset(train_qa_df[10000:20000])

train_model(seqToseqNet, \
            train_dataset_3,validation_dataset, \
            ses_lrn = 0.0005, ses_tea = 0.0, ses_epochs = 20, ses_batch_size = 96)

How many questions available: 59317
10 batches of epoch 1 completed.  Last Training Loss:  6.715934
20 batches of epoch 1 completed.  Last Training Loss:  6.702463
30 batches of epoch 1 completed.  Last Training Loss:  6.370927
40 batches of epoch 1 completed.  Last Training Loss:  6.442051
50 batches of epoch 1 completed.  Last Training Loss:  6.280188
Saving model . . .
Last Validation Loss:  7.047170, Lowest Validation Loss:  7.047170
60 batches of epoch 1 completed.  Last Training Loss:  6.330679
70 batches of epoch 1 completed.  Last Training Loss:  6.321343
80 batches of epoch 1 completed.  Last Training Loss:  6.341338
90 batches of epoch 1 completed.  Last Training Loss:  5.837990
100 batches of epoch 1 completed.  Last Training Loss:  5.876680
Saving model . . .
Last Validation Loss:  6.840302, Lowest Validation Loss:  6.840302
6 batches of epoch 2 completed.  Last Training Loss:  5.641878
16 batches of epoch 2 completed.  Last Training Loss:  5.732736
26 batches of epoch 2 co

In [18]:
# torch.save(seqToseqNet,'afterRunSeven.pt')

# Go back to lowest validation network due to overfitting on last run
# seqToseqNet = torch.load("Min-Validation-Loss-Model.pt") 

# Run with all questions for fewer epochs to attempt best generalization
train_dataset_4 = qaWithContextDataset(train_qa_df)

# Fewer epochs since run will be longer
# train_model(seqToseqNet, \
#            train_dataset_4,validation_dataset, \
#            ses_lrn = 0.0005, ses_tea = 0.0, ses_epochs = 10, ses_batch_size = 96)

In [19]:
# Changed validation output file name and made larger model.  Code should be right since can work
# loss down below zero, but capacity not there.
seqToseqNet = Seq2Seq(512,layer_count,pr_kv)
train_model(seqToseqNet, \
            train_dataset_4,validation_dataset, \
            ses_lrn = 0.01, ses_tea = 0.5, ses_epochs = 5, ses_batch_size = 96)

10 batches of epoch 1 completed.  Last Training Loss:  9.761504
20 batches of epoch 1 completed.  Last Training Loss:  7.624062
30 batches of epoch 1 completed.  Last Training Loss:  8.439412
40 batches of epoch 1 completed.  Last Training Loss:  6.849150
50 batches of epoch 1 completed.  Last Training Loss:  8.349634
Saving model . . .
Last Validation Loss:  8.515495, Lowest Validation Loss:  8.515495
60 batches of epoch 1 completed.  Last Training Loss:  6.973281
70 batches of epoch 1 completed.  Last Training Loss:  6.728694
80 batches of epoch 1 completed.  Last Training Loss:  7.642511
90 batches of epoch 1 completed.  Last Training Loss:  7.599713
100 batches of epoch 1 completed.  Last Training Loss:  7.068876
Saving model . . .
Last Validation Loss:  7.582142, Lowest Validation Loss:  7.582142
110 batches of epoch 1 completed.  Last Training Loss:  6.967543
120 batches of epoch 1 completed.  Last Training Loss:  7.328267
130 batches of epoch 1 completed.  Last Training Loss:  6

In [20]:
# Return inference given model, keyvectors vocabulary,prompt, maximum response length
def return_inference(model,emb_kv,prompt,maxLen):
    
    # Bounded prompt
    bounded_prompt = prompt + ' ' + eosToken
    
    # Convert prompt to embedding vectors
    prompt_model_input = torch.tensor(tokensToIndices(prepare_text(bounded_prompt,emb_kv),emb_kv), dtype=torch.int64)
    prompt_model_input = prompt_model_input.unsqueeze(dim=0)
    prompt_question_lengths = torch.tensor([prompt_model_input.size(dim=1)], dtype=torch.int64)
    
    # Prepare unused target for forward method, except to feed in soseq token at start
    unused_target = emb_kv.get_index(sosToken)*torch.ones(maxLen,dtype=torch.int64).unsqueeze(dim=0)
    unused_lengths = torch.tensor([maxLen],dtype=torch.int64)
        
    # Get output tensor
    output = model.to("cpu")((prompt_model_input,prompt_question_lengths), \
                                     (unused_target,unused_lengths), \
                                     teacher_forcing_ratio=0)
    outString = ""
    
    # Convert embeddings to words
    for i in range(output.size(dim=1)):
        next_word = emb_kv.index_to_key[torch.argmax(output[0][i])]
        if next_word == eosToken:
            outString += " " + next_word
            break
        else:
            outString += " " + next_word
    
    return outString

In [21]:
lowest_val_model = torch.load("Min-Validation-Loss-Model.pt")
lowest_val_model.eval()

prompt_one = "What was the size of the notre dame endowment when theodore hesburgh became president?"
prompt_two = "Tigers are nice but badgers are mean.  Which of these animals is kind?"
prompt_three = "Some good foods for breakfast are pancakes, waffles, and french toast.  " + \
               "Some good foods for dinner are chicken, soup, and pizza.  Would it be " + \
               "better to serve pizza or pancakes tonight?"

for prompt in [prompt_one, prompt_two, prompt_three]:
    print(return_inference(seqToseqNet,pr_kv,prompt,40))
    

 the eoseq
 the eoseq
 the eoseq
