In [1]:
#import
#torch lib
import torch
#neural network
import torch.nn as nn
#optimizer ex. momentum,SGD,RMSprop,Adagrad,Adadelta,Adam
from torch import optim
#loss function ex. softmax *** Activation func?
import torch.nn.functional as F
#others
import csv
import random
import re
import os
import unicodedata
import codecs
import itertools

In [2]:
#check cuda is available or not
CUDA = torch.cuda.is_available()
device = torch.device("cuda" if CUDA else "cpu")

In [3]:
#processing datasets
#dataset download from http://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html
#datasets file path
lines_filepath = os.path.join("cornell movie-dialogs corpus", "movie_lines.txt")
conv_filepath = os.path.join("cornell movie-dialogs corpus", "movie_conversations.txt")

In [4]:
#Visualize some lines
with open(lines_filepath, "r", encoding="iso-8859-1") as file:
    lines = file.readlines()
for line in lines[:8]:
    print(line.strip())

L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.
L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?
L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.
L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow
L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.
L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No


In [5]:
#Split each line of the file into a dictionary of fields (lineID, characterID, movieID, character, text)
line_fields = ["lineID", "characterID", "movieID", "character", "text"]
lines = {}
with open(lines_filepath, "r", encoding="iso-8859-1") as f:
    for line in f:
        values = line.split(" +++$+++ ")
        #Extract fields
        lineObj = {}
        if len(values)==5:
            for i, field in enumerate(line_fields):
                lineObj[field] = values[i]
            lines[lineObj["lineID"]] = lineObj

In [6]:
list(lines.items())[0]

('L1045',
 {'lineID': 'L1045',
  'characterID': 'u0',
  'movieID': 'm0',
  'character': 'BIANCA',
  'text': 'They do not!\n'})

In [7]:
conv_fields = ["character1ID", "character2ID", "movieID", "utteranceIDs"]
conversations = []
with open(conv_filepath, "r", encoding="iso-8859-1") as f:
    for line in f:
        values = line.split(" +++$+++ ")
        #Extract fields
        convObj = {}
        for i, field in enumerate(conv_fields):
            convObj[field] = values[i]
        #convert string type array data to real array
        lineIds = eval(convObj["utteranceIDs"])
        #query lines's data then append to convObj["lines"]
        convObj["lines"] = []
        for lineId in lineIds:
            convObj["lines"].append(lines[lineId])
        conversations.append(convObj)
#Done for extract and combine conversation data to an array...

In [8]:
conversations[0]

{'character1ID': 'u0',
 'character2ID': 'u2',
 'movieID': 'm0',
 'utteranceIDs': "['L194', 'L195', 'L196', 'L197']\n",
 'lines': [{'lineID': 'L194',
   'characterID': 'u0',
   'movieID': 'm0',
   'character': 'BIANCA',
   'text': 'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\n'},
  {'lineID': 'L195',
   'characterID': 'u2',
   'movieID': 'm0',
   'character': 'CAMERON',
   'text': "Well, I thought we'd start with pronunciation, if that's okay with you.\n"},
  {'lineID': 'L196',
   'characterID': 'u0',
   'movieID': 'm0',
   'character': 'BIANCA',
   'text': 'Not the hacking and gagging and spitting part.  Please.\n'},
  {'lineID': 'L197',
   'characterID': 'u2',
   'movieID': 'm0',
   'character': 'CAMERON',
   'text': "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n"}]}

In [9]:
#Extract conversation Q&A
qa_pairs = []
for conversation in conversations:
    #Iterate over all the lines of the conversation
    for i in range(len(conversation["lines"]) - 1):
        inputLine = conversation["lines"][i]["text"].strip()
        targetLine = conversation["lines"][i+1]["text"].strip()
        #Filter wrong samples (if one of the lists is empty)
        if inputLine and targetLine:
            qa_pairs.append([inputLine, targetLine])

In [10]:
qa_pairs[:4]
len(qa_pairs)

221282

In [11]:
#Save array to a csv file
#Define path to new file
datafile = os.path.join("cornell movie-dialogs corpus", "formatted_movie_lines.txt")
delimiter = '\t'
#Unescape the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))
#Write new csv file
print("\nWriting newly formatted file...")
with open(datafile, "w", encoding="utf-8") as outputfile:
    writer = csv.writer(outputfile, delimiter=delimiter)
    for pair in qa_pairs:
        writer.writerow(pair)
print("Done writing to file")


Writing newly formatted file...
Done writing to file


In [12]:
#Visualize some lines
datafile = os.path.join("cornell movie-dialogs corpus", "formatted_movie_lines.txt")
with open(datafile, "rb") as file:
    lines = file.readlines()
for line in lines[:8]:
    print(line)

b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\r\r\n"
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\r\r\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\r\r\n"
b"You're asking me out.  That's so cute. What's your name again?\tForget it.\r\r\n"
b"No, no, it's my fault -- we didn't have a proper introduction ---\tCameron.\r\r\n"
b"Cameron.\tThe thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\r\r\n"
b"The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\tSeems like she could get a date easy enough...\r\r\n"
b'Why?\tU

In [13]:
#processing word
PAD_token = 0 #Used for padding short sentences
SOS_token = 1 #Start-of-sentence token <START>
EOS_token = 2 #End-of-sentence token <END>

class Vocabulary:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 #Count SOS, EOS, PAD
    def addSentence(self, sentence):
        for word in sentence.split(" "):
            self.addWord(word)
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1
    #let the word will not repeat to many times
    #Remove words below a certain count threshold
    def trim(self, min_count):
        keep_words = []
        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)
        print("keep_words {} / {} = {:.4f}".format(len(keep_words), len(self.word2index), len(keep_words)/len(self.word2index)))
        #Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 #Count SOS, EOS, PAD
        
        for word in keep_words:
            self.addWord(word)
#End of processing word

In [14]:
#processing text
#Convert unicode to ascii code
#NFD=normal form decomposed, Mn=non-marking space
def unicodeToAscii(s):
    return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")

In [15]:
#Test **unicodeToAscii** function
unicodeToAscii("Montreal,Francoise...")

'Montreal,Francoise...'

In [16]:
#Lowercase, trim white space, lines...etc., and remove non-letter characters
def normalizeString(s):
    #to lowercase and strip string
    s = unicodeToAscii(s.lower().strip())
    #replace "!" -> " !"
    s = re.sub(r"([.!?])",r" \1", s)
    #remove any characters that is not a sequence of lower or upper case letters
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    #remove a sequence of whitespace characters
    s = re.sub(r"\s+", r" ", s).strip()
    return s

In [17]:
#Test **normalizeString** function
normalizeString("aa123aa!s's     dd?")

'aa aa !s s dd ?'

In [18]:
datafile = os.path.join("cornell movie-dialogs corpus", "formatted_movie_lines.txt")
#Read the file and split into lines
print("Reading and processing file... Please wait")
lines = open(datafile, encoding="utf-8").read().strip().split('\n')
#Split every line into pairs and normalize
pairs = [[normalizeString(s) for s in pair.split('\t')] for pair in lines]
print("Done Reading txt file into array!")
voc = Vocabulary("cornell movie-dialogs corpus")

Reading and processing file... Please wait
Done Reading txt file into array!


In [19]:
#Return True if both sentences in a pair 'p' are under the MAX_LENGTH threshold
MAX_LENGTH = 10 #Maximum sentence length to consider
def filterPair(p):
    #Input sequences need to preserve the last word for EOS token
    return len(p[0].split()) < MAX_LENGTH and len(p[1].split()) < MAX_LENGTH

#Filter pairs using **filterPair** condition
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [20]:
pairs = [pair for pair in pairs if len(pair) > 1]
print("There are {} pairs/conversations in the dataset".format(len(pairs)))
pairs = filterPairs(pairs)
print("After filtering, there are {} pairs/converations".format(len(pairs)))
#End for processing text

There are 221282 pairs/conversations in the dataset
After filtering, there are 64271 pairs/converations


In [21]:
#Getting rid of rare words
for pair in pairs:
    voc.addSentence(pair[0])
    voc.addSentence(pair[1])
print("Counted words: ", voc.num_words)
for pair in pairs[:10]:
    print(pair)

Counted words:  18008
['there .', 'where ?']
['you have my word . as a gentleman', 'you re sweet .']
['hi .', 'looks like things worked out tonight huh ?']
['you know chastity ?', 'i believe we share an art instructor']
['have fun tonight ?', 'tons']
['well no . . .', 'then that s all you had to say .']
['then that s all you had to say .', 'but']
['but', 'you always been this selfish ?']
['do you listen to this crap ?', 'what crap ?']
['what good stuff ?', 'the real you .']


In [22]:
MIN_COUNT = 3 #
def trimRareWords(voc, pairs, MIN_COUNT):
    #Trim words used under the MIN_COUNT from the voc
    voc.trim(MIN_COUNT)
    #Filter out pairs with trimmed words
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        #Check input sentence
        for word in input_sentence.split(" "):
            if word not in voc.word2index:
                keep_input = False
                break
        #Check output sentence
        for word in output_sentence.split(" "):
            if word not in voc.word2index:
                keep_output = False
                break
        #Only keep pairs that do not contain trimmed word(s) in their input or output sentence
        if keep_input and keep_output:
            keep_pairs.append(pair)
    print("Trimmed from pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs)/len(pairs)))
    return keep_pairs

#Trim voc and pairs
pairs = trimRareWords(voc, pairs, MIN_COUNT)
#End for processing datasets

keep_words 7823 / 18005 = 0.4345
Trimmed from pairs to 64271, 53165.0000 of total


In [23]:
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]

In [24]:
#Test the function
indexesFromSentence(voc, pairs[1][0])

[7, 8, 9, 10, 4, 11, 12, 13, 2]

In [25]:
pairs[1][0]

'you have my word . as a gentleman'

In [26]:
#Define some samples for testing
inp = []
out = []
i = 0
for pair in pairs[:10]:
    inp.append(pair[0])
    out.append(pair[1])
print(inp)
print(len(inp))
indexes = [indexesFromSentence(voc, sentence) for sentence in inp]
indexes

['there .', 'you have my word . as a gentleman', 'hi .', 'have fun tonight ?', 'well no . . .', 'then that s all you had to say .', 'but', 'do you listen to this crap ?', 'what good stuff ?', 'wow']
10


[[3, 4, 2],
 [7, 8, 9, 10, 4, 11, 12, 13, 2],
 [16, 4, 2],
 [8, 31, 22, 6, 2],
 [33, 34, 4, 4, 4, 2],
 [35, 36, 37, 38, 7, 39, 40, 41, 4, 2],
 [42, 2],
 [47, 7, 48, 40, 45, 49, 6, 2],
 [50, 51, 52, 6, 2],
 [58, 2]]

In [27]:
def zeroPadding(l, fillvalue = 0):
    # * is used for transpose matrix
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

In [28]:
leng = [len(ind) for ind in indexes]
max(leng)

10

In [29]:
#Test the function
test_result = zeroPadding(indexes)
print(len(test_result)) #The max length is now the number of rows
test_result

10


[(3, 7, 16, 8, 33, 35, 42, 47, 50, 58),
 (4, 8, 4, 31, 34, 36, 2, 7, 51, 2),
 (2, 9, 2, 22, 4, 37, 0, 48, 52, 0),
 (0, 10, 0, 6, 4, 38, 0, 40, 6, 0),
 (0, 4, 0, 2, 4, 7, 0, 45, 2, 0),
 (0, 11, 0, 0, 2, 39, 0, 49, 0, 0),
 (0, 12, 0, 0, 0, 40, 0, 6, 0, 0),
 (0, 13, 0, 0, 0, 41, 0, 2, 0, 0),
 (0, 2, 0, 0, 0, 4, 0, 0, 0, 0),
 (0, 0, 0, 0, 0, 2, 0, 0, 0, 0)]

In [30]:
def binaryMatrix(l, value = 0):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

In [31]:
binary_result = binaryMatrix(test_result)
binary_result

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 0, 1, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]]

In [32]:
#Returns padded input sequence tensor and as well as a tensor of lengths for each of the sequence in the batch
def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

In [33]:
#Return padded target sequence tensor, padding mask, and max target length
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

In [34]:
#Return all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
    #Sort the questions in descending length
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, voc)
    #assert len(inp) == lengths[0]
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len
#Done for pre-processing data

In [35]:
#Example fot validation
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input_variable:")
print(input_variable)
print("lengths: ", lengths)
print("target_variable:")
print(target_variable)
print("mask:")
print(mask)
print("mask_target_len: ", max_target_len)

input_variable:
tensor([[   7,   25,   25,   65,   50],
        [  73,   94,  296,   14,    6],
        [ 380, 1262,    7,  187,    2],
        [1418,   76,    4,    6,    0],
        [   6,    2,    2,    2,    0],
        [   2,    0,    0,    0,    0]])
lengths:  tensor([6, 5, 5, 5, 3])
target_variable:
tensor([[  59,    7,  190,  318,   68],
        [  83,   94,   51,   65,    7],
        [ 158, 1262,   98,   92,  236],
        [ 111,   76,   12,    4,   50],
        [   4,    6,  180,    2,  101],
        [   2,    2, 2730,    0,  215],
        [   0,    0, 4780,    0,    6],
        [   0,    0,   23,    0,    2],
        [   0,    0,    6,    0,    0],
        [   0,    0,    2,    0,    0]])
mask:
tensor([[ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True, False,  True],
        [False,

In [36]:
#ENCODER
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout), bidirectional=True)
    
    def forward(self, input_seq, input_lengths, hidden=None):
        embedded = self.embedding(input_seq)
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        outputs, hidden = self.gru(packed, hidden)
        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
        return outputs, hidden

In [37]:
#Attention layer
class Attn(torch.nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        self.hidden_size = hidden_size
    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)
    def forward(self, hidden, encoder_outputs):
        attn_energies = self.dot_score(hidden, encoder_outputs)
        attn_energies = attn_energies.t()
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [38]:
#DECODER
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        #Defne layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers ==1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        
        self.attn = Attn(attn_model, hidden_size)
        
    def forward(self, input_step, last_hidden, encoder_outputs):
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        rnn_output, hidden = self.gru(embedded, last_hidden)
        attn_weights = self.attn(rnn_output, encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        output = self.out(concat_output)
        outout = F.softmax(output, dim=1)
        return output, hidden

In [39]:
def maskNLLLoss(decoder_out, target, mask):
    nTotal = mask.sum()
    target = target.view(-1, 1)
    gathered_tensor = torch.gather(decoder_out, 1, target)
    crossEntropy = -torch.log(gathered_tensor)
    loss = crossEntropy.masked_select(mask)
    loss = loss.mean()
    loss = loss.to(device)
    return loss, nTotal.item()

In [42]:
small_batch_size = 5
batched = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, length, target_variable, mask, max_target_len = batches

print("input_variable shape: ", input_variable.shape)
print("lengths shape: ", lengths.shape)
print("target_variable shape: ", target_variable.shape)
print("mask shape: ", mask.shape)
print("max_target_len: ", max_target_len)

#Define the parameters
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
attn_model = 'dot'
embedding = nn.Embedding(voc.num_words, hidden_size)

##Define the encoder and decoder
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
encoder = encoder.to(device)
decoder = decoder.to(device)
#Ensure dropout layers are in train mode
encoder.train()
decoder.train()

#Initialize optimizers
encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.0001)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.0001)
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()

input_variable = input_variable.to(device)
lengths = lengths.to(device)
target_variable = target_variable.to(device)
mask = mask.to(device)

loss = 0
print_losses = []
n_totals = 0

encoder_outputs, encoder_hidden = encoder(input_variable, lengths)
print("Encoder Outputs Shape: ", encoder_outputs.shape)
print("Last Encoder Hidden Shape: ", encoder_hidden.shape)

decoder_input = torch.LongTensor([[SOS_token for _ in range(small_batch_size)]])
decoder_input = decoder_input.to(device)
print("Initial Decoder Input Shape: ", decoder_input.shape)
print(decoder_input)

#Set initial decoder hidden state to the encoder's final hidden state
decoder_hidden = encoder_hidden[:decoder.n_layers]
print("Initial Decoder hidden statr shape: ", decoder_hidden.shape)
print("\n")
print("--------------------------------------------------")
print("Now Let's look what's happening in every timestep of the GRU!")
print("--------------------------------------------------")
print("\n")

#Assume we are using Teacher Forcing
for t in range(max_target_len):
    decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
    print("Decoder Output Shape: ", decoder_output.shape)
    print("Decoder Hidden Shape: ", decoder_hidden.shape)
    #Teacher forcing: next input is current target
    decoder_input = target_variable[t].view(1, -1)
    print("The target variable at the current timestep before reshaping: ", target_variable[t])
    print("The target variable at the current timestep shape before reshaping: ", target_variable[t].shape)
    print("The Decoder input shape (reshape the target variable): ", decoder_input.shape)
    #Calculate and accumulate loss
    print("The mask at the current timestep: ", mask[t])
    print("The mask at the curremt timestep shape: ", mask[t].shape)
    mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
    print("Mask Loss: ", mask_loss)
    print("Total: ", nTotal)
    loss += mask_loss
    print_losses.append(mask_loss.item() * nTotal)
    print(print_losses)
    n_totals += nTotal
    encoder_optimizer.step()
    decoder_optimizer.step()
    returned_loss = sum(print_losses) / n_totals
    print("Returned Loss: ", returned_loss)
    print("\n")
    print("--------------------DONE ONE TIMESTEP--------------------")
    print("\n")

input_variable shape:  torch.Size([6, 5])
lengths shape:  torch.Size([5])
target_variable shape:  torch.Size([10, 5])
mask shape:  torch.Size([10, 5])
max_target_len:  10
Encoder Outputs Shape:  torch.Size([6, 5, 500])
Last Encoder Hidden Shape:  torch.Size([4, 5, 500])
Initial Decoder Input Shape:  torch.Size([1, 5])
tensor([[1, 1, 1, 1, 1]], device='cuda:0')
Initial Decoder hidden statr shape:  torch.Size([2, 5, 500])


--------------------------------------------------
Now Let's look what's happening in every timestep of the GRU!
--------------------------------------------------


Decoder Output Shape:  torch.Size([5, 7826])
Decoder Hidden Shape:  torch.Size([2, 5, 500])
The target variable at the current timestep before reshaping:  tensor([ 59,   7, 190, 318,  68], device='cuda:0')
The target variable at the current timestep shape before reshaping:  torch.Size([5])
The Decoder input shape (reshape the target variable):  torch.Size([1, 5])
The mask at the current timestep:  tensor(