## Name: Yusuf Elnady
## Project: Chatbot - RNN

In [1]:
import re
import csv
import unicodedata
import codecs
import random
import torch
import itertools
from torch import nn
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Reading From Files

In [2]:
# filename="/projects/3e080ba6-1476-4b72-953b-1b591cbf600a/Notebooks/Yusuf/cornell movie-dialogs corpus/movie_lines.txt"
filename="movie_lines.txt"

with open(filename, 'r', encoding='iso-8859-1') as datafile:
     lines = datafile.readlines()

all_text_and_fields = {}

for line in lines:
    temp_dict = {}
    temp_list = line.split(' +++$+++ ')
    all_text_and_fields[temp_list[0]] ={'lineID':temp_list[0], 'characterID':temp_list[1],
                                       'movieID':temp_list[2], 'characterName':temp_list[3],
                                       'text':temp_list[4]}
 

In [3]:
# filename2="/projects/3e080ba6-1476-4b72-953b-1b591cbf600a/Notebooks/Yusuf/cornell movie-dialogs corpus/movie_conversations.txt"
filename2="movie_conversations.txt"
with open(filename2, 'r', encoding='iso-8859-1') as datafile:
     lines = datafile.readlines()
        
        
conversations = []
pattern = re.compile('L[0-9]+')
for line in lines:
    temp_list = line.split(' +++$+++ ')
    temp_dict = {'Character1ID':temp_list[0], 'Character2ID':temp_list[1],
                  'movieID':temp_list[2], 'lineIDs':temp_list[3]}
    
    my_line_ids = pattern.findall(temp_dict['lineIDs'])
    my_lines_text = []
    for line_id in my_line_ids:
        my_lines_text.append(all_text_and_fields[line_id]['text'])
    
    temp_dict['lines'] = my_lines_text
    conversations.append(temp_dict)
    

In [4]:
conversations[0]

{'Character1ID': 'u0',
 'Character2ID': 'u2',
 'movieID': 'm0',
 'lineIDs': "['L194', 'L195', 'L196', 'L197']\n",
 'lines': ['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\n',
  "Well, I thought we'd start with pronunciation, if that's okay with you.\n",
  'Not the hacking and gagging and spitting part.  Please.\n',
  "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n"]}

## Building QA Pairs

In [5]:
def make_qa_pairs(conversations):

    qa_pairs = []

    for conversation in conversations:
        for i in range(0,len(conversation['lines'])-1,2):
            q = conversation['lines'][i].strip()
            a = conversation['lines'][i+1].strip()

            if q and a:
                qa_pairs.append([q,a])
                
    return qa_pairs

In [6]:
qa_pairs = make_qa_pairs(conversations)

## Saving clear Version of qa_pairs

In [7]:
datafile = 'qa_pairs.txt'
delimiter = '\t'
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

with open(datafile , 'w',encoding='utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter=delimiter, lineterminator='\n')
    for pair in qa_pairs:
        writer.writerow(pair)

In [8]:
datafile = 'qa_pairs.txt'
with open(datafile,"r") as outputfile:  # Had to add encoding to split the list
    lines=outputfile.readlines()


In [9]:
lines[0]

"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\n"

## Normalizing and Adding The Vocabulary

In [10]:
PAD_TOKEN = 0 
SOS_TOKEN = 1
EOS_TOKEN = 2
class Voc:
    def __init__(self,name):
        self.name = name
        self.word2index = {'PAD':0,'SOS':1,'EOS':2 }
        self.index2word = {0:'PAD', 1:'SOS', 2:'EOS'}
        self.num_words = 3
        self.word2count = {'PAD':1,'SOS':1,'EOS':1}
    def add_sentence(self,sentence): # sentence is just a long string, and I will split it into many strings
        for word in sentence.split(' '):
            self.add_word(word.strip())
        
    def add_word(self,word):
        if word in self.word2index:
            self.word2count[word]+=1
        elif word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1

In [11]:
# https://stackoverflow.com/a/518232/2809427
def unicode2ascii(s): # Turn a Unicode string to plain ASCII
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalize_word(word):
    
    contractions_dict = { "ain't": "are not", "'s":" is", "aren't": "are not", "can't": "cannot", "can't've": "cannot have", "'cause": "because", "could've": "could have", "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'd've": "he would have", "he'll": "he will", "he'll've": "he will have", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have", "I'm": "I am", "I've": "I have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have", "mightn't": "might not", "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have", "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have", "that'd": "that would", "that'd've": "that would have", "there'd": "there would", "there'd've": "there would have", "they'd": "they would", "they'd've": "they would have","they'll": "they will",
 "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not","what'll": "what will", "what'll've": "what will have", "what're": "what are", "what've": "what have", "when've": "when have", "where'd": "where did", "where've": "where have",
 "who'll": "who will", "who'll've": "who will have", "who've": "who have", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would", "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have", "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}
    
    word=unicode2ascii(word.lower().strip())
    
    for key in contractions_dict.keys():
        if key in word:
            word=word.replace(key,contractions_dict[key])
    
    word=re.sub(r"([.!,?'])",r" \1 ",word)
    word=re.sub(r"[^a-zA-Z.!,?']+",r" ",word)
    word=re.sub(r"\s+",r" ",word).strip()
    
    return word

def normalize_sentence(sentence):
    normalized_sentence = ''
    for word in sentence.split(' '):
        normalized_sentence += normalize_word(word)+' '
    return normalized_sentence

In [12]:
MAX_SENTENCE_LENGTH = 10  # Maximum sentence length to consider

In [73]:
voc = Voc('My Vocabulary')

In [86]:
#This function takes the qa_pairs, and normalize everyword, and then adds the words to the vocabulary class
def build_vocabulary(voc, qa_pairs):
    keep_pairs = [ ]
    qa_pairs = [[normalize_sentence(sentence) for sentence in pair] for pair in qa_pairs]
    for pair in qa_pairs:
        #roxaane wasn't in the vocabulary, because the sentence length was larger than MAX_SENTENCE_LENGTH
        if (len(pair[0].split(' '))<=MAX_SENTENCE_LENGTH and len(pair[1].split(' '))<=MAX_SENTENCE_LENGTH):
            voc.add_sentence(pair[0])
            voc.add_sentence(pair[1])
            keep_pairs.append(pair)
    return voc, keep_pairs

In [111]:
voc, normalized_qa_pairs = build_vocabulary(voc, qa_pairs)

In [112]:
"Values without deleting anything" ,len(voc.index2word.keys()), voc.num_words

('Values without deleting anything', 14199, 14199)

## Trim Words

In [113]:
# Problem I had --> When we trim words, we should change the index of the other words to be in the range of the num_vocabulary

# Which means if Vocab is 20, then trimmed to be 10, so no word should have index [11-20]

In [114]:
def trim_words(voc):
    words_kept = []
    trimmed_words = []
    for word,count in voc.word2count.items():
        if count >= WORD_FREQ_THRESHOLD:
            words_kept.append(word)
        elif count<WORD_FREQ_THRESHOLD:
            trimmed_words.append(word)
                
     # Reinitialize dictionaries
    voc.word2index = {}
    voc.word2count = {}
    voc.index2word = {PAD_TOKEN: "PAD", SOS_TOKEN: "SOS", EOS_TOKEN: "EOS"}
    voc.num_words = 3 # Count default tokens

    for word in words_kept:
        voc.add_word(word)           
    return voc, trimmed_words

In [115]:
voc, trimmed_words = trim_words(voc)

In [116]:
def remove_trimmed_pairs(qa_pairs,trimmed_words):
    keep_pairs = []
    for pair in qa_pairs:
        flag_q = True
        flag_a = True
        
        question = pair[0]
        answer = pair[1]
        
        for word in question.split(' '):
            if word in trimmed_words:
                flag_q = False 
                break 
    
        if flag_q == True:
            for word in answer.split(' '):
                if word in trimmed_words:
                    flag_a = False 
                    break 
    
        if flag_q and flag_a :
            keep_pairs.append(pair)
            
    return keep_pairs 

In [117]:
final_qa_pairs = remove_trimmed_pairs(normalized_qa_pairs,trimmed_words)

In [118]:
final_qa_pairs

[['there . ', 'where ? '],
 ['you have my word .  as a gentleman ', 'you are sweet . '],
 ['hi . ', 'looks like things worked out tonight , huh ? '],
 ['have fun tonight ? ', 'tons '],
 ['well , no . . . ', 'then that is all you had to say . '],
 ['but ', 'you always been this selfish ? '],
 ['do you listen to this crap ? ', 'what crap ? '],
 ['wow ', 'let is go . '],
 ['she okay ? ', 'i hope so . '],
 ['they do to ! ', 'they do not ! '],
 ['did you change your hair ? ', 'no . '],
 ['it is more ', 'expensive ? '],
 ['where have you been ? ', 'nowhere . . . hi , daddy . '],
 ['in th .  for a month ', 'why ? '],
 ['he was , like , a total babe ', 'but you hate joey '],
 ['you looked beautiful last night , you know . ', 'so did you '],
 ['let go ! ', 'you set me up . '],
 ['but she does not want to date . ', 'exactly my point '],
 ['daddy , i  ', "and where ' re you going ? "],
 ['oh , god .  it is starting . ', 'it is just a party . daddy . '],
 ['you the new guy ? ', 'so they tell me . 

## Vectorization

In [119]:
# Convert words of the sentence to numbers --> This our vectorization
def sentence2vector(voc, sentence):
    vector = [voc.word2index[word] for word in sentence.split(' ')] + [EOS_TOKEN]
    return vector 

def zero_padding(vectors, fillvalue=PAD_TOKEN):
    return torch.LongTensor(list(itertools.zip_longest(*vectors, fillvalue=fillvalue))).t()

def create_mask_matrix(lengths,max_length): # 1 means Value, 0 means Padding
    mask_matrix = []
    for length in lengths :
        mask = sum([[0]*length,[1]*(max_length-length)],[])
        mask_matrix.append(mask)
    return  torch.BoolTensor(mask_matrix)

In [120]:
def questions2vectors(voc, questions):
    
    #Convert sentence to vector using words indexed
    question_vectors = [sentence2vector(voc, sentence) for sentence in questions]
    
    #Get Length of each sentence
    questions_lengths = torch.tensor([len(indexes) for indexes in question_vectors])
    
    #max_length can be 11 as we added EOS_TOKEN
    max_length = max(questions_lengths) 
    
    #Create mask matrix based on the lengths array
    questions_mask = create_mask_matrix(questions_lengths, max_length)
    
    #Pad the sentences to the maximum length with PAD_TOKEN
    question_vectors_padded = zero_padding(question_vectors)
    
    return question_vectors_padded, questions_mask, questions_lengths

In [121]:
def answers2vectors(voc, answers):
    
    #Convert sentence to vector using words indexed
    answers_vectors = [sentence2vector(voc, sentence) for sentence in answers]
    #Get Length of each sentence
    answers_lengths = torch.tensor([len(indexes) for indexes in answers_vectors])
    
    #max_length can be 11 as we added EOS_TOKEN
    max_length = max(answers_lengths) 
    
    #Create mask matrix based on the lengths array
    answers_mask = create_mask_matrix(questions_lengths, max_length)
    
    #Pad the sentences to the maximum length with PAD_TOKEN
    answers_vectors_padded = zero_padding(answers_vectors)
    
    return answers_vectors_padded, answers_mask, answers_lengths

In [122]:
questions = [pair[0] for pair in final_qa_pairs]
question_vectors_padded, questions_mask, questions_lengths  = questions2vectors(voc,questions)
print()
answers = [pair[1] for pair in final_qa_pairs]
answers_vectors_padded, answers_mask, answers_lengths  = answers2vectors(voc,answers)




## Dataset and Dataloader

In [123]:
# In Input, we don't need the mask, as we will use the lengths function in pack_padded_sequence to determine in paddings
# In Output, we need the mask function, as it will be used in the loss (maskNLLLoss)
batch_size = 64
dataset = torch.utils.data.TensorDataset(question_vectors_padded, questions_lengths , answers_vectors_padded, answers_mask,answers_lengths)
train_dataloader  = torch.utils.data.DataLoader(dataset, shuffle=True, batch_size=batch_size)
# max_target_len = max(answer_lengths) ---> Will be needed later in the train_iter

In [124]:
question_vectors_padded.shape


torch.Size([26875, 11])

In [125]:
question_vectors_padded.shape

torch.Size([26875, 11])

## Building The Model

In [139]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super().__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding

        # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
        #   because our input size is a word embedding with number of features == hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        # Convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # Pack padded batch of sequences for RNN module
        print("Embedded from Encoder:",embedded.shape)
        print("Input Lengths from Encoder:",len(input_lengths))
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths,batch_first=True,enforce_sorted=False)
        # Forward pass through GRU
        outputs, hidden = self.gru(packed, hidden)
        # Unpack padding
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs,batch_first=True)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        # Return output and final hidden state
        return outputs, hidden

In [140]:
# Luong attention layer
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super( ).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [141]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super( ).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

In [142]:
x = torch.rand([6,11,5])
l = [1,2,11,10,5,7]
nn.utils.rnn.pack_padded_sequence(x, l,batch_first=True,enforce_sorted=False)

PackedSequence(data=tensor([[0.0575, 0.2809, 0.7025, 0.7735, 0.8638],
        [0.8009, 0.4077, 0.0554, 0.4601, 0.7082],
        [0.5057, 0.2061, 0.4572, 0.8015, 0.1893],
        [0.4699, 0.8882, 0.5105, 0.9807, 0.7253],
        [0.0139, 0.4963, 0.5086, 0.6076, 0.6502],
        [0.8538, 0.6370, 0.7128, 0.4015, 0.3457],
        [0.8452, 0.1311, 0.8235, 0.7654, 0.5286],
        [0.6603, 0.7052, 0.1643, 0.3576, 0.6263],
        [0.1488, 0.6658, 0.7640, 0.9306, 0.4674],
        [0.6604, 0.7372, 0.0576, 0.0771, 0.9560],
        [0.8724, 0.7236, 0.6678, 0.2173, 0.3163],
        [0.3437, 0.7498, 0.5143, 0.7214, 0.6581],
        [0.1668, 0.4992, 0.1329, 0.9904, 0.5818],
        [0.8351, 0.9924, 0.5670, 0.1276, 0.7069],
        [0.5289, 0.8241, 0.5616, 0.8331, 0.3413],
        [0.4890, 0.2811, 0.3797, 0.7544, 0.2458],
        [0.5535, 0.2952, 0.6204, 0.6572, 0.6800],
        [0.1511, 0.8952, 0.6000, 0.2579, 0.9322],
        [0.9165, 0.1233, 0.5141, 0.3705, 0.1709],
        [0.4647, 0.7057, 0.753

## Training

In [143]:
def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss#.to(device)
    return loss, nTotal.item()

In [155]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable#.to(device)
    target_variable = target_variable#.to(device)
    mask = mask#.to(device)
    # Lengths for rnn packing should always be on the cpu
    lengths = lengths.tolist()

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0
    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_TOKEN for _ in range(batch_size)]])
    decoder_input = decoder_input#.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input#.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropatation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

In [156]:
def trainIters(model_name, voc, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, 
               decoder_n_layers, n_iteration, batch_size, print_every, save_every, clip):

    # Load batches for each iteration
    #training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])for _ in range(n_iteration)]
                      

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
   

    # Training loop
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        
        # Extract fields from batch
        # input_variable, lengths, target_variable, mask, max_target_len = training_batch
        input_variable_padded, input_lengths , target_vectors_padded, target_mask,target_lengths =  next(iter(train_dataloader))  
        max_target_len = max(target_lengths)
        # Run a training iteration with batch
        loss = train(input_variable_padded, input_lengths, target_vectors_padded, target_mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

In [157]:
input_variable_padded, input_lengths , target_vectors_padded, target_mask,target_lengths =  next(iter(train_dataloader))

In [158]:
input_variable_padded.shape,input_lengths.shape

(torch.Size([64, 11]), torch.Size([64]))

In [159]:
model_name = 'cb_model'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64


embedding = nn.Embedding(voc.num_words, hidden_size)

# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)

# Use appropriate device
encoder = encoder#.to(device)
decoder = decoder#.to(device)
print('Models built and ready to go!')

Models built and ready to go!


In [160]:
voc.num_words, len(voc.word2count)

(4976, 4973)

In [161]:
len(voc.index2word.keys())

4976

In [162]:
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 4000
print_every = 1
save_every = 500

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
 
# If you have cuda, configure cuda to call
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

# Run training iterations
print("Starting Training!")
trainIters(model_name, voc, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, n_iteration, batch_size,
           print_every, save_every, clip)

Building optimizers ...
Starting Training!
Initializing ...
Training...
Embedded from Encoder: torch.Size([64, 11, 500])
Input Lengths from Encoder: 64


RuntimeError: The size of tensor a (64) must match the size of tensor b (11) at non-singleton dimension 1

In [153]:
CUDA_LAUNCH_BLOCKING="1"