## Todo:
- Add Xavier initialisation for all the weights of the networks
- Look up on how to structure the auxillary function?
- Update the training function to include all the parts of the model
- What is KL Annealing???

In [125]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import os
import bcolz
import numpy as np
import pickle

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import time
import math

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [129]:
max_length = 10

In [130]:
"""
Dataset processing functions
"""

# start_of_sentence and end_of_sentence token
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {
            0: "SOS",
            1: "EOS"
        }
        self.n_words = len(self.index2word.keys())
    
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)
    
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
            
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )
def norm(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def readLangs(lang1, lang2, reverse=False):
    print("Reading lines..")
    
    # read the file and split into lines
    directory = "../Datasets/Tutorials/seq2seq"
    filename = lang1 + "-" + lang2 + ".txt"
    filepath = os.path.join(directory, filename)
    lines = open(filepath, encoding='utf-8').\
        read().strip().split("\n")
    
    # split every line into pairs
    # note that the language phrases are split by a tab.
    pairs = [[norm(s) for s in l.split('\t')] for l in lines]
    
    # reverse the pairs
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)
    
    return input_lang, output_lang, pairs

englishPrefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
    )

def filterPair(p):
    check1 = len(p[0].split(' ')) < max_length
    check2 = len(p[1].split(' ')) < max_length
    check3 = p[1].startswith(englishPrefixes)
    return check1 and check2 and check3

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

def prepare(lang1, lang2, reverse=False):
    inputLang, outputLang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs." % len(pairs))
    
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs." % len(pairs))
    
    print("Counting words..")
    
    for pair in pairs:
        inputLang.addSentence(pair[0])
        outputLang.addSentence(pair[1])
        
    print("Counted words:")
    print(inputLang.name, inputLang.n_words)
    print(outputLang.name, outputLang.n_words)
    
    return inputLang, outputLang, pairs

inputLang, outputLang, pairs = prepare("eng", "fra", True)
print(random.choice(pairs))

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1,1)

def tensorsFromPair(pair):
    inputTensor = tensorFromSentence(inputLang, pair[0])
    targetTensor = tensorFromSentence(outputLang, pair[1])
    return (inputTensor, targetTensor)

def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

Reading lines..
Read 135842 sentence pairs.
Trimmed to 13465 sentence pairs.
Counting words..
Counted words:
fra 5392
eng 3552
['vous etes depourvue d ambition .', 'you re unambitious .']


In [103]:
"""
Need a function to load the GloVe word embeddings.
Based on this method:
https://medium.com/@martinpella/how-to-use-pre-trained-word-embeddings-in-pytorch-71ca59249f76
"""
glovePath = "/media/data/Datasets/glove"

def loadGlove(glove_path, dim=50):
    acceptedDimensions = [50, 100, 200, 300]
    if dim not in acceptedDimensions:
        print("You didn't choose a right dimension.")
        print("Try one of these:", acceptedDimensions)
        return None
    pickleWordFile = f'{glove_path}/6B.'+str(dim)+'_words.pkl'
    pickleIdFile   = f'{glove_path}/6B.'+str(dim)+'_idx.pkl'
    pickleDatFile  = f'{glove_path}/glove.6B.'+str(dim)+'.dat'
    pickleDataset  = f'{glove_path}/glove.6B.'+str(dim)+'d.txt'
    
    if os.path.isfile(pickleWordFile):
        # check if we've made the outputs before
        print("Preloading files..", end=" ")
        vectors = bcolz.open(pickleDatFile)[:]
        words = pickle.load(open(pickleWordFile, 'rb'))
        word2idx = pickle.load(open(pickleIdFile, 'rb'))
        glove = {w: vectors[word2idx[w]] for w in words}
        print("Done.")
        return glove
    else:
        print("Creating new files..", end=" ")
        words    = []
        idx      = 0
        word2idx = {}
        vectors = bcolz.carray(np.zeros(1), rootdir=pickleDatFile, mode='w')

        with open(pickleDataset, 'rb') as f:
            for l in f:
                line = l.decode().split()
                word = line[0]
                words.append(word)
                word2idx[word] = idx
                idx += 1
                vect = np.array(line[1:]).astype(np.float)
                vectors.append(vect)

        vectors = bcolz.carray(vectors[1:].reshape((400000, dim)),
                               rootdir=pickleDatFile, mode='w')
        vectors.flush()
        # save the outputs
        pickle.dump(words, open(pickleWordFile, 'wb'))
        pickle.dump(word2idx, open(pickleIdFile, 'wb'))
        # create the dataset
        glove = {w: vectors[word2idx[w]] for w in words}
        print("Done.")
        return glove

glove = loadGlove(glovePath)

Preloading files.. Done.


In [111]:
def createWeightMatrix(targetVocab, glove):
    """
    For each word in the dataset's vocabulary,
    check if it's also in the @glove vocab.
    If it does, then we load the pre-trained word vector.
    Otherwise we use some random vector.
    """
    length  = len(targetVocab)
    dim = glove['fail'].shape
    wMatrix = np.zeros((length, dim[0]))
    wordsFound = 0
    for i, word in enumerate(targetVocab):
        try:
            wMatrix[i] = glove[word]
            wordsFound += 1
        except KeyError:
            wMatrix[i] = np.random.normal(scale=0.6, size=dim)
    return wMatrix

createWeightMatrix(['noodb'], glove)

array([[-0.10900477,  0.39005339,  0.41426117,  0.73226437, -0.39847988,
        -1.1908525 ,  0.86290411, -0.26205337,  0.24860828, -0.2872339 ,
         0.06183626, -0.29048577,  0.0494523 ,  0.08885237, -0.01168055,
         0.09605561,  0.53305146, -0.73836863,  0.26562488, -0.18981482,
        -0.29566844, -0.29841539,  0.69372019,  0.15864261,  1.00231501,
        -0.62599312,  0.08614995, -0.12092209, -0.48454993,  1.04824008,
        -0.72058916,  0.58283893,  0.02703291, -0.20486233, -0.24587728,
        -1.06012552, -0.22821207,  0.19279423,  0.53912303, -0.18990544,
        -0.21820726,  0.41613238,  0.48022712, -0.77217027, -1.05255587,
        -0.02258505, -0.30678506, -0.57693466, -0.06212577, -0.73361708]])

In [101]:
def createEmbeddingLayer(wMatrix, trainable=True):
    """
    This function is to be called when initialising
    a new neural network class.
    """
    num, dim = wMatrix.size()
    layer = nn.Embedding(num, dim)
    layer.load_state_dict({'weight': wMatrix})
    layer.weight.requires_grad = True if trainable else False
    return layer, num, dim

class ToyNN(nn.Module):
    def __init__(self, weights_matrix, hidden_size, num_layers):
        super(self).__init__()
        self.embedding, num_embeddings, embedding_dim = createEmbeddingLayer(weights_matrix, True)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(embedding_dim, hidden_size, num_layers, batch_first=True)
        
    def forward(self, inp, hidden):
        return self.gru(self.embedding(inp), hidden)
    
    def init_hidden(self, batch_size):
#         torch.nn.init.xavier_uniform_(tensor, gain=1)
        return Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size))

In [132]:
class Encoder(nn.Module):
    """
    This'll be a bi-directional GRU.
    Utilises equation (1) in the paper.
    
    The hidden size is 512 as per the paper.
    """
    def __init__(self, embeddingMatrix, inputSize, hiddenSize=512):
        super(Encoder, self).__init__()
        self.hiddenSize = hiddenSize
        # this embedding is a simple lookup table that stores the embeddings of a 
        # fixed dictionary and size.
        
        # This module is often used to store word embeddings and retrieve them
        # using indices. 
        # The input to the module is a list of indices, and 
        # the output is the corresponding word embeddings.
#         self.embedding = nn.Embedding(inputSize, hiddenSize)
        self.embedding, numEmbeddings, embeddingDim = createEmbeddingLayer(embeddingMatrix)
        self.embedding.to(device)
        self.gru = nn.GRU(inputSize, embeddingDim, hiddenSize, bidirectional=True)
    
    def forward(self, x, hidden):
        # load the input into the embedding before doing GRU computation.
        output = self.embedding(x).view(1,1,-1)
        output, hidden = self.gru(output, hidden)
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1,1, self.hiddenSize, device=device)

In [133]:
class Backwards(nn.Module):
    def __init__(self, hidden_size):
        super(Backwards, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        output, hidden = self.gru(input, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [121]:
class Attention(nn.Module):
    """
    TODO: Add layer normalisation?
    https://arxiv.org/abs/1607.06450
    
    We also set the hidden state size to 512.
    
    """
    def __init__(self, outputSize, hiddenSize=512, maxLength = max_length):
        """
        # dropout omitted
        """
        super(Attention, self).__init__()
        self.hiddenSize = hiddenSize
        self.maxLength = maxLength
        
        # self.attention is our tiny neural network that takes 
        # in the hidden weights and the previous hidden weights.
        self.attention = nn.Linear(self.hiddenSize * 2, self.maxLength, 
                                   device=device)
        self.attentionCombined = nn.Linear(self.hiddenSize * 2, self.hiddenSize, 
                                           device=device)
        torch.nn.init.xavier_uniform_(self.attention)
        torch.nn.init.xavier_uniform_(self.attentionCombined)
    
    def forward(self, prevHidden, encoderOutputs):

        # concatenate hidden layer inputs together.
        attentionInputs  = prevHidden
        attentionWeights = F.softmax(self.attention(attentionInputs), dim=1)
        
        # batch matrix multiplication
        attentionApplied = torch.bmm(attentionWeights.unsqueeze(0),
                                    encoderOutputs.unsqueeze(0))
        # reshape to produce context vector.
        context = self.attentionCombined(context).unsqueeze(0)
        context = F.relu(context)
        return context, attentionWeights

    def initHidden(self):
        return torch.zeros(1,1, self.hiddenSize, device=device) 

In [122]:
class Decoder(nn.Module):
    """
    TODO: Add layer normalisation?
    https://arxiv.org/abs/1607.06450
    
    We also set the hidden state size to 512.
    
    """
    def __init__(self, outputSize, hiddenSize=512, maxLength = max_length):
        """
        # dropout omitted
        """
        super(AttentionDecoder, self).__init__()
        self.hiddenSize = hiddenSize
        self.outputSize = outputSize
        self.maxLength = maxLength
        
        self.embedding, numEmbeddings, embeddingDim = createEmbeddingLayer(embeddingMatrix)
        self.embedding.to(device)
        
        self.gru = nn.GRU(self.hiddenSize, self.hiddenSize)
        self.out = nn.Linear(self.hiddenSize, self.outputSize, device=device)
        torch.nn.init.xavier_uniform_(self.out)
    
    def forward(self, previousY, previousHidden, context, z):
        # get the embedding value of our previous Y
        embedded = self.embedding(previousY).view(1,1,-1)
        # concatenate hidden layer inputs together.
        inputs =  torch.cat((embedded[0], context, z), 1)
        # do a forward GRU
        output, hidden = self.gru(context, previousHidden)
        # softmax the output
        output = self.out(torch.cat((output[0], context), 1))
        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1,1, self.hiddenSize, device=device) 

- Calculate a set of attention weights.
- Dot product the attention weights with the encoder output vectors.
- This result should contain information about that specific part of the input sequence, which helps the decoder choose the right words. We'll store this into a variable called attentionApplied.

In [21]:
class Inference(nn.Module):
    """
    Note that the inference and prior networks
    are a simple 1 layer feed forward neural network.
    Therefore the size of the weights are entirely based on the size
    of the input and outputs.
    """
    def __init__(self, feature_size, class_size, latent_size=400):
        super(Inference, self).__init__()
        
        self.feature_size = feature_size
        self.class_size = class_size

        # encode
        self.fc1  = nn.Linear(feature_size + class_size, 400)
        self.mean = nn.Linear(400, latent_size)
        self.var = nn.Linear(400, latent_size)

        # decode
        self.fc3 = nn.Linear(latent_size + class_size, 400)
        self.fc4 = nn.Linear(400, feature_size)

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def encode(self, x_forward, c, h_backward): # Q(z|x, c)
        '''
        x: (bs, feature_size)
        c: (bs, class_size)
        '''
        inputs = torch.cat([x_forward, c, h_backward], 1) # (bs, feature_size+class_size)
        h1 = self.relu(self.fc1(inputs))
        z_mu = self.mean(h1)
        z_var = self.var(h1)
        return z_mu, z_var

    def reparametrize(self, mu, logvar):
        # samples your mu, logvar to get z.
        if self.training:
            std = logvar.mul(0.5).exp_()
            eps = Variable(std.data.new(std.size()).normal_())
            return eps.mul(std) + mu
        else:
            return mu

    def decode(self, z, c): # P(x|z, c)
        '''
        z: (bs, latent_size)
        c: (bs, class_size)
        '''
        inputs = torch.cat([z, c], 1) # (bs, latent_size+class_size)
        h3 = self.relu(self.fc3(inputs))
        return self.sigmoid(self.fc4(h3))

    def forward(self, x, c):
        mu, logvar = self.encode(x, c)
        z = self.reparametrize(mu, logvar)
        return self.decode(z, c), mu, logvar

In [22]:
class Prior(nn.Module):
    def __init__(self, feature_size, class_size):
        super(Prior, self).__init__()
        
        self.feature_size = feature_size
        self.class_size = class_size

        # encode
        self.fc1  = nn.Linear(feature_size + class_size, 400)
        self.mean = nn.Linear(400, latent_size)
        self.var = nn.Linear(400, latent_size)

        # decode
        self.fc3 = nn.Linear(latent_size + class_size, 400)
        self.fc4 = nn.Linear(400, feature_size)

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def encode(self, h, c): # Q(z|x, c)
        '''
        x: (bs, feature_size)
        c: (bs, class_size)
        '''
        inputs = torch.cat([h, c], 1) # (bs, feature_size+class_size)
        h1 = self.relu(self.fc1(inputs))
        z_mu = self.mean(h1)
        z_var = self.var(h1)
        return z_mu, z_var

    def reparametrize(self, mu, logvar):
        # samples your mu, logvar to get z.
        if self.training:
            std = logvar.mul(0.5).exp_()
            eps = Variable(std.data.new(std.size()).normal_())
            return eps.mul(std) + mu
        else:
            return mu

    def decode(self, z, c): # P(x|z, c)
        '''
        z: (bs, latent_size)
        c: (bs, class_size)
        '''
        inputs = torch.cat([z, c], 1) # (bs, latent_size+class_size)
        h3 = self.relu(self.fc3(inputs))
        return self.sigmoid(self.fc4(h3))

    def forward(self, x, c):
        mu, logvar = self.encode(x, c)
        z = self.reparametrize(mu, logvar)
        return self.decode(z, c), mu, logvar

In [114]:
class Auxillary(nn.Module):
    def __init__(self, latent_size):
        self.fc1  = nn.Linear(latent_size, 400)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()
    
    def forward(self, z):
        """
        The motive here is to produce an auxillary loss for our 
        training objective.
        
        We do this by Sequential Bag of Words (SBOW) as the
        auxillary objective for the proposed VAD model. 
        
        We want to predict the bag of succeeding words
        in the response using the latent variable z at each
        time step.
        """
        return 0

In [126]:
# Reconstruction + KL divergence losses summed over all elements and batch
def loss_function(y_predicted, y, z_inference, z_prior):
    LL = criterion(y_predicted, y)
    KL = F.kl_div(z_inference, z_prior)
    return LL - KL

def KLD(meanA, logvarA, meanB, logvarB):
    """
    Not used as PyTorch has its own KL-Divergence function
    which is most likely to be better optimised than whatever i have.
    """
    # Univariate
    # https://stats.stackexchange.com/questions/7440/kl-divergence-between-two-univariate-gaussians
    # return logvarB - logvarA + (2*logvarA + (meanA - meanB) ** 2)/(4 * logvarB) - 0.5
    # Multivariate
    # https://stats.stackexchange.com/questions/60680/kl-divergence-between-two-multivariate-gaussians
    d = meanA.shape
    divergence = logvarB - logvarA - torch.trace(torch.matmul(logvarB.inverse(), logvarA))
    cov = meanB - meanA
    divergence += torch.matmul(torch.matmul(cov.t(), logvarB.inverse()), cov)
    return 0.5 * divergence

In [127]:
"""
Helper Functions
"""

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [128]:
def trainVAD(x,y,
             encoder,
             attention,
             backwards,
             inference,
             prior,
             decoder,
             encoderOpt,
             attentionOpt,
             backwardsOpt,
             inferenceOpt,
             priorOpt,
             decoderOpt,
             criterion = nn.NLLLoss(),
             maxLength = max_length
            ):
    
    # initialise gradients
    encoderOpt.zero_grad()
    attentionOpt.zero_grad()
    backwardsOpt.zero_grad()
    inferenceOpt.zero_grad()
    priorOpt.zero_grad()
    decoderOpt.zero_grad()
    
    # initalise input and target lengths
    inputLength = x.size(0)
    targetLength = y.size(0)
    
    # set default loss
    loss = 0
    
    # set up encoder computation
    encoderOutputs  = torch.zeros(maxLength, encoder.hiddenSize, device=device)
    backwardOutputs = torch.zeros(maxLength, encoder.hiddenSize, device=device)
    decoderHiddens  = torch.zeros(maxLength, decoder.hiddenSize, device=device)
    
    # set up encoder outputs
    for ei in range(inputLength):
        encoderOutput, encoderHidden = encoder(x[ei], encoderHidden)
        encoderOutputs[ei] = encoderOutput[0,0]
    
    # set up backwards RNN
    for t in range(targetLength-1, 0, 1):
        # here we can also build the backwards RNN that takes in the y.
        # this backwards RNN conditions our latent variable.
        backwardOutput, backwardsHidden = backwards(y[t+1], backwardsHidden)
        # get the values of our backwards network.
        backwardOutputs[t] = backwardOutput[0,0]
        
    # set up the decoder computation
    decoderInput = torch.tensor([[SOS_token]], device=device)
    decoderHidden = encoderHidden
    
    
    for t in range(targetLength):
        # get the context vector c
        c, _ = attention(decoderH, encoderOutputs[t])
        # compute the inference layer
        z_infer, infMean, infLogvar = inference(decoderOutput, c, backwardOutputs[t])
        # compute the prior layer
        z_prior, priMean, priLogvar = prior(decoderOutput, c)
        # compute the output of each decoder state
        DecoderOut = decoder(decoderInput, c, z_infer, decoderHidden)
        decoderOutput, decoderHidden = DecoderOut
        
        # calculate the loss
        loss += loss_function(decoderOutput, y[t], z_infer, z_prior)
        # feed this output to the next input
        decoderInput = y[t]
    
    # possible because our loss_function uses gradient storing calculations
    loss.backward()
    
    encoderOpt.step()
    attentionOpt.step()
    backwardsOpt.step()
    inferenceOpt.step()
    priorOpt.step()
    decoderOpt.step()
    
    return loss.item()/targetLength

In [131]:
def trainIteration(encoder,
                attention,
                backwards,
                inference,
                prior,
                decoder,
                iterations,
                criterion = nn.NLLLoss(),
                learningRate = 0.0001,
                printEvery = 1000,
                plotEvery = 100):
    
    start = time.time()
    plotLosses = []
    printLossTotal = 0
    plotLossTotal = 0
    
    encoderOpt   = optim.Adam(encoder.parameters(),   lr=learningRate)
    attentionOpt = optim.Adam(attention.parameters(), lr=learningRate)
    backwardsOpt = optim.Adam(backwards.parameters(), lr=learningRate)
    inferenceOpt = optim.Adam(inference.parameters(), lr=learningRate)
    priorOpt     = optim.Adam(prior.parameters(),     lr=learningRate)
    decoderOpt   = optim.Adam(decoder.parameters(),   lr=learningRate)
    
    trainingPairs = [tensorsFromPair(random.choice(pairs)) for i in range(iterations)]
    
    for i in range(1, iterations + 1):
        # set up variables needed for training.
        trainingPair = trainingPairs[i-1]
        x, y = trainingPair[0], trainingPair[1]
        # calculate loss.
        loss = trainVAD(x, y, 
             encoder,
             attention,
             backwards,
             inference,
             prior,
             decoder,
             encoderOpt,
             attentionOpt,
             backwardsOpt,
             inferenceOpt,
             priorOpt,
             decoderOpt,
             criterion
            )
        # increment our print and plot.
        printLossTotal += loss
        plotLossTotal += loss
        
        # print mechanism
        if i % printEvery == 0:
            printLossAvg = printLossTotal / printEvery
            # reset the print loss.
            printLossTotal = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, i / iterations),
                                         i, i / iterations * 100, printLossAvg))
        # plot mechanism
        if i % plotEvery == 0:
            plotLossAvg = plotLossTotal / plotEvery
            plotLosses.append(plotLossAvg)
            plotLossTotal = 0
            
    showPlot(plotLosses)

In [134]:
modelEncoder   = Encoder(inputLang.n_words).to(device)
modelAttention = Attention().to(device)
modelBackwards = Backwards().to(device)
modelInference = Inference().to(device)
modelPrior     = Prior().to(device)
modelDecoder   = Decoder(outputLang.n_words, dropout=0.1).to(device)
trainIters(encoder,
           attention,
           backwards,
           inference,
           prior,
           decoder,
           iterations,
           75000, 
           printEvery=1000)

TypeError: __init__() missing 1 required positional argument: 'inputSize'