In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import os
import bcolz
import numpy as np
import pickle
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import time
import math
import pickle
from tqdm import tqdm
from torch.autograd import Variable
# plotting
import matplotlib
matplotlib.use('Agg')
from vad import batchData
from matplotlib import cm
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import os
import json

In [2]:
model_path = "models/20190411 17-58-12"
param_path = os.path.join(model_path,"model_parameters.json")
with open(param_path) as json_file:  
    params = json.load(json_file)
print(params)

{'hiddenSize': 512, 'latentSize': 400, 'batchSize': 64, 'iterations': 200, 'learningRate': 0.0001, 'gradientClip': 3, 'useBOW': True, 'bidirectionalEncoder': True, 'reduction': 256, 'device': 'cuda', 'useLatent': True}


In [3]:

"""
Set seed #
"""
seed = 1337

torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

def loadDataset(path = '../Datasets/Reviews/dataset_ready.pkl'):
    return pickle.load(open(path, 'rb'))

In [18]:
print("Loading parameters..", end=" ")
batchSize  = 10
iterations = 1
bidirectionalEncoder = params['bidirectionalEncoder']
# device = "cpu"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Done.")

print("Loading dataset..", end=" ")
dataset = loadDataset()
# setup store parameters
id2word = dataset['id2word']
word2id = dataset['word2id']
weightMatrix = dataset['weights']
train = dataset['train']
validation = dataset['validation']
cutoff = dataset['cutoff']
paddingID = word2id['<pad>']
print("Done.")

print("Converting dataset weights into tensors..", end=" ")
# convert dataset into tensors
weightMatrix = torch.tensor(weightMatrix, dtype=torch.float)
print("Done.")

# batching data
print("Batching Data..",end=" ")

random.shuffle(validation)

trainx = [x[0] for x in train]
trainy = [x[1] for x in train]
valx = [x[0] for x in validation]
valy = [x[1] for x in validation]

trainx = batchData(trainx, paddingID, device, batchSize, cutoff)
trainy = batchData(trainy, paddingID, device, batchSize, cutoff)
valx = batchData(valx, paddingID, device, batchSize, cutoff)
valy = batchData(valy, paddingID, device, batchSize, cutoff)

traindata = (trainx, trainy)
valdata = (valx, valy)
print("Done.")

# setup variables for model components initialisation
maxReviewLength = cutoff
vocabularySize = len(id2word)
embeddingDim = weightMatrix.shape[1]

embedding_shape = weightMatrix.shape

Loading parameters.. Done.
Loading dataset.. Done.
Converting dataset weights into tensors.. Done.
Batching Data.. Done.


In [19]:
hiddenSize = params['hiddenSize']
latentSize = params['latentSize']

from vad import Encoder, Decoder, loss_function

print("Initialising model components..", end=" ")
embedding = nn.Embedding(
        num_embeddings=vocabularySize,
        embedding_dim=embeddingDim,
        padding_idx=paddingID,
    _weight=weightMatrix
    )

modelEncoder = Encoder(embedding, vocabularySize,
                       paddingID, hiddenSize, bidirectionalEncoder).to(device)

modelDecoder = Decoder(embedding, vocabularySize,
                       paddingID, batchSize, maxReviewLength, hiddenSize, latentSize, bidirectionalEncoder).to(device)

criterion = nn.NLLLoss(ignore_index=paddingID)


print("Done.")

# load models
print("Loading model weights..", end=" ")
modelEncoder.load_state_dict(torch.load(os.path.join(model_path,'encoder.pth')))
modelDecoder.load_state_dict(torch.load(os.path.join(model_path,'decoder.pth')))
print("Done.")

Initialising model components.. Done.
Loading model weights.. Done.


In [20]:
def evaluate(x,
             xLength,
             y,
             yLength,
             encoder,
             decoder,
             device,
             criterion,
             word2id
            ):
    
    loss = 0
    
    # initalise input and target lengths
    inputLength = x[0].size(0)
    targetLength = y[0].size(0)
    batchSize = x.shape[0]

    # set up encoder computation
    encoderHidden = encoder.initHidden(batchSize).to(device)
    
    x = x.to(device)
    y = y.to(device)
    
    # set up encoder outputs
    encoderOutputs, encoderHidden = encoder(x, encoderHidden, xLength)

    # set up the variables for decoder computation
    decoderInput = torch.tensor([word2id["<sos>"]] * batchSize, dtype=torch.long, device=device)
    
    decoderHidden = encoderHidden[-1]
    decoderOutput = None
    decoderOutputs = []
    
    # Run through the decoder one step at a time. This seems to be common practice across
    # all of the seq2seq based implementations I've come across on GitHub.
    for t in range(yLength[0]):
        # compute the output of each decoder state
        decoderOutput, decoderHidden = decoder(decoderInput, encoderOutputs, xLength, decoderHidden, device, back=None)

        decoderOutputs.append(decoderOutput)

        decoderInput = decoderOutput.argmax(1)
        decoderHidden = decoderHidden.squeeze(0)
        
    return decoderOutputs, loss

In [21]:
# from train_vad import evalVAD as evaluate
def evaluateModel(batched_data,
                   encoder,
                   decoder,
                   criterion,
                   id2word,
                  ):
    
    encoder.eval()
    decoder.eval()
    numbatches = len(batched_data)
    for batch in tqdm(range(0,3)):
        # each batch is composed of the 
        # reviews, and a sentence length.
        x, xLength = batched_data[0][batch][0], batched_data[0][batch][1]
        y, yLength = batched_data[1][batch][0], batched_data[1][batch][1]
        
        outputs, losses = evaluate(x,
                                   xLength,
                                   y,
                                   yLength,
                                   encoder,
                                   decoder,
                                   device,
                                   criterion,
                                   word2id)
        break
    return x, y, outputs

In [22]:
x, y, outputs = evaluateModel(traindata,
                   modelEncoder,
                   modelDecoder,
                   criterion,
                   id2word
                  )

  0%|          | 0/3 [00:00<?, ?it/s]


In [23]:
def convertRealID2Word(id2word, y):
    entries = [[id2word[x.item()] for x in y[entry].cpu()] for entry in range(len(y))]
    for i in range(len(entries)):
        entries[i] = " ".join(entries[i])
        entries[i] = entries[i].replace("<pad> ", "")
#         entries[i] = entries[i][43:]
    return entries
        
def convertDecoderID2Word(id2word, outputs):
    entries = []
    for batch_line in outputs:
        entry = [torch.argmax(batch_line[i]).cpu().item() for i in range(len(batch_line))]
        entries.append([id2word[i] for i in entry])
     
    words = []
    for i in range(len(outputs[0])):
        tokens = [entries[j][i] for j in range(len(entries))][1:]
#         print(tokens)
        words.append(" ".join(tokens))
    return words

In [24]:
entries = [torch.argmax(entry, dim=1) for entry in outputs]
textes  = np.array([[id2word[x.item()] for x in y] for y in entries]).transpose()
for row in textes:
    print(" ".join(row.tolist()))

<unk> <unk> <unk> the the , , , the . . <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos>
<unk> <unk> <unk> , , . . <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos>
<unk> <unk> <unk> , , , . . <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos>
<unk> <unk> <unk> , , . . . <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos>
<unk> <unk> <unk> , , . . <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos>
<unk> <unk> <unk> , the . . <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos>
<unk> <unk> <unk> , , . . <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> 

In [17]:
before = convertRealID2Word(id2word,x)
reference = convertRealID2Word(id2word,y)
words = convertDecoderID2Word(id2word,outputs)

for i in range(len(reference)):
    print("INPUT:", before[i])
    print("REFERENCE:", reference[i])
    print("MODEL:    ",words[i])
    print()

INPUT: b 0 0 0 0 0 j 1 u b rating_5.0 polarity_0.0 kb at to ps/2 adapter <eos> <pad>
REFERENCE: the package arrived in a timely fashion and in good shape . <eos> <pad>
MODEL:     <unk> <unk> the . . . <eos> <eos> <eos> <eos> <eos> <eos>



In [None]:
import shutil
import os
for x in os.listdir('models'):
    subpath = os.path.join('models', x)
    if "encoder.pth" not in os.listdir(subpath):
        print("bad", subpath, os.listdir(subpath))
        shutil.rmtree(subpath)