# Load the dataset

In [1]:
from modules.texts import Vocab, GloVeLoader
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import modules.extractive as ext
import modules.abstractive as abs
from modules.data import Documents
from torch.utils.data import DataLoader

# Initialize the pretrained embeddings

In [None]:
import numpy as np

# Load the pretrained embedding into the memory
path_glove = os.path.join(os.path.expanduser('~'),
             'data/NLP/word_embeddings/GloVe/glove.6B.200d.txt')
glove = GloVeLoader(path_glove)

# Load the dataset
doc_file = './data/kaggle_news_rouge1.pkl'
docs = Documents(doc_file, vocab_size = 30000)
docs.set_doc_classes(np.random.randint(2, size = len(docs)).tolist()) # attach random document labels
vocab = docs.vocab

d = 200
emb = nn.Embedding(vocab.V, d)

def init_emb(emb, vocab):
    for word in vocab.word2id:
        try:
            emb.weight.data[vocab[word]] = torch.from_numpy(glove[word])
        except KeyError as e:
            # Case when pretrained embedding for a word does not exist
            pass
#     emb.weight.requires_grad = False # suppress updates
    print('Initialized the word embeddings.')

init_emb(emb, vocab)

The pretrained vector file to use: /home/yhs/data/NLP/word_embeddings/GloVe/glove.6B.200d.txt
The number of words in the pretrained vector: 400000
The dimension of the pretrained vector: 200


In [None]:
# Test
from copy import deepcopy
from torch.optim import Adam
import time

vocab_size = vocab.V
emb_size = emb.weight.data.size(1)
hidden_size = 400
num_layers = 1
batch_size = 1

enc = abs.EncoderRNN(vocab_size, emb_size, hidden_size, num_layers, emb).cuda()
dec = abs.AttnDecoderRNN(vocab_size, emb_size, hidden_size * 2, num_layers, emb).cuda()

loss_fn = nn.CrossEntropyLoss()
params = list(enc.parameters()) + list(dec.parameters())
optimizer = Adam(params, lr = .005)

def run_epoch(docs):
    epoch_loss = 0
    for doc in docs:
        docloader = DataLoader(doc, batch_size = batch_size, shuffle = False)
        total_loss = 0
        for sent, ext_label in docloader:
            optimizer.zero_grad()
            loss = 0
            # Run through the encoder
            sent = Variable(sent.t()).cuda()
            enc_hidden = enc.init_hidden(batch_size)
            enc_output, enc_hidden = enc(sent, enc_hidden)

            # Run through the decoder
            dec_hidden = dec.init_hidden(batch_size)
            for target in docloader.dataset.summ:
                target = Variable(torch.LongTensor([target]).unsqueeze(1)).cuda()
                dec_output, dec_hidden, attn_weights = dec(target, dec_hidden, enc_output)
                loss += loss_fn(dec_output, target.squeeze(1))

            epoch_loss += loss.data.cpu().numpy()[0]
            loss.backward()
            optimizer.step()
    
    epoch_loss /= len(docs)
    
    return epoch_loss

def train(docs, n_epochs = 10, print_every = 1):
    import time
    
    for epoch in range(n_epochs):
        start_time = time.time()
        epoch_loss = run_epoch(docs)
        end_time = time.time()
        wall_clock = (end_time - start_time) / 60
        if epoch % print_every == 0:
            print('Epoch:%2i / Loss:(%.3f/%.3f) / Accuracy:(%.3f/%.3f) / TrainingTime:%.3f(min)' %
                  (epoch, ext_loss, dclass_loss, ext_acc, dclass_acc, wall_clock))

train(docs, n_epochs = 3, print_every = 1)