# Load the dataset

In [1]:
from modules.texts import Vocab, GloVeLoader
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import modules.extractive as ext
import modules.abstractive as abs
import modules.beam_search as bs
from modules.data import Documents
from torch.utils.data import DataLoader

# Initialize the pretrained embeddings

In [2]:
import numpy as np

# Load the pretrained embedding into the memory
path_glove = os.path.join(os.path.expanduser('~'),
             'data/NLP/word_embeddings/GloVe/glove.6B.200d.txt')
glove = GloVeLoader(path_glove)

# Load the dataset
doc_file = './data/kaggle_news_rouge1.pkl'
docs = Documents(doc_file, n_samples=10, vocab_size = 30000)
vocab = docs.vocab

d = 200
emb = nn.Embedding(vocab.V, d)

def init_emb(emb, vocab):
    for word in vocab.word2id:
        try:
            emb.weight.data[vocab[word]] = torch.from_numpy(glove[word])
        except KeyError as e:
            # Case when pretrained embedding for a word does not exist
            pass
#     emb.weight.requires_grad = False # suppress updates
    print('Initialized the word embeddings.')

The pretrained vector file to use: /home/yhs/data/NLP/word_embeddings/GloVe/glove.6B.200d.txt
The number of words in the pretrained vector: 400000
The dimension of the pretrained vector: 200


In [3]:
# Test
from copy import deepcopy
from torch import optim
import time
from itertools import chain

vocab_size = vocab.V
emb_size = emb.weight.data.size(1)
n_kernels = 50
kernel_sizes = [1,2,3,4,5]
pretrained = emb
sent_size = len(kernel_sizes) * n_kernels
hidden_size = 400
num_layers = 1
n_classes = len(docs.dclass2id)
batch_size = 1
torch.manual_seed(7)
torch.cuda.manual_seed(7)

init_emb(emb, vocab)
ext_s_enc = ext.SentenceEncoder(vocab_size, emb_size,
                                   n_kernels, kernel_sizes, pretrained)
ext_d_enc = ext.DocumentEncoder(sent_size, hidden_size)
ext_extc = ext.ExtractorCell(sent_size, hidden_size)
ext_d_classifier = ext.DocumentClassifier(sent_size, n_classes)
abs_enc = abs.EncoderRNN(emb, hidden_size, num_layers)
abs_dec = abs.AttnDecoderRNN(emb, hidden_size * 2, num_layers)

models = [ext_s_enc, ext_d_enc, ext_extc, ext_d_classifier,
         abs_enc, abs_dec]
params = list(chain(*[model.parameters() for model in models]))
optimizer = optim.SGD(params, lr = .005)

loss_fn_ext = nn.BCELoss()
loss_fn_dclass = nn.NLLLoss()
loss_fn_abs = nn.CrossEntropyLoss()

def get_accuracy(probs, targets, verbose = False):   
    '''
    Calculates the accuracy for the extractor

    Args:
        probs: extraction probability
        targets: ground truth labels for extraction
    '''
    import numpy as np
    preds = np.array([1 if p > 0.5 else 0 for p in probs])
    if verbose:
        print(preds)
    accuracy = np.mean(preds == targets)
    
    return accuracy

# class RougeScorer:
#     def __init__(self):
#         from rouge import Rouge
#         self.rouge = Rouge()
#     def score(self, reference, generated, type = 1):
#         score = self.rouge.get_scores(reference, generated, avg=True)
#         score = score['rouge-%s' % type]['f']
#         return score

# rouge = RougeScorer()
    
def run_epoch(docs):
    
    epoch_loss_abs = 0
    epoch_loss_ext = 0
    epoch_loss_dclass = 0
    epoch_accuracy_ext = 0

    for doc in docs:
        optimizer.zero_grad()
        docloader = DataLoader(doc, batch_size=1, shuffle=False)
        # Encode the sentences in a document
        sents_raw = []
        sents_encoded = []
        ext_labels = []
        doc_class = Variable(torch.LongTensor([doc.doc_class])).cuda()
        for sent, ext_label in docloader:
            # only accept sentences that conforms the maximum kernel sizes
            if sent.size(1) < max(kernel_sizes):
                continue
            sent = Variable(sent).cuda()
            sents_raw.append(sent)
            sents_encoded.append(ext_s_enc(sent))
            ext_labels.append(ext_label.cuda())
        # Ignore if the content is a single sentence(no need to train)
        if len(sents_raw) <= 1:
            continue

        # Build the document representation using encoded sentences
        d_encoded = torch.cat(sents_encoded, dim = 0).unsqueeze(1)
        ext_labels = Variable(torch.cat(ext_labels, dim = 0).type(torch.FloatTensor).view(-1)).cuda()
        init_sent = ext_s_enc.init_sent(batch_size)
        d_ext = torch.cat([init_sent, d_encoded[:-1]], dim = 0)

        # Extractive Summarizer
        ## Initialize the d_encoder
        h, c = ext_d_enc.init_h0c0(batch_size)
        h0 = Variable(h.data)
        ## An input goes through the document encoder
        output, hn, cn = ext_d_enc(d_ext, h, c)
        ## Initialize the decoder
        ### calculate p0, h_bar0, c_bar0
        h_ = hn.squeeze(0)
        c_ = cn.squeeze(0)
        p = ext_extc.init_p(h0.squeeze(0), h_)
        ### calculate p_t, h_bar_t, c_bar_t
        d_encoder_hiddens = torch.cat((h0, output[:-1]), 0) #h0 ~ h_{n-1}
        extract_probs = Variable(torch.zeros(len(sents_encoded))).cuda()
        for i, (s, h) in enumerate(zip(sents_encoded, d_encoder_hiddens)):
            h_, c_, p = ext_extc(s, h, h_, c_, p)
            extract_probs[i] = p
        ## Document Classifier
        q = ext_d_classifier(extract_probs.view(-1,1), d_encoded.squeeze(1))
        
        ## Optimize over the extractive examples
        loss_ext = loss_fn_ext(extract_probs, ext_labels)
        loss_dclass = loss_fn_dclass(q.view(1,-1), doc_class)
        epoch_loss_ext += loss_ext.data.cpu().numpy()[0]
        epoch_loss_dclass += loss_dclass.data.cpu().numpy()[0]
        torch.autograd.backward([loss_ext, loss_dclass])
        optimizer.step()
        
        ## Measure the accuracy
        p_cpu = extract_probs.data.cpu().numpy()
        t_cpu = ext_labels.data.cpu().numpy()
        q_cpu = q.data.cpu().numpy()
        c_cpu = doc_class.data.cpu().numpy()
        epoch_accuracy_ext += get_accuracy(p_cpu, t_cpu)

        # Abstractive Summarizer
        optimizer.zero_grad()
        loss_abs = 0
        ## Run through the encoder
#         words = torch.cat(sents_ext, dim=1).t()
        sents_ext = [sent for i,sent in enumerate(sents_raw)
                     if extract_probs[i].data.cpu().numpy() > 0.5]
        
        # skip if no sentences are selected as summaries
        if len(sents_ext) == 0:
            continue
        words = torch.cat(sents_ext, dim=1).t()

        abs_enc_hidden = abs_enc.init_hidden(batch_size)
        abs_enc_output, abs_enc_hidden = abs_enc(words, abs_enc_hidden)
        ## Remove to too long documents to tackle memory overflow
        if len(abs_enc_output) > 6000:
            continue
        ## Run through the decoder
        abs_dec_hidden = abs_enc_hidden.view(1,1,-1)
#         abs_dec_hidden = abs_enc_hidden
        for i in range(len(doc.head)-1):
            input = doc.head[i]
            target = doc.head[i+1]
            input = Variable(torch.LongTensor([input]).unsqueeze(1)).cuda()
            target = Variable(torch.LongTensor([target]).unsqueeze(1)).cuda()
            abs_dec_output, abs_dec_hidden, _ = abs_dec(input, abs_dec_hidden, abs_enc_output)
            loss_abs += loss_fn_abs(abs_dec_output, target.squeeze(1))

        epoch_loss_abs += loss_abs.data.cpu().numpy()[0]
        loss_abs.backward()
        optimizer.step()

    acc_ext = epoch_accuracy_ext / len(docs)
    
    return epoch_loss_ext, epoch_loss_dclass, epoch_loss_abs, acc_ext

def train(docs, n_epochs = 10, print_every = 1):
    import time
    
    for epoch in range(n_epochs):
        start_time = time.time()
        ext_loss, dclass_loss, abs_loss, ext_acc = run_epoch(docs)
        end_time = time.time()
        wall_clock = (end_time - start_time) / 60
        if epoch % print_every == 0:
            print('Epoch:%2i / Loss:(%.3f/%.3f/%.3f) / Accuracy:(%.3f) / TrainingTime:%.3f(min)' %
                  (epoch, ext_loss, dclass_loss, abs_loss, ext_acc, wall_clock))

import os
from os.path import join            
# Training
train(docs, n_epochs = 50, print_every = 1)
# for n in range(5):
#     train(docs, n_epochs = 10, print_every = 1)
#     print('Epoch %2i finished.' % ((n+1)*10))
#     model_dict = dict()
#     model_dict['emb'] = emb
#     model_dict['ext_s_enc'] = ext_s_enc
#     model_dict['ext_d_enc'] = ext_d_enc
#     model_dict['ext_extc'] = ext_extc
#     model_dict['ext_d_classifier'] = ext_d_classifier
#     model_dict['abs_enc'] = abs_dec
#     model_dict['abs_dec'] = abs_dec

#     data_dir = join(os.path.expanduser('~'), 'cs671-large')
#     for name, model in model_dict.items():
#         torch.save(model.state_dict(), join(data_dir, name + '_epoch_%2i' % ((n+1)*10)) )

Initialized the word embeddings.
Epoch: 0 / Loss:(6.983/17.299/671.624) / Accuracy:(0.364) / TrainingTime:0.014(min)
Epoch: 1 / Loss:(6.974/15.424/647.219) / Accuracy:(0.362) / TrainingTime:0.013(min)
Epoch: 2 / Loss:(6.966/14.987/762.837) / Accuracy:(0.372) / TrainingTime:0.014(min)
Epoch: 3 / Loss:(6.959/14.739/773.425) / Accuracy:(0.373) / TrainingTime:0.016(min)
Epoch: 4 / Loss:(6.952/14.528/722.124) / Accuracy:(0.454) / TrainingTime:0.017(min)
Epoch: 5 / Loss:(6.945/14.330/671.109) / Accuracy:(0.525) / TrainingTime:0.017(min)
Epoch: 6 / Loss:(6.939/14.138/629.470) / Accuracy:(0.561) / TrainingTime:0.017(min)
Epoch: 7 / Loss:(6.933/13.948/599.839) / Accuracy:(0.561) / TrainingTime:0.018(min)
Epoch: 8 / Loss:(6.928/13.768/578.466) / Accuracy:(0.561) / TrainingTime:0.017(min)
Epoch: 9 / Loss:(6.922/13.592/561.193) / Accuracy:(0.561) / TrainingTime:0.017(min)
Epoch:10 / Loss:(6.917/13.417/545.695) / Accuracy:(0.561) / TrainingTime:0.018(min)
Epoch:11 / Loss:(6.912/13.246/531.061) / Ac

In [4]:
from rouge import Rouge

doc_i = 5
models = [ext_s_enc, ext_d_enc, ext_extc, ext_d_classifier, abs_enc, abs_dec]
test_input = [torch.LongTensor(sent).view(1,-1) for sent in docs[doc_i].sents]
ref_input = torch.LongTensor(docs[doc_i].head).view(1,-1)
top1_batch, seqs_batch = bs.generate_title(doc_sents = test_input,
                                           beam_size = 5,
                                           models = models,
                                           max_kernel_size = 5)

rouge = Rouge()
total_rouge = 0
for i in range(len(top1_batch)):
    generated = '_BEGIN_ ' + vocab.id2sents([top1_batch[i][0]])
    reference = vocab.id2sents([ref_input[i]])
    print(generated)
    print(reference)
    total_rouge += rouge.get_scores(generated, reference)[0]['rouge-1']['f']

print(total_rouge / len(top1_batch))
# print(seqs_batch)


 852  339  296   65  399
[torch.cuda.LongTensor of size 1x5 (GPU 0)]

_BEGIN_ delhi police kin allege foul play _END_
_BEGIN_ man found dead at delhi police station , kin allege foul play _END_
0.7272727226446282


In [46]:
# # Test
# from copy import deepcopy
# from torch import optim
# import time
# from itertools import chain

# vocab_size = vocab.V
# emb_size = emb.weight.data.size(1)
# n_kernels = 50
# kernel_sizes = [1,2,3,4,5]
# pretrained = emb
# sent_size = len(kernel_sizes) * n_kernels
# hidden_size = 400
# num_layers = 1
# n_classes = len(docs.dclass2id)
# batch_size = 1
# torch.manual_seed(7)
# torch.cuda.manual_seed(7)

# init_emb(emb, vocab)
# ext_s_enc = ext.SentenceEncoder(vocab_size, emb_size,
#                                    n_kernels, kernel_sizes, pretrained)
# ext_d_enc = ext.DocumentEncoder(sent_size, hidden_size)
# ext_extc = ext.ExtractorCell(sent_size, hid\den_size)
# ext_d_classifier = ext.DocumentClassifier(sent_size, n_classes)
# abs_enc = abs.EncoderRNN(emb, hidden_size, num_layers)
# abs_dec = abs.AttnDecoderRNN(emb, hidden_size * 2, num_layers)

# models = [ext_s_enc, ext_d_enc, ext_extc, ext_d_classifier,
#          abs_enc, abs_dec]
# params = list(chain(*[model.parameters() for model in models]))
# optimizer = optim.SGD(params, lr = .005)

# loss_fn_ext = nn.BCELoss()
# loss_fn_dclass = nn.NLLLoss()
# loss_fn_abs = nn.CrossEntropyLoss()

# def get_accuracy(probs, targets, verbose = False):   
#     '''
#     Calculates the accuracy for the extractor

#     Args:
#         probs: extraction probability
#         targets: ground truth labels for extraction
#     '''
#     import numpy as np
#     preds = probs > .5
#     if verbose:
#         print(preds)
#     matches = preds.type(torch.cuda.FloatTensor) == targets
#     accuracy = torch.mean(matches.type(torch.cuda.FloatTensor))
    
#     return accuracy

    
# def run_epoch(docs):
    
#     epoch_loss_abs = 0
#     epoch_loss_ext = 0
#     epoch_loss_dclass = 0
#     epoch_accuracy_ext = 0

#     for doc in docs:
#         optimizer.zero_grad()
#         docloader = DataLoader(doc, batch_size=1, shuffle=False)
#         # Encode the sentences in a document
#         sents_raw = []
#         sents_encoded = []
#         ext_labels = []
#         doc_class = Variable(torch.LongTensor([doc.doc_class])).cuda()
#         for sent, ext_label in docloader:
#             # only accept sentences that conforms the maximum kernel sizes
#             if sent.size(1) < max(kernel_sizes):
#                 continue
#             sent = Variable(sent).cuda()
#             sents_raw.append(sent)
#             sents_encoded.append(ext_s_enc(sent))
#             ext_labels.append(ext_label.cuda())
#         # Ignore if the content is a single sentence(no need to train)
#         if len(sents_raw) <= 1:
#             continue

#         # Build the document representation using encoded sentences
#         d_encoded = torch.cat(sents_encoded, dim = 0).unsqueeze(1)
#         ext_labels = Variable(torch.cat(ext_labels, dim = 0).type(torch.FloatTensor).view(-1)).cuda()
#         init_sent = ext_s_enc.init_sent(batch_size)
#         d_ext = torch.cat([init_sent, d_encoded[:-1]], dim = 0)

#         # Extractive Summarizer
#         ## Initialize the d_encoder
#         h, c = ext_d_enc.init_h0c0(batch_size)
#         h0 = Variable(h.data)
#         ## An input goes through the document encoder
#         output, hn, cn = ext_d_enc(d_ext, h, c)
#         ## Initialize the decoder
#         ### calculate p0, h_bar0, c_bar0
#         h_ = hn.squeeze(0)
#         c_ = cn.squeeze(0)
#         p = ext_extc.init_p(h0.squeeze(0), h_)
#         ### calculate p_t, h_bar_t, c_bar_t
#         d_encoder_hiddens = torch.cat((h0, output[:-1]), 0) #h0 ~ h_{n-1}
#         extract_probs = Variable(torch.zeros(len(sents_encoded))).cuda()
#         for i, (s, h) in enumerate(zip(sents_encoded, d_encoder_hiddens)):
#             h_, c_, p = ext_extc(s, h, h_, c_, p)
#             extract_probs[i] = p
#         ## Document Classifier
#         q = ext_d_classifier(extract_probs.view(-1,1), d_encoded.squeeze(1))
        
#         ## Optimize over the extractive examples
#         loss_ext = loss_fn_ext(extract_probs, ext_labels)
#         loss_dclass = loss_fn_dclass(q.view(1,-1), doc_class)
#         epoch_loss_ext += loss_ext.data
#         epoch_loss_dclass += loss_dclass.data
#         torch.autograd.backward([loss_ext, loss_dclass])
#         optimizer.step()
        
#         ## Measure the accuracy
# #         q_cpu = q.data
# #         c_cpu = doc_class.data
#         epoch_accuracy_ext += get_accuracy(extract_probs.data, ext_labels.data)

#         # Abstractive Summarizer
#         optimizer.zero_grad()
#         loss_abs = 0
#         ## Run through the encoder
# #         words = torch.cat(sents_ext, dim=1).t()
#         sents_ext = [sent for i,sent in enumerate(sents_raw)
#                      if extract_probs[i].data[0] > 0.5]
        
#         # skip if no sentences are selected as summaries
#         if len(sents_ext) == 0:
#             continue
#         words = torch.cat(sents_ext, dim=1).t()

#         abs_enc_hidden = abs_enc.init_hidden(batch_size)
#         abs_enc_output, abs_enc_hidden = abs_enc(words, abs_enc_hidden)
#         ## Remove to too long documents to tackle memory overflow
#         if len(abs_enc_output) > 6000:
#             continue
#         ## Run through the decoder
#         abs_dec_hidden = abs_dec.init_hidden(batch_size)
#         for target in doc.head:
#             target = Variable(torch.LongTensor([target]).unsqueeze(1)).cuda()
#             abs_dec_output, abs_dec_hidden, attn_weights = abs_dec(target, abs_dec_hidden, abs_enc_output)
#             loss_abs += loss_fn_abs(abs_dec_output, target.squeeze(1))

#         epoch_loss_abs += loss_abs.data
#         loss_abs.backward()
#         optimizer.step()

#     acc_ext = epoch_accuracy_ext / len(docs)
    
#     return epoch_loss_ext, epoch_loss_dclass, epoch_loss_abs, acc_ext

# def train(docs, n_epochs = 10, print_every = 1):
#     import time
    
#     for epoch in range(n_epochs):
#         start_time = time.time()
#         ext_loss, dclass_loss, abs_loss, ext_acc = run_epoch(docs)
#         ext_loss = ext_loss.cpu().numpy()[0]
#         dclass_loss = dclass_loss.cpu().numpy()[0]
#         abs_loss = abs_loss.cpu().numpy()[0]
#         end_time = time.time()
#         wall_clock = (end_time - start_time) / 60
#         if epoch % print_every == 0:
#             print('Epoch: %i',epoch)
#             print('Extractive Loss: ',ext_loss)
#             print('Classification Loss: ',dclass_loss)
#             print('Abstractive Loss: ',abs_loss)
#             print('Extractive Accuracy: ', ext_acc)
#             print('Training Time: %.3f(min)' % wall_clock)

# import os
# from os.path import join            
# # Training
# for n in range(5):
#     train(docs, n_epochs = 1, print_every = 1)
#     print('Epoch %2i finished.' % ((n+1)*10))
# #     model_dict = dict()
# #     model_dict['emb'] = emb
# #     model_dict['ext_s_enc'] = ext_s_enc
# #     model_dict['ext_d_enc'] = ext_d_enc
# #     model_dict['ext_extc'] = ext_extc
# #     model_dict['ext_d_classifier'] = ext_d_classifier
# #     model_dict['abs_enc'] = abs_enc
# #     model_dict['abs_dec'] = abs_dec

# #     data_dir = join(os.path.expanduser('~'), 'cs671-large')
# #     for name, model in model_dict.items():
# #         torch.save(model.state_dict(), join(data_dir, name + '_epoch_%2i' % ((n+1)*10)) )

Initialized the word embeddings.


KeyboardInterrupt: 