# Load the dataset

In [1]:
from modules.texts import Vocab, GloVeLoader
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from modules.layers import SentenceEncoder, DocumentEncoder, ExtractorCell
from modules.data import DocumentsGroup
from torch.utils.data import DataLoader

# Table of Contents
1. Load the Multiple Documents into the memory
2. 

In [2]:
# Load the pretrained embedding into the memory
path_glove = os.path.join(os.path.expanduser('~'),
             'data/NLP/word_embeddings/GloVe/glove.6B.300d.txt')
glove = GloVeLoader(path_glove)

# Load the dataset
doc_file = './data/kaggle_news_rouge1.pkl'
docs = DocumentsGroup(doc_file)
vocab = docs.vocab

d = 300
emb = nn.Embedding(vocab.V, d)
for word in vocab.word2id:
    try:
        emb.weight.data[vocab[word]] = torch.from_numpy(glove[word])
    except KeyError as e:
        # Case when pretrained embedding for a word does not exist
        pass
# emb.weight.requires_grad = False # suppress updates

The pretrained vector file to use: /home/yhs/data/NLP/word_embeddings/GloVe/glove.6B.300d.txt
The number of words in the pretrained vector: 400000
The dimension of the pretrained vector: 300


In [3]:
docloader = DataLoader(docs, batch_size=1, shuffle=False)

In [4]:
for i, doc in enumerate(docs):
    docloader = DataLoader(doc, batch_size=1, shuffle=False)
    print("Doc %i" % i)
    for input, target in docloader:
        print(input, target)

Doc 0


Columns 0 to 12 
 1544  1362   377   142   439    95   979  1491  1518    74   296  1361   169

Columns 13 to 25 
 1522  1292  1385  1380  1112   979   848   312   102  1362   991  1415    74

Columns 26 to 38 
  190   563   476   142  1482  1176   149   979  1266   874    24  1362  1432

Columns 39 to 51 
 1355    66  1198    95  1482   551  1385  1166  1519    51   654   967   727

Columns 52 to 64 
 1362   296  1361   834   728   328   548   729  1292  1385   268  1113   171

Columns 65 to 67 
 1528    24  1545
[torch.LongTensor of size 1x68]
 
 1
[torch.LongTensor of size 1x1]



Columns 0 to 12 
 1544    66   728   620   206   389  1385   268  1362   529   967  1113   979

Columns 13 to 16 
  179    63    24  1545
[torch.LongTensor of size 1x17]
 
 1
[torch.LongTensor of size 1x1]



Columns 0 to 12 
 1544   682  1371   338    20   121   972    33   402  1239  1144   984   142

Columns 13 to 25 
  268  1362   529   313   171    74  1323  1382  1502   121  1362   785  1292


# Train

In [5]:
vocab_size = vocab.V
emb_size = emb.weight.data.size(1)
n_kernels = 50
kernel_sizes = [1,2,3]
pretrained = emb
sent_size = len(kernel_sizes) * n_kernels
hidden_size = 100
batch_size = 1

####WARNING: No mini-batch processing#########
s_encoder = SentenceEncoder(vocab_size,
                            emb_size,
                            n_kernels,
                            kernel_sizes,
                            pretrained)
d_encoder = DocumentEncoder(sent_size, hidden_size)
ext_cell = ExtractorCell(sent_size, hidden_size)

# Binary Cross-Entropy loss
loss_fn = nn.BCELoss()
params = list(s_encoder.parameters()) + list(d_encoder.parameters()) + list(ext_cell.parameters())
optimizer = optim.Adam(params, lr = .005)

def get_accuracy(probs, targets, verbose = False):   
    '''
    Calculates the accuracy for the predictions

    Args:
        probs: extraction probability
        targets: ground truth labels for extraction
    '''
    import numpy as np
    preds = np.array([1 if p > 0.5 else 0 for p in probs])
    if verbose:
        print(preds)
    accuracy = np.mean(preds == targets)
    
    return accuracy

def run_epoch(docs):
    
    epoch_loss = 0
    epoch_accuracy = 0
    
    for doc in docs:
        docloader = DataLoader(doc, batch_size=1, shuffle=False)
        # Encode the sentences in a document
        inputs = []
        targets = []
        for input_raw, target_raw in docloader:
            input_raw = Variable(input_raw).cuda()
            inputs.append(s_encoder(input_raw))
            targets.append(target_raw.cuda())
        # Build the document representation using encoded sentences
        d_encoded = torch.cat(inputs, dim = 0)
        targets = Variable(torch.cat(targets, dim = 0).type(torch.FloatTensor).view(-1)).cuda()
        #### WARNING: "BEGINNING OF THE SENTENCE" embedding was initialized to zero ####
        init_sent = Variable(torch.zeros(1, d_encoded.size(1))).cuda()
        d_final = torch.cat([init_sent, d_encoded[:-1]], dim = 0)
        d_final = d_final.view(d_final.size(0),1,d_final.size(1))

        # Initialize the d_encoder
        h, c = d_encoder.init_h0c0(batch_size)
        h0 = Variable(h.data)

        # An input goes through the d_encoder
        output, hn, cn = d_encoder(d_final, h, c)

        # Initialize the decoder
        ## calculate p0, h_bar0, c_bar0
        h_ = hn.squeeze(0)
        c_ = cn.squeeze(0)
        p = ext_cell.init_p(h0.squeeze(0), h_)

        ## calculate p_t, h_bar_t, c_bar_t
        d_encoder_hiddens = torch.cat((h0, output[:-1]), 0) #h0 ~ h_{n-1}
        extract_probs = Variable(torch.zeros(len(inputs))).cuda()
        for i, (s, h) in enumerate(zip(inputs, d_encoder_hiddens)):
            h_, c_, p = ext_cell(s, h, h_, c_, p)
            extract_probs[i] = p

        optimizer.zero_grad() # flush the gradients
        loss = loss_fn(extract_probs, targets)
        epoch_loss += loss.data.cpu().numpy()
        p_cpu = extract_probs.data.cpu().numpy()
        t_cpu = targets.data.cpu().numpy()
        epoch_accuracy += get_accuracy(p_cpu, t_cpu)
        loss.backward()
        optimizer.step()
    
    return epoch_loss, epoch_accuracy/len(docs)



def train(docs, n_epochs = 100, print_every = 10):
    for epoch in range(n_epochs):
        loss, accuracy = run_epoch(docs)
        if epoch % print_every == 0:
            print('Epoch: %2i / Loss: %.7f / Accuracy: %.7f' % (epoch, loss, accuracy))
        
# Initial Training
# train(docloader, n_epochs = 100, print_every = 10)
train(docs, n_epochs = 10, print_every = 1)

Epoch:  0 / Loss: 7.5550203 / Accuracy: 0.4950596
Epoch:  1 / Loss: 6.7682199 / Accuracy: 0.5621973
Epoch:  2 / Loss: 6.1075773 / Accuracy: 0.6845216
Epoch:  3 / Loss: 4.0982604 / Accuracy: 0.9053865
Epoch:  4 / Loss: 2.0957580 / Accuracy: 0.9649855
Epoch:  5 / Loss: 1.4692974 / Accuracy: 0.9693043
Epoch:  6 / Loss: 1.4155220 / Accuracy: 0.9645652
Epoch:  7 / Loss: 1.0345395 / Accuracy: 0.9794022
Epoch:  8 / Loss: 0.6849986 / Accuracy: 0.9944444
Epoch:  9 / Loss: 0.4549244 / Accuracy: 0.9956522


# Test

In [6]:
import numpy as np

def test(docloader):
    # Encode the sentences in a document
    inputs = []
    targets = []
    for input_raw, target_raw in docloader:
        input_raw = Variable(input_raw).cuda()
        inputs.append(s_encoder(input_raw))
        targets.append(target_raw.cuda())
    # Build the document representation using encoded sentences
    d_encoded = torch.cat(inputs, dim = 0)
    targets = Variable(torch.cat(targets, dim = 0).type(torch.FloatTensor).view(-1)).cuda()
    #### WARNING: "BEGINNING OF THE SENTENCE" embedding was initialized to zero ####
    init_sent = Variable(torch.zeros(1, d_encoded.size(1))).cuda()
    d_final = torch.cat([init_sent, d_encoded[:-1]], dim = 0)
    d_final = d_final.view(d_final.size(0),1,d_final.size(1))

    # Initialize the d_encoder
    h, c = d_encoder.init_h0c0(batch_size)
    h0 = Variable(h.data)

    # An input goes through the d_encoder
    output, hn, cn = d_encoder(d_final, h, c)

    # Initialize the decoder
    ## calculate p0, h_bar0, c_bar0
    h_ = hn.squeeze(0)
    c_ = cn.squeeze(0)
    p = ext_cell.init_p(h0.squeeze(0), h_)

    ## calculate p_t, h_bar_t, c_bar_t
    d_encoder_hiddens = torch.cat((h0, output[:-1]), 0) #h0 ~ h_{n-1}
    extract_probs = Variable(torch.zeros(len(inputs))).cuda()
    for i, (s, h) in enumerate(zip(inputs, d_encoder_hiddens)):
        h_, c_, p = ext_cell(s, h, h_, c_, p)
        extract_probs[i] = p
    
    extract_probs = extract_probs.data.cpu().numpy()
    targets = targets.data.cpu().numpy()
    preds = np.array([1 if p > 0.5 else 0 for p in extract_probs])
    print(preds)
    accuracy = np.mean(preds == targets)
    
    return accuracy

accuracy = test(DataLoader(docs[0], batch_size=1, shuffle=False))
print(accuracy)

[1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
1.0


In [7]:
# vocab_size = vocab.V
# emb_size = emb.weight.data.size(1)
# n_kernels = 50
# kernel_sizes = [1,2,3,4,5]
# input_size = vocab.V
# hidden_size = 100
# batch_size = 1

# ####WARNING: No mini-batch processing#########
# s_encoder = SentenceEncoder(vocab_size,
#                             emb_size,
#                             n_kernels,
#                             kernel_sizes)
# d_encoder = DocumentEncoder(emb_size, hidden_size)
# ext_cell = ExtractorCell(input_size, hidden_size)

# # Binary Cross-Entropy loss
# loss_fn = nn.BCELoss()
# params = list(s_encoder.parameters()) + list(d_encoder.parameters()) + list(ext_cell.parameters())
# optimizer = optim.Adam(params, lr = .005)

# def run_epoch(input_docs, target_docs):
    
#     epoch_loss = 0
    
#     # Train over the whole document
#     for input, target in zip(input_docs, target_docs):
#         # flush the gradients
#         optimizer.zero_grad()

#         input = Variable(input).view(input.size(0),1,input.size(1)).cuda()
#         target = Variable(torch.FloatTensor(target)).cuda()

#         # Initialize the d_encoder
#         h, c = d_encoder.init_h0c0(batch_size)
#         h0 = Variable(h.data)

#         # An input goes through the d_encoder
#         output, hn, cn = d_encoder(input, h, c)

#         # Initialize the decoder
#         ## calculate p0, h_bar0, c_bar0
#         h_ = hn.squeeze(0)
#         c_ = cn.squeeze(0)
#         p = ext_cell.init_p(h0.squeeze(0), h_)

#         ## calculate p_t, h_bar_t, c_bar_t
#         d_encoder_hiddens = torch.cat((h0, output[:-1]), 0) #h0 ~ h_{n-1}
#         extract_probs = Variable(torch.zeros(input.size(0))).cuda()
#         for i, (s, h) in enumerate(zip(input, d_encoder_hiddens)):
#             h_, c_, p = ext_cell(s, h, h_, c_, p)
#             extract_probs[i] = p
#         loss = loss_fn(extract_probs, target)
#         epoch_loss += loss.data.cpu().numpy()
#         loss.backward()
#         optimizer.step()
    
#     return epoch_loss

# def train(input_docs, target_docs, n_epochs = 100, print_every = 10):
#     total_loss = 0.0
#     for epoch in range(n_epochs):
#         epoch_loss = run_epoch(input_docs, target_docs)
#         if epoch % print_every == 0:
#             print('Epoch: %2i / Loss: %.7f' % (epoch, epoch_loss))
        
# # Initial Training
# train(input_docs, target_docs, n_epochs = 100, print_every = 10)