# Load the dataset

In [1]:
from modules.texts import Vocab, GloVeLoader
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from modules.layers import SentenceEncoder, DocumentEncoder, ExtractorCell
from modules.data import DocumentsGroup
from torch.utils.data import DataLoader

In [2]:
# Load the pretrained embedding into the memory
path_glove = os.path.join(os.path.expanduser('~'),
             'data/NLP/word_embeddings/GloVe/glove.6B.300d.txt')
glove = GloVeLoader(path_glove)

# Load the dataset
doc_file = './data/kaggle_news_rouge1.pkl'
docs = DocumentsGroup(doc_file)
vocab = docs.vocab

d = 300
emb = nn.Embedding(vocab.V, d)
for word in vocab.word2id:
    try:
        emb.weight.data[vocab[word]] = torch.from_numpy(glove[word])
    except KeyError as e:
        # Case when pretrained embedding for a word does not exist
        pass
# emb.weight.requires_grad = False # suppress updates

The pretrained vector file to use: /home/yhs/data/NLP/word_embeddings/GloVe/glove.6B.300d.txt
The number of words in the pretrained vector: 400000
The dimension of the pretrained vector: 300


In [10]:
docloader = DataLoader(docs, batch_size=1, shuffle=False)

for i, doc in enumerate(docs):
    docloader = DataLoader(doc, batch_size=1, shuffle=False)
    if i > 5:
        break
    print("Doc %i" % i)
    for input, target in docloader:
        print(input, target)

Doc 0


Columns 0 to 10 
 52373  47069  13388   4758  15414   3583  33465  51004  51393   2866  10942

Columns 11 to 21 
 47056   5635  51459  44706  47588  47397  38095  33465  28971  11406   3816

Columns 22 to 32 
 47069  33649  48181   2866   6384  19265  16703   4758  50838  40082   5080

Columns 33 to 43 
 33465  43960  29880    198  47069  49073  46903   2732  40748   3583  50838

Columns 44 to 54 
 18922  47588  39765  51399   1265  22455  33300  24511  47069  10942  47056

Columns 55 to 65 
 28618  24521  11782  18910  24560  44706  47588   9983  38103   5795  51520

Columns 66 to 67 
   198  52374
[torch.LongTensor of size 1x68]
 
 1
[torch.LongTensor of size 1x1]



Columns 0 to 10 
 52373   2732  24521  21514   7208  13738  47588   9983  47069  18302  33300

Columns 11 to 16 
 38103  33465   5980   2289    198  52374
[torch.LongTensor of size 1x17]
 
 1
[torch.LongTensor of size 1x1]



Columns 0 to 10 
 52373  23280  47238  12042    180   4307  33329    293  14234  42545  

# Train

In [26]:
vocab_size = vocab.V
emb_size = emb.weight.data.size(1)
n_kernels = 50
kernel_sizes = [1,2,3]
pretrained = emb
sent_size = len(kernel_sizes) * n_kernels
hidden_size = 100
batch_size = 1

torch.manual_seed(7) # for reproducibility

####WARNING: No mini-batch processing#########
s_encoder = SentenceEncoder(vocab_size,
                            emb_size,
                            n_kernels,
                            kernel_sizes,
                            pretrained)
d_encoder = DocumentEncoder(sent_size, hidden_size)
ext_cell = ExtractorCell(sent_size, hidden_size)

# Binary Cross-Entropy loss
loss_fn = nn.BCELoss()
params = list(s_encoder.parameters()) + list(d_encoder.parameters()) + list(ext_cell.parameters())
optimizer = optim.Adam(params, lr = .005)

def get_accuracy(probs, targets, verbose = False):   
    '''
    Calculates the accuracy for the predictions

    Args:
        probs: extraction probability
        targets: ground truth labels for extraction
    '''
    import numpy as np
    preds = np.array([1 if p > 0.5 else 0 for p in probs])
    if verbose:
        print(preds)
    accuracy = np.mean(preds == targets)
    
    return accuracy

def run_epoch(docs):
    
    epoch_loss = 0
    epoch_accuracy = 0
    
    for doc in docs:
        docloader = DataLoader(doc, batch_size=1, shuffle=False)
        # Encode the sentences in a document
        inputs = []
        targets = []
        for input_raw, target_raw in docloader:
            input_raw = Variable(input_raw).cuda()
            inputs.append(s_encoder(input_raw))
            targets.append(target_raw.cuda())
        
        # Ignore if the content is a single sentence(no need to train)
        if len(inputs) <= 1:
            continue
        
        # Build the document representation using encoded sentences
        d_encoded = torch.cat(inputs, dim = 0)
        targets = Variable(torch.cat(targets, dim = 0).type(torch.FloatTensor).view(-1)).cuda()
        #### WARNING: "BEGINNING OF THE SENTENCE" embedding was initialized to zero ####
        init_sent = Variable(torch.zeros(1, d_encoded.size(1))).cuda()
        d_final = torch.cat([init_sent, d_encoded[:-1]], dim = 0)
        d_final = d_final.view(d_final.size(0),1,d_final.size(1))

        # Initialize the d_encoder
        h, c = d_encoder.init_h0c0(batch_size)
        h0 = Variable(h.data)

        # An input goes through the d_encoder
        output, hn, cn = d_encoder(d_final, h, c)

        # Initialize the decoder
        ## calculate p0, h_bar0, c_bar0
        h_ = hn.squeeze(0)
        c_ = cn.squeeze(0)
        p = ext_cell.init_p(h0.squeeze(0), h_)

        ## calculate p_t, h_bar_t, c_bar_t
        d_encoder_hiddens = torch.cat((h0, output[:-1]), 0) #h0 ~ h_{n-1}
        extract_probs = Variable(torch.zeros(len(inputs))).cuda()
        for i, (s, h) in enumerate(zip(inputs, d_encoder_hiddens)):
            h_, c_, p = ext_cell(s, h, h_, c_, p)
            extract_probs[i] = p

        optimizer.zero_grad() # flush the gradients
        loss = loss_fn(extract_probs, targets)
        epoch_loss += loss.data.cpu().numpy()
        p_cpu = extract_probs.data.cpu().numpy()
        t_cpu = targets.data.cpu().numpy()
        epoch_accuracy += get_accuracy(p_cpu, t_cpu)
        loss.backward()
        optimizer.step()
    
    return epoch_loss, epoch_accuracy/len(docs)


def train(docs, n_epochs = 10, print_every = 1):
    import time
    
    for epoch in range(n_epochs):
        start_time = time.time()
        loss, accuracy = run_epoch(docs)
        end_time = time.time()
        if epoch % print_every == 0:
            print('Epoch:%2i / Loss:%.7f / Accuracy:%.7f / TrainingTime:%s(sec)' %
                  (epoch, loss, accuracy, end_time - start_time))
        
# Initial Training
train(docs, n_epochs = 10, print_every = 1)

Epoch: 0 / Loss:2756.5773926 / Accuracy:0.5888386 / TrainingTime:181.99645042419434(sec)
Epoch: 1 / Loss:2606.7377930 / Accuracy:0.6322453 / TrainingTime:182.00997233390808(sec)
Epoch: 2 / Loss:2551.6467285 / Accuracy:0.6526475 / TrainingTime:180.93056678771973(sec)
Epoch: 3 / Loss:2564.9841309 / Accuracy:0.6527101 / TrainingTime:179.66590070724487(sec)
Epoch: 4 / Loss:2571.7143555 / Accuracy:0.6491518 / TrainingTime:179.64153933525085(sec)
Epoch: 5 / Loss:2495.8183594 / Accuracy:0.6672701 / TrainingTime:179.95737838745117(sec)
Epoch: 6 / Loss:2347.5646973 / Accuracy:0.6958365 / TrainingTime:180.11845922470093(sec)
Epoch: 7 / Loss:2370.2644043 / Accuracy:0.6910322 / TrainingTime:180.05119013786316(sec)
Epoch: 8 / Loss:2353.4150391 / Accuracy:0.6923257 / TrainingTime:180.23937892913818(sec)
Epoch: 9 / Loss:2342.6794434 / Accuracy:0.6908230 / TrainingTime:180.04811549186707(sec)


# Test

In [6]:
import numpy as np

def test(docloader):
    # Encode the sentences in a document
    inputs = []
    targets = []
    for input_raw, target_raw in docloader:
        input_raw = Variable(input_raw).cuda()
        inputs.append(s_encoder(input_raw))
        targets.append(target_raw.cuda())
        
    assert len(input) > 1 # content should be composed of more than one sentences
    
    # Build the document representation using encoded sentences
    d_encoded = torch.cat(inputs, dim = 0)
    targets = Variable(torch.cat(targets, dim = 0).type(torch.FloatTensor).view(-1)).cuda()
    #### WARNING: "BEGINNING OF THE SENTENCE" embedding was initialized to zero ####
    init_sent = Variable(torch.zeros(1, d_encoded.size(1))).cuda()
    d_final = torch.cat([init_sent, d_encoded[:-1]], dim = 0)
    d_final = d_final.view(d_final.size(0),1,d_final.size(1))

    # Initialize the d_encoder
    h, c = d_encoder.init_h0c0(batch_size)
    h0 = Variable(h.data)

    # An input goes through the d_encoder
    output, hn, cn = d_encoder(d_final, h, c)

    # Initialize the decoder
    ## calculate p0, h_bar0, c_bar0
    h_ = hn.squeeze(0)
    c_ = cn.squeeze(0)
    p = ext_cell.init_p(h0.squeeze(0), h_)

    ## calculate p_t, h_bar_t, c_bar_t
    d_encoder_hiddens = torch.cat((h0, output[:-1]), 0) #h0 ~ h_{n-1}
    extract_probs = Variable(torch.zeros(len(inputs))).cuda()
    for i, (s, h) in enumerate(zip(inputs, d_encoder_hiddens)):
        h_, c_, p = ext_cell(s, h, h_, c_, p)
        extract_probs[i] = p
    
    extract_probs = extract_probs.data.cpu().numpy()
    targets = targets.data.cpu().numpy()
    preds = np.array([1 if p > 0.5 else 0 for p in extract_probs])
    print(preds)
    accuracy = np.mean(preds == targets)
    
    return accuracy

accuracy = test(DataLoader(docs[0], batch_size=1, shuffle=False))
print(accuracy)

[1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
1.0


In [7]:
# vocab_size = vocab.V
# emb_size = emb.weight.data.size(1)
# n_kernels = 50
# kernel_sizes = [1,2,3,4,5]
# input_size = vocab.V
# hidden_size = 100
# batch_size = 1

# ####WARNING: No mini-batch processing#########
# s_encoder = SentenceEncoder(vocab_size,
#                             emb_size,
#                             n_kernels,
#                             kernel_sizes)
# d_encoder = DocumentEncoder(emb_size, hidden_size)
# ext_cell = ExtractorCell(input_size, hidden_size)

# # Binary Cross-Entropy loss
# loss_fn = nn.BCELoss()
# params = list(s_encoder.parameters()) + list(d_encoder.parameters()) + list(ext_cell.parameters())
# optimizer = optim.Adam(params, lr = .005)

# def run_epoch(input_docs, target_docs):
    
#     epoch_loss = 0
    
#     # Train over the whole document
#     for input, target in zip(input_docs, target_docs):
#         # flush the gradients
#         optimizer.zero_grad()

#         input = Variable(input).view(input.size(0),1,input.size(1)).cuda()
#         target = Variable(torch.FloatTensor(target)).cuda()

#         # Initialize the d_encoder
#         h, c = d_encoder.init_h0c0(batch_size)
#         h0 = Variable(h.data)

#         # An input goes through the d_encoder
#         output, hn, cn = d_encoder(input, h, c)

#         # Initialize the decoder
#         ## calculate p0, h_bar0, c_bar0
#         h_ = hn.squeeze(0)
#         c_ = cn.squeeze(0)
#         p = ext_cell.init_p(h0.squeeze(0), h_)

#         ## calculate p_t, h_bar_t, c_bar_t
#         d_encoder_hiddens = torch.cat((h0, output[:-1]), 0) #h0 ~ h_{n-1}
#         extract_probs = Variable(torch.zeros(input.size(0))).cuda()
#         for i, (s, h) in enumerate(zip(input, d_encoder_hiddens)):
#             h_, c_, p = ext_cell(s, h, h_, c_, p)
#             extract_probs[i] = p
#         loss = loss_fn(extract_probs, target)
#         epoch_loss += loss.data.cpu().numpy()
#         loss.backward()
#         optimizer.step()
    
#     return epoch_loss

# def train(input_docs, target_docs, n_epochs = 100, print_every = 10):
#     total_loss = 0.0
#     for epoch in range(n_epochs):
#         epoch_loss = run_epoch(input_docs, target_docs)
#         if epoch % print_every == 0:
#             print('Epoch: %2i / Loss: %.7f' % (epoch, epoch_loss))
        
# # Initial Training
# train(input_docs, target_docs, n_epochs = 100, print_every = 10)