# Load the dataset

In [1]:
from modules.texts import Vocab, GloVeLoader
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from modules.layers import SentenceEncoder, DocumentEncoder, ExtractorCell
from modules.data import DocumentDataset
from torch.utils.data import DataLoader

# Table of Contents
1. Load the Multiple Documents into the memory
2. 

In [2]:
# Load the pretrained embedding into the memory
path_glove = os.path.join(os.path.expanduser('~'),
             'data/NLP/word_embeddings/GloVe/glove.6B.300d.txt')
glove = GloVeLoader(path_glove)

# Load the dataset
file = './data/Trump.txt'
with open(file) as f:
#     vocab = Vocab(f.read(), top_k = 50)
    vocab = Vocab(f.read())

d = 300
emb = nn.Embedding(vocab.V, d)
for word in vocab.word2id:
    try:
        emb.weight.data[vocab[word]] = torch.from_numpy(glove[word])
    except KeyError as e:
        # Case when pretrained embedding for a word does not exist
        pass
# emb.weight.requires_grad = False # suppress updates

The pretrained vector file to use: /home/yhs/data/NLP/word_embeddings/GloVe/glove.6B.300d.txt
The number of words in the pretrained vector: 400000
The dimension of the pretrained vector: 300


In [3]:
doc = DocumentDataset(file, vocab)
docloader = DataLoader(doc, batch_size=1, shuffle=False)

In [4]:
for input, target in docloader:
    print(input, target)



Columns 0 to 12 
  272    88   142   248     2    46   144     8     4     9     3   139   241

Columns 13 to 25 
   15    28    76   193   166   241   250   233     4   134   167   224   141

Columns 26 to 38 
   11     4    13     6    42    95   187     4   128   257    20    51    28

Columns 39 to 42 
  238   183     6   273
[torch.LongTensor of size 1x43]
 
 0
[torch.LongTensor of size 1x1]



Columns 0 to 12 
  272   248   257    46   134   241   163   270    60    47   166   203     6

Columns 13 to 13 
  273
[torch.LongTensor of size 1x14]
 
 1
[torch.LongTensor of size 1x1]



Columns 0 to 12 
  272   128    90    27    91    79   118   241   260   217   166   241   251

Columns 13 to 16 
  166   181     6   273
[torch.LongTensor of size 1x17]
 
 1
[torch.LongTensor of size 1x1]



Columns 0 to 12 
  272    20   243     5   120    51     4   248   112   134   241   113   166

Columns 13 to 25 
  130   124    93    28   107   116   134   214   241   105   205    97    66

Co

# Train

In [14]:
vocab_size = vocab.V
emb_size = emb.weight.data.size(1)
n_kernels = 50
kernel_sizes = [1,2,3,4,5]
pretrained = emb
sent_size = len(kernel_sizes) * n_kernels
hidden_size = 100
batch_size = 1

####WARNING: No mini-batch processing#########
s_encoder = SentenceEncoder(vocab_size,
                            emb_size,
                            n_kernels,
                            kernel_sizes,
                            pretrained)
d_encoder = DocumentEncoder(sent_size, hidden_size)
ext_cell = ExtractorCell(sent_size, hidden_size)

# Binary Cross-Entropy loss
loss_fn = nn.BCELoss()
params = list(s_encoder.parameters()) + list(d_encoder.parameters()) + list(ext_cell.parameters())
optimizer = optim.Adam(params, lr = .005)

def run_epoch(docloader):
    
    epoch_loss = 0

    # Encode the sentences in a document
    inputs = []
    targets = []
    for input_raw, target_raw in docloader:
        input_raw = Variable(input_raw).cuda()
        inputs.append(s_encoder(input_raw))
        targets.append(target.cuda())
    
    # Build the document representation using encoded sentences
    d_encoded = torch.cat(inputs, dim = 0)
    targets = Variable(torch.cat(targets, dim = 0).type(torch.FloatTensor).view(-1)).cuda()
    #### WARNING: "BEGINNING OF THE SENTENCE" embedding was initialized to zero ####
    init_sent = Variable(torch.zeros(1, d_encoded.size(1))).cuda()
    d_final = torch.cat([init_sent, d_encoded[:-1]], dim = 0)
    d_final = d_final.view(d_final.size(0),1,d_final.size(1))

    # Initialize the d_encoder
    h, c = d_encoder.init_h0c0(batch_size)
    h0 = Variable(h.data)

    # An input goes through the d_encoder
    output, hn, cn = d_encoder(d_final, h, c)

    # Initialize the decoder
    ## calculate p0, h_bar0, c_bar0
    h_ = hn.squeeze(0)
    c_ = cn.squeeze(0)
    p = ext_cell.init_p(h0.squeeze(0), h_)

    ## calculate p_t, h_bar_t, c_bar_t
    d_encoder_hiddens = torch.cat((h0, output[:-1]), 0) #h0 ~ h_{n-1}
    extract_probs = Variable(torch.zeros(len(inputs))).cuda()
    for i, (s, h) in enumerate(zip(inputs, d_encoder_hiddens)):
        h_, c_, p = ext_cell(s, h, h_, c_, p)
        extract_probs[i] = p

    optimizer.zero_grad() # flush the gradients
    loss = loss_fn(extract_probs, targets)
    epoch_loss += loss.data.cpu().numpy()
    loss.backward()
    optimizer.step()
    
    return epoch_loss

def train(docloader, n_epochs = 100, print_every = 10):
    total_loss = 0.0
    for epoch in range(n_epochs):
        epoch_loss = run_epoch(docloader)
        if epoch % print_every == 0:
            print('Epoch: %2i / Loss: %.7f' % (epoch, epoch_loss))
        
# Initial Training
train(docloader, n_epochs = 100, print_every = 10)

Epoch:  0 / Loss: 0.6908168
Epoch: 10 / Loss: 0.0762508
Epoch: 20 / Loss: 0.0151254
Epoch: 30 / Loss: 0.0054811
Epoch: 40 / Loss: 0.0026618
Epoch: 50 / Loss: 0.0014242
Epoch: 60 / Loss: 0.0009217
Epoch: 70 / Loss: 0.0006863
Epoch: 80 / Loss: 0.0005508
Epoch: 90 / Loss: 0.0004623


# Test

In [45]:
import numpy as np

def test(docloader):
    # Encode the sentences in a document
    inputs = []
    targets = []
    for input_raw, target_raw in docloader:
        input_raw = Variable(input_raw).cuda()
        inputs.append(s_encoder(input_raw))
        targets.append(target.cuda())
    
    # Build the document representation using encoded sentences
    d_encoded = torch.cat(inputs, dim = 0)
    targets = Variable(torch.cat(targets, dim = 0).type(torch.FloatTensor).view(-1)).cuda()
    #### WARNING: "BEGINNING OF THE SENTENCE" embedding was initialized to zero ####
    init_sent = Variable(torch.zeros(1, d_encoded.size(1))).cuda()
    d_final = torch.cat([init_sent, d_encoded[:-1]], dim = 0)
    d_final = d_final.view(d_final.size(0),1,d_final.size(1))

    # Initialize the d_encoder
    h, c = d_encoder.init_h0c0(batch_size)
    h0 = Variable(h.data)

    # An input goes through the d_encoder
    output, hn, cn = d_encoder(d_final, h, c)

    # Initialize the decoder
    ## calculate p0, h_bar0, c_bar0
    h_ = hn.squeeze(0)
    c_ = cn.squeeze(0)
    p = ext_cell.init_p(h0.squeeze(0), h_)

    ## calculate p_t, h_bar_t, c_bar_t
    d_encoder_hiddens = torch.cat((h0, output[:-1]), 0) #h0 ~ h_{n-1}
    extract_probs = Variable(torch.zeros(len(inputs))).cuda()
    for i, (s, h) in enumerate(zip(inputs, d_encoder_hiddens)):
        h_, c_, p = ext_cell(s, h, h_, c_, p)
        extract_probs[i] = p
    
    extract_probs = extract_probs
    extract_probs = extract_probs.data.cpu().numpy()
    targets = targets.data.cpu().numpy()
    preds = np.array([1 if p > 0.5 else 0 for p in extract_probs])
    accuracy = np.mean(preds == targets)
    
    return accuracy

preds = test(docloader)
print(preds)

0.47619047619


In [5]:
# vocab_size = vocab.V
# emb_size = emb.weight.data.size(1)
# n_kernels = 50
# kernel_sizes = [1,2,3,4,5]
# input_size = vocab.V
# hidden_size = 100
# batch_size = 1

# ####WARNING: No mini-batch processing#########
# s_encoder = SentenceEncoder(vocab_size,
#                             emb_size,
#                             n_kernels,
#                             kernel_sizes)
# d_encoder = DocumentEncoder(emb_size, hidden_size)
# ext_cell = ExtractorCell(input_size, hidden_size)

# # Binary Cross-Entropy loss
# loss_fn = nn.BCELoss()
# params = list(s_encoder.parameters()) + list(d_encoder.parameters()) + list(ext_cell.parameters())
# optimizer = optim.Adam(params, lr = .005)

# def run_epoch(input_docs, target_docs):
    
#     epoch_loss = 0
    
#     # Train over the whole document
#     for input, target in zip(input_docs, target_docs):
#         # flush the gradients
#         optimizer.zero_grad()

#         input = Variable(input).view(input.size(0),1,input.size(1)).cuda()
#         target = Variable(torch.FloatTensor(target)).cuda()

#         # Initialize the d_encoder
#         h, c = d_encoder.init_h0c0(batch_size)
#         h0 = Variable(h.data)

#         # An input goes through the d_encoder
#         output, hn, cn = d_encoder(input, h, c)

#         # Initialize the decoder
#         ## calculate p0, h_bar0, c_bar0
#         h_ = hn.squeeze(0)
#         c_ = cn.squeeze(0)
#         p = ext_cell.init_p(h0.squeeze(0), h_)

#         ## calculate p_t, h_bar_t, c_bar_t
#         d_encoder_hiddens = torch.cat((h0, output[:-1]), 0) #h0 ~ h_{n-1}
#         extract_probs = Variable(torch.zeros(input.size(0))).cuda()
#         for i, (s, h) in enumerate(zip(input, d_encoder_hiddens)):
#             h_, c_, p = ext_cell(s, h, h_, c_, p)
#             extract_probs[i] = p
#         loss = loss_fn(extract_probs, target)
#         epoch_loss += loss.data.cpu().numpy()
#         loss.backward()
#         optimizer.step()
    
#     return epoch_loss

# def train(input_docs, target_docs, n_epochs = 100, print_every = 10):
#     total_loss = 0.0
#     for epoch in range(n_epochs):
#         epoch_loss = run_epoch(input_docs, target_docs)
#         if epoch % print_every == 0:
#             print('Epoch: %2i / Loss: %.7f' % (epoch, epoch_loss))
        
# # Initial Training
# train(input_docs, target_docs, n_epochs = 100, print_every = 10)