# Load the data

In [1]:
from custom_utils.preprocess import Vocab
with open('./data/dl_history.txt') as f:
    text = f.read()

vocab = Vocab(text, top_k = 50)

print(vocab.V)

sents = vocab.sents2id(text)
# print(sents)
# print(vocab.id2sents(sents))
# print(vocab[0])
# print(vocab[vocab[0]])

onehot = vocab.sent2onehot(sents[0])
# print(onehot.size())
# print(vocab.onehot2sent(onehot))

import numpy as np

# Build the Training dataset
input_docs = [vocab.sent2onehot(sent) for sent in sents]
# Build inputs / targets as lists of tensors
np.random.seed(0)
target_docs = [np.random.randint(2, size=len(sent)).tolist()
               for sent in input_docs]

def generate_batch(inputs, targets, batch_size = 100):
    i = 0
    for i in range(len(inputs) // batch_size):
        yield inputs[i:i+batch_size], targets[i:i+batch_size]

next(generate_batch(input_docs, target_docs, batch_size = 3))

50


([
      0     0     0  ...      0     1     0
      0     0     0  ...      0     0     0
      0     0     0  ...      0     0     0
         ...          ⋱          ...       
      0     0     0  ...      1     0     0
      0     0     0  ...      0     0     0
      0     0     0  ...      0     0     1
  [torch.FloatTensor of size 43x50], 
      0     0     0  ...      0     1     0
      0     0     0  ...      0     0     0
      0     0     0  ...      1     0     0
         ...          ⋱          ...       
      0     0     0  ...      0     0     0
      0     0     0  ...      0     0     0
      0     0     0  ...      0     0     1
  [torch.FloatTensor of size 30x50], 
      0     0     0  ...      0     1     0
      0     0     0  ...      0     0     0
      0     0     0  ...      1     0     0
         ...          ⋱          ...       
      0     0     0  ...      0     0     0
      0     0     0  ...      0     0     0
      0     0     0  ...      0     0    

For the reference for general batch processing in RNNs, see [here](https://www.quora.com/How-are-inputs-fed-into-the-LSTM-RNN-network-in-mini-batch-method).

# Document Encoder

In [3]:
from modules.layers import DocumentEncoder

# Sentence Extractor

In [4]:
import torch
import torch.nn as nn
from torch.autograd import Variable

[What operations are allowed within `forward`?(see fmassa's answers)](https://discuss.pytorch.org/t/nn-module-with-multiple-inputs/237/3)

In [167]:
class ExtractorCell(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super().__init__()
        
        # arguments
        self.input_size = input_size 
        self.hidden_size = hidden_size
        
        # layers and operations
        self.lstmc = nn.LSTMCell(input_size, hidden_size)
        self.h2p = nn.Linear(hidden_size * 2, 1)
        self.sigmoid = nn.Sigmoid()
        
        if torch.cuda.is_available():
            self.cuda()
    
    def forward(self, s, h, h_, c_, p):
        '''
        s: s_{t-1} (batch=1, input_size)
        h: h_t (batch=1, hidden_size)
        h_: hbar_{t-1} (batch=1, hidden_size)
        c_: cbar_{t-1} (batch=1, hidden_size)
        p: p_{t-1}. (batch=1, 1)
        '''

        s_weighted = p.expand_as(s) * s
        h_, c_ = self.lstmc(s_weighted, (h_,c_))
        # (batch, hidden_size*2)
        h_cat = torch.cat([h, h_], dim = 1)
        
        batch_size = h_cat.size(0)
        logit = Variable(torch.zeros(batch_size, 1))
        if torch.cuda.is_available():
            logit = logit.cuda()
            
        for b in range(batch_size):
            logit[b] = self.h2p(h_cat[b])
        p = self.sigmoid(logit)
        
        if torch.cuda.is_available():
            h_ = h_.cuda()
            c_ = c_.cuda()
            p = p.cuda()
        
        return h_, c_, p
    
    def init_p(self, h0, hn):
        batch_size = h0.size(0)
        h_cat = torch.cat([h0, hn], dim = 1)
        logit = Variable(torch.zeros(batch_size, 1))
        
        if torch.cuda.is_available():
            logit = logit.cuda()
            
        for b in range(batch_size):
            logit[b] = self.h2p(h_cat[b])
            
        p0 = self.sigmoid(logit)
        
        if torch.cuda.is_available():
            p0 = p0.cuda()
        
        return p0

In [168]:
extc = ExtractorCell(50, 100)
extc

ExtractorCell (
  (lstmc): LSTMCell(50, 100)
  (h2p): Linear (200 -> 1)
  (sigmoid): Sigmoid ()
)

In [169]:
# Test
s = Variable(input_docs[0][0:1]).cuda()
h = Variable(torch.zeros(1, 100)).cuda()
h_ = Variable(torch.randn(1, 100)).cuda()
c_ = Variable(torch.randn(1, 100)).cuda()
p = extc.init_p(h,h_)

extc(s, h, h_, c_, p)

(Variable containing:
 
 Columns 0 to 9 
 -0.0784  0.2611  0.0797  0.5101 -0.1945 -0.1016  0.2735  0.1839  0.3074  0.0828
 
 Columns 10 to 19 
  0.1248  0.1799 -0.3553  0.1724  0.0806 -0.0822 -0.1091 -0.0355 -0.1199  0.0218
 
 Columns 20 to 29 
  0.0328  0.0212  0.5700 -0.0746 -0.0072  0.0555 -0.1197 -0.4433 -0.2899  0.1244
 
 Columns 30 to 39 
 -0.1667 -0.1599 -0.2223 -0.0585  0.1371  0.0014 -0.2806  0.2987 -0.4335 -0.0183
 
 Columns 40 to 49 
  0.0488  0.4721 -0.1027  0.0398 -0.0853 -0.6915 -0.5837 -0.1448 -0.1069 -0.0568
 
 Columns 50 to 59 
 -0.2179  0.2284 -0.0984 -0.0027  0.3182  0.3011 -0.1427  0.0650 -0.0041  0.1462
 
 Columns 60 to 69 
  0.6825  0.0298 -0.2918 -0.1235  0.3215  0.2051  0.3010  0.2994  0.2406  0.2883
 
 Columns 70 to 79 
  0.0498  0.0456  0.1877 -0.1237  0.1274 -0.4455  0.1771  0.0085  0.1104  0.0432
 
 Columns 80 to 89 
  0.1121  0.0277 -0.1032  0.1760  0.0424  0.1090  0.0785  0.1958  0.4948  0.1290
 
 Columns 90 to 99 
 -0.1289  0.0440 -0.2039 -0.2040 -0.0912 

# Test

In [170]:
from modules.layers import DocumentEncoder
from torch.nn.utils.rnn import pad_packed_sequence
from custom_utils.packing import pack, unpack
input_size = vocab.V
hidden_size = 100
batch_size = 1

####WARNING: No mini-batch processing#########

encoder = DocumentEncoder(input_size, hidden_size)
extc = ExtractorCell(input_size, hidden_size)
        
# Train over all the documents
for input, target in zip(input_docs, target_docs):
    input = Variable(input).view(input.size(0),1,input.size(1)).cuda()
    
    # Initialize the encoder
    h, c = encoder.init_h0c0(batch_size)
    h0 = Variable(h.data)
    
    # An input goes through the encoder
    output, hn, cn = encoder(input, h, c)
    
    # Initialize the decoder
    ## calculate p0, h_bar0, c_bar0
    h_ = hn.squeeze(0)
    c_ = cn.squeeze(0)
    p = extc.init_p(h0.squeeze(0), h_)
    
    ## calculate p_t, h_bar_t, c_bar_t
    i = 0
    for s, h in zip(input, output):
        h_, c_, p = extc(s, h, h_, c_, p)
        i += 1
    print(i)

43
30
61
15
18
5


# Train

In [171]:
?nn.NLLLoss

In [172]:
?nn.BCELoss

In [177]:
from modules.layers import DocumentEncoder
import torch.optim as optim

input_size = vocab.V
hidden_size = 100
batch_size = 1

####WARNING: No mini-batch processing#########

encoder = DocumentEncoder(input_size, hidden_size)
extc = ExtractorCell(input_size, hidden_size)

# Binary Cross-Entropy loss
loss_fn = nn.BCELoss()
params = list(encoder.parameters()) + list(extc.parameters())
optimizer = optim.Adam(params, lr = .005)

def run_epoch(input_docs, target_docs):
    
    epoch_loss = 0
    
    # Train over the whole document
    for input, target in zip(input_docs, target_docs):
        # flush the gradients
        optimizer.zero_grad()

        input = Variable(input).view(input.size(0),1,input.size(1)).cuda()
        target = Variable(torch.FloatTensor(target)).cuda()

        # Initialize the encoder
        h, c = encoder.init_h0c0(batch_size)
        h0 = Variable(h.data)

        # An input goes through the encoder
        output, hn, cn = encoder(input, h, c)

        # Initialize the decoder
        ## calculate p0, h_bar0, c_bar0
        h_ = hn.squeeze(0)
        c_ = cn.squeeze(0)
        p = extc.init_p(h0.squeeze(0), h_)

        ## calculate p_t, h_bar_t, c_bar_t
        encoder_hiddens = torch.cat((h0, output[:-1]), 0) #h0 ~ h_{n-1}
        extract_probs = Variable(torch.zeros(input.size(0))).cuda()
        for i, (s, h) in enumerate(zip(input, encoder_hiddens)):
            h_, c_, p = extc(s, h, h_, c_, p)
            extract_probs[i] = p
        loss = loss_fn(extract_probs, target)
        epoch_loss += loss.data.cpu().numpy()
        loss.backward()
        optimizer.step()
    
    return epoch_loss

def train(input_docs, target_docs, n_epochs = 100, print_every = 10):
    total_loss = 0.0
    for epoch in range(n_epochs):
        epoch_loss = run_epoch(input_docs, target_docs)
        if epoch % print_every == 0:
            print('Epoch: %2i / Loss: %.7f' % (epoch, epoch_loss))
        
# Initial Training
train(input_docs, target_docs, n_epochs = 100, print_every = 10)

Epoch:  0 / Loss: 4.1813889
Epoch: 10 / Loss: 2.5383389
Epoch: 20 / Loss: 3.0810349
Epoch: 30 / Loss: 2.1359465
Epoch: 40 / Loss: 1.5895779
Epoch: 50 / Loss: 0.8240650
Epoch: 60 / Loss: 0.5068389
Epoch: 70 / Loss: 0.3069730
Epoch: 80 / Loss: 0.1650973
Epoch: 90 / Loss: 0.3713593


[Good reference on PackedSequence](https://medium.com/huggingface/understanding-emotions-from-keras-to-pytorch-3ccb61d5a983)

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [None]:
unpacked, length = pad_packed_sequence(input_packed)
unpacked_masked = [unpacked[:lengths[batch], batch, :]
                   for batch in range(len(lengths))]

In [None]:
unpacked.size()

In [None]:
encoder_hiddens

In [None]:
# Build inputs / targets as lists of tensors
input_size = vocab.V
hidden_size = 100
num_layers = 1
batch_size = 2

encoder = DocumentEncoder(input_size, hidden_size, num_layers)
decoder = SentenceExtractor(input_size, hidden_size)

h, c = encoder.init_h0c0(batch_size)
h0 = Variable(h.data)
output = None
encoder_hiddens = [] # the list that stores the encoder hidden states(except for h0)

for minibatch in generate_batch(sentences, batch_size):
    minibatch = minibatch.cuda()
    _, h, c = encoder(minibatch, h, c)
    encoder_hiddens.append(Variable(h.data))

p0 = decoder.init_p(h0, h)
s0 = decoder.init_s(batch_size)
# initial step for the decoder
p, h_bar, c_bar = decoder(s0, h, c, h, p0)

extract_probs = []
for minibatch, h in zip(generate_batch(sentences, batch_size), encoder_hiddens):
    minibatch = minibatch.cuda()
    p, h_bar, c_bar = decoder(minibatch, h_bar, c_bar, h, p)
    extract_probs.append(p.view(-1).data.cpu().numpy()[0])

print(extract_probs, len(extract_probs))

# Train

In [None]:
?nn.CrossEntropyLoss

In [None]:
# Build inputs / targets as lists of tensors
import torch
n_sentences = 300
embedding_dim = 50
sentences = torch.rand(n_sentences, embedding_dim)

input_size = 50
hidden_size = 100
num_layers = 1
batch_size = 1

encoder = DocumentEncoder(input_size, hidden_size, num_layers)
decoder = SentenceExtractor(input_size, hidden_size)

loss_fn = nn.CrossEntropyLoss()

for epoch in range(n_epochs):
    epoch_loss = 0
    for minibatch in generate_batch(sentences, batch_size):
        # Initialize the Encoder
        h, c = encoder.init_h0c0(batch_size)
        h0 = Variable(h.data)
        encoder_hiddens = [] # the list that stores the encoder hidden states(except for h0)

        minibatch = minibatch.cuda()
        _, h, c = encoder(minibatch, h, c)
        encoder_hiddens.append(Variable(h.data))

        # Initialize the Decoder
        loss = loss_fn
        p0 = decoder.init_p(h0, h)
        s0 = decoder.init_s(batch_size)
        p, h_bar, c_bar = decoder(s0, h, c, h, p0) # initial step for the decoder    
        extract_probs = []

        for h in encoder_hiddens:
            p, h_bar, c_bar = decoder(minibatch, h_bar, c_bar, h, p)
            extract_probs.append(p)


# Document Classifier

In [None]:
# # Train the network(requires a target sequence)

# input_size = vocab.V
# hidden_size = vocab.V
# output_size = vocab.V
# num_layers = 1
# batch_size = 1

# encoder = Encoder(input_size, hidden_size).cuda()
# decoder = Decoder(hidden_size, output_size).cuda()

# import torch.optim as optim

# loss_fn = nn.MSELoss()
# params = list(encoder.parameters()) + list(decoder.parameters())
# optimizer = optim.Adam(params, lr = .005)

# def run_epoch(inputs, targets):
#     # flush the gradients
#     optimizer.zero_grad()
    
#     # initial hidden state(h0)
#     h0,c0 = encoder.init_h0c0(batch_size = 6)
#     # training loss
#     loss = 0
    
#     # Feed the training data
#     targets = [Variable(tensor).cuda() for tensor in targets]
#     inputs_packed, orders = pack(inputs)
    
#     # Run a RNN encoder-decoder through the training samples  
#     _, h_encoder, c_encoder = encoder(inputs_packed, h0, c0)
#     y0 = decoder.init_pred(h_encoder)
#     y, h_decoder, c_decoder = decoder(y0, h_encoder, c_encoder)
    
#     for out, target in zip(outputs, targets):
#         loss += loss_fn(out, target)
        
#     loss.backward()
#     optimizer.step()
    
#     return outputs, loss.data[0]

# def train(inputs, targets, n_epochs = 100, print_every = 10):
#     total_loss = 0.0
#     for epoch in range(1, n_epochs + 1):
#         output, loss = run_epoch(inputs, targets)
#         if epoch % print_every == 0:
#             print('Epoch: %2i / Loss: %.7f' % (epoch, loss))
            
# def test(input_sent):
#     h, c = rnn.init_h0c0()
#     seq_len = input_sent.size()[0]
#     input_sent = Variable(input_sent.view(seq_len, batch_size, -1))
    
#     output, h, c = rnn(input_sent, h, c)
#     _, argmaxs = torch.max(output, dim = 0)
    
#     # flatten the sorted indices
#     sent = argmaxs.view(-1).data.cpu().numpy().tolist()
#     for i in sent:
#         print(vocab[i],end=' ')
        
# # run_epoch(inputs, targets)
# train(inputs, targets, n_epochs = 1000, print_every = 100)
# torch.manual_seed(7)
# test(inputs[0].cuda())

# CNN Sentence Encoder

In [None]:
class CNNSentenceEncoder(nn.Module):
    def __init__(self):
        pass
    def init_s0(self):
        '''Produces a start-of-document character'''

In [None]:
# class SentenceExtractor(nn.Module):
    
#     def __init__(self, input_size, hidden_size, num_layers = 1):
#         super().__init__()
#         # arguments
#         self.input_size = input_size 
#         self.hidden_size = hidden_size
#         self.num_layers = num_layers
        
#         # parameters
#         self.w_y = nn.Parameter(torch.randn(1, hidden_size * 2))
        
#         # layers and operations
#         self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
#         self.sigmoid = nn.Sigmoid()
        
#         if torch.cuda.is_available():
#             self.cuda()
            
#     def forward(self, s, h_bar, c_bar, h, p):
#         '''
#         Args:
#             # Raw Inputs
#             s: (seq_len) x (batch_size) x (input_size). s_t.
#             h: (seq_len) x (num_layers) x (batch_size) x (hidden_size). h_t from the encoder.
            
#             # Previous hidden states
#             h_bar: (num_layers=1) x (batch_size) x (hidden_size)
#             c_bar: (num_layers=1) x (batch_size) x (hidden_size)
            
#             p: (batch_size) x (1). torch FloatTensor.
            
#         Returns:
#             output
#         '''
        
#         s_weighted = p.unsqueeze(0).expand(s.size()) * s
#         output, (h_bar,c_bar) = self.lstm(s_weighted, (h_bar, c_bar))
        
#         # h_cat: batch_size x hidden_size*2 x 1
#         h_cat = torch.cat((h, h_bar), dim = -1).squeeze(0).unsqueeze(-1)
        
#         # batch_size x 1 x hidden_size*2
#         w_y = self.w_y
#         W_y = w_y.expand(h_cat.size(0), w_y.size(0), w_y.size(1))
        
#         # batch_size x 1. torch.bmm is a batch matrix multiplication
#         logit = torch.bmm(W_y, h_cat).view(-1, 1)
        
#         # batch_size x 1
#         p = self.sigmoid(logit)
        
#         return p, output, h_bar, c_bar
    
#     def init_p(self, h0, hn):
#         '''
#         Args:
#             h0: initial hidden state from the encoder.
#             (num_layers=1) x (batch_size) x (hidden_size)
#             hn: final hidden state from the encoder. 
#             (num_layers=1) x (batch_size) x (hidden_size)
            
#         Returns:
#             p0: initial probability. (batch_size) x 1
#         '''
#         # h_cat: batch_size x hidden_size*2 x 1
#         h_cat = torch.cat((h0, hn), dim = -1).squeeze(0).unsqueeze(-1)
        
#         # batch_size x 1 x hidden_size*2
#         w_y = self.w_y
#         W_y = w_y.expand(batch_size, w_y.size(0), w_y.size(1))
        
#         # batch_size x 1. torch.bmm is a batch matrix multiplication
#         logit = torch.bmm(W_y, h_cat).view(-1, 1)
        
#         # batch_size x 1
#         p0 = self.sigmoid(logit)
        
#         if torch.cuda.is_available():
#             p0 = p0.cuda()
        
#         return p0

In [42]:
# from torch.nn.utils.rnn import pad_packed_sequence
# from custom_utils.packing import pack, unpack
# input_size = vocab.V
# hidden_size = 100
# num_layers = 1
# batch_size = 2

# encoder = DocumentEncoder(input_size, hidden_size, num_layers)
# decoder = SentenceExtractor(input_size, hidden_size)
    
# for minibatch in generate_batch(input_docs, target_docs, batch_size):
#     input, target = minibatch
    
#     # Initialize the encoder
#     h_e, c_e = encoder.init_h0c0(batch_size)
#     h_e0 = Variable(h_e.data)
    
#     # A minibatch goes through the encoder
#     input_packed, orders = pack(input)
#     output_packed, hn, cn = encoder(input_packed, h_e, c_e)
#     output_unpacked, length = pad_packed_sequence(output_packed)
#     # h1, h2, ....hn for each documents
# #     encoder_hiddens = unpack(output_packed, orders)
# #     print(encoder_hiddens)
#     print(output_unpacked.size())
# #     print(output_packed.data)
    
    
#     # Initialize the decoder
#     ## calculate p0, h_bar0, c_bar0
#     h_bar = h_e
#     c_bar = c_e
# #     print(h_bar.size())
#     p = decoder.init_p(h_e0, h_bar)
#     ## calculate p1, h_bar1, c_bar1
# #     h = encoder_hiddens[0]
# #     print(h.size())
# #     for b, enc_hidden in enumerate(encoder_hiddens):
#     input_unpacked, length = pad_packed_sequence(input_packed)
#     p, output_unmasked, h_bar, c_bar = decoder(input_unpacked, h_bar, c_bar, h_e, p)
#     output_masked = output_unmasked
    
#     print(output.size())
    
#     # A minibatch goes through the decoder
#     ## WARNING: input should include the representation for s0
# #     input_packed, orders = pack(input)
# #     p, h_bar, c_bar = decoder(input_packed, h_bar, c_bar, h, p)
    

NameError: name 'SentenceExtractor' is not defined