# Load the data

In [1]:
from custom_utils.preprocess import Vocab
with open('./data/dl_history.txt') as f:
    text = f.read()

vocab = Vocab(text, top_k = 50)

print(vocab.V)

sents = vocab.sents2id(text)
print(sents)
print(vocab.id2sents(sents))
print(vocab[0])
print(vocab[vocab[0]])

onehot = vocab.sent2onehot(sents[0])
print(onehot.size())
print(vocab.onehot2sent(onehot))

50
[[48, 43, 42, 18, 26, 46, 47, 45, 43, 27, 26, 47, 15, 47, 47, 22, 47, 47, 47, 9, 47, 8, 12, 47, 31, 30, 15, 21, 47, 12, 47, 22, 4, 0, 22, 43, 16, 33, 47, 47, 32, 2, 49], [48, 7, 47, 8, 22, 5, 0, 19, 47, 12, 23, 38, 47, 10, 35, 47, 26, 18, 47, 7, 47, 8, 47, 31, 30, 20, 47, 26, 2, 49], [48, 22, 47, 0, 10, 37, 15, 47, 47, 0, 34, 12, 47, 7, 47, 9, 6, 8, 47, 47, 10, 47, 1, 25, 47, 31, 29, 17, 47, 47, 36, 1, 47, 47, 24, 47, 10, 44, 0, 47, 47, 24, 22, 47, 13, 11, 47, 47, 14, 27, 0, 47, 47, 1, 47, 47, 47, 41, 47, 2, 49], [48, 7, 47, 8, 43, 35, 47, 45, 26, 20, 18, 47, 28, 2, 49], [48, 10, 47, 47, 47, 39, 47, 43, 47, 33, 43, 42, 47, 47, 40, 4, 2, 49], [48, 7, 47, 8, 49]]
_BEGIN_ the term deep learning was UNK to the machine learning UNK by UNK UNK in UNK UNK UNK ][ UNK ] and UNK neural networks by igor UNK and UNK in 2000 , in the context of UNK UNK neurons . _END_ _BEGIN_ [ UNK ] in 2005 , faustino UNK and jürgen schmidhuber UNK a paper UNK learning deep UNK [ UNK ] UNK neural networks for U

In [2]:
# Build the Training dataset
onehots = [vocab.sent2onehot(sent) for sent in sents]

# Build inputs / targets as lists of tensors
inputs = [sent[:-1,:] for sent in onehots]
targets = [sent[1:,:] for sent in onehots]

For the reference for general batch processing in RNNs, see [here](https://www.quora.com/How-are-inputs-fed-into-the-LSTM-RNN-network-in-mini-batch-method).

In [3]:
inputs

[
     0     0     0  ...      0     1     0
     0     0     0  ...      0     0     0
     0     0     0  ...      0     0     0
        ...          ⋱          ...       
     0     0     0  ...      1     0     0
     0     0     0  ...      0     0     0
     0     0     1  ...      0     0     0
 [torch.FloatTensor of size 42x50], 
     0     0     0  ...      0     1     0
     0     0     0  ...      0     0     0
     0     0     0  ...      1     0     0
        ...          ⋱          ...       
     0     0     0  ...      1     0     0
     0     0     0  ...      0     0     0
     0     0     1  ...      0     0     0
 [torch.FloatTensor of size 29x50], 
     0     0     0  ...      0     1     0
     0     0     0  ...      0     0     0
     0     0     0  ...      1     0     0
        ...          ⋱          ...       
     0     0     0  ...      0     0     0
     0     0     0  ...      1     0     0
     0     0     1  ...      0     0     0
 [torch.FloatTensor o

# Define the Encoder and Decoder

In [10]:
import torch
import torch.nn as nn
from torch.autograd import Variable

class Encoder(nn.Module):
    
    def __init__(self, input_size, hidden_size, num_layers = 1):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
        
        if torch.cuda.is_available():
            self.cuda()
        
    def forward(self, input, h, c):
        output,(h,c) = self.lstm(input,(h,c))
        return output,h,c
        
    def init_h0c0(self, batch_size = 1):
        # dimension: num_layers*num_directions, batch_size, hidden_size
        h0 = Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size))
        c0 = Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size))
        
        if torch.cuda.is_available():
            h0 = h0.cuda()
            c0 = c0.cuda()
        
        return h0,c0
    
class Decoder(nn.Module):
    
    def __init__(self, hidden_size, output_size, num_layers = 1):
        super().__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers)
        
        # hidden-to-logit
        self.h2l = nn.Linear(hidden_size, output_size)
        
        if torch.cuda.is_available():
            self.cuda()
        
    def forward(self, input, h, c):
        '''
        Args:
            input: 1 x (batch_size) x (output_size)
            h: (num_layers) x (batch_size) x (output_size)
            c: (num_layers) x (batch_size) x (output_size)
            
        Returns:
            output
        '''
        output,(h,c) = self.lstm(input,(h,c))
        y = self.predict(h)
        
        return y, h, c
    
    def predict(self, h):
        '''
        Predict the next state, given the previous hidden state
        '''
        # flatten the hidden state
        h_flat = h.view(-1, h.size(2))
        logit = self.h2l(h_flat)
        
        _, argmaxs = torch.max(logit, dim = 1)
        
        pred = Variable(torch.zeros(h_flat.size())).cuda()
        
        argmaxs = argmaxs.view(-1).data.cpu().numpy().tolist()
        for i, i_max in enumerate(argmaxs):
            pred[i, i_max] = 1
            
        pred = pred.view(h.size())
        
        return pred
    
    def init_pred(self, h_encoder):
        '''
        Args:
            h_encoder: final hidden state from the encoder. 
            (num_layers) x (batch_size) x (hidden_size)
        '''
        y0 = self.predict(h_encoder)
        
        return y0

In [5]:
from custom_utils.packing import pack, unpack

# Build the Training dataset
onehots = [vocab.sent2onehot(sent) for sent in sents]
# Build inputs / targets as lists of tensors
inputs = [sent[:-1,:] for sent in onehots]
targets = [sent[1:,:] for sent in onehots]

def generate_batch(inputs, targets, batch_size = 100):
    i = 0
    for i in range(len(inputs) // batch_size):
        yield inputs[i:i+batch_size], targets[i:i+batch_size]

Note that we can use the hidden states and cell states from the `lstm` directly, since the `lstm` module handles `PackedSequence` automatically.

In [12]:
input_size = vocab.V
hidden_size = vocab.V
output_size = vocab.V
num_layers = 1
batch_size  = 2

encoder = Encoder(input_size, hidden_size).cuda()
decoder = Decoder(hidden_size, output_size).cuda()

i, t = next(generate_batch(inputs, targets, batch_size))
inputs_packed, orders = pack(i)
h0, c0 = encoder.init_h0c0(batch_size)
outputs_packed, h_e, c_e = encoder(inputs_packed, h0, c0)
# Activate the line below, if you need to use the output 
# outputs_unpacked = unpack(outputs_packed, orders)
y0 = decoder.init_pred(h_e)
y, h_decoder, c_decoder = decoder(y0, h_e, c_e)

In [None]:
# Train the network(requires a target sequence)

input_size = vocab.V
hidden_size = vocab.V
output_size = vocab.V
num_layers = 1
batch_size = 1

encoder = Encoder(input_size, hidden_size).cuda()
decoder = Decoder(hidden_size, output_size).cuda()

import torch.optim as optim

loss_fn = nn.MSELoss()
params = list(encoder.parameters()) + list(decoder.parameters())
optimizer = optim.Adam(params, lr = .005)

def run_epoch(inputs, targets):
    # flush the gradients
    optimizer.zero_grad()
    
    # initial hidden state(h0)
    h0,c0 = encoder.init_h0c0(batch_size = 6)
    # training loss
    loss = 0
    
    # Feed the training data
    targets = [Variable(tensor).cuda() for tensor in targets]
    inputs_packed, orders = pack(inputs)
    
    # Run a RNN encoder-decoder through the training samples  
    _, h_encoder, c_encoder = encoder(inputs_packed, h0, c0)
    y0 = decoder.init_pred(h_encoder)
    y, h_decoder, c_decoder = decoder(y0, h_encoder, c_encoder)
    
    for out, target in zip(outputs, targets):
        loss += loss_fn(out, target)
        
    loss.backward()
    optimizer.step()
    
    return outputs, loss.data[0]

def train(inputs, targets, n_epochs = 100, print_every = 10):
    total_loss = 0.0
    for epoch in range(1, n_epochs + 1):
        output, loss = run_epoch(inputs, targets)
        if epoch % print_every == 0:
            print('Epoch: %2i / Loss: %.7f' % (epoch, loss))
            
def test(input_sent):
    h, c = rnn.init_h0c0()
    seq_len = input_sent.size()[0]
    input_sent = Variable(input_sent.view(seq_len, batch_size, -1))
    
    output, h, c = rnn(input_sent, h, c)
    _, argmaxs = torch.max(output, dim = 0)
    
    # flatten the sorted indices
    sent = argmaxs.view(-1).data.cpu().numpy().tolist()
    for i in sent:
        print(vocab[i],end=' ')
        
# run_epoch(inputs, targets)
train(inputs, targets, n_epochs = 1000, print_every = 100)
torch.manual_seed(7)
test(inputs[0].cuda())