# Load the data and preprocessing modules

In [1]:
from modules.preprocess import Vocab
with open('./data/dl_history.txt') as f:
    text = f.read()
vocab = Vocab(text, max_size = 100, lower = True, one_hot = True)
print(len(vocab)) # size of the vocabulary
sents = vocab.sents2id(text)
print(sents)
print(vocab.id2sents(sents))

95
[
 93
 77
 75
 26
 47
 90
 42
 82
 77
 48
 47
 22
 19
 68
 25
 40
  3
  0
 10
 11
 53
 52
 19
 39
  8
 10
 21
 40
  4
  0
 40
 77
 23
 56
 18
 79
 54
  2
 94
[torch.LongTensor of size 39]
, 
 93
 40
  5
  0
 29
 34
 10
 44
 69
 64
  7
 60
 57
 47
 26
 61
 80
 53
 52
 32
 66
 47
  2
 94
[torch.LongTensor of size 24]
, 
 93
 40
  6
  0
  7
 63
 19
 33
 37
  0
 59
 10
 74
 70
 38
  7
 49
  1
 46
 30
 53
 51
 24
 15
 28
 62
  1
 83
 58
 45
 13
  7
 81
  0
 84
 27
 45
 40
 86
 12
  9
 87
 67
 17
 48
  0
 78
 31
  1
 85
 43
 89
 73
 14
  2
 94
[torch.LongTensor of size 56]
, 
 93
 77
 60
 65
 82
 47
 32
 26
 16
 50
  2
 94
[torch.LongTensor of size 12]
, 
 93
  7
 35
 55
 20
 71
 76
 77
 88
 56
 77
 75
 36
 41
 72
  4
  2
 94
[torch.LongTensor of size 18]
]
<sos> the term deep learning was introduced to the machine learning community by rina dechter in 1986 , and artificial neural networks by igor aizenberg and colleagues in 2000 , in the context of boolean threshold neurons . <eos> <sos>

# No GPU

In [2]:
import torch
import torch.nn as nn
from torch.autograd import Variable

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super().__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers)

    def forward(self, input, h, c):
        output,(h,c) = self.lstm(input,(h,c))        
        
        return output,h,c

    def init_h0c0(self, batch = 1):
        # dimension: num_layers*num_directions, batch_size, hidden_size
        h0 = Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size)) 
        c0 = Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size)) 
        
        return h0,c0

input_size = len(vocab)
hidden_size = len(vocab)
num_layers = 1
rnn = LSTM(input_size, hidden_size, num_layers)

In [3]:
# Little note about the view function
seq_len = 43
batch_size = 1
dim = 50
x = torch.arange(0,seq_len*dim).view(seq_len,dim)
print(x)
print(x.view(seq_len, batch_size, -1))


    0     1     2  ...     47    48    49
   50    51    52  ...     97    98    99
  100   101   102  ...    147   148   149
       ...          ⋱          ...       
 2000  2001  2002  ...   2047  2048  2049
 2050  2051  2052  ...   2097  2098  2099
 2100  2101  2102  ...   2147  2148  2149
[torch.FloatTensor of size 43x50]


(0 ,.,.) = 
     0     1     2  ...     47    48    49

(1 ,.,.) = 
    50    51    52  ...     97    98    99

(2 ,.,.) = 
   100   101   102  ...    147   148   149
...

(40,.,.) = 
  2000  2001  2002  ...   2047  2048  2049

(41,.,.) = 
  2050  2051  2052  ...   2097  2098  2099

(42,.,.) = 
  2100  2101  2102  ...   2147  2148  2149
[torch.FloatTensor of size 43x1x50]



In [4]:
# Process one string with a zero-vector inital hidden state / cell state
inputs = vocab.id2emb(sents[0])
seq_len = inputs.size()[0]
batch_size = 1
inputs = Variable(inputs.view(seq_len, batch_size, -1))
h0,c0 = rnn.init_h0c0()

output,h,c = rnn(inputs, h0, c0)
print(output,h,c)

Variable containing:
(0 ,.,.) = 
 -0.0109  0.0016 -0.0046  ...   0.0170  0.0443 -0.0024

(1 ,.,.) = 
 -0.0349  0.0147 -0.0097  ...   0.0061  0.0721 -0.0135

(2 ,.,.) = 
 -0.0060  0.0321 -0.0458  ...   0.0028  0.0796 -0.0228
...

(36,.,.) = 
 -0.0309  0.0299 -0.0641  ...  -0.0082  0.0660  0.0324

(37,.,.) = 
 -0.0293  0.0369 -0.0775  ...  -0.0083  0.0665  0.0051

(38,.,.) = 
 -0.0247  0.0331 -0.0893  ...  -0.0110  0.0446 -0.0181
[torch.FloatTensor of size 39x1x95]
 Variable containing:
(0 ,.,.) = 

Columns 0 to 8 
  -0.0247  0.0331 -0.0893 -0.0057  0.0413 -0.0887  0.0023  0.0023 -0.0090

Columns 9 to 17 
   0.0039  0.0270 -0.0131 -0.0102 -0.0174 -0.0651 -0.0715 -0.0036  0.0009

Columns 18 to 26 
  -0.0004  0.0570  0.0807 -0.0421 -0.0495 -0.0908  0.0251  0.0197 -0.0697

Columns 27 to 35 
  -0.0460 -0.0654 -0.0115 -0.0731 -0.0160 -0.0313 -0.0555 -0.0118  0.0041

Columns 36 to 44 
   0.0023  0.0323  0.0253 -0.0347 -0.0227  0.0306  0.0009  0.0001  0.0201

Columns 45 to 53 
  -0.0261  0.0700

In [5]:
# Build the Training dataset
onehots = [vocab.id2emb(sent) for sent in sents]

# Build inputs / targets as lists of tensors
inputs = [sent[:-1,:] for sent in onehots]
targets = [sent[1:,:] for sent in onehots]

In [6]:
input_size = len(vocab)
hidden_size = len(vocab)
num_layers = 1
batch_size = 1
rnn = LSTM(input_size, hidden_size, num_layers)

import torch.optim as optim

loss_fn = nn.MSELoss()
optimizer = optim.Adam(rnn.parameters(), lr = .005)

def run_epoch(inputs, targets):
    # flush the gradients
    optimizer.zero_grad()
    # initial hidden state(h0)
    h,c = rnn.init_h0c0()
    # training loss
    loss = 0
    # Run a RNN through the training samples
    for i in range(len(inputs)):
        input = inputs[i]
        target = targets[i]
        
        seq_len = input.size()[0]
        
        input = Variable(input.view(seq_len, batch_size, -1))
        target = Variable(target.view(seq_len, batch_size, -1))
        # Note: new hidden layer output is generated for every loop, so we have to send the
        # hidden weights to cuda for every loop
        output, h, c = rnn(input, h, c) 
        loss += loss_fn(output, target)
    loss.backward()
    optimizer.step()

    return output, loss.data[0]

def train(inputs, targets, n_epochs = 100, print_every = 10):
    total_loss = 0.0
    for epoch in range(1, n_epochs + 1):
        output, loss = run_epoch(inputs, targets)
        if epoch % print_every == 0:
            print('Epoch: %2i / Loss: %.7f' % (epoch, loss))
            
def test(input_sent):
    h, c = rnn.init_h0c0()
    seq_len = input_sent.size()[0]
    input_sent = Variable(input_sent.view(seq_len, batch_size, -1))
    
    output, h, c = rnn(input_sent, h, c)
    _, argmaxs = torch.max(output, dim = 0)
    sent = argmaxs.view(-1).data.numpy().tolist()
    for i in sent:
        print(vocab[i],end=' ')

In [7]:
# run_epoch
train(inputs, targets, n_epochs = 1000, print_every = 100)

Epoch: 100 / Loss: 0.0120398
Epoch: 200 / Loss: 0.0044662
Epoch: 300 / Loss: 0.0022816
Epoch: 400 / Loss: 0.0014084
Epoch: 500 / Loss: 0.0009812
Epoch: 600 / Loss: 0.0007276
Epoch: 700 / Loss: 0.0005505
Epoch: 800 / Loss: 0.0004282
Epoch: 900 / Loss: 0.0003496
Epoch: 1000 / Loss: 0.0002941


In [8]:
# Test
torch.manual_seed(7)
for i in range(len(inputs)):
    try:
        test(inputs[i])
        print()
    except Exception as e:
        print(e)
        print()

belief and has be each has , , context , boltzmann boolean fine hinton google , aizenberg backpropagation geoff colleagues 1986 dechter and fine . at . deep a has , , , has feedforward , geoff community , community deep . 2005 a boolean effectively , 1986 aizenberg 2006 1986 belief chart by google , for , , as 2000 hinton deep feedforward , - colleagues hinton as boolean a effectively . , 1986 - faustino a and gomez 2000 has 2006 belief 1986 effectively , fine fine aizenberg 2000 . effectively deep hinton 
. 2005 colleagues - - - - an an . 2005 chart - - and . . as 2000 2005 - backpropagation colleagues - 1986 . at 2000 . 1986 - 2000 boolean colleagues 2000 and 1986 - , as be 1986 2005 . 2006 a . as . 2000 1986 - boltzmann belief 2005 as an artificial as as and backpropagation an 2000 aizenberg artificial by 1986 as a . . 2000 . be . 1986 - 1986 1986 be . 1986 2000 1986 2006 as as 2000 aizenberg , chart an 1986 community 
for dechter neural - 2005 - - feedforward aizenberg igor and 200

# GPU version

### Dealing with Variable-length sequences for cuDNN in PyTorch
- References
    - [Simple working example how to use packing for variable-length sequence inputs for rnn](https://discuss.pytorch.org/t/simple-working-example-how-to-use-packing-for-variable-length-sequence-inputs-for-rnn/2120)
    - [Feeding Data to PyTorch RNNs](https://djosix.github.io/2017/09/05/Feeding-Data-to-Pytorch-RNNs/)
    - [How to use pad_packed_sequence in PyTorch](https://www.snip2code.com/Snippet/1950100/How-to-use-pad_packed_sequence-in-pytorc)
    - [RNN sequence padding with batch_first](https://github.com/pytorch/pytorch/issues/1176)
    - [padded_rnn.py](https://gist.github.com/MaximumEntropy/918d4ad7c931bc14b475008c00aa09f1)
    - [About the variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/7)
    - [How can i compute seq2seq loss using mask?](https://discuss.pytorch.org/t/how-can-i-compute-seq2seq-loss-using-mask/861/7)
- Steps
    1. Pad the input sequences to the same length
    2. Sort them by their lengths (asc order)
    3. Use torch.nn.utils.rnn.pack_padded_sequence()
    4. RNN
    5. Use torch.nn.utils.rnn.pad_packed_sequence()
    6. Unsort output sequences
    7. Unpad output sequences

In [9]:
from modules.preprocess import Vocab
with open('./data/dl_history.txt') as f:
    text = f.read()
vocab = Vocab(text, max_size = 100, lower = True, one_hot = True)
print(len(vocab)) # size of the vocabulary
sents = vocab.sents2id(text)
print(sents)
print(vocab.id2sents(sents))

95
[
 93
 77
 75
 26
 47
 90
 42
 82
 77
 48
 47
 22
 19
 68
 25
 40
  3
  0
 10
 11
 53
 52
 19
 39
  8
 10
 21
 40
  4
  0
 40
 77
 23
 56
 18
 79
 54
  2
 94
[torch.LongTensor of size 39]
, 
 93
 40
  5
  0
 29
 34
 10
 44
 69
 64
  7
 60
 57
 47
 26
 61
 80
 53
 52
 32
 66
 47
  2
 94
[torch.LongTensor of size 24]
, 
 93
 40
  6
  0
  7
 63
 19
 33
 37
  0
 59
 10
 74
 70
 38
  7
 49
  1
 46
 30
 53
 51
 24
 15
 28
 62
  1
 83
 58
 45
 13
  7
 81
  0
 84
 27
 45
 40
 86
 12
  9
 87
 67
 17
 48
  0
 78
 31
  1
 85
 43
 89
 73
 14
  2
 94
[torch.LongTensor of size 56]
, 
 93
 77
 60
 65
 82
 47
 32
 26
 16
 50
  2
 94
[torch.LongTensor of size 12]
, 
 93
  7
 35
 55
 20
 71
 76
 77
 88
 56
 77
 75
 36
 41
 72
  4
  2
 94
[torch.LongTensor of size 18]
]
<sos> the term deep learning was introduced to the machine learning community by rina dechter in 1986 , and artificial neural networks by igor aizenberg and colleagues in 2000 , in the context of boolean threshold neurons . <eos> <sos>

In [10]:
import torch
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

### A little exercise on `pack_padded_sequence`, and `pad_packed_sequence`

In [11]:
max_seq_len = 20
num_batches = 10
x = Variable(torch.randn(max_seq_len, num_batches, 30))
lens = list(range(max_seq_len, max_seq_len - num_batches, -1)) # sequence of lengths of each batches
x_packed = pack_padded_sequence(x, lens)

In [12]:
lens

[20, 19, 18, 17, 16, 15, 14, 13, 12, 11]

In [13]:
x

Variable containing:
(0 ,.,.) = 
 -1.0381  0.4584 -0.2096  ...  -0.6112  0.8608  0.8062
 -1.4102 -0.3467 -1.4297  ...  -0.2382 -0.1597  0.4433
 -0.0803 -0.0278 -0.2597  ...   0.5534  1.3503  2.4055
           ...             ⋱             ...          
 -1.9267  0.5453  0.7812  ...   0.2765 -0.1865  0.6421
 -0.1019 -0.7288  1.4008  ...  -0.0032 -0.6151  1.9332
 -1.5708  1.7737  0.3035  ...  -1.3612 -0.3341 -1.0302

(1 ,.,.) = 
  0.7460  0.2938  0.0664  ...   0.1086  0.7538  0.3079
  0.1405  1.1716  0.3962  ...  -1.1237 -1.2468  2.7688
 -0.2353  0.2731  0.0772  ...   0.6473  0.6643 -0.8425
           ...             ⋱             ...          
 -0.1701 -0.8283 -0.3167  ...  -0.7884 -1.6525  0.8426
  2.1250 -0.7835  1.3479  ...  -1.7399  0.3085 -0.7343
 -0.2546  0.2770 -1.0888  ...   0.1493 -1.5022 -1.6231

(2 ,.,.) = 
  0.5846  0.9017 -0.8200  ...   0.4783  0.2394  1.0083
 -0.0760  0.7519  0.7903  ...   1.6909  0.2236 -1.0112
  0.1779 -0.3156  0.3472  ...  -1.3540  0.1746  0.3665
      

In [14]:
x_packed

PackedSequence(data=Variable containing:
-1.0381  0.4584 -0.2096  ...  -0.6112  0.8608  0.8062
-1.4102 -0.3467 -1.4297  ...  -0.2382 -0.1597  0.4433
-0.0803 -0.0278 -0.2597  ...   0.5534  1.3503  2.4055
          ...             ⋱             ...          
-0.0565 -1.1523 -1.0487  ...  -1.0301  0.1426 -0.8216
 0.1549  0.0952  0.3328  ...   1.9159  1.2383 -1.9206
 0.6701  0.5796  0.4989  ...   0.4067 -0.4743  2.4214
[torch.FloatTensor of size 155x30]
, batch_sizes=[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

### Define the packing/unpacking functions

To utilize cuDNN for variable length input on PyTorch, we need to use `torch.utils.nn.rnn.pack_padded_sequence` and `torch.utils.nn.rnn.pad_packed_sequence`.
- `pack_padded_sequence` packs a **padded** tensor into a `PackedSequence` object, which is internally handled by nn.LSTM.
- `pad_padded_sequence` unpacks a `PackedSequence object` into a tensor

In [15]:
# Build the Training dataset
onehots = [vocab.id2emb(sent) for sent in sents]

# Build inputs / targets as lists of tensors
inputs = [sent[:-1,:] for sent in onehots]
targets = [sent[1:,:] for sent in onehots]

In [16]:
# Helper functions
def pack(seq):
    '''
    Packs a list of variable-length tensors into a packed sequence
    
    Args:
        seq: 2 dim tensor, where each row corresponds to an individual element.
        
    Returns:
        packed: PackedSequence
        orders: ordered indices for the original sequence before the sorting.
        later used to retrieve the original ordering of the sequences.
        
    '''
    seq_sorted = []
    orders = []
    
    for i, tensor in sorted(enumerate(seq), key = lambda t: -t[1].size()[0]):
        seq_sorted.append(tensor)
        orders.append(i)
        
    lengths = list(map(lambda t: t.size()[0], seq_sorted))
    
    max_seq_len = lengths[0]
    dim = seq_sorted[0].size()[1]
    batch_size = len(seq_sorted)
    
    # Build a padded sequence
    padded_sequence = Variable(torch.zeros(max_seq_len, batch_size, dim))
    if torch.cuda.is_available():
        padded_sequence = padded_sequence.cuda()
    
    for i in range(batch_size):
        padded_sequence[:lengths[i], i, :] = seq_sorted[i]
    
    # pack the padded sequence
    packed = pack_padded_sequence(padded_sequence, lengths)
    
    return packed, orders

def unpack(packed, orders):
    '''
    Unpacks a packed sequence
    
    Args:
        packed: PackedSequence
        
    Returns:
        unpacked_masked
    '''
    unpacked, lengths = pad_packed_sequence(packed)
    
    # Masking
    unpacked_masked = [unpacked[:lengths[batch], batch, :] for batch in range(len(lengths))]
    
    # Unsort
    unpacked_masked = [tensor for i, tensor in sorted(zip(orders, unpacked_masked))]
        
    return unpacked_masked
    
packed, orders = pack(inputs)
print(packed)
unpacked = unpack(packed, orders)
print(unpacked)

PackedSequence(data=Variable containing:
    0     0     0  ...      0     1     0
    0     0     0  ...      0     1     0
    0     0     0  ...      0     1     0
       ...          ⋱          ...       
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     1  ...      0     0     0
[torch.cuda.FloatTensor of size 144x95 (GPU 0)]
, batch_sizes=[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
[Variable containing:
    0     0     0  ...      0     1     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     1  ...      0     0     0
[torch.cuda.FloatTensor of size 38x95 (GPU 0)]
, Variable containing:
    0     0     0  ...      0     1     0
    0   

In [17]:
# Define the LSTM Cell

import torch
import torch.nn as nn
from torch.autograd import Variable

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super().__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers)

        if torch.cuda.is_available():
            self.cuda()

    def forward(self, input, h, c):
        output,(h,c) = self.lstm(input,(h,c))
        return output,h,c

    def init_h0c0(self, batch_size = 1):
        # dimension: num_layers*num_directions, batch_size, hidden_size
        h0 = Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size))
        c0 = Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size))
        
        if torch.cuda.is_available():
            h0 = h0.cuda()
            c0 = c0.cuda()
        
        return h0,c0

In [18]:
# Test over one input
input_size = len(vocab)
hidden_size = len(vocab)
num_layers = 1
rnn = LSTM(input_size, hidden_size, num_layers).cuda()

inputs_packed, orders = pack(inputs)
h0,c0 = rnn.init_h0c0(batch_size = 5)
outputs_packed, h, c = rnn(inputs_packed, h0, c0)
outputs = unpack(outputs_packed, orders)

print(list(map(lambda t: t.size(),outputs)))

[torch.Size([38, 95]), torch.Size([23, 95]), torch.Size([55, 95]), torch.Size([11, 95]), torch.Size([17, 95])]


In [23]:
# Train the network

input_size = len(vocab)
hidden_size = len(vocab)
num_layers = 1
batch_size = 1
rnn = LSTM(input_size, hidden_size, num_layers).cuda()

import torch.optim as optim

loss_fn = nn.MSELoss()
optimizer = optim.Adam(rnn.parameters(), lr = .005)

def run_epoch(inputs, targets):
    # flush the gradients
    optimizer.zero_grad()
    # initial hidden state(h0)
    h,c = rnn.init_h0c0(batch_size = 5)
    # training loss
    loss = 0
    
    targets = [Variable(tensor).cuda() for tensor in targets]
    # Run a RNN through the training samples
    inputs_packed, orders = pack(inputs)
    outputs_packed, h, c = rnn(inputs_packed, h, c)
    outputs = unpack(outputs_packed, orders)
    
    for out, target in zip(outputs, targets):
        loss += loss_fn(out, target)
        
    loss.backward()
    optimizer.step()
    
    return outputs, loss.data[0]

def train(inputs, targets, n_epochs = 100, print_every = 10):
    total_loss = 0.0
    for epoch in range(1, n_epochs + 1):
        output, loss = run_epoch(inputs, targets)
        if epoch % print_every == 0:
            print('Epoch: %2i / Loss: %.7f' % (epoch, loss))
            
def test(input_sent):
    h, c = rnn.init_h0c0()
    seq_len = input_sent.size()[0]
    input_sent = Variable(input_sent.view(seq_len, batch_size, -1))
    
    output, h, c = rnn(input_sent, h, c)
    _, argmaxs = torch.max(output, dim = 0)
    
    # flatten the sorted indices
    sent = argmaxs.view(-1).data.cpu().numpy().tolist()
    for i in sent:
        print(vocab[i],end=' ')
        
# run_epoch(inputs, targets)
train(inputs, targets, n_epochs = 1000, print_every = 100)
torch.manual_seed(7)
for i in range(len(inputs)):
    try:
        test(inputs[i].cuda())
        print()
    except Exception as e:
        print(e)
        print()

Epoch: 100 / Loss: 0.0133031
Epoch: 200 / Loss: 0.0062557
Epoch: 300 / Loss: 0.0042884
Epoch: 400 / Loss: 0.0035189
Epoch: 500 / Loss: 0.0031153
Epoch: 600 / Loss: 0.0029421
Epoch: 700 / Loss: 0.0027505
Epoch: 800 / Loss: 0.0026356
Epoch: 900 / Loss: 0.0025679
Epoch: 1000 / Loss: 0.0025200
effectively and has be each , . , context . could boolean belief backpropagation google chart aizenberg deep geoff colleagues . dechter and fine chart at . deep 2006 fine an and colleagues . backpropagation , geoff as boolean community deep . 2005 feedforward . an belief 1986 aizenberg a 2000 boltzmann chart by google , for . deep belief - belief faustino faustino boolean - . 1986 as dechter boolean effectively 1986 effectively a - faustino a an gomez 2000 backpropagation 2006 feedforward an a . belief effectively . 2000 belief belief chart hinton 
. - colleagues chart chart - - an boolean at 2005 be as as as boltzmann by as - at artificial artificial 2000 - as as at as as 1986 be 1986 boolean as 200