# Contents
- String Preprocessing
- Word-level Language Model with vanilla RNN from scratch

This tutorial is a combination of [this code](https://github.com/GunhoChoi/PyTorch-FastCampus/blob/master/05_RNN/0_Basic/Simple_Char_RNNcell.ipynb) and the official pytorch documentation.

## String Preprocessing

In [1]:
from modules.preprocess import Vocab
with open('./data/dl_history.txt') as f:
    text = f.read()
vocab = Vocab(text, max_size = 512, one_hot = True, lower = True)
print(len(vocab)) # size of the vocabulary
sents = vocab.sents2id(text)
print(sents)
print(vocab.id2sents(sents))

95
[
 93
 77
 75
 26
 47
 90
 42
 82
 77
 48
 47
 22
 19
 68
 25
 40
  3
  0
 10
 11
 53
 52
 19
 39
  8
 10
 21
 40
  4
  0
 40
 77
 23
 56
 18
 79
 54
  2
 94
[torch.LongTensor of size 39]
, 
 93
 40
  5
  0
 29
 34
 10
 44
 69
 64
  7
 60
 57
 47
 26
 61
 80
 53
 52
 32
 66
 47
  2
 94
[torch.LongTensor of size 24]
, 
 93
 40
  6
  0
  7
 63
 19
 33
 37
  0
 59
 10
 74
 70
 38
  7
 49
  1
 46
 30
 53
 51
 24
 15
 28
 62
  1
 83
 58
 45
 13
  7
 81
  0
 84
 27
 45
 40
 86
 12
  9
 87
 67
 17
 48
  0
 78
 31
  1
 85
 43
 89
 73
 14
  2
 94
[torch.LongTensor of size 56]
, 
 93
 77
 60
 65
 82
 47
 32
 26
 16
 50
  2
 94
[torch.LongTensor of size 12]
, 
 93
  7
 35
 55
 20
 71
 76
 77
 88
 56
 77
 75
 36
 41
 72
  4
  2
 94
[torch.LongTensor of size 18]
]
<sos> the term deep learning was introduced to the machine learning community by rina dechter in 1986 , and artificial neural networks by igor aizenberg and colleagues in 2000 , in the context of boolean threshold neurons . <eos> <sos>

In [2]:
print(vocab[0]) # First element in the vocabulary
print(vocab[vocab[0]]) # index of the first element in the vocabulary
print(vocab.text2emb('deep learning'))

,
0


Columns 0 to 12 
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0

Columns 13 to 25 
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0

Columns 26 to 38 
    1     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0

Columns 39 to 51 
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     1     0     0     0     0

Columns 52 to 64 
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0

Columns 65 to 77 
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0 

## Word-level Language Model with vanilla RNN from scratch

The vanilla RNN I use here follows the architecture from [this post](https://r2rt.com/written-memories-understanding-deriving-and-extending-the-lstm.html)

#### Define the vanilla RNN cell(without cuda)

In [3]:
# MSE Loss version

import torch
import torch.nn as nn
from torch.autograd import Variable

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()

        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.tanh = nn.Tanh()
#         self.softmax = nn.Softmax()

    def forward(self, input, hidden):
        # Concatenate
        combined = torch.cat((input, hidden), 1)
        hidden = self.tanh(self.i2h(combined))
        output = self.h2o(hidden)
#         output = self.softmax(self.h2o(hidden))
        
        return output, hidden

    def init_hidden(self):
        return Variable(torch.zeros(1, self.hidden_size))

input_size = len(vocab)
hidden_size = 512
output_size = len(vocab)
rnn = RNN(input_size, hidden_size, output_size)

In [4]:
# Process one string with a zero-vector inital hidden state

inputs = vocab.id2emb(sents[0])
print(inputs.size())

hidden = rnn.init_hidden()
input = Variable(inputs[0]).view(1, -1) # .view() reshapes a tensor
print(input.size())
output, hidden = rnn(input, hidden)
print(output)

torch.Size([39, 95])
torch.Size([1, 95])
Variable containing:

Columns 0 to 9 
1.00000e-02 *
  2.7261 -2.3328 -4.5138  3.6472  1.2867  3.7989 -3.5443 -1.5485 -2.4366  5.3291

Columns 10 to 19 
1.00000e-02 *
  0.3459  0.9965  1.3540 -0.3399 -3.2519 -2.1269  1.9542  1.0897  3.6371 -1.1031

Columns 20 to 29 
1.00000e-02 *
 -1.2107 -1.2748  3.4532  2.9712  6.0002  0.5507 -5.0433  1.0191 -0.9572 -1.4670

Columns 30 to 39 
1.00000e-02 *
  2.0256 -2.1627  2.0323  0.3216 -4.3429  1.0173 -2.1241  3.9655 -0.1047 -4.5488

Columns 40 to 49 
1.00000e-02 *
  3.0713  2.6672  3.0768  2.9332 -5.2887 -2.4649  1.3456  3.7306  1.8000  1.6995

Columns 50 to 59 
1.00000e-02 *
 -1.3142 -1.4045 -1.6318 -3.0951  3.0573  4.4069 -1.3072  5.4081 -5.1226 -1.7581

Columns 60 to 69 
1.00000e-02 *
 -0.4983  3.5541 -3.5229  3.0062 -3.3063  0.8933  2.2045 -3.5248  2.3407  2.1591

Columns 70 to 79 
1.00000e-02 *
  0.6806  1.4828  3.9668 -3.7632  2.7826 -4.6924  1.3895 -1.8344  0.7586  4.7505

Columns 80 to 89 
1.00000e-

#### Train the network(without cuda)

In [5]:
# Build the Training dataset
onehots = [vocab.id2emb(sent) for sent in sents]

inputs = [sent[:-1,:] for sent in onehots]
targets = [sent[1:,:] for sent in onehots]
inputs = torch.cat(inputs, dim = 0)
targets = torch.cat(targets, dim = 0)

print(inputs)
print(targets)


    0     0     0  ...      0     1     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     1  ...      0     0     0
[torch.FloatTensor of size 144x95]


    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      0     0     0
    0     0     1  ...      0     0     0
    0     0     0  ...      0     0     1
[torch.FloatTensor of size 144x95]



In [6]:
input_size = len(vocab)
output_size = len(vocab)
hidden_size = 128
rnn = RNN(input_size, hidden_size, output_size)

import torch.optim as optim

loss_fn = nn.MSELoss()
optimizer = optim.Adam(rnn.parameters(), lr = .005)

def run_epoch(inputs, targets):
    # flush the gradients
    optimizer.zero_grad()
    # initial hidden state(h0)
    hidden = rnn.init_hidden()
    # training loss
    loss = 0
    # Run a RNN through the training samples
    for i in range(len(inputs)):
        input = Variable(inputs[i]).view(1,-1)
        target = Variable(targets[i]).view(1,-1)
        # Note: new hidden layer output is generated for every loop, so we have to send the
        # hidden weights to cuda for every loop
        output, hidden = rnn(input, hidden)
        loss += loss_fn(output, target)
    loss.backward()
    optimizer.step()

    return output, loss.data[0]

def train(inputs, targets, n_epochs = 100, print_every = 10):
    total_loss = 0.0
    for epoch in range(1, n_epochs + 1):
        output, loss = run_epoch(inputs, targets)
        if epoch % print_every == 0:
            print('Epoch: %2i / Loss: %.7f' % (epoch, loss))
            
def test(inputs):
    hidden = rnn.init_hidden()
    input = Variable(inputs[0].view(1,-1))
    
    for i in range(len(inputs)):
        output, hidden = rnn(input, hidden)
        _, argmax = torch.max(output, dim = 1)
        word = vocab[int(argmax.data.numpy()[0])]
        print(word,end=' ')
        input = output
                
# run_epoch(inputs, targets)
train(inputs, targets, n_epochs = 1000, print_every = 100)
test(inputs)          

Epoch: 100 / Loss: 0.0285679
Epoch: 200 / Loss: 0.0095582
Epoch: 300 / Loss: 0.0045619
Epoch: 400 / Loss: 0.0033293
Epoch: 500 / Loss: 0.0025119
Epoch: 600 / Loss: 0.0018986
Epoch: 700 / Loss: 0.0019633
Epoch: 800 / Loss: 0.0014020
Epoch: 900 / Loss: 0.0014298
Epoch: 1000 / Loss: 0.0014845
the term deep learning since 2000 . the machine learning , by rina dechter in 1986 , and artificial neural networks by igor aizenberg the colleagues in 2000 , in . in in , , neural neural neural networks learning , <eos> in layer in a , the the networks learning - <eos> . <eos> in in , , neural networks , hinton , . in in learning , a , the networks networks , the learning . in in , , neural networks networks by time . the in in in <eos> a through neural neural , , the paper learning in in . <eos> in networks , , term networks the in learning . <eos> in learning , , , the term paper learning <eos> in . in in , , the neural networks by term . . in in in , 

With cuda. However, there is not much speed gain since no mini-batch is used.

In [7]:
import torch.nn as nn
from torch.autograd import Variable

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()

        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.tanh = nn.Tanh()
#         self.softmax = nn.Softmax()

    def forward(self, input, hidden):
        # Concatenate
        combined = torch.cat((input, hidden), 1)
        hidden = self.tanh(self.i2h(combined))
        output = self.h2o(hidden)
#         output = self.softmax(self.h2o(hidden))
        
        return output, hidden

    def init_hidden(self):
        return Variable(torch.zeros(1, self.hidden_size))

input_size = len(vocab)
hidden_size = 128
output_size = len(vocab)
rnn = RNN(input_size, hidden_size, output_size)

# send the tensors to cuda
rnn.cuda()

import torch.optim as optim

loss_fn = nn.MSELoss()
optimizer = optim.Adam(rnn.parameters(), lr = .005)

def run_epoch(inputs, targets):
    # flush the gradients
    optimizer.zero_grad()
    # initial hidden state(h0)
    hidden = rnn.init_hidden()
    # training loss
    loss = 0
    # Run a RNN through the training samples
    for i in range(len(inputs)):
        input = Variable(inputs[i].view(1,-1))
        target = Variable(targets[i].view(1,-1))
        # Note: new hidden layer output is generated for every loop, so we have to send the
        # hidden weights to cuda for every loop
        output, hidden = rnn(input.cuda(), hidden.cuda()) 
        loss += loss_fn(output, target.cuda())
    loss.backward()
    optimizer.step()

    return output, loss.data[0]

def train(inputs, targets, n_epochs = 100, print_every = 10):
    total_loss = 0.0
    for epoch in range(1, n_epochs + 1):
        output, loss = run_epoch(inputs, targets)
        if epoch % print_every == 0:
            print('Epoch: %2i / Loss: %.7f' % (epoch, loss))
        
# run_epoch(inputs, targets)
train(inputs, targets, n_epochs = 1000, print_every = 100)

Epoch: 100 / Loss: 0.0355654
Epoch: 200 / Loss: 0.0138856
Epoch: 300 / Loss: 0.0056430
Epoch: 400 / Loss: 0.0046159
Epoch: 500 / Loss: 0.0027214
Epoch: 600 / Loss: 0.0032206
Epoch: 700 / Loss: 0.0024738
Epoch: 800 / Loss: 0.0020541
Epoch: 900 / Loss: 0.0014194
Epoch: 1000 / Loss: 0.0012618
