In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *

# Data

In [2]:
PATH='data/nietzsche/'

In [3]:
get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')
text = open(f'{PATH}nietzsche.txt').read()
print('corpus length:', len(text))

corpus length: 600893


In [4]:
text[:400]

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not ground\nfor suspecting that all philosophers, in so far as they have been\ndogmatists, have failed to understand women--that the terrible\nseriousness and clumsy importunity with which they have usually paid\ntheir addresses to Truth, have been unskilled and unseemly methods for\nwinning a woman? Certainly she has never allowed herself '

In [5]:
chars = sorted(list(set(text)))

###### Helps to have a 0 character to represent something like `padding`

In [6]:
chars.insert(0, '\0')

In [7]:
''.join(chars[:-1])

'\x00\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyzÆäæé'

In [8]:
vocab_size = len(chars)

In [9]:
vocab_size

85

##### Map chars - indices and indices - chars

In [10]:
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}

In [11]:
input_idxs = [char_indices[c] for c in text]

In [12]:
input_idxs[:20]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1, 43, 45, 40, 40, 39, 43, 33, 38, 31, 2]

In [13]:
''.join(indices_char[i] for i in input_idxs[:70])

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

# Char 3 model

#### Create a list of every 4th character, starting at the 0th, 1st, then 2nd characters

### Data

In [14]:
cs = 3
c1_dat = [input_idxs[i]     for i in range(0, len(input_idxs) - cs, cs)]
c2_dat = [input_idxs[i + 1] for i in range(0, len(input_idxs) - cs, cs)]
c3_dat = [input_idxs[i + 2] for i in range(0, len(input_idxs) - cs, cs)]
c4_dat = [input_idxs[i + 3] for i in range(0, len(input_idxs) - cs, cs)]

In [15]:
print(' '.join(str(c) for c in c1_dat[:10]))
print(' '.join(str(c) for c in c2_dat[:10]))
print(' '.join(str(c) for c in c3_dat[:10]))
print(' '.join(str(c) for c in c4_dat[:10]))

40 30 29 1 40 43 31 61 2 74
42 25 1 43 40 33 2 54 44 73
29 27 1 45 39 38 73 73 71 61
30 29 1 40 43 31 61 2 74 2


#### Inputs

In [16]:
x1 = np.stack(c1_dat)
x2 = np.stack(c2_dat)
x3 = np.stack(c3_dat)

#### Output

In [17]:
y = np.stack(c4_dat)

In [18]:
y

array([30, 29,  1, ..., 59, 67, 72])

In [19]:
x1.shape, x2.shape, x3.shape, y.shape

((200297,), (200297,), (200297,), (200297,))

### Model

###### Hidden activations. Arbitary number. experiment!

In [20]:
n_hidden = 256 

###### The number of latent factors to create (i.e. the size of the `embedding matrix`). Arbitary number. experiment!

In [21]:
n_factors = 42

In [22]:
class Char3Model(nn.Module):
    def __init__(self, vocab_size, n_factors):
        super().__init__();
        
        # Embedding for all our chars
        self.emb = nn.Embedding(vocab_size, n_factors)
        
        # input to hidden layes (char 0, 1 and 2) - Green arrows in the lesson 6 slides
        self.l_in = nn.Linear(n_factors, n_hidden)
        
        # hidden to hidden - Orange arrows
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        
        # hidden to output - Blue arrow
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, char1, char2, char3):
        in1 = F.relu(self.l_in(self.emb(char1)))
        in2 = F.relu(self.l_in(self.emb(char2)))
        in3 = F.relu(self.l_in(self.emb(char3)))
        
        h = V(torch.zeros(in1.size()).cuda()) # init to ZEROS. Can do without this.
        h = F.tanh(self.l_hidden(h + in1)) # input to the hidden layer
        h = F.tanh(self.l_hidden(h + in2)) # Input hidden activation + char2 input layer
        h = F.tanh(self.l_hidden(h + in3)) # Input hidden activation + char3 input layer
        
        return F.log_softmax(self.l_out(h))

###### np.stack(list(x1, x2, x3)) will be mapped to char1, char2, char3 in forward function by from_arrays method.

In [23]:
model_data = ColumnarModelData.from_arrays('.', [-1], np.stack([x1, x2, x3], axis=1), y, bs=512)

In [24]:
model = Char3Model(vocab_size, n_factors).cuda()

In [25]:
itr = iter(model_data.trn_dl)
*xt, yt = next(itr) # gives the first batch of x and y values represented as tensors

In [26]:
len(xt) # 3 inputs

3

In [27]:
xt[0].size() # bs = 512

torch.Size([512])

In [28]:
len(yt)

512

In [29]:
t = model(*V(xt)) # Feed the inputs to the model as a Pytorch variable

In [30]:
t # input layer - vocab size x batch size

Variable containing:
-4.5913 -4.3566 -4.6661  ...  -4.2598 -4.2236 -4.2551
-4.7305 -4.3931 -4.8056  ...  -4.4504 -4.3482 -4.2300
-4.5454 -4.4885 -4.4548  ...  -4.4871 -4.3387 -4.2716
          ...             ⋱             ...          
-4.4278 -4.5744 -4.5279  ...  -4.3445 -4.4496 -4.2653
-4.6582 -4.3198 -4.8140  ...  -4.4846 -4.4026 -4.1842
-4.2736 -4.4791 -4.6715  ...  -4.2848 -4.3814 -4.2055
[torch.cuda.FloatTensor of size 512x85 (GPU 0)]

In [31]:
optimizer = optim.Adam(model.parameters(), 1e-2)

In [32]:
fit(model, model_data, 1, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      2.098745   0.367612  



[0.36761236]

In [33]:
set_lrs(optimizer, 1e-3)

In [34]:
fit(model, model_data, 1, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.850784   0.300332  



[0.3003316]

## Test model

In [35]:
def get_next(chars):
    c_idxs = T(np.array([char_indices[c] for c in chars]))
    pred = model(*VV(c_idxs))
    next_idx = np.argmax(to_np(pred))
    return indices_char[next_idx]

In [36]:
get_next('y. ')

'T'

In [37]:
get_next('ppl')

'e'

In [38]:
get_next(' th')

'e'

In [39]:
get_next('The')

' '

In [40]:
get_next('. S')

'o'

# RNN

## Inputs

In [41]:
cs = 8

For each of 0 through 7, create a list of every 8th character with that starting point. These will be the 8 inputs to out model.

In [42]:
c_in_dat = [[input_idxs[i + j] for i in range(cs)] for j in range(len(input_idxs) - cs)]

In [43]:
len(c_in_dat), len(c_in_dat[0])

(600885, 8)

In [44]:
c_out_dat = [input_idxs[j + cs] for j in range(len(input_idxs) - cs)]

In [45]:
len(c_out_dat)

600885

In [46]:
xs = np.stack(c_in_dat, axis=0)

In [47]:
xs.shape

(600885, 8)

In [48]:
y = np.stack(c_out_dat)

In [49]:
y.shape

(600885,)

In [50]:
xs[:cs][:8]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [42, 29, 30, 25, 27, 29,  1,  1],
       [29, 30, 25, 27, 29,  1,  1,  1],
       [30, 25, 27, 29,  1,  1,  1, 43],
       [25, 27, 29,  1,  1,  1, 43, 45],
       [27, 29,  1,  1,  1, 43, 45, 40],
       [29,  1,  1,  1, 43, 45, 40, 40],
       [ 1,  1,  1, 43, 45, 40, 40, 39]])

In [51]:
y[:8]

array([ 1,  1, 43, 45, 40, 40, 39, 43])

## Model

In [52]:
class CharLoopModel(nn.Module):
    def __init__(self, vocab_size, n_factors):
        super().__init__()
        
        self.emb = nn.Embedding(vocab_size, n_factors)
        
        self.l_in = nn.Linear(n_factors, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        
        for c in cs:
            inp = F.relu(self.l_in(self.emb(c)))
            h = F.tanh(self.l_hidden(h + inp))
            
        return F.log_softmax(self.l_out(h), dim=-1)

In [53]:
val_idx = get_cv_idxs(len(input_idxs) - cs - 1)

assert val_idx is not None

val_idx

array([480310, 419017, 232803, ..., 134355, 389158, 330599])

In [54]:
model_data = ColumnarModelData.from_arrays('.', val_idx, xs, y, bs=512)

assert model_data is not None

In [55]:
model = CharLoopModel(vocab_size, n_factors).cuda()
assert model is not None
optimizer = optim.Adam(model.parameters(), 1e-2)
assert optimizer is not None

In [56]:
fit(model, model_data, 1, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.989065   1.990273  



[1.9902734]

In [57]:
set_lrs(optimizer, 1e-3)

In [58]:
fit(model, model_data, 1, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.693568   1.701358  



[1.7013581]

### Concatenate factors and hidden activation 

We are currently adding them, which probarbly is not the bext thing to do as one represents the `input` and the other is `activations "learn't" over a sequence of characters (each hidden layer)`.

In [59]:
class CharLoopConcatModel(nn.Module):
    def __init__(self, vocab_size, n_factors):
        super().__init__()
        
        self.emb = nn.Embedding(vocab_size, n_factors)
        
        # now our input will be n_factors + n_hidden as we plan to concatenate them
        self.l_in = nn.Linear(n_factors + n_hidden, n_hidden) 
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        
        for c in cs:
            inp = F.relu(self.l_in( torch.cat((h, self.emb(c)), 1) ))
            h = F.tanh(self.l_hidden(inp))
            
        return F.log_softmax(self.l_out(h), dim=-1)

In [60]:
model = CharLoopConcatModel(vocab_size, n_factors).cuda()
assert model is not None

In [61]:
optimizer = optim.Adam(model.parameters(), 1e-3)
assert optimizer is not None

In [62]:
fit(model, model_data, 1, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.832812   1.809611  



[1.8096114]

In [63]:
set_lrs(optimizer, 1e-4)

In [64]:
fit(model, model_data, 1, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.71754    1.720564  



[1.7205644]

#### Test model

In [65]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = model(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [66]:
get_next('y. ')

'T'

In [67]:
get_next('for thos')

'e'

In [68]:
get_next('a. ')

'I'

In [69]:
get_next('eir name')

' '

### RNN using Pytorch

In [70]:
class CharRNN(nn.Module):
    def __init__(self, vocab_size, n_factors):
        super().__init__()
        
        self.emb = nn.Embedding(vocab_size, n_factors)
        
        # use PyTorch RNN model
        self.rnn = nn.RNN(n_factors, n_hidden) 
        
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        # PyTorch uses a rank 3 tenaor with the rank 1 being a unit tensor (1)
        # Altough not required here, but require for bi-directional RNN and RNN to RNN models
        h = V(torch.zeros(1, bs, n_hidden).cuda())
        
        inp = self.emb(torch.stack(cs))
        outp, h = self.rnn(inp, h)
              
        # Pytorch appends hidden activations to **outp**, so we use the -1 index
        return F.log_softmax(self.l_out(outp[-1]), dim=-1)

In [71]:
model = CharRNN(vocab_size, n_factors).cuda()
assert model is not None

In [72]:
optimizer = optim.Adam(model.parameters(), 1e-3)
assert optimizer is not None

#### Manually...

In [73]:
itr = iter(model_data.trn_dl)
*xs, yt = next(itr) # get next batch

In [74]:
t = model.emb(V(torch.stack(xs)))
t.size()

torch.Size([8, 512, 42])

In [75]:
hiddent = V(torch.zeros(1, 512, n_hidden))
outp, hn = model.rnn(t, hiddent)
outp.size(), hn.size()

(torch.Size([8, 512, 256]), torch.Size([1, 512, 256]))

In [76]:
t = model(*V(xs)); t.size()

torch.Size([512, 85])

#### Using the model to fit

In [77]:
fit(model, model_data, 1, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.862324   1.840125  



[1.8401252]

In [78]:
set_lrs(optimizer, 1e-4)

In [79]:
fit(model, model_data, 4, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.769927   1.779424  
    1      1.744763   1.752078                              
    2      1.734504   1.728695                              
    3      1.699613   1.708632                              



[1.7086325]

#### Test model

In [80]:
get_next('for thos')

'e'

In [81]:
get_next_n('for thos', 50)

'for those the same the same the same the same the same the'

# Multi output model

In the previous model, the exact same sequence minus 1 char was repeated in each row. This is not very effcient.

We could create input sequence, which is nothing but the same sequence as input broken down into the number of columns  equal to the nth character to be predicted. E.g: if 8th char then 8 length long.

Also, in the previous model we were using only the final hidden output `(outp[-1])`. Now, we will use all the outputs from every hidden layer.

So, the output sequence will be next char in the input text. TEXT = "And, so it begins.". The output sequence would be `"nd, so it begins."`

In [82]:
c_in_dat = [[input_idxs[i + j] for i in range(cs)] for j in range(0, len(input_idxs)-cs-1, cs)]

###### Create output offset by 1. See the first param of range for var j

In [83]:
c_out_dat = [[input_idxs[i + j] for i in range(cs)] for j in range(1, len(input_idxs)-cs-1, cs)]

In [84]:
xs = np.stack(c_in_dat); xs.shape

(75111, 8)

In [85]:
ys = np.stack(c_out_dat); ys.shape

(75111, 8)

In [86]:
xs[:cs, :cs]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [ 1,  1, 43, 45, 40, 40, 39, 43],
       [33, 38, 31,  2, 73, 61, 54, 73],
       [ 2, 44, 71, 74, 73, 61,  2, 62],
       [72,  2, 54,  2, 76, 68, 66, 54],
       [67,  9,  9, 76, 61, 54, 73,  2],
       [73, 61, 58, 67, 24,  2, 33, 72],
       [ 2, 73, 61, 58, 71, 58,  2, 67]])

In [87]:
ys[:cs, :cs]

array([[42, 29, 30, 25, 27, 29,  1,  1],
       [ 1, 43, 45, 40, 40, 39, 43, 33],
       [38, 31,  2, 73, 61, 54, 73,  2],
       [44, 71, 74, 73, 61,  2, 62, 72],
       [ 2, 54,  2, 76, 68, 66, 54, 67],
       [ 9,  9, 76, 61, 54, 73,  2, 73],
       [61, 58, 67, 24,  2, 33, 72,  2],
       [73, 61, 58, 71, 58,  2, 67, 68]])

As you can see, if's off by 1 char

### Model

As you can see, the code is pretty much similar, except we return the entire outp list

In [88]:
class CharSeqRNN(nn.Module):
    def __init__(self, vocab_size, n_factors):
        super().__init__()
        
        self.emb = nn.Embedding(vocab_size, n_factors)
        
        # use PyTorch RNN model
        self.rnn = nn.RNN(n_factors, n_hidden) 
        
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        # PyTorch uses a rank 3 tenaor with the rank 1 being a unit tensor (1)
        # Altough not required here, but require for bi-directional RNN and RNN to RNN models
        h = V(torch.zeros(1, bs, n_hidden).cuda())
        
        inp = self.emb(torch.stack(cs))
        outp, h = self.rnn(inp, h)
              
        # Pytorch appends hidden activations to **outp**, so we use the -1 index
        return F.log_softmax(self.l_out(outp), dim=-1)

In [89]:
val_idx = get_cv_idxs(len(xs) - cs - 1)

In [90]:
model_data = ColumnarModelData.from_arrays('.', val_idx, xs, ys, bs=512)

In [91]:
model = CharSeqRNN(vocab_size, n_factors).cuda()
assert model is not None

In [92]:
optimizer = optim.Adam(model.parameters(), 1e-3)
assert optimizer is not None

In [93]:
itr = iter(model_data.trn_dl)
assert itr is not None
*xst, yt = next(itr)
assert xst is not None
assert yt is not None

In [94]:
len(xst), xst[0].size()

(8, torch.Size([512]))

In [95]:
yt.size()

torch.Size([512, 8])

###### The default PyTorch Negaitve Likley Loss (nll_loss) function supports only a rank 2 tensor, but we have a rank 3 `(bs=512, vocab=85, time steps(aka sequence length)=8)`

You don't need to do the same to the target (actual), as Pytorch does that based on the input (x)

In [96]:
def nll_loss_seq(inp, targs):
    sl, bs, nh = inp.size()
    # use .continguous if you get a "not contiguous error". TODO: Look this up
    targs = targs.transpose(0, 1).contiguous().view(-1) # view change to single vector, -1 = as along as required
    return F.nll_loss(inp.view(-1, nh), targs)

In [97]:
fit(model, model_data, 1, optimizer, nll_loss_seq)

epoch      trn_loss   val_loss                              
    0      2.593358   2.402694  



[2.4026935]

In [98]:
set_lrs(optimizer, 1e-4)

In [99]:
fit(model, model_data, 1, optimizer, nll_loss_seq)

epoch      trn_loss   val_loss                              
    0      2.379065   2.365463  



[2.365463]

# Identity init

The default PuyTorch initialization of n_hidden matrix is random. This can cause `gradient explosion` as these matrices are multiplied in the loop self.rnn.

So a cool trick is to init h_hidden by an identity matrix, which whne multipled by another matrix, results in the same matrix. i.e.: `matrix_a * identity_matrix = matrix_a`

In [100]:
model = CharSeqRNN(vocab_size, n_factors).cuda()
assert model is not None
optimizer = optim.Adam(model.parameters(), 1e-2)
assert optimizer is not None

###### Init the hidden layer

In [101]:
model.rnn.weight_hh_l0.data.copy_(torch.eye(n_hidden))


    1     0     0  ...      0     0     0
    0     1     0  ...      0     0     0
    0     0     1  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      1     0     0
    0     0     0  ...      0     1     0
    0     0     0  ...      0     0     1
[torch.cuda.FloatTensor of size 256x256 (GPU 0)]

In [102]:
fit(model, model_data, 4, optimizer, nll_loss_seq)

epoch      trn_loss   val_loss                              
    0      2.361371   2.187065  
    1      2.102242   2.044799                              
    2      2.001742   1.970715                              
    3      1.949674   1.942515                              



[1.9425148]

In [103]:
set_lrs(optimizer, 1e-3)

In [104]:
fit(model, model_data, 4, optimizer, nll_loss_seq)

epoch      trn_loss   val_loss                              
    0      1.863767   1.872093  
    1      1.849467   1.865412                              
    2      1.844203   1.860011                              
    3      1.834789   1.854303                              



[1.8543025]

# Stateful model

## Setup

In [105]:
from torchtext import vocab, data

from fastai.nlp import *
from fastai.lm_rnn import *

In [106]:
PATH='data/nietzsche/'

TRN_PATH = 'trn/'
VAL_PATH = 'val/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

%ls {PATH}

[0m[01;34mmodels[0m/  nietzsche.txt  [01;34mtrn[0m/  [01;34mval[0m/


TODO: split trn to 8-%

In [107]:
%ls {PATH}trn
%ls {PATH}val

nietzsche.txt
nietzsche.txt


In [108]:
TEXT = data.Field(lower=True, tokenize=list)
bs=64; bptt=8; n_factors=42; n_hidden=256 # bptt is same as how we use using `cs` variable, but Pytorch changes bptt by a bit

FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
model_data = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=3)

# no. of mini=batches, unique chars, 
len(model_data.trn_dl), model_data.nt, len(model_data.trn_ds), len(model_data.trn_ds[0].text)

(1153, 55, 1, 590960)

In [109]:
TEXT.vocab.itos[:10]

['<unk>', '<pad>', ' ', 'e', 't', 'i', 'a', 'o', 'n', 's']

## RNN

###### Note

Earlier we were throwing away the learn't hidden weights after each loop, which is wasteful.

So now, we want to **store them and reuse it**. However, this creates another problme. If we have a million inputs, this needs to compute and **remember a million layers computations**.

So, after every sequence, while we still **keep the activations**, we **drop most of the history** of how we got there by creating a new PyTorch variable in `repackage_var()` method

In [110]:
class CharSeqStatefulRNN(nn.Module):
    def __init__(self, vocab_size, n_factors, bs):
        super().__init__()
        
        self.vocab_size = vocab_size
        
        self.emb = nn.Embedding(self.vocab_size, n_factors)
        # use PyTorch RNN model
        self.rnn = nn.RNN(n_factors, n_hidden) 
        self.l_out = nn.Linear(n_hidden, self.vocab_size)
        
        self.init_hidden(bs) # now store the init state as we are going to reuse it
        
        
    def forward(self, cs):
        bs = cs[0].size(0)
        
        # We want to reset this at end of every epoch. 
        # As the last mini-batch may not be the same size as other (may not be perfectly divisable), we do the following
        if self.h.size(1) != bs:
            self.init_hidden(bs)
            
        inp = self.emb(cs)
        outp, h = self.rnn(inp, self.h)
        
        self.h = repackage_var(h)
              
        # dim = -1 indicates we want softmax applied to the last dimension (out of rank 3), whic is the probabilties
        # of each char.
        # view params : -1 is as many rows as required
        #               vocab_size as that is the expected o/p
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    
    def init_hidden(self, bs): 
            self.h = V(torch.zeros(1, bs, n_hidden))

In [111]:
model = CharSeqStatefulRNN(model_data.nt, n_factors, 512).cuda()
assert model is not None
optimizer = optim.Adam(model.parameters(), 1e-2)
assert optimizer is not None

In [112]:
fit(model, model_data, 4, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                                 
    0      1.948119   1.986244  
    1      1.945059   1.964081                                 
    2      1.934376   2.019065                                 
    3      1.960284   2.001886                                 



[2.001886]

In [113]:
set_lrs(optimizer, 1e-3)

In [114]:
fit(model, model_data, 4, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                                 
    0      1.716658   1.741921  
    1      1.684743   1.700374                                 
    2      1.661317   1.679244                                 
    3      1.642028   1.661543                                 



[1.6615433]

## RNN cell

See [lesson6-rnn](file:///lesson6-rnn.iypb)

## GRU - better than RNN

###### Overview 

GRU has two gates. Gates are linear regression equation (kind of a mini NN. There can be these mini NN's to make these kinds of decisions).
1. **Reset (some?) hidden weights** E.g.: When encountering something like full stop, we would probarbly want to get rid of those weights (h_tilda).
2. **Update hidden weights** - This gate decides if we should _update our hidden weights with the newly created weights (h_tilda) _ or _leave it as it is._


In [115]:
class CharSeqStatefulGRU(nn.Module):
    def __init__(self, vocab_size, n_factors, bs):
        super().__init__()
        
        self.vocab_size = vocab_size
        
        self.emb = nn.Embedding(self.vocab_size, n_factors)
        # use PyTorch RNN model
        self.rnn = nn.GRU(n_factors, n_hidden) 
        self.l_out = nn.Linear(n_hidden, self.vocab_size)
        
        self.init_hidden(bs) # now store the init state as we are going to reuse it
        
        
    def forward(self, cs):
        bs = cs[0].size(0)
        
        # We want to reset this at end of every epoch. 
        # As the last mini-batch may not be the same size as other (may not be perfectly divisable), we do the following
        if self.h.size(1) != bs:
            self.init_hidden(bs)
            
        inp = self.emb(cs)
        outp, h = self.rnn(inp, self.h)
        
        self.h = repackage_var(h)
              
        # dim = -1 indicates we want softmax applied to the last dimension (out of rank 3), whic is the probabilties
        # of each char.
        # view params : -1 is as many rows as required
        #               vocab_size as that is the expected o/p
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    
    def init_hidden(self, bs): 
            self.h = V(torch.zeros(1, bs, n_hidden))

In [116]:
model = CharSeqStatefulGRU(model_data.nt, n_factors, 512).cuda()
assert model is not None
optimizer = optim.Adam(model.parameters(), 1e-2)
assert optimizer is not None

In [117]:
fit(model, model_data, 4, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                                 
    0      1.865196   1.891044  
    1      2.137428   2.156178                                 
    2      2.296512   2.332376                                 
    3      2.275554   2.316704                                 



[2.3167038]

In [118]:
set_lrs(optimizer, 1e-3)

In [119]:
fit(model, model_data, 4, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                                 
    0      2.101469   2.128114  
    1      2.089605   2.107544                                 
    2      2.06972    2.085692                                 
    3      2.050724   2.072692                                 



[2.0726917]

# Resources

[Understanding LSTM](http://colah.github.io/posts/2015-08-Understanding-LSTMs/)

[Intro to RNN](http://www.wildml.com/2015/09/recurrent-neural-networks-tutorial-part-1-introduction-to-rnns/)

## Putting it together - LSTM

In [120]:
from fastai import sgdr

n_hidden=512

In [121]:
class CharSeqStatefulLSTM(nn.Module):
    def __init__(self, vocab_size, n_factors, bs, nl):
        super().__init__()
        
        self.vocab_size, self.nl = vocab_size, nl
        
        self.emb = nn.Embedding(self.vocab_size, n_factors)
        # use PyTorch RNN model
        self.rnn = nn.LSTM(n_factors, n_hidden, self.nl, dropout=0.5) # adding dropout, therefore increasin n_hidden to 512
        self.l_out = nn.Linear(n_hidden, self.vocab_size)
        
        self.init_hidden(bs) # now store the init state as we are going to reuse it
        
        
    def forward(self, cs):
        bs = cs[0].size(0)
        
        # We want to reset this at end of every epoch. 
        # As the last mini-batch may not be the same size as other (may not be perfectly divisable), we do the following
        if self.h[0].size(1) != bs:
            self.init_hidden(bs)
            
        inp = self.emb(cs)
        outp, h = self.rnn(inp, self.h)
        
        self.h = repackage_var(h)
              
        # dim = -1 indicates we want softmax applied to the last dimension (out of rank 3), whic is the probabilties
        # of each char.
        # view params : -1 is as many rows as required
        #               vocab_size as that is the expected o/p
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    
    def init_hidden(self, bs): 
            # We need two of them now to support H_tilda (see GRU blog)
            self.h = (V(torch.zeros(self.nl, bs, n_hidden)),
                      V(torch.zeros(self.nl, bs, n_hidden))) 

In [122]:
model = CharSeqStatefulLSTM(model_data.nt, n_factors, bs=512, nl=2).cuda()
assert model is not None
layer_optim = LayerOptimizer(optim.Adam, model, 1e-2, 1e-5) # 1e-5 is weight decay
assert layer_optim is not None

In [123]:
os.makedirs(f'{PATH}models', exist_ok=True)

In [124]:
fit(model, model_data, 4, layer_optim.opt, F.nll_loss)

epoch      trn_loss   val_loss                                
    0      1.783368   1.721933  
    1      1.670954   1.614005                                
    2      1.627069   1.564956                                
    3      1.6022     1.530213                                



[1.5302126]

In [125]:
cb_on_end = lambda sched, cycle: save_model(model, f'{PATH}models/cycle_{cycle}')

In [126]:
# Cosine anealing callback will chang ethe learning rate in layer_optim
# no of batches in an epoch = len(md.trn_dl) (len(md.trn_dl) is no. of batches.)
# on_cycle_end = callback to execute an end of every cycle, in out case we save the model

cb_diff_rate = [CosAnneal(layer_optim, len(model_data.trn_dl), cycle_mult=2, on_cycle_end=cb_on_end)]

In [127]:
fit(model, model_data, 2**4-1, layer_optim.opt, F.nll_loss, callbacks=cb_diff_rate)

epoch      trn_loss   val_loss                                
    0      1.460061   1.402394  
    1      1.518817   1.45111                                 
    2      1.406631   1.357684                                
    3      1.542031   1.482027                                
    4      1.482199   1.417088                                
    5      1.411333   1.350648                                
    6      1.353592   1.311607                                
    7      1.537986   1.473367                                
    8      1.516637   1.447743                                
    9      1.48308    1.424815                                
    10     1.45499    1.390888                                
    11     1.412474   1.350913                                
    12     1.374482   1.3171                                  
    13     1.332594   1.281977                                
    14     1.30131    1.264177                                



[1.2641774]

In [None]:
fit(model, model_data, 2**6-1, layer_optim.opt, F.nll_loss, callbacks=cb_diff_rate)

epoch      trn_loss   val_loss                                
    0      1.506091   1.443144  
    1      1.47467    1.415195                                
    2      1.453964   1.396274                                
    3      1.419088   1.359969                                
    4      1.38591    1.326395                                
    5      1.346146   1.292296                                
    6      1.307357   1.259894                                
    7      1.287982   1.248361                                
    8      1.49369    1.421023                                
    9      1.479992   1.418226                                
    10     1.486799   1.416987                                
    11     1.476497   1.410142                                
    12     1.451897   1.390113                                
    13     1.433927   1.380422                                
    14     1.43018    1.367612                                
    15     1.408999   

### Test

In [None]:
def get_next(inp):
    idxs = TEXT.numericalize(inp)
    p = model(VV(idxs.transpose(0,1)))
    r = torch.multinomial(p[-1].exp(), 1)
    return TEXT.vocab.itos[to_np(r)[0]]

In [None]:
get_next('for thos')

In [None]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [None]:
print(get_next_n('for thos', 400))