In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *

# Data

In [2]:
PATH='data/nietzsche/'

In [3]:
get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')
text = open(f'{PATH}nietzsche.txt').read()
print('corpus length:', len(text))

corpus length: 600893


In [4]:
text[:400]

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not ground\nfor suspecting that all philosophers, in so far as they have been\ndogmatists, have failed to understand women--that the terrible\nseriousness and clumsy importunity with which they have usually paid\ntheir addresses to Truth, have been unskilled and unseemly methods for\nwinning a woman? Certainly she has never allowed herself '

In [5]:
chars = sorted(list(set(text)))

###### Helps to have a 0 character to represent something like `padding`

In [6]:
chars.insert(0, '\0')

In [7]:
''.join(chars[:-1])

'\x00\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyzÆäæé'

In [8]:
vocab_size = len(chars)

In [9]:
vocab_size

85

##### Map chars - indices and indices - chars

In [10]:
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}

In [11]:
input_idxs = [char_indices[c] for c in text]

In [12]:
input_idxs[:20]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1, 43, 45, 40, 40, 39, 43, 33, 38, 31, 2]

In [13]:
''.join(indices_char[i] for i in input_idxs[:70])

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

# Char 3 model

#### Create a list of every 4th character, starting at the 0th, 1st, then 2nd characters

### Data

In [14]:
cs = 3
c1_dat = [input_idxs[i]     for i in range(0, len(input_idxs) - cs, cs)]
c2_dat = [input_idxs[i + 1] for i in range(0, len(input_idxs) - cs, cs)]
c3_dat = [input_idxs[i + 2] for i in range(0, len(input_idxs) - cs, cs)]
c4_dat = [input_idxs[i + 3] for i in range(0, len(input_idxs) - cs, cs)]

In [15]:
print(' '.join(str(c) for c in c1_dat[:10]))
print(' '.join(str(c) for c in c2_dat[:10]))
print(' '.join(str(c) for c in c3_dat[:10]))
print(' '.join(str(c) for c in c4_dat[:10]))

40 30 29 1 40 43 31 61 2 74
42 25 1 43 40 33 2 54 44 73
29 27 1 45 39 38 73 73 71 61
30 29 1 40 43 31 61 2 74 2


#### Inputs

In [16]:
x1 = np.stack(c1_dat)
x2 = np.stack(c2_dat)
x3 = np.stack(c3_dat)

#### Output

In [17]:
y = np.stack(c4_dat)

In [18]:
y

array([30, 29,  1, ..., 59, 67, 72])

In [19]:
x1.shape, x2.shape, x3.shape, y.shape

((200297,), (200297,), (200297,), (200297,))

### Model

###### Hidden activations. Arbitary number. experiment!

In [20]:
n_hidden = 256 

###### The number of latent factors to create (i.e. the size of the `embedding matrix`). Arbitary number. experiment!

In [21]:
n_factors = 42

In [22]:
class Char3Model(nn.Module):
    def __init__(self, vocab_size, n_factors):
        super().__init__();
        
        # Embedding for all our chars
        self.emb = nn.Embedding(vocab_size, n_factors)
        
        # input to hidden layes (char 0, 1 and 2) - Green arrows in the lesson 6 slides
        self.l_in = nn.Linear(n_factors, n_hidden)
        
        # hidden to hidden - Orange arrows
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        
        # hidden to output - Blue arrow
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, char1, char2, char3):
        in1 = F.relu(self.l_in(self.emb(char1)))
        in2 = F.relu(self.l_in(self.emb(char2)))
        in3 = F.relu(self.l_in(self.emb(char3)))
        
        h = V(torch.zeros(in1.size()).cuda()) # init to ZEROS. Can do without this.
        h = F.tanh(self.l_hidden(h + in1)) # input to the hidden layer
        h = F.tanh(self.l_hidden(h + in2)) # Input hidden activation + char2 input layer
        h = F.tanh(self.l_hidden(h + in3)) # Input hidden activation + char3 input layer
        
        return F.log_softmax(self.l_out(h))

###### np.stack(list(x1, x2, x3)) will be mapped to char1, char2, char3 in forward function by from_arrays method.

In [23]:
model_data = ColumnarModelData.from_arrays('.', [-1], np.stack([x1, x2, x3], axis=1), y, bs=512)

In [24]:
model = Char3Model(vocab_size, n_factors).cuda()

In [25]:
itr = iter(model_data.trn_dl)
*xt, yt = next(itr) # gives the first batch of x and y values represented as tensors

In [26]:
len(xt) # 3 inputs

3

In [27]:
xt[0].size() # bs = 512

torch.Size([512])

In [28]:
len(yt)

512

In [29]:
t = model(*V(xt)) # Feed the inputs to the model as a Pytorch variable

In [30]:
t # input layer - vocab size x batch size

Variable containing:
-4.2339 -4.4919 -4.2477  ...  -4.4742 -4.4225 -4.4848
-4.5732 -4.7988 -4.4424  ...  -4.8834 -4.3811 -4.1374
-4.1714 -4.6891 -4.2716  ...  -4.5991 -4.5162 -4.5677
          ...             ⋱             ...          
-4.3672 -4.3019 -4.1932  ...  -4.5346 -4.5168 -4.5924
-4.3141 -4.5946 -4.3406  ...  -4.5322 -4.5009 -4.5595
-4.2684 -4.6779 -4.3547  ...  -4.5684 -4.4691 -4.3004
[torch.cuda.FloatTensor of size 512x85 (GPU 0)]

In [31]:
optimizer = optim.Adam(model.parameters(), 1e-2)

In [32]:
fit(model, model_data, 1, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      2.070191   0.500434  



[0.5004344]

In [33]:
set_lrs(optimizer, 1e-3)

In [34]:
fit(model, model_data, 1, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.8062     0.213173  



[0.21317291]

## Test model

In [35]:
def get_next(chars):
    c_idxs = T(np.array([char_indices[c] for c in chars]))
    pred = model(*VV(c_idxs))
    next_idx = np.argmax(to_np(pred))
    return indices_char[next_idx]

In [36]:
get_next('y. ')

'T'

In [37]:
get_next('ppl')

'e'

In [38]:
get_next(' th')

'e'

In [39]:
get_next('The')

' '

In [40]:
get_next('. S')

'o'

# RNN

## Inputs

In [41]:
cs = 8

For each of 0 through 7, create a list of every 8th character with that starting point. These will be the 8 inputs to out model.

In [42]:
c_in_dat = [[input_idxs[i + j] for i in range(cs)] for j in range(len(input_idxs) - cs)]

In [43]:
len(c_in_dat), len(c_in_dat[0])

(600885, 8)

In [44]:
c_out_dat = [input_idxs[j + cs] for j in range(len(input_idxs) - cs)]

In [45]:
len(c_out_dat)

600885

In [46]:
xs = np.stack(c_in_dat, axis=0)

In [47]:
xs.shape

(600885, 8)

In [48]:
y = np.stack(c_out_dat)

In [49]:
y.shape

(600885,)

In [50]:
xs[:cs][:8]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [42, 29, 30, 25, 27, 29,  1,  1],
       [29, 30, 25, 27, 29,  1,  1,  1],
       [30, 25, 27, 29,  1,  1,  1, 43],
       [25, 27, 29,  1,  1,  1, 43, 45],
       [27, 29,  1,  1,  1, 43, 45, 40],
       [29,  1,  1,  1, 43, 45, 40, 40],
       [ 1,  1,  1, 43, 45, 40, 40, 39]])

In [51]:
y[:8]

array([ 1,  1, 43, 45, 40, 40, 39, 43])

## Model

In [52]:
class CharLoopModel(nn.Module):
    def __init__(self, vocab_size, n_factors):
        super().__init__()
        
        self.emb = nn.Embedding(vocab_size, n_factors)
        
        self.l_in = nn.Linear(n_factors, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        
        for c in cs:
            inp = F.relu(self.l_in(self.emb(c)))
            h = F.tanh(self.l_hidden(h + inp))
            
        return F.log_softmax(self.l_out(h), dim=-1)

In [53]:
val_idx = get_cv_idxs(len(input_idxs) - cs - 1)

assert val_idx is not None

val_idx

array([480310, 419017, 232803, ..., 134355, 389158, 330599])

In [54]:
model_data = ColumnarModelData.from_arrays('.', val_idx, xs, y, bs=512)

assert model_data is not None

In [55]:
model = CharLoopModel(vocab_size, n_factors).cuda()
assert model is not None
optimizer = optim.Adam(model.parameters(), 1e-2)
assert optimizer is not None

In [56]:
fit(model, model_data, 1, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.992196   1.977481  



[1.9774805]

In [57]:
set_lrs(optimizer, 1e-3)

In [58]:
fit(model, model_data, 1, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.693561   1.708419  



[1.7084187]

### Concatenate factors and hidden activation 

We are currently adding them, which probarbly is not the bext thing to do as one represents the `input` and the other is `activations "learn't" over a sequence of characters (each hidden layer)`.

In [59]:
class CharLoopConcatModel(nn.Module):
    def __init__(self, vocab_size, n_factors):
        super().__init__()
        
        self.emb = nn.Embedding(vocab_size, n_factors)
        
        # now our input will be n_factors + n_hidden as we plan to concatenate them
        self.l_in = nn.Linear(n_factors + n_hidden, n_hidden) 
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        
        for c in cs:
            inp = F.relu(self.l_in( torch.cat((h, self.emb(c)), 1) ))
            h = F.tanh(self.l_hidden(inp))
            
        return F.log_softmax(self.l_out(h), dim=-1)

In [60]:
model = CharLoopConcatModel(vocab_size, n_factors).cuda()
assert model is not None

In [61]:
optimizer = optim.Adam(model.parameters(), 1e-3)
assert optimizer is not None

In [62]:
fit(model, model_data, 1, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.825872   1.800771  



[1.8007709]

In [63]:
set_lrs(optimizer, 1e-4)

In [64]:
fit(model, model_data, 1, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.718473   1.718774  



[1.7187737]

#### Test model

In [65]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = model(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [66]:
get_next('y. ')

'T'

In [67]:
get_next('for thos')

'e'

In [68]:
get_next('a. ')

'T'

In [69]:
get_next('eir name')

' '

### RNN using Pytorch

In [70]:
class CharRNN(nn.Module):
    def __init__(self, vocab_size, n_factors):
        super().__init__()
        
        self.emb = nn.Embedding(vocab_size, n_factors)
        
        # use PyTorch RNN model
        self.rnn = nn.RNN(n_factors, n_hidden) 
        
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        # PyTorch uses a rank 3 tenaor with the rank 1 being a unit tensor (1)
        # Altough not required here, but require for bi-directional RNN and RNN to RNN models
        h = V(torch.zeros(1, bs, n_hidden).cuda())
        
        inp = self.emb(torch.stack(cs))
        outp, h = self.rnn(inp, h)
              
        # Pytorch appends hidden activations to **outp**, so we use the -1 index
        return F.log_softmax(self.l_out(outp[-1]), dim=-1)

In [71]:
model = CharRNN(vocab_size, n_factors).cuda()
assert model is not None

In [72]:
optimizer = optim.Adam(model.parameters(), 1e-3)
assert optimizer is not None

#### Manually...

In [73]:
itr = iter(model_data.trn_dl)
*xs, yt = next(itr) # get next batch

In [74]:
t = model.emb(V(torch.stack(xs)))
t.size()

torch.Size([8, 512, 42])

In [75]:
hiddent = V(torch.zeros(1, 512, n_hidden))
outp, hn = model.rnn(t, hiddent)
outp.size(), hn.size()

(torch.Size([8, 512, 256]), torch.Size([1, 512, 256]))

In [76]:
t = model(*V(xs)); t.size()

torch.Size([512, 85])

#### Using the model to fit

In [77]:
fit(model, model_data, 1, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.868345   1.844111  



[1.8441107]

In [78]:
set_lrs(optimizer, 1e-4)

In [79]:
fit(model, model_data, 4, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.766119   1.783243  
    1      1.747371   1.756444                              
    2      1.730701   1.732489                              
    3      1.708599   1.713269                              



[1.7132685]

#### Test model

In [80]:
get_next('for thos')

'e'

In [81]:
get_next_n('for thos', 50)

'for those the sense of the stranged the stranged the stran'

# Multi output model

In the previous model, the exact same sequence minus 1 char was repeated in each row. This is not very effcient.

We could create input sequence, which is nothing but the same sequence as input broken down into the number of columns  equal to the nth character to be predicted. E.g: if 8th char then 8 length long.

Also, in the previous model we were using only the final hidden output `(outp[-1])`. Now, we will use all the outputs from every hidden layer.

So, the output sequence will be next char in the input text. TEXT = "And, so it begins.". The output sequence would be `"nd, so it begins."`

In [89]:
c_in_dat = [[input_idxs[i + j] for i in range(cs)] for j in range(0, len(input_idxs)-cs-1, cs)]

###### Create output offset by 1. See the first param of range for var j

In [91]:
c_out_dat = [[input_idxs[i + j] for i in range(cs)] for j in range(1, len(input_idxs)-cs-1, cs)]

In [92]:
xs = np.stack(c_in_dat); xs.shape

(75111, 8)

In [93]:
ys = np.stack(c_out_dat); ys.shape

(75111, 8)

In [95]:
xs[:cs, :cs]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [ 1,  1, 43, 45, 40, 40, 39, 43],
       [33, 38, 31,  2, 73, 61, 54, 73],
       [ 2, 44, 71, 74, 73, 61,  2, 62],
       [72,  2, 54,  2, 76, 68, 66, 54],
       [67,  9,  9, 76, 61, 54, 73,  2],
       [73, 61, 58, 67, 24,  2, 33, 72],
       [ 2, 73, 61, 58, 71, 58,  2, 67]])

In [98]:
ys[:cs, :cs]

array([[42, 29, 30, 25, 27, 29,  1,  1],
       [ 1, 43, 45, 40, 40, 39, 43, 33],
       [38, 31,  2, 73, 61, 54, 73,  2],
       [44, 71, 74, 73, 61,  2, 62, 72],
       [ 2, 54,  2, 76, 68, 66, 54, 67],
       [ 9,  9, 76, 61, 54, 73,  2, 73],
       [61, 58, 67, 24,  2, 33, 72,  2],
       [73, 61, 58, 71, 58,  2, 67, 68]])

As you can see, if's off by 1 char

### Model

As you can see, the code is pretty much similar, except we return the entire outp list

In [99]:
class CharSeqRNN(nn.Module):
    def __init__(self, vocab_size, n_factors):
        super().__init__()
        
        self.emb = nn.Embedding(vocab_size, n_factors)
        
        # use PyTorch RNN model
        self.rnn = nn.RNN(n_factors, n_hidden) 
        
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        # PyTorch uses a rank 3 tenaor with the rank 1 being a unit tensor (1)
        # Altough not required here, but require for bi-directional RNN and RNN to RNN models
        h = V(torch.zeros(1, bs, n_hidden).cuda())
        
        inp = self.emb(torch.stack(cs))
        outp, h = self.rnn(inp, h)
              
        # Pytorch appends hidden activations to **outp**, so we use the -1 index
        return F.log_softmax(self.l_out(outp), dim=-1)

In [101]:
val_idx = get_cv_idxs(len(xs) - cs - 1)

In [102]:
model_data = ColumnarModelData.from_arrays('.', val_idx, xs, ys, bs=512)

In [118]:
model = CharSeqRNN(vocab_size, n_factors).cuda()
assert model is not None

In [119]:
optimizer = optim.Adam(model.parameters(), 1e-3)
assert optimizer is not None

In [120]:
itr = iter(model_data.trn_dl)
assert itr is not None
*xst, yt = next(itr)
assert xst is not None
assert yt is not None

In [121]:
len(xst), xst[0].size()

(8, torch.Size([512]))

In [122]:
yt.size()

torch.Size([512, 8])

###### The default PyTorch Negaitve Likley Loss (nll_loss) function supports only a rank 2 tensor, but we have a rank 3 `(bs=512, vocab=85, time steps(aka sequence length)=8)`

In [125]:
def nll_loss_seq(inp, targs):
    sl, bs, nh = inp.size()
    # use .continguous if you get a "not contiguous error". TODO: Look this up
    targs = targs.transpose(0, 1).contiguous().view(-1) # view change to single vector, -1 = as along as required
    return F.nll_loss(inp.view(-1, nh), targs)

In [126]:
fit(model, model_data, 1, optimizer, nll_loss_seq)

epoch      trn_loss   val_loss                              
    0      2.616673   2.413175  



[2.4131746]

In [127]:
set_lrs(optimizer, 1e-4)

In [128]:
fit(model, model_data, 1, optimizer, nll_loss_seq)

epoch      trn_loss   val_loss                              
    0      2.385795   2.374755  



[2.3747554]

# Identity init

The default PuyTorch initialization of n_hidden matrix is random. This can cause `gradient explosion` as these matrices are multiplied in the loop self.rnn.

So a cool trick is to init h_hidden by an identity matrix, which whne multipled by another matrix, results in the same matrix. i.e.: `matrix_a * identity_matrix = matrix_a`

In [138]:
model = CharSeqRNN(vocab_size, n_factors).cuda()
assert model is not None
optimizer = optim.Adam(model.parameters(), 1e-2)
assert optimizer is not None

###### Init the hidden layer

In [139]:
model.rnn.weight_hh_l0.data.copy_(torch.eye(n_hidden))


    1     0     0  ...      0     0     0
    0     1     0  ...      0     0     0
    0     0     1  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      1     0     0
    0     0     0  ...      0     1     0
    0     0     0  ...      0     0     1
[torch.cuda.FloatTensor of size 256x256 (GPU 0)]

In [140]:
fit(model, model_data, 4, optimizer, nll_loss_seq)

epoch      trn_loss   val_loss                              
    0      2.415068   2.264249  
    1      2.15286    2.088422                              
    2      2.045299   2.030455                              
    3      1.988121   2.004508                              



[2.004508]

In [141]:
set_lrs(optimizer, 1e-3)

In [142]:
fit(model, model_data, 4, optimizer, nll_loss_seq)

epoch      trn_loss   val_loss                             
    0      1.897527   1.911362  
    1      1.892329   1.904617                              
    2      1.883241   1.896475                              
    3      1.875589   1.893283                              



[1.893283]

# Stateful model