In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *

# Data

In [2]:
PATH='data/nietzsche/'

In [3]:
get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')
text = open(f'{PATH}nietzsche.txt').read()
print('corpus length:', len(text))

corpus length: 600893


In [4]:
text[:400]

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not ground\nfor suspecting that all philosophers, in so far as they have been\ndogmatists, have failed to understand women--that the terrible\nseriousness and clumsy importunity with which they have usually paid\ntheir addresses to Truth, have been unskilled and unseemly methods for\nwinning a woman? Certainly she has never allowed herself '

In [5]:
chars = sorted(list(set(text)))

###### Helps to have a 0 character to represent something like `padding`

In [6]:
chars.insert(0, '\0')

In [7]:
''.join(chars[:-1])

'\x00\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyzÆäæé'

In [8]:
vocab_size = len(chars)

In [9]:
vocab_size

85

##### Map chars - indices and indices - chars

In [10]:
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}

In [11]:
input_idxs = [char_indices[c] for c in text]

In [12]:
input_idxs[:20]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1, 43, 45, 40, 40, 39, 43, 33, 38, 31, 2]

In [13]:
''.join(indices_char[i] for i in input_idxs[:70])

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

# Char 3 model

#### Create a list of every 4th character, starting at the 0th, 1st, then 2nd characters

### Data

In [14]:
cs = 3
c1_dat = [input_idxs[i]     for i in range(0, len(input_idxs) - cs, cs)]
c2_dat = [input_idxs[i + 1] for i in range(0, len(input_idxs) - cs, cs)]
c3_dat = [input_idxs[i + 2] for i in range(0, len(input_idxs) - cs, cs)]
c4_dat = [input_idxs[i + 3] for i in range(0, len(input_idxs) - cs, cs)]

In [15]:
print(' '.join(str(c) for c in c1_dat[:10]))
print(' '.join(str(c) for c in c2_dat[:10]))
print(' '.join(str(c) for c in c3_dat[:10]))
print(' '.join(str(c) for c in c4_dat[:10]))

40 30 29 1 40 43 31 61 2 74
42 25 1 43 40 33 2 54 44 73
29 27 1 45 39 38 73 73 71 61
30 29 1 40 43 31 61 2 74 2


#### Inputs

In [16]:
x1 = np.stack(c1_dat)
x2 = np.stack(c2_dat)
x3 = np.stack(c3_dat)

#### Output

In [17]:
y = np.stack(c4_dat)

In [18]:
y

array([30, 29,  1, ..., 59, 67, 72])

In [19]:
x1.shape, x2.shape, x3.shape, y.shape

((200297,), (200297,), (200297,), (200297,))

### Model

###### Hidden activations. Arbitary number. experiment!

In [20]:
n_hidden = 256 

###### The number of latent factors to create (i.e. the size of the `embedding matrix`). Arbitary number. experiment!

In [21]:
n_factors = 42

In [22]:
class Char3Model(nn.Module):
    def __init__(self, vocab_size, n_factors):
        super().__init__();
        
        # Embedding for all our chars
        self.emb = nn.Embedding(vocab_size, n_factors)
        
        # input to hidden layes (char 0, 1 and 2) - Green arrows in the lesson 6 slides
        self.l_in = nn.Linear(n_factors, n_hidden)
        
        # hidden to hidden - Orange arrows
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        
        # hidden to output - Blue arrow
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, char1, char2, char3):
        in1 = F.relu(self.l_in(self.emb(char1)))
        in2 = F.relu(self.l_in(self.emb(char2)))
        in3 = F.relu(self.l_in(self.emb(char3)))
        
        h = V(torch.zeros(in1.size()).cuda()) # init to ZEROS. Can do without this.
        h = F.tanh(self.l_hidden(h + in1)) # input to the hidden layer
        h = F.tanh(self.l_hidden(h + in2)) # Input hidden activation + char2 input layer
        h = F.tanh(self.l_hidden(h + in3)) # Input hidden activation + char3 input layer
        
        return F.log_softmax(self.l_out(h))

###### np.stack(list(x1, x2, x3)) will be mapped to char1, char2, char3 in forward function by from_arrays method.

In [23]:
model_data = ColumnarModelData.from_arrays('.', [-1], np.stack([x1, x2, x3], axis=1), y, bs=512)

In [24]:
model = Char3Model(vocab_size, n_factors).cuda()

In [25]:
itr = iter(model_data.trn_dl)
*xt, yt = next(itr) # gives the first batch of x and y values represented as tensors

In [26]:
len(xt) # 3 inputs

3

In [27]:
xt[0].size() # bs = 512

torch.Size([512])

In [28]:
len(yt)

512

In [29]:
t = model(*V(xt)) # Feed the inputs to the model as a Pytorch variable

In [30]:
t # input layer - vocab size x batch size

Variable containing:
-4.4240 -4.5106 -4.3911  ...  -4.3289 -4.4115 -4.4957
-4.3286 -4.3445 -4.2067  ...  -4.3855 -4.4347 -4.4061
-4.3850 -4.4178 -4.5417  ...  -4.4138 -4.2761 -4.5093
          ...             ⋱             ...          
-4.4336 -4.4008 -4.4418  ...  -4.5267 -4.6925 -4.4282
-4.3222 -4.3023 -4.2174  ...  -4.2617 -4.5135 -4.4200
-4.5313 -4.6107 -4.2877  ...  -4.5209 -4.6137 -4.2524
[torch.cuda.FloatTensor of size 512x85 (GPU 0)]

In [31]:
optimizer = optim.Adam(model.parameters(), 1e-2)

In [32]:
fit(model, model_data, 1, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      2.085744   1.00944   



[1.0094404]

In [33]:
set_lrs(optimizer, 1e-3)

In [34]:
fit(model, model_data, 1, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.838826   0.409908  



[0.4099083]

## Test model

In [35]:
def get_next(chars):
    c_idxs = T(np.array([char_indices[c] for c in chars]))
    pred = model(*VV(c_idxs))
    next_idx = np.argmax(to_np(pred))
    return indices_char[next_idx]

In [36]:
get_next('y. ')

'T'

In [37]:
get_next('ppl')

'e'

In [38]:
get_next(' th')

'e'

In [39]:
get_next('The')

' '

In [40]:
get_next('. S')

'o'

# RNN

## Inputs

In [41]:
cs = 8

For each of 0 through 7, create a list of every 8th character with that starting point. These will be the 8 inputs to out model.

In [42]:
c_in_dat = [[input_idxs[i + j] for i in range(cs)] for j in range(len(input_idxs) - cs)]

In [43]:
len(c_in_dat), len(c_in_dat[0])

(600885, 8)

In [44]:
c_out_dat = [input_idxs[j + cs] for j in range(len(input_idxs) - cs)]

In [45]:
len(c_out_dat)

600885

In [46]:
xs = np.stack(c_in_dat, axis=0)

In [47]:
xs.shape

(600885, 8)

In [48]:
y = np.stack(c_out_dat)

In [49]:
y.shape

(600885,)

In [50]:
xs[:cs][:8]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [42, 29, 30, 25, 27, 29,  1,  1],
       [29, 30, 25, 27, 29,  1,  1,  1],
       [30, 25, 27, 29,  1,  1,  1, 43],
       [25, 27, 29,  1,  1,  1, 43, 45],
       [27, 29,  1,  1,  1, 43, 45, 40],
       [29,  1,  1,  1, 43, 45, 40, 40],
       [ 1,  1,  1, 43, 45, 40, 40, 39]])

In [51]:
y[:8]

array([ 1,  1, 43, 45, 40, 40, 39, 43])

## Model

In [52]:
class CharLoopModel(nn.Module):
    def __init__(self, vocab_size, n_factors):
        super().__init__()
        
        self.emb = nn.Embedding(vocab_size, n_factors)
        
        self.l_in = nn.Linear(n_factors, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        
        for c in cs:
            inp = F.relu(self.l_in(self.emb(c)))
            h = F.tanh(self.l_hidden(h + inp))
            
        return F.log_softmax(self.l_out(h), dim=-1)

In [53]:
val_idx = get_cv_idxs(len(input_idxs) - cs - 1)

assert val_idx is not None

val_idx

array([480310, 419017, 232803, ..., 134355, 389158, 330599])

In [54]:
model_data = ColumnarModelData.from_arrays('.', val_idx, xs, y, bs=512)

assert model_data is not None

In [55]:
model = CharLoopModel(vocab_size, n_factors).cuda()
assert model is not None
optimizer = optim.Adam(model.parameters(), 1e-2)
assert optimizer is not None

In [56]:
fit(model, model_data, 1, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      2.001985   1.992708  



[1.9927082]

In [57]:
set_lrs(optimizer, 1e-3)

In [58]:
fit(model, model_data, 1, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.703461   1.703753  



[1.703753]

### Concatenate factors and hidden activation 

We are currently adding them, which probarbly is not the bext thing to do as one represents the `input` and the other is `activations "learn't" over a sequence of characters (each hidden layer)`.

In [59]:
class CharLoopConcatModel(nn.Module):
    def __init__(self, vocab_size, n_factors):
        super().__init__()
        
        self.emb = nn.Embedding(vocab_size, n_factors)
        
        # now our input will be n_factors + n_hidden as we plan to concatenate them
        self.l_in = nn.Linear(n_factors + n_hidden, n_hidden) 
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        
        for c in cs:
            inp = F.relu(self.l_in( torch.cat((h, self.emb(c)), 1) ))
            h = F.tanh(self.l_hidden(inp))
            
        return F.log_softmax(self.l_out(h), dim=-1)

In [60]:
model = CharLoopConcatModel(vocab_size, n_factors).cuda()
assert model is not None

In [61]:
optimizer = optim.Adam(model.parameters(), 1e-3)
assert optimizer is not None

In [62]:
fit(model, model_data, 1, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.790996   1.774132  



[1.7741321]

In [63]:
set_lrs(optimizer, 1e-4)

In [64]:
fit(model, model_data, 1, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.67848    1.688973  



[1.6889728]

#### Test model

In [65]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = model(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [66]:
get_next('y. ')

'T'

In [67]:
get_next('for thos')

'e'

In [68]:
get_next('a. ')

'T'

In [71]:
get_next('eir name')

' '

### RNN using Pytorch

In [97]:
class CharRNN(nn.Module):
    def __init__(self, vocab_size, n_factors):
        super().__init__()
        
        self.emb = nn.Embedding(vocab_size, n_factors)
        
        # use PyTorch RNN model
        self.rnn = nn.RNN(n_factors, n_hidden) 
        
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        # PyTorch uses a rank 3 tenaor with the rank 1 being a unit tensor (1)
        # Altough not required here, but require for bi-directional RNN and RNN to RNN models
        h = V(torch.zeros(1, bs, n_hidden).cuda())
        
        inp = self.emb(torch.stack(cs))
        outp, h = self.rnn(inp, h)
              
        # Pytorch appends hidden activations to **outp**, so we use the -1 index
        return F.log_softmax(self.l_out(outp[-1]), dim=-1)

In [98]:
model = CharRNN(vocab_size, n_factors).cuda()
assert model is not None

In [99]:
optimizer = optim.Adam(model.parameters(), 1e-3)
assert optimizer is not None

#### Manually...

In [100]:
itr = iter(model_data.trn_dl)
*xs, yt = next(itr) # get next batch

In [101]:
t = model.emb(V(torch.stack(xs)))
t.size()

torch.Size([8, 512, 42])

In [102]:
hiddent = V(torch.zeros(1, 512, n_hidden))
outp, hn = model.rnn(t, hiddent)
outp.size(), hn.size()

(torch.Size([8, 512, 256]), torch.Size([1, 512, 256]))

In [103]:
t = model(*V(xs)); t.size()

torch.Size([512, 85])

#### Using the model to fit

In [104]:
fit(model, model_data, 1, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.847615   1.838715  



[1.8387152]

In [105]:
set_lrs(optimizer, 1e-4)

In [107]:
fit(model, model_data, 4, optimizer, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.749281   1.75014   
    1      1.720167   1.728232                              
    2      1.702862   1.708622                              
    3      1.69039    1.690233                              



[1.6902328]

#### Test model

In [108]:
get_next('for thos')

'e'

In [109]:
get_next_n('for thos', 50)

'for those and the sense of the sense of the sense of the s'