In [12]:
from fastai.text.all import *


# Data

numbers written in english as small dataset

In [13]:
path = untar_data(URLs.HUMAN_NUMBERS)
Path.BASE_PATH = path

In [14]:
lines = L()
with open(path/'train.txt') as f: lines += L(*f.readlines())
with open(path/'valid.txt') as f: lines += L(*f.readlines())
lines

(#9998) ['one \n','two \n','three \n','four \n','five \n','six \n','seven \n','eight \n','nine \n','ten \n'...]

concatenate

In [15]:
text = ' . '.join([l.strip() for l in lines])
text[:100]

'one . two . three . four . five . six . seven . eight . nine . ten . eleven . twelve . thirteen . fo'

tokenize

In [16]:
tokens = text.split(' ')
tokens[:10]

['one', '.', 'two', '.', 'three', '.', 'four', '.', 'five', '.']

numericalize

In [17]:
vocab = L(*tokens).unique()
vocab

(#30) ['one','.','two','three','four','five','six','seven','eight','nine'...]

In [18]:
word2idx = {w:i for i,w in enumerate(vocab)}
nums = L(word2idx[i] for i in tokens)
nums

(#63095) [0,1,2,1,3,1,4,1,5,1...]

# Recurrent Neural Networtk

## simple langugage model with 3 words as input

predict next word based on the previous three words

In [19]:
seqs = L((tensor(nums[i:i+3]), nums[i+3]) for i in range(0,len(nums)-4,3))
seqs

(#21031) [(tensor([0, 1, 2]), 1),(tensor([1, 3, 1]), 4),(tensor([4, 1, 5]), 1),(tensor([1, 6, 1]), 7),(tensor([7, 1, 8]), 1),(tensor([1, 9, 1]), 10),(tensor([10,  1, 11]), 1),(tensor([ 1, 12,  1]), 13),(tensor([13,  1, 14]), 1),(tensor([ 1, 15,  1]), 16)...]

In [20]:
bs = 64
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(seqs[:cut], seqs[cut:], bs=64, shuffle=False)

In [21]:
class LMModel2(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)  
        self.h_h = nn.Linear(n_hidden, n_hidden)     
        self.h_o = nn.Linear(n_hidden,vocab_sz)
        
    def forward(self, x):
        h = 0 # hidden state
        for i in range(3):
            h = h + self.i_h(x[:,i])
            h = F.relu(self.h_h(h))
        return self.h_o(h)

# same as:
#    def forward(self, x):
#        h = F.relu(self.h_h(self.i_h(x[:,0])))
#        h = h + self.i_h(x[:,1])
#        h = F.relu(self.h_h(h))
#        h = h + self.i_h(x[:,2])
#        h = F.relu(self.h_h(h))
#        return self.h_o(h)

In [22]:
learn = Learner(dls, LMModel2(len(vocab), 64), loss_func=F.cross_entropy, 
                metrics=accuracy)
learn.fit_one_cycle(4, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.875787,2.024178,0.466366,00:02
1,1.401659,1.831923,0.467316,00:02
2,1.406861,1.723105,0.491799,00:02
3,1.371849,1.659739,0.48966,00:02


## stateful RNN

remeber activation for entire epoch

In [23]:
class LMModel3(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)  
        self.h_h = nn.Linear(n_hidden, n_hidden)     
        self.h_o = nn.Linear(n_hidden,vocab_sz)
        self.h = 0
        
    def forward(self, x):
        for i in range(3):
            self.h = self.h + self.i_h(x[:,i])
            self.h = F.relu(self.h_h(self.h))
        out = self.h_o(self.h)
        #backpropagation throught time. truncated gradient instead of derivative through entire epoch / neural network
        self.h = self.h.detach()
        return out
    
    def reset(self): self.h = 0

In [24]:
def group_chunks(ds, bs):
    m = len(ds) // bs
    new_ds = L()
    for i in range(m): new_ds += L(ds[i + m*j] for j in range(bs))
    return new_ds

In [25]:
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(
    group_chunks(seqs[:cut], bs), 
    group_chunks(seqs[cut:], bs), 
    bs=bs, drop_last=True, shuffle=False)

In [27]:
learn = Learner(dls, LMModel3(len(vocab), 64), loss_func=F.cross_entropy,
                metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(10, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.687411,1.877732,0.386779,00:02
1,1.277564,1.683533,0.459615,00:02
2,1.096524,1.599628,0.52524,00:02
3,1.014883,1.528335,0.530048,00:02
4,0.952005,1.49778,0.53726,00:02
5,0.913662,1.68124,0.569471,00:03
6,0.859758,1.675047,0.558413,00:02
7,0.814109,1.834198,0.586538,00:03
8,0.782421,1.808951,0.582212,00:02
9,0.763415,1.828485,0.585577,00:03


## more signal

predict every single next word, instead of every fourth

In [28]:
sl = 16
seqs = L((tensor(nums[i:i+sl]), tensor(nums[i+1:i+sl+1]))
         for i in range(0,len(nums)-sl-1,sl))
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(group_chunks(seqs[:cut], bs),
                             group_chunks(seqs[cut:], bs),
                             bs=bs, drop_last=True, shuffle=False)

In [29]:
[L(vocab[o] for o in s) for s in seqs[0]]

[(#16) ['one','.','two','.','three','.','four','.','five','.'...],
 (#16) ['.','two','.','three','.','four','.','five','.','six'...]]

In [30]:
class LMModel4(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)  
        self.h_h = nn.Linear(n_hidden, n_hidden)     
        self.h_o = nn.Linear(n_hidden,vocab_sz)
        self.h = 0
        
    def forward(self, x):
        outs = []
        for i in range(sl):
            self.h = self.h + self.i_h(x[:,i])
            self.h = F.relu(self.h_h(self.h))
            outs.append(self.h_o(self.h))
        self.h = self.h.detach()
        return torch.stack(outs, dim=1)
    
    def reset(self): self.h = 0

In [31]:
def loss_func(inp, targ):
    return F.cross_entropy(inp.view(-1, len(vocab)), targ.view(-1))

In [33]:
learn = Learner(dls, LMModel4(len(vocab), 64), loss_func=loss_func,
                metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.249174,3.068954,0.22819,00:01
1,2.394985,2.041021,0.454508,00:01
2,1.755099,1.838777,0.463298,00:01
3,1.435703,1.811959,0.511963,00:01
4,1.220498,1.737558,0.513265,00:01
5,1.075709,1.654432,0.565023,00:01
6,0.939703,1.748627,0.561605,00:01
7,0.847741,1.782233,0.574056,00:01
8,0.770066,1.833525,0.595052,00:01
9,0.716484,1.945457,0.611735,00:01


## Multilayer RNNs

stack multiple recurrent neural networks

In [34]:
class LMModel5(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.RNN(n_hidden, n_hidden, n_layers, batch_first=True)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = torch.zeros(n_layers, bs, n_hidden)
        
    def forward(self, x):
        res,h = self.rnn(self.i_h(x), self.h)
        self.h = h.detach()
        return self.h_o(res)
    
    def reset(self): self.h.zero_()

In [35]:
learn = Learner(dls, LMModel5(len(vocab), 64, 2), 
                loss_func=CrossEntropyLossFlat(), 
                metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.086636,2.633717,0.402507,00:01
1,2.176643,1.821965,0.471436,00:01
2,1.714541,1.986462,0.281169,00:01
3,1.476746,1.849811,0.472738,00:01
4,1.298071,2.022116,0.507812,00:01
5,1.142964,2.208432,0.503825,00:01
6,1.022992,2.299367,0.526123,00:01
7,0.930766,2.517967,0.536784,00:01
8,0.85606,2.68123,0.543376,00:01
9,0.792491,2.73723,0.552246,00:01


# long short-term memory

in addition to hidden state, wich focues on the next token prediction, we have a cell state which memorize everything in the sentence before the token

In [36]:
class LMModel6(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]
        
    def forward(self, x):
        res,h = self.rnn(self.i_h(x), self.h)
        self.h = [h_.detach() for h_ in h]
        return self.h_o(res)
    
    def reset(self): 
        for h in self.h: h.zero_()

In [38]:
learn = Learner(dls, LMModel6(len(vocab), 64, 2), 
                loss_func=CrossEntropyLossFlat(), 
                metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,3.026617,2.753782,0.164144,00:02
1,2.215143,2.069492,0.250081,00:02
2,1.646841,1.849758,0.473307,00:02
3,1.396861,1.996956,0.493408,00:02
4,1.167894,1.965482,0.525309,00:02
5,0.928181,1.939916,0.550944,00:02
6,0.690009,1.466943,0.631266,00:02
7,0.515068,1.607659,0.679199,00:02
8,0.371089,1.266821,0.719971,00:02
9,0.277229,1.403179,0.725667,00:02


## Regularizing an LSTM

regularization to reduce overfitting

### Dropout

randomly change some activations to zero. prevent extreme spezialisation of neurons. all neurons have to learn "everything"

In [55]:
class Dropout(Module):
    def __init__(self, p): self.p = p
    def forward(self, x):
        # when using Learner class, training attribute is set durring train method
        if not self.training: return x
        mask = x.new(*x.shape).bernoulli_(1-self.p)
        return x * mask.div_(1-self.p)

In [60]:
class LMModel7(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers, p):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
        self.dropout = Dropout(p)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]
        
    def forward(self, x):
        res,h = self.rnn(self.i_h(x), self.h)
        # using dropout before passing the output of lstm to final layer
        drop = self.dropout(res)
        self.h = [h_.detach() for h_ in h]
        return self.h_o(drop)
    
    def reset(self): 
        for h in self.h: h.zero_()

In [61]:
learn = Learner(dls, LMModel7(len(vocab), 64, 2, 0.5), 
                loss_func=CrossEntropyLossFlat(), 
                metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,3.054723,2.699915,0.302734,00:02
1,2.226563,1.839455,0.404297,00:02
2,1.640824,1.765492,0.492188,00:02
3,1.3483,2.108709,0.490641,00:02
4,1.188088,2.258959,0.532552,00:02
5,1.054639,2.262479,0.586507,00:02
6,0.92447,2.068452,0.627848,00:02
7,0.787528,2.284938,0.664469,00:02
8,0.638388,2.193993,0.686849,00:02
9,0.523764,2.311509,0.69401,00:02
