In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *

  from numpy.core.umath_tests import inner1d


## Setup

We're going to download the collected works of Nietzsche to use as our data for this class.

In [2]:
PATH='/data/nietzsche/'

In [3]:
ls $PATH

nietzsche.txt


In [4]:
# get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')
text = open(f'{PATH}nietzsche.txt').read()
print('corpus length:', len(text))

corpus length: 600893


In [5]:
text[:400]

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not ground\nfor suspecting that all philosophers, in so far as they have been\ndogmatists, have failed to understand women--that the terrible\nseriousness and clumsy importunity with which they have usually paid\ntheir addresses to Truth, have been unskilled and unseemly methods for\nwinning a woman? Certainly she has never allowed herself '

In [6]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)

total chars: 85


Sometimes it's useful to have a zero value in the dataset, e.g. for padding

In [7]:
chars.insert(0, "\0")

''.join(chars[1:-6])

'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxy'

Map from chars to indices and back again

In [8]:
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}

*idx* will be the data we use from now on - it simply converts all the characters to their index (based on the mapping above)

In [9]:
idx = [char_indices[c] for c in text]

idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [10]:
''.join(indices_char[i] for i in idx[:70])

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

In [11]:
text[:70]

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

## Three char model

### Create inputs

Create a list of every 4th character, starting at the 0th, 1st, 2nd, then 3rd characters

In [100]:
cs=3
c1_dat = [idx[i]   for i in range(0, len(idx)-cs, cs)]
c2_dat = [idx[i+1] for i in range(0, len(idx)-cs, cs)]
c3_dat = [idx[i+2] for i in range(0, len(idx)-cs, cs)]
c4_dat = [idx[i+3] for i in range(0, len(idx)-cs, cs)]

In [101]:
[i  for i in range(0, len(idx)-cs, cs)][:10]

[0, 3, 6, 9, 12, 15, 18, 21, 24, 27]

In [102]:
[i+1  for i in range(0, len(idx)-cs, cs)][:10]

[1, 4, 7, 10, 13, 16, 19, 22, 25, 28]

In [103]:
[i+2  for i in range(0, len(idx)-cs, cs)][:10]

[2, 5, 8, 11, 14, 17, 20, 23, 26, 29]

In [104]:
[i+3  for i in range(0, len(idx)-cs, cs)][:10]

[3, 6, 9, 12, 15, 18, 21, 24, 27, 30]

In [105]:
[i+4  for i in range(0, len(idx)-cs, cs)][:10]

[4, 7, 10, 13, 16, 19, 22, 25, 28, 31]

Our inputs

In [106]:
x1 = np.stack(c1_dat)
x2 = np.stack(c2_dat)
x3 = np.stack(c3_dat)

In [107]:
c1_dat[:10]

[40, 30, 29, 1, 40, 43, 31, 61, 2, 74]

In [108]:
x1[:10]

array([40, 30, 29,  1, 40, 43, 31, 61,  2, 74])

In [109]:
x1.shape

(200297,)

Our output

In [110]:
y = np.stack(c4_dat)

The first 4 inputs and outputs

In [111]:
x1[:4], x2[:4], x3[:4]

(array([40, 30, 29,  1]), array([42, 25,  1, 43]), array([29, 27,  1, 45]))

In [112]:
y[:4]

array([30, 29,  1, 40])

In [113]:
x1.shape, y.shape

((200297,), (200297,))

### Create and train model

Pick a size for our hidden state

In [114]:
n_hidden = 256

The number of latent factors to create (i.e. the size of the embedding matrix)

In [115]:
n_fac = 42

There is no relu in pytorch rnn or keras, only tanh.

In [116]:
class Char3Model(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)

        # The 'green arrow' from our diagram - the layer operation from input to hidden
        self.l_in = nn.Linear(n_fac, n_hidden)

        # The 'orange arrow' from our diagram - the layer operation from hidden to hidden
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        
        # The 'blue arrow' from our diagram - the layer operation from hidden to output
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, c1, c2, c3):
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        h = V(torch.zeros(in1.size()).cuda())
        h = F.tanh(self.l_hidden(h+in1))
        h = F.tanh(self.l_hidden(h+in2))
        h = F.tanh(self.l_hidden(h+in3))
        
        return F.log_softmax(self.l_out(h))

In [117]:
md = ColumnarModelData.from_arrays('.', [-1], np.stack([x1,x2,x3], axis=1), y, bs=512)

In [118]:
np.stack([x1,x2,x3], axis=1).shape

(200297, 3)

In [119]:
m = Char3Model(vocab_size, n_fac).cuda()

In [120]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [121]:
type(xs), len(xs), [o.shape for o in xs]

(list, 3, [torch.Size([512]), torch.Size([512]), torch.Size([512])])

In [125]:
[x[100] for x in xs]

[2, 68, 71]

In [128]:
for i in range(0, 100):
    print("".join(indices_char[x[i]] for x in xs) + "---" + indices_char[yt[i]])

 in---t
n w---o
ct ---i
 mu---c
at ---t
r c---o
 af---f
bad--- 
ego---i
ly ---o
ccu---r
t t---o
leu---r
 co---n
era---l
 re---a
sci---e
usi---c
 wo---r
eac---h
old--- 
avy--- 
 be---l
nt ---o
 fa---n
, o---r
 pl---e
o a---r
he ---q
PRE----
an ---o
nts--- 

Ag---a
ain---t
adi---e
par---t
e f---i
f p---r
uch--- 
 in---f
gia---n
res---e
 Pe---t
c m---a
rse---l
ble--- 
hom--- 
ess---"
ami---l
rly---

se ---e
ave--- 
e
s---y
the--- 
tua---l
rka---b
ng ---h
one--- 
oth---e
s I---?
o p---r
tha---t
, h---o
rta---n
s f---o
lly--- 
ize--- 
wor---t
 an---d
eek---i
 th---e
one--- 
t y---e
s a---r
eas---i
, a---n
wor---l
est---,
nsw---e
big---,
the--- 
he ---b

ge---t
eca---u
hic---h
igh---t
ers---t
wou---l
thi---s
lon---g
f n---e
he ---h
at ---t
ien---c
a m---a
 in--- 
 th---e
ain--- 
wak---e
omm---u


In [41]:
opt = optim.Adam(m.parameters(), 1e-2)

In [42]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      2.089328   1.550326  



[array([1.55033])]

In [43]:
set_lrs(opt, 0.001)

In [44]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.823994   0.450279  



[array([0.45028])]

### Test model

In [45]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [46]:
get_next('y. ')

'T'

In [49]:
inp='y. '
idxs = T(np.array([char_indices[c] for c in inp]))
idxs


 78
 10
  2
[torch.cuda.LongTensor of size 3 (GPU 0)]

In [52]:
VV(idxs)

Variable containing:
 78
 10
  2
[torch.cuda.LongTensor of size 3 (GPU 0)]

In [50]:
p = m(*VV(idxs))

In [54]:
p.shape

torch.Size([1, 85])

In [55]:
i = np.argmax(to_np(p))
chars[i]

'T'

In [56]:
get_next('ppl')

'e'

In [57]:
get_next(' th')

'e'

In [58]:
get_next('and')

' '

## Our first RNN!

### Create inputs

This is the size of our unrolled RNN.

In [409]:
cs=8

For each of 0 through 7, create a list of every 8th character with that starting point. These will be the 8 inputs to our model.

In [410]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(len(idx)-cs)]

Then create a list of the next character in each of these series. This will be the labels for our model.

In [411]:
c_out_dat = [idx[j+cs] for j in range(len(idx)-cs)]

In [412]:
xs = np.stack(c_in_dat, axis=0)

cs is bptt

In [413]:
xs.shape

(600885, 8)

In [414]:
y = np.stack(c_out_dat)

In [415]:
y.shape

(600885,)

So each column below is one series of 8 characters from the text.

In [416]:
xs[:cs,:cs]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [42, 29, 30, 25, 27, 29,  1,  1],
       [29, 30, 25, 27, 29,  1,  1,  1],
       [30, 25, 27, 29,  1,  1,  1, 43],
       [25, 27, 29,  1,  1,  1, 43, 45],
       [27, 29,  1,  1,  1, 43, 45, 40],
       [29,  1,  1,  1, 43, 45, 40, 40],
       [ 1,  1,  1, 43, 45, 40, 40, 39]])

...and this is the next character after each sequence.

In [417]:
y[:cs]

array([ 1,  1, 43, 45, 40, 40, 39, 43])

In [418]:
xs[100:110]

array([[62, 65, 68, 72, 68, 69, 61, 58],
       [65, 68, 72, 68, 69, 61, 58, 71],
       [68, 72, 68, 69, 61, 58, 71, 72],
       [72, 68, 69, 61, 58, 71, 72,  8],
       [68, 69, 61, 58, 71, 72,  8,  2],
       [69, 61, 58, 71, 72,  8,  2, 62],
       [61, 58, 71, 72,  8,  2, 62, 67],
       [58, 71, 72,  8,  2, 62, 67,  2],
       [71, 72,  8,  2, 62, 67,  2, 72],
       [72,  8,  2, 62, 67,  2, 72, 68]])

In [419]:
y[100:110]

array([71, 72,  8,  2, 62, 67,  2, 72, 68,  2])

In [420]:
for i in range(100, 110):
    print("".join(indices_char[j] for j in xs[i]) + "---" + indices_char[y[i]])

ilosophe---r
losopher---s
osophers---,
sophers,--- 
ophers, ---i
phers, i---n
hers, in--- 
ers, in ---s
rs, in s---o
s, in so--- 


### Create and train model

In [421]:
val_idx = get_cv_idxs(len(idx)-cs-1)

In [422]:
len(idx)-cs-1, xs.shape

(600884, (600885, 8))

In [423]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, y, bs=512)

In [424]:
it = iter(md.trn_dl)
*xs,yt = next(it)

In [425]:
len(xs), yt.shape

(8, torch.Size([512]))

In [426]:
[o.shape for o in xs]

[torch.Size([512]),
 torch.Size([512]),
 torch.Size([512]),
 torch.Size([512]),
 torch.Size([512]),
 torch.Size([512]),
 torch.Size([512]),
 torch.Size([512])]

In [427]:
for i in range(20):
    print("".join(indices_char[o[i]] for o in xs) + "-" + indices_char[yt[i]])

-god
Epi-c
s out
of- 
ditional-n
 have to- 
pias of -g
 the pro-b
mortal d-o
absolve -G
 called -g
1. The s-e
seback e-x
learly p-e
 it has -b
y
makes -o
peoples,-

sequent -u
nture th-e
 best, a-r
 certain- 
nized th-e


In [428]:
class CharLoopModel(nn.Module):
    # This is an RNN!
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        print("len(cs) is bptt:", len(cs))
        bs = cs[0].size(0)
        print("bs:", bs)
        h = V(torch.zeros(bs, n_hidden).cuda())
        c = cs[0]
        
        print("c.shape:", c.shape)
        print("self.e(c).shape:", self.e(c).shape)
        inp = F.relu(self.l_in(self.e(c)))
        print("inp.shape:", inp.shape)
        h = F.tanh(self.l_hidden(h+inp))
        print("h.shape:", h.shape)

        
        for c in cs:
            inp = F.relu(self.l_in(self.e(c)))
            h = F.tanh(self.l_hidden(h+inp))
        print("self.l_out(h).shape:", self.l_out(h).shape)
        print("F.log_softmax(self.l_out(h), dim=-1).shape:", F.log_softmax(self.l_out(h), dim=-1).shape)
        
        return F.log_softmax(self.l_out(h), dim=-1)

m = CharLoopModel(vocab_size, n_fac).cuda()    

it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

len(cs) is bptt: 8
bs: 512
c.shape: torch.Size([512])
self.e(c).shape: torch.Size([512, 42])
inp.shape: torch.Size([512, 256])
h.shape: torch.Size([512, 256])
self.l_out(h).shape: torch.Size([512, 85])
F.log_softmax(self.l_out(h), dim=-1).shape: torch.Size([512, 85])


In [429]:
class CharLoopModel(nn.Module):
    # This is an RNN!
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = F.relu(self.l_in(self.e(c)))
            h = F.tanh(self.l_hidden(h+inp))
        
        return F.log_softmax(self.l_out(h), dim=-1)

In [430]:
m = CharLoopModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [431]:
m

CharLoopModel(
  (e): Embedding(85, 42)
  (l_in): Linear(in_features=42, out_features=256, bias=True)
  (l_hidden): Linear(in_features=256, out_features=256, bias=True)
  (l_out): Linear(in_features=256, out_features=85, bias=True)
)

In [237]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      2.01374    2.040762  


[array([2.04076])]

In [238]:
set_lrs(opt, 0.001)

In [239]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.729271   1.736789  


[array([1.73679])]

In [243]:
# demo code
class CharLoopConcatModel(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac+n_hidden, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        print("len(cs) is bptt:", len(cs))
        bs = cs[0].size(0)
        print("bs:", bs)
        h = V(torch.zeros(bs, n_hidden).cuda())
        
        c = cs[0]
        print("c.shape:", c.shape)
        print("self.e(c).shape:", self.e(c).shape)
        print("torch.cat((h, self.e(c)), 1).shape:", torch.cat((h, self.e(c)), 1).shape)
        inp = torch.cat((h, self.e(c)), 1)
        inp = F.relu(self.l_in(inp))
        h = F.tanh(self.l_hidden(inp))
        print("h.shape:", h.shape)

        for c in cs:
            inp = torch.cat((h, self.e(c)), 1)
            inp = F.relu(self.l_in(inp))
            h = F.tanh(self.l_hidden(inp))
        
        return F.log_softmax(self.l_out(h), dim=-1)

m = CharLoopConcatModel(vocab_size, n_fac).cuda()

it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

len(cs) is bptt: 8
bs: 512
c.shape: torch.Size([512])
self.e(c).shape: torch.Size([512, 42])
torch.cat((h, self.e(c)), 1).shape: torch.Size([512, 298])
h.shape: torch.Size([512, 256])


In [244]:
256+42

298

In [245]:
class CharLoopConcatModel(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac+n_hidden, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = torch.cat((h, self.e(c)), 1)
            inp = F.relu(self.l_in(inp))
            h = F.tanh(self.l_hidden(inp))
        
        return F.log_softmax(self.l_out(h), dim=-1)

In [246]:
m = CharLoopConcatModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [247]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [248]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.808648   1.787715  


[array([1.78772])]

In [249]:
set_lrs(opt, 1e-4)

In [250]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.696378   1.703465  


[array([1.70346])]

### Test model

In [251]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [252]:
get_next('for thos')

'e'

In [253]:
get_next('part of ')

't'

In [254]:
get_next('queens a')

'n'

## RNN with pytorch

In [444]:
# demo code
class CharRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp, h)
        print(np.allclose(outp[-1].cpu().data.numpy(), h.cpu().data.numpy()))
        print("outp.shape, h.shape:", outp.shape, h.shape)
        
        return F.log_softmax(self.l_out(outp[-1]), dim=-1)

    
m = CharRnn(vocab_size, n_fac).cuda()
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

True
outp.shape, h.shape: torch.Size([8, 512, 256]) torch.Size([1, 512, 256])


In [445]:
m

CharRnn(
  (e): Embedding(85, 42)
  (rnn): RNN(42, 256)
  (l_out): Linear(in_features=256, out_features=85, bias=True)
)

In [432]:
class CharRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp, h)
        
        return F.log_softmax(self.l_out(outp[-1]), dim=-1)

In [433]:
m = CharRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [434]:
it = iter(md.trn_dl)
*xs,yt = next(it)

In [435]:
t = m.e(V(torch.stack(xs)))
t.size()

torch.Size([8, 512, 42])

In [436]:
ht = V(torch.zeros(1, 512,n_hidden))
outp, hn = m.rnn(t, ht)
outp.size(), hn.size()

(torch.Size([8, 512, 256]), torch.Size([1, 512, 256]))

In [260]:
t = m(*V(xs)); t.size()

torch.Size([512, 85])

In [261]:
fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.879015   1.840871  
    1      1.662778   1.66416                                
    2      1.58984    1.588653                               
    3      1.534141   1.54491                                


[array([1.54491])]

In [262]:
set_lrs(opt, 1e-4)

In [263]:
fit(m, md, 2, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.462728   1.505766  
    1      1.450971   1.500196                               


[array([1.5002])]

### Test model

In [269]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [270]:
get_next('for thos')

'e'

In [271]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [272]:
get_next_n('for thos', 40)

'for those in the same to the same to the same to'

## Multi-output model

### Setup

Let's take non-overlapping sets of characters this time

In [273]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(0, len(idx)-cs-1, cs)]

Then create the exact same thing, offset by 1, as our labels

In [274]:
c_out_dat = [[idx[i+j] for i in range(cs)] for j in range(1, len(idx)-cs, cs)]

In [275]:
xs = np.stack(c_in_dat)
xs.shape

(75111, 8)

In [276]:
ys = np.stack(c_out_dat)
ys.shape

(75111, 8)

In [277]:
xs[:cs,:cs]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [ 1,  1, 43, 45, 40, 40, 39, 43],
       [33, 38, 31,  2, 73, 61, 54, 73],
       [ 2, 44, 71, 74, 73, 61,  2, 62],
       [72,  2, 54,  2, 76, 68, 66, 54],
       [67,  9,  9, 76, 61, 54, 73,  2],
       [73, 61, 58, 67, 24,  2, 33, 72],
       [ 2, 73, 61, 58, 71, 58,  2, 67]])

In [278]:
ys[:cs,:cs]

array([[42, 29, 30, 25, 27, 29,  1,  1],
       [ 1, 43, 45, 40, 40, 39, 43, 33],
       [38, 31,  2, 73, 61, 54, 73,  2],
       [44, 71, 74, 73, 61,  2, 62, 72],
       [ 2, 54,  2, 76, 68, 66, 54, 67],
       [ 9,  9, 76, 61, 54, 73,  2, 73],
       [61, 58, 67, 24,  2, 33, 72,  2],
       [73, 61, 58, 71, 58,  2, 67, 68]])

In [290]:
for i in range(10):
    print("".join(indices_char[j] for j in xs[i]))
    print("".join(indices_char[j] for j in ys[i]))
    print("*"*10)

PREFACE

REFACE


**********


SUPPOS

SUPPOSI
**********
ING that
NG that 
**********
 Truth i
Truth is
**********
s a woma
 a woman
**********
n--what 
--what t
**********
then? Is
hen? Is 
**********
 there n
there no
**********
ot groun
t ground
**********
d
for su

for sus
**********


### Create and train model

In [291]:
val_idx = get_cv_idxs(len(xs)-cs-1)

In [292]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, ys, bs=512)

In [332]:
# demo code
class CharSeqRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        print("bs:", bs)
        print("len(cs), cs[0].shap:", len(cs), cs[0].shape)
        h = V(torch.zeros(1, bs, n_hidden))
        print("h.shape:", h.shape)
        inp = self.e(torch.stack(cs))
        print("inp.shape:", inp.shape)
        outp,h = self.rnn(inp, h)
        print("outp.shape:", outp.shape)
        print("self.l_out(outp).shape:", self.l_out(outp).shape)
        print("F.log_softmax(self.l_out(outp), dim=-1).shape:", F.log_softmax(self.l_out(outp), dim=-1).shape)
        return F.log_softmax(self.l_out(outp), dim=-1)

m = CharSeqRnn(vocab_size, n_fac).cuda()

it = iter(md.trn_dl)
*xst,yt = next(it)
t = m(*V(xst))

bs: 512
len(cs), cs[0].shap: 8 torch.Size([512])
h.shape: torch.Size([1, 512, 256])
inp.shape: torch.Size([8, 512, 42])
outp.shape: torch.Size([8, 512, 256])
self.l_out(outp).shape: torch.Size([8, 512, 85])
F.log_softmax(self.l_out(outp), dim=-1).shape: torch.Size([8, 512, 85])


In [329]:
# demo code
def nll_loss_seq(inp, targ):
    print("inp.size():", inp.size())
    sl,bs,nh = inp.size()
    print("targ.size():", targ.size())
    targ = targ.transpose(0,1).contiguous().view(-1)
    print("targ.size() after:", targ.size())
    print("inp.view(-1,nh).size():", inp.view(-1,nh).size())
    return F.nll_loss(inp.view(-1,nh), targ)

nll_loss_seq(t, V(yt))

inp.size(): torch.Size([8, 512, 85])
targ.size(): torch.Size([512, 8])
targ.size() after: torch.Size([4096])
inp.view(-1,nh).size(): torch.Size([4096, 85])


Variable containing:
 4.4672
[torch.cuda.FloatTensor of size 1 (GPU 0)]

In [310]:
class CharSeqRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp, h)
        return F.log_softmax(self.l_out(outp), dim=-1)

In [311]:
m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [312]:
m

CharSeqRnn(
  (e): Embedding(85, 42)
  (rnn): RNN(42, 256)
  (l_out): Linear(in_features=256, out_features=85, bias=True)
)

In [313]:
it = iter(md.trn_dl)
*xst,yt = next(it)

In [314]:
def nll_loss_seq(inp, targ):
    sl,bs,nh = inp.size()
    targ = targ.transpose(0,1).contiguous().view(-1)
    return F.nll_loss(inp.view(-1,nh), targ)

In [315]:
fit(m, md, 4, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      2.61235    2.418312  
    1      2.298683   2.204691                              
    2      2.14505    2.08883                               
    3      2.046877   2.016112                              


[array([2.01611])]

In [316]:
set_lrs(opt, 1e-4)

In [317]:
fit(m, md, 1, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.994879   1.999039  


[array([1.99904])]

### Identity init!

In [318]:
m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [319]:
m.rnn.weight_hh_l0.data.copy_(torch.eye(n_hidden))


    1     0     0  ...      0     0     0
    0     1     0  ...      0     0     0
    0     0     1  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      1     0     0
    0     0     0  ...      0     1     0
    0     0     0  ...      0     0     1
[torch.cuda.FloatTensor of size 256x256 (GPU 0)]

In [320]:
fit(m, md, 4, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      2.348198   2.183635  
    1      2.090558   2.03315                               
    2      1.989041   1.969637                             
    3      1.9366     1.934515                              


[array([1.93451])]

In [321]:
set_lrs(opt, 1e-3)

In [322]:
fit(m, md, 4, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.84497    1.860609  
    1      1.82984    1.853267                              
    2      1.823908   1.846842                              
    3      1.816839   1.841488                              


[array([1.84149])]

## Stateful model

### Setup

In [452]:
from torchtext import vocab, data

from fastai.nlp import *
from fastai.lm_rnn import *

PATH='/data/nietzsche/'

TRN_PATH = 'trn/'
VAL_PATH = 'val/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

# Note: The student needs to practice her shell skills and prepare her own dataset before proceeding:
# - trn/trn.txt (first 80% of nietzsche.txt)
# - val/val.txt (last 20% of nietzsche.txt)

%ls {PATH}

nietzsche.txt  [0m[01;34mtrn[0m/  [01;34mval[0m/


In [453]:
!wc -l {PATH}/nietzsche.txt

9934 /data/nietzsche//nietzsche.txt


In [454]:
!mkdir $TRN

!mkdir $VAL

mkdir: cannot create directory ‘/data/nietzsche/trn/’: File exists
mkdir: cannot create directory ‘/data/nietzsche/val/’: File exists


In [456]:
!(head  -n 8000 {PATH}/nietzsche.txt) > {TRN}/trn.txt

In [457]:
!(tail  -n 2000 {PATH}/nietzsche.txt) > {VAL}/val.txt

In [458]:
!wc -l {TRN}/trn.txt

8000 /data/nietzsche/trn//trn.txt


In [459]:
!wc -l {VAL}/val.txt

1999 /data/nietzsche/val//val.txt


In [460]:
!head {TRN}/trn.txt

PREFACE


SUPPOSING that Truth is a woman--what then? Is there not ground
for suspecting that all philosophers, in so far as they have been
dogmatists, have failed to understand women--that the terrible
seriousness and clumsy importunity with which they have usually paid
their addresses to Truth, have been unskilled and unseemly methods for
winning a woman? Certainly she has never allowed herself to be won; and
at present every kind of dogma stands with sad and discouraged mien--IF,


In [461]:
!tail {VAL}/val.txt

whole of antiquity swarmed with sons of god--he attained the same goal,
the sense of complete sinlessness, complete irresponsibility, that can
now be attained by every individual through science.--In the same manner
I have viewed the saints of India who occupy an intermediate station
between the christian saints and the Greek philosophers and hence are
not to be regarded as a pure type. Knowledge and science--as far as they
existed--and superiority to the rest of mankind by logical discipline
and training of the intellectual powers were insisted upon by the
Buddhists as essential to sanctity, just as they were denounced by the
christian world as the indications of sinfulness.

In [462]:
%ls {PATH}trn

trn.txt


In [463]:
TEXT = data.Field(lower=True, tokenize=list)
bs=64; bptt=8; n_fac=42; n_hidden=256

FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=3)

len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

(947, 55, 1, 485751)

In [464]:
bptt*bs*len(md.trn_dl)

484864

In [465]:
TEXT.vocab.itos[:3]

['<unk>', '<pad>', ' ']

### RNN

In [466]:
it = iter(md.trn_dl)
*xst,yt = next(it)

In [467]:
len(xst), xst[0].shape, yt.shape

(1, torch.Size([12, 64]), torch.Size([768]))

In [494]:
# demo code
class CharSeqStatefulRnn(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        self.vocab_size = vocab_size
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)

        print("bs:", bs)
        print("cs[0].shape:", cs[0].shape)
        print("self.h.size():", self.h.size())
        if self.h.size(1) != bs: self.init_hidden(bs)
        print("self.h.size():", self.h.size())
        
        outp,h = self.rnn(self.e(cs), self.h)
        print("outp.shape, h.shape:", outp.shape, h.shape)
        self.h = repackage_var(h)
        print("repackage_var(h).shape:", repackage_var(h).shape)
        print("self.l_out(outp).shape:", self.l_out(outp).shape)
        print("F.log_softmax(self.l_out(outp), dim=-1).shape:", F.log_softmax(self.l_out(outp), dim=-1).shape)
        print(F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size).shape)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

m = CharSeqStatefulRnn(md.nt, n_fac, 512).cuda()        
it = iter(md.trn_dl)

In [495]:
*xst,yt = next(it)

t = m(*V(xst))

bs: 64
cs[0].shape: torch.Size([64])
self.h.size(): torch.Size([1, 512, 256])
self.h.size(): torch.Size([1, 64, 256])
outp.shape, h.shape: torch.Size([6, 64, 256]) torch.Size([1, 64, 256])
repackage_var(h).shape: torch.Size([1, 64, 256])
self.l_out(outp).shape: torch.Size([6, 64, 55])
F.log_softmax(self.l_out(outp), dim=-1).shape: torch.Size([6, 64, 55])
torch.Size([384, 55])


In [496]:
class CharSeqStatefulRnn(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        self.vocab_size = vocab_size
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [497]:
m = CharSeqStatefulRnn(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [498]:
fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.870669   1.85842   
    1      1.691095   1.711502                                
    2      1.603655   1.638105                                
    3      1.548151   1.59676                                 


[array([1.59676])]

In [499]:
set_lrs(opt, 1e-4)

fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.475907   1.555194  
    1      1.477998   1.550018                                
    2      1.47277    1.545215                                
    3      1.471363   1.541167                                


[array([1.54117])]

### RNN loop

In [500]:
# From the pytorch source

def RNNCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    return F.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))

In [501]:
class CharSeqStatefulRnn2(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNNCell(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp = []
        o = self.h
        for c in cs: 
            o = self.rnn(self.e(c), o)
            outp.append(o)
        outp = self.l_out(torch.stack(outp))
        self.h = repackage_var(o)
        return F.log_softmax(outp, dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [502]:
m = CharSeqStatefulRnn2(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [503]:
fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.866231   1.857579  
    1      1.681677   1.703645                                
    2      1.598947   1.638115                                
    3      1.548415   1.597791                                


[array([1.59779])]

### GRU

In [504]:
class CharSeqStatefulGRU(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.GRU(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [505]:
# From the pytorch source code - for reference

def GRUCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    gi = F.linear(input, w_ih, b_ih)
    gh = F.linear(hidden, w_hh, b_hh)
    i_r, i_i, i_n = gi.chunk(3, 1)
    h_r, h_i, h_n = gh.chunk(3, 1)

    resetgate = F.sigmoid(i_r + h_r)
    inputgate = F.sigmoid(i_i + h_i)
    newgate = F.tanh(i_n + resetgate * h_n)
    return newgate + inputgate * (hidden - newgate)

In [506]:
m = CharSeqStatefulGRU(md.nt, n_fac, 512).cuda()

opt = optim.Adam(m.parameters(), 1e-3)

In [507]:
fit(m, md, 6, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=6), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.736293   1.735454  
    1      1.556567   1.583899                                
    2      1.465984   1.516054                                
    3      1.407965   1.487188                                
    4      1.373922   1.465377                                
    5      1.346191   1.453536                                


[array([1.45354])]

In [508]:
set_lrs(opt, 1e-4)

In [509]:
fit(m, md, 3, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.266002   1.421961  
    1      1.264239   1.418461                                
    2      1.261855   1.417242                                


[array([1.41724])]

### Putting it all together: LSTM

In [510]:
from fastai import sgdr

n_hidden=512

In [511]:
class CharSeqStatefulLSTM(nn.Module):
    def __init__(self, vocab_size, n_fac, bs, nl):
        super().__init__()
        self.vocab_size,self.nl = vocab_size,nl
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.LSTM(n_fac, n_hidden, nl, dropout=0.5)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h[0].size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs):
        self.h = (V(torch.zeros(self.nl, bs, n_hidden)),
                  V(torch.zeros(self.nl, bs, n_hidden)))

In [512]:
m = CharSeqStatefulLSTM(md.nt, n_fac, 512, 2).cuda()
lo = LayerOptimizer(optim.Adam, m, 1e-2, 1e-5)

In [513]:
os.makedirs(f'{PATH}models', exist_ok=True)

In [514]:
fit(m, md, 2, lo.opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.790431   1.722828  
    1      1.685614   1.637427                                


[array([1.63743])]

In [515]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**4-1, lo.opt, F.nll_loss, callbacks=cb)

HBox(children=(IntProgress(value=0, description='Epoch', max=15), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.508704   1.475089  
    1      1.563436   1.516339                                
    2      1.426324   1.42031                                 
    3      1.579777   1.539666                                
    4      1.499894   1.470937                                
    5      1.412511   1.405306                                
    6      1.342619   1.368411                                
    7      1.572835   1.52915                                 
    8      1.522518   1.488928                                
    9      1.493546   1.473597                                
    10     1.443223   1.444669                                
    11     1.403843   1.400079                                
    12     1.350915   1.371567                                
    13     1.307978   1.343778                                
    14     1.27355    1.331145                                


[array([1.33114])]

In [516]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**6-1, lo.opt, F.nll_loss, callbacks=cb)

HBox(children=(IntProgress(value=0, description='Epoch', max=63), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.269403   1.328722  
    1      1.264966   1.326942                                
    2      1.260983   1.325817                                
    3      1.25931    1.324691                                
    4      1.253125   1.322111                                
    5      1.250396   1.320514                                
    6      1.241133   1.32024                                 
    7      1.241134   1.320437                                
    8      1.242356   1.318128                                
    9      1.234068   1.316596                                
    10     1.232381   1.314807                                
    11     1.223086   1.314373                                
    12     1.21831    1.313571                                
    13     1.21211    1.313018                                
    14     1.210101   1.31283                                 
    15     1.216055   

[array([1.34261])]

### Test

In [517]:
def get_next(inp):
    idxs = TEXT.numericalize(inp)
    p = m(VV(idxs.transpose(0,1)))
    r = torch.multinomial(p[-1].exp(), 1)
    return TEXT.vocab.itos[to_np(r)[0]]

In [518]:
get_next('for thos')

'e'

In [519]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [520]:
print(get_next_n('for thos', 400))

for those supposed up and diefunderstowadays andone.--"moral angel," or in being brought of their own p-in fancies,healthy--as thow, my point as we tensitude and intercourse, deceptives it--why?--do in question, this _mean of metaphysical instincture for opinion; we formerlyle, critical souch being outside of the peoplize thehold may have too "thought, missingulation, would make afterwards to me. it is, a
