In [36]:
from fastai.learner import *
from fastai.column_data import *

# 1. Setup

In [11]:
PATH = 'data/nietzsche/nietzsche.txt'
with open(PATH) as f:
    text = f.read()
print(f'Corpus length: {len(text)}')
print(text[:400])

Corpus length: 600893
PREFACE


SUPPOSING that Truth is a woman--what then? Is there not ground
for suspecting that all philosophers, in so far as they have been
dogmatists, have failed to understand women--that the terrible
seriousness and clumsy importunity with which they have usually paid
their addresses to Truth, have been unskilled and unseemly methods for
winning a woman? Certainly she has never allowed herself 


In [15]:
chars = sorted(list(set(text)))
chars.insert(0,'\0')
vocab_size = len(chars)
print(f'Vocab size: {len(chars)}')
''.join(chars[1:-5])

Vocab size: 85


'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz'

In [16]:
chars_idx = {o:i for i,o in enumerate(chars)}
idx_chars = {i:o for i,o in enumerate(chars)}

In [24]:
text_idx = [chars_idx[c] for c in text]
print(text_idx[:10])
''.join(idx_chars[i] for i in text_idx[:10])

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]


'PREFACE\n\n\n'

# 2. 3 char model 

In [125]:
cs = 3
c1_dat = [text_idx[i] for i in range(0,len(text_idx)-cs,cs)]
c2_dat = [text_idx[i+1] for i in range(0,len(text_idx)-cs,cs)]
c3_dat = [text_idx[i+2] for i in range(0,len(text_idx)-cs,cs)]
c4_dat = [text_idx[i+3] for i in range(0,len(text_idx)-cs,cs)]

In [126]:
x1 = np.stack(c1_dat)
x2 = np.stack(c2_dat)
x3 = np.stack(c3_dat)

In [127]:
y = np.stack(c4_dat)

In [128]:
x1[:4],x2[:4],x3[:4]

(array([40, 30, 29,  1]), array([42, 25,  1, 43]), array([29, 27,  1, 45]))

In [129]:
y[:4]

array([30, 29,  1, 40])

In [130]:
x1.shape,y.shape

((200297,), (200297,))

In [131]:
n_hidden = 256
n_factor = 42

class Char3Module(nn.Module):
    def __init__(self,n_hidden,n_factor):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_factor)
        self.l_in = nn.Linear(n_factor, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, c1, c2, c3):
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        h = V(torch.zeros(in1.size()).cuda())
        h = F.tanh(self.l_hidden(h+in1))
        h = F.tanh(self.l_hidden(h+in2))
        h = F.tanh(self.l_hidden(h+in3))
        
        return F.log_softmax(self.l_out(h))

m = Char3Module(n_hidden, n_factor).cuda()

In [133]:
x1.shape

(200297,)

In [134]:
cv_idx = get_cv_idxs(x1.shape[0]-1)
md = ColumnarModelData.from_arrays('.',cv_idx, np.stack([x1,x2,x3],axis = 1),y,bs=512)

In [135]:
*xt,yt = next(iter(md.trn_dl))
t = m(*V(xt));t

Variable containing:
-4.3381 -4.4526 -4.4978  ...  -4.3454 -4.5140 -4.2106
-4.2977 -4.5310 -4.6003  ...  -4.4774 -4.6172 -4.5308
-4.4218 -4.6433 -4.4595  ...  -4.3908 -4.6186 -4.4428
          ...             ⋱             ...          
-4.6660 -4.6238 -4.2899  ...  -4.3191 -4.7133 -4.4596
-4.2288 -4.5025 -4.5492  ...  -4.3752 -4.3550 -4.4148
-4.4222 -4.7158 -4.5393  ...  -4.6131 -4.3682 -4.5438
[torch.cuda.FloatTensor of size 512x85 (GPU 0)]

In [136]:
opt = optim.Adam(m.parameters(),1e-2)

In [137]:
fit(m,md,1,opt,F.nll_loss)

epoch      trn_loss   val_loss                                                                                         
    0      2.138028   2.096862  



[2.0968616]

## Test 3 char model

In [138]:
def get_next(inp):
    idx = T(np.array([chars_idx[c] for c in inp]))
    prob = m(*VV(idx))
    pred_idx = np.argmax(to_np(prob))
    return idx_chars[pred_idx]

In [139]:
get_next('ppl')

'e'

In [140]:
get_next('and')

' '

In [141]:
get_next('tha')

't'

# 3. Char loop model

In [110]:
cs = 8

inp_dat = [[text_idx[i+j] for i in range(cs)] for j in range(0,len(text_idx)-cs)]
out_dat = [text_idx[i+cs] for i in range(0, len(text_idx)-cs)]

X = np.stack(inp_dat)
y = np.stack(out_dat)

X,y

(array([[40, 42, 29, ..., 27, 29,  1],
        [42, 29, 30, ..., 29,  1,  1],
        [29, 30, 25, ...,  1,  1,  1],
        ...,
        [72, 62, 67, ..., 65, 67, 58],
        [62, 67, 59, ..., 67, 58, 72],
        [67, 59, 74, ..., 58, 72, 72]]), array([ 1,  1, 43, ..., 72, 72, 10]))

In [111]:
X.shape,y.shape

((600885, 8), (600885,))

In [113]:
cv_idx = get_cv_idxs(len(text_idx)-cs-1)
md = ColumnarModelData.from_arrays('.',cv_idx,X,y,bs=512)

In [115]:
n_hidden = 256
n_factor = 42

class CharLoopModule(nn.Module):
    def __init__(self, n_factor, n_hidden):
        super().__init__()
        self.e = nn.Embedding(vocab_size,n_factor)
        self.l_in = nn.Linear(n_factor,n_hidden)
        self.l_h = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        
        h = V(torch.zeros(bs,n_hidden).cuda())
        
        for c in cs:
            inp = F.relu(self.l_in(self.e(c)))
            h = F.tanh(self.l_h(h+inp))
        
        return F.log_softmax(self.l_out(h))

m = CharLoopModule(n_factor, n_hidden).cuda()

In [116]:
opt = optim.Adam(m.parameters(),1e-2)

In [117]:
fit(m,md,1,opt,F.nll_loss)

epoch      trn_loss   val_loss                                                                                         
    0      2.004841   1.986168  



[1.9861676]

In [118]:
set_lrs(opt, 1e-3)
fit(m,md,1,opt,F.nll_loss)

epoch      trn_loss   val_loss                                                                                         
    0      1.706155   1.70856   



[1.70856]

## Test char loop model 

In [122]:
def get_next(inp):
    idx = T(np.array([chars_idx[c] for c in inp]))
    prob = m(*VV(idx))
    pred = np.argmax(to_np(prob))
    return idx_chars[pred]

In [123]:
get_next('for thos')

'e'

In [124]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res
get_next_n('for thos',40)

'for those of the sense of the sense of the sense'