### Import Section

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.learner import *

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *

import dill as pickle
import spacy

  (fname, cnt))
  (fname, cnt))


###  Data setup

In [7]:
PATH='data/'

TRN_PATH = 'wiki_train/train_files/'
VAL_PATH = 'wiki_valid/validation_files'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

%ls {PATH}

wiki_en.txt  [0m[01;34mwiki_train[0m/  [01;34mwiki_valid[0m/


In [10]:
trn_files = !ls -U {TRN} | head -10
trn_files

['train_128374.txt',
 'train_1424683.txt',
 'train_1256497.txt',
 'train_962445.txt',
 'train_2127595.txt',
 'train_1783703.txt',
 'train_1908863.txt',
 'train_1790997.txt',
 'train_561716.txt',
 'train_1421562.txt']

In [11]:
article = !cat {TRN}{trn_files[6]}
article[0]

'ignacio martín esperanza tejada born june is spanish retired footballer who played as forward and is former manager playing career born in verín ourense galicia martín esperanza finished his formation at real madrid and made his senior debuts while on loan at cp la felguera in he scored goals for the club which narrowly avoided relegation and was assigned to the former reserves in the following year martín esperanza was also loaned to ca ceuta and in signed for la liga real betis with luis del sol moving in the opposite direction he made his debut in the competition on september playing the full minutes in home win against rcd mallorca and scored his first goal on december netting the last in success at granada cf martín esperanza struggled with injuries during his spell with the andalusians and joined pontevedra cf in the summer he was regularly used by the club and eventually retired in at the age of he was also called up to the national team in december being only an unused substit

In [12]:
!find {TRN} -name '*.txt' | xargs cat | wc -w

^C


In [13]:
!find {VAL} -name '*.txt' | xargs cat | wc -w

^C


### Creating the model

In [14]:
spacy_tok = spacy.load('en')

In [16]:
TEXT = data.Field(lower=True, tokenize="spacy")

In [17]:
bs=64; bptt=70

In [None]:
FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=10)

In [None]:
pickle.dump(TEXT, open(f'{PATH}models/TEXT.pkl','wb'))

batches; unique tokens in the vocab; tokens in the training set; sentences

In [None]:
len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

### Training

In [None]:
em_sz = 200  # size of each embedding vector
nh = 500     # number of hidden activations per layer
nl = 3       # number of layers

In [None]:
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

In [None]:
learner = md.get_model(opt_fn, em_sz, nh, nl,
               dropouti=0.05, dropout=0.05, wdrop=0.1, dropoute=0.02, dropouth=0.05)
learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learner.clip=0.3

In [None]:
learner.fit(3e-3, 4, wds=1e-6, cycle_len=1, cycle_mult=2)

In [None]:
learner.save_encoder('adam1_enc')

In [None]:
learner.load_encoder('adam1_enc')

In [None]:
learner.load_cycle('adam3_10',2)

In [None]:
learner.fit(3e-3, 1, wds=1e-6, cycle_len=10)

In [None]:
learner.save_encoder('adam3_10_enc')

In [None]:
learner.save_encoder('adam3_20_enc')

In [None]:
learner.load_encoder('adam3_20_enc')

In [None]:
pickle.dump(TEXT, open(f'{PATH}models/TEXT.pkl','wb'))

### Testing

In [None]:
m=learner.model
ss=""". So, it wasn't quite was I was expecting, but I really liked it anyway! The best"""
s = [spacy_tok(ss)]
t=TEXT.numericalize(s)
' '.join(s[0])

In [None]:
# Set batch size to 1
m[0].bs=1
# Turn off dropout
m.eval()
# Reset hidden state
m.reset()
# Get predictions from model
res,*_ = m(t)
# Put the batch size back to what it was
m[0].bs=bs

In [None]:
nexts = torch.topk(res[-1], 10)[1]
[TEXT.vocab.itos[o] for o in to_np(nexts)]

In [None]:
print(ss,"\n")
for i in range(50):
    n=res[-1].topk(2)[1]
    n = n[1] if n.data[0]==0 else n[0]
    print(TEXT.vocab.itos[n.data[0]], end=' ')
    res,*_ = m(n[0].unsqueeze(0))
print('...')

# Archive

In [21]:
learner = model_data.get_model(optimization_function, 
                               embedding_matrix_vector_size,
                               hidden_activations_per_layer,
                               number_of_layers,
                               dropouti=0.05, 
                               dropout=0.05, 
                               wdrop=0.1, 
                               dropoute=0.02, 
                               dropouth=0.05)

In [22]:
learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)

In [23]:
# This clips the learning rate
learner.clip=0.2

In [24]:
learner.fit(3e-4, 4, wds=1e-6, cycle_len=1, cycle_mult=2)

epoch      trn_loss   val_loss                              
    0      2.820423   2.662502  
    1      2.104295   1.877877                              
    2      1.782074   1.751191                              
    3      1.624566   1.567082                              
    4      1.495123   1.4979                                
    5      1.425235   1.461224                              
    6      1.404931   1.433978                              
    7      1.391117   1.431324                              
    8      1.340597   1.385715                              
    9      1.29383    1.347896                              
    10     1.264139   1.322773                              
    11     1.229856   1.28737                               
    12     1.189815   1.266234                              
    13     1.198576   1.269404                              
    14     1.187142   1.279893                              



[1.2798933]

In [25]:
learner.fit(3e-3, 1, wds=1e-6, cycle_len=10)

epoch      trn_loss   val_loss                              
    0      1.326785   1.422977  
    1      1.2476     1.270615                              
    2      1.079133   1.213807                              
    3      0.960059   1.187449                               
    4      0.808017   1.185401                               
    5      0.675018   1.195614                               
    6      0.585171   1.211999                               
    7      0.477929   1.238316                               
    8      0.41318    1.260869                               
    9      0.455005   1.259753                               



[1.2597526]

In [26]:
model = learner.model

In [42]:
ss = "m"

In [43]:
s = [TEXT.tokenize(ss)]
t = TEXT.numericalize(s)
' '.join(s[0])

'm'

In [44]:
model[0].bs = 1
model.eval()
model.reset()
res, *_ = model(t)
model[0].bs = bs

In [45]:
next_words = torch.topk(res[-1], 10)[1]
[TEXT.vocab.itos[o] for o in to_np(next_words)]

['e', 'a', 'i', 'y', 'o', ' ', '!', 'u', 's', 'b']

In [49]:
print(ss,"\n")
for i in range(300):
    n=res[-1].topk(2)[1]
    n = n[1] if n.data[0]==0 else n[0]
    print(TEXT.vocab.itos[n.data[0]], end=' ')
    res,*_ = model(n[0].unsqueeze(0))
print('...')

Variable containing:
 26
[torch.cuda.LongTensor of size 1x1 (GPU 0)]
 

k n o c k .   w h o ' s   t h e r e !   b e n   h u r !   b e n   h u r   w h o ?   b e n   h u r   y o u   a   p o o l i n e !   <eos> k n o c k ,   k n o c k .   w h o ' s   t h e r e !   b e n   h u r !   b e n   h u r   w h o ?   b e n   h u r   y o u   a   p o o l i n e !   <eos> k n o c k ,   k n o c k .   w h o ' s   t h e r e !   b e n   h u r !   b e n   h u r   w h o ?   b e n   h u r   y o u   a   p o o l i n e !   <eos> k n o c k ,   k n o c k .   w h o ' s   t h e r e !   b e n   h u r !   b e n   h u r   w h o ?   b e n   h u r   y o u   a   p o o l i n e !   <eos> k n o c k ,   k n o c k .   w ...
