### Import Section

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.learner import *

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *

import dill as pickle
import spacy

###  Data setup

In [3]:
PATH='data/'

TRN_PATH = 'wiki_train/train_90_10/'
VAL_PATH = 'wiki_valid/valid_90_10/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

%ls {PATH}

[0m[01;34maclImdb[0m/                    [01;34mkk_train[0m/  [01;34mmodels[0m/  wiki_en.txt  [01;34mwiki_valid[0m/
cleaned-extra-kk-jokes.txt  [01;34mkk_valid[0m/  [01;34mtmp[0m/     [01;34mwiki_train[0m/


In [4]:
trn_files = !ls -U {TRN}
trn_files

['wiki_train_90pct.txt']

In [5]:
review = !head {TRN}{trn_files[0]}
review[0]

'<Article>'

In [None]:
!find {TRN} -name '*.txt' | xargs cat | wc -w

In [None]:
!find {VAL} -name '*.txt' | xargs cat | wc -w

### Creating the model

In [6]:
spacy_tok = spacy.load('en')

In [7]:
TEXT = data.Field(lower=True, tokenize='spacy')

In [8]:
bs=64; bptt=70

In [None]:
FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=1)

In [None]:
pickle.dump(TEXT, open(f'{PATH}models/TEXT_wiki.pkl','wb'))

batches; unique tokens in the vocab; tokens in the training set; sentences

In [None]:
print(f'no. of batches: {len(md.trn_dl)}\nunique tokens: {md.nt}\ntokens in training\
set:{len(md.trn_ds)}\nsentences:{len(md.trn_ds[0].text)}')

### Training

In [None]:
em_sz = 200  # size of each embedding vector
nh = 500     # number of hidden activations per layer
nl = 3       # number of layers

In [None]:
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

In [None]:
learner = md.get_model(opt_fn, em_sz, nh, nl,
               dropouti=0.05, dropout=0.05, wdrop=0.1, dropoute=0.02, dropouth=0.05)
learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learner.clip=0.3

In [None]:
learner.load_encoder('adam3_20_enc')

In [None]:
learner.fit(3e-3, 4, wds=1e-6, cycle_len=1, cycle_mult=2)

In [None]:
learner.save_encoder('adam1_enc')

In [None]:
learner.load_encoder('adam1_enc')

In [None]:
learner.load_cycle('adam3_10',2)

In [None]:
learner.fit(3e-3, 1, wds=1e-6, cycle_len=10)

In [None]:
learner.save_encoder('adam3_10_enc')

In [None]:
learner.fit(3e-3, 1, wds=1e-6, cycle_len=20)

In [None]:
learner.save_encoder('adam3_20_enc')

In [None]:
learner.load_encoder('adam3_20_enc')

In [None]:
pickle.dump(TEXT, open(f'{PATH}models/TEXT.pkl','wb'))

### Testing

In [None]:
m=learner.model
ss=""". So, it wasn't quite was I was expecting, but I really liked it anyway! The best"""
s = [spacy_tok(ss)]
t=TEXT.numericalize(s)
' '.join(s[0])

In [None]:
# Set batch size to 1
m[0].bs=1
# Turn off dropout
m.eval()
# Reset hidden state
m.reset()
# Get predictions from model
res,*_ = m(t)
# Put the batch size back to what it was
m[0].bs=bs

In [None]:
nexts = torch.topk(res[-1], 10)[1]
[TEXT.vocab.itos[o] for o in to_np(nexts)]

In [None]:
print(ss,"\n")
for i in range(50):
    n=res[-1].topk(2)[1]
    n = n[1] if n.data[0]==0 else n[0]
    print(TEXT.vocab.itos[n.data[0]], end=' ')
    res,*_ = m(n[0].unsqueeze(0))
print('...')

# Archive

In [None]:
learner = model_data.get_model(optimization_function, 
                               embedding_matrix_vector_size,
                               hidden_activations_per_layer,
                               number_of_layers,
                               dropouti=0.05, 
                               dropout=0.05, 
                               wdrop=0.1, 
                               dropoute=0.02, 
                               dropouth=0.05)

In [None]:
learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)

In [None]:
# This clips the learning rate
learner.clip=0.2

In [None]:
learner.fit(3e-4, 4, wds=1e-6, cycle_len=1, cycle_mult=2)

In [None]:
learner.fit(3e-3, 1, wds=1e-6, cycle_len=10)

In [None]:
model = learner.model

In [None]:
ss = "m"

In [None]:
s = [TEXT.tokenize(ss)]
t = TEXT.numericalize(s)
' '.join(s[0])

In [None]:
model[0].bs = 1
model.eval()
model.reset()
res, *_ = model(t)
model[0].bs = bs

In [None]:
next_words = torch.topk(res[-1], 10)[1]
[TEXT.vocab.itos[o] for o in to_np(next_words)]

In [None]:
print(ss,"\n")
for i in range(300):
    n=res[-1].topk(2)[1]
    n = n[1] if n.data[0]==0 else n[0]
    print(TEXT.vocab.itos[n.data[0]], end=' ')
    res,*_ = model(n[0].unsqueeze(0))
print('...')