### Import Section

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.learner import *

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *

import dill as pickle
import spacy

  (fname, cnt))
  (fname, cnt))


###  Data setup

In [2]:
PATH='data/aclImdb/'

TRN_PATH = 'train/all/'
VAL_PATH = 'test/all/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

%ls {PATH}

imdbEr.txt  imdb.vocab  [0m[01;34mmodels[0m/  README  [01;34mtest[0m/  [01;34mtmp[0m/  [01;34mtrain[0m/


In [3]:
trn_files = !ls -U {TRN} | head -10
trn_files

['15905_0.txt',
 '43567_0.txt',
 '38960_0.txt',
 '270_10.txt',
 '41328_0.txt',
 '31443_0.txt',
 '1626_0.txt',
 '49868_0.txt',
 '9939_1.txt',
 '6442_1.txt']

In [4]:
review = !cat {TRN}{trn_files[6]}
review[0]

'A simple but gripping story about a robbery gone haywire and the people who get caught up in the good vs. evil mess, Crime Wave is a stone-cold cool cops and robbers film noir from André De Toth. Set in a grim and menacing depiction of Los Angeles, Crime Wave is a dark, brutal, stylish, and ultimately unique film within the genre.<br /><br />The entire cast is solid, Gene Nelson gives a very good performance as ex con Steve Lacey and Charles Bronson (credited as Charles Buchinsky) gives a new meaning to the word "sleazebag" with his performance as baddie Ben Hastings - but Sterling Hayden grabs the most attention throughout the film. As L.A. Detective Sims, Hayden does just as much stealing as the bad guys he\'s trying to track down do - stealing every scene he\'s in that is. Bringing a high-speed delivery, certain likability, and an unyielding toughness (it looks like he could leave a guy in a roughed up clutter by just looking at him) to this edgy and no-nonsense cop ("once a crook 

In [5]:
!find {TRN} -name '*.txt' | xargs cat | wc -w

17486581


In [6]:
!find {VAL} -name '*.txt' | xargs cat | wc -w

5686719


### Creating the model

In [7]:
spacy_tok = spacy.load('en')

In [8]:
TEXT = data.Field(lower=True, tokenize="spacy")

In [9]:
bs=64; bptt=70

In [10]:
FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=10)

In [11]:
pickle.dump(TEXT, open(f'{PATH}models/TEXT.pkl','wb'))

batches; unique tokens in the vocab; tokens in the training set; sentences

In [12]:
len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

(4583, 37392, 1, 20540756)

### Training

In [13]:
em_sz = 200  # size of each embedding vector
nh = 500     # number of hidden activations per layer
nl = 3       # number of layers

In [14]:
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

In [15]:
learner = md.get_model(opt_fn, em_sz, nh, nl,
               dropouti=0.05, dropout=0.05, wdrop=0.1, dropoute=0.02, dropouth=0.05)
learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learner.clip=0.3

In [16]:
learner.fit(3e-3, 4, wds=1e-6, cycle_len=1, cycle_mult=2)

epoch      trn_loss   val_loss                                
    0      4.887469   4.785148  
    1      4.655075   4.550943                                
    2      4.550405   4.467352                                
    3      4.595144   4.485721                                
    4      4.511596   4.409963                                
    5      4.437769   4.354779                                
    6      4.406035   4.337164                                
    7      4.533539   4.425644                                
    8      4.487822   4.395535                                
    9      4.458847   4.366721                                
    10     4.422964   4.335066                                
    11     4.381823   4.306395                                
    12     4.343195   4.281677                                
    13     4.313741   4.268547                                
    14     4.333699   4.265585                                



[4.2655854]

In [17]:
learner.save_encoder('adam1_enc')

In [18]:
learner.load_encoder('adam1_enc')

In [None]:
learner.load_cycle('adam3_10',2)

In [19]:
learner.fit(3e-3, 1, wds=1e-6, cycle_len=10)

epoch      trn_loss   val_loss                                
    0      4.469402   4.377753  
    1      4.462979   4.366992                                
    2      4.451687   4.347831                                
    3      4.407734   4.324759                                
    4      4.381709   4.300958                                
    5      4.34871    4.277249                                
    6      4.346461   4.260454                                
    7      4.312004   4.243355                                
    8      4.266161   4.235327                                
    9      4.26993    4.233541                                



[4.2335405]

In [20]:
learner.save_encoder('adam3_10_enc')

In [21]:
learner.fit(3e-3, 1, wds=1e-6, cycle_len=20)

epoch      trn_loss   val_loss                                
    0      4.30121    4.235869  
    1      4.455959   4.347098                                
    2      4.434215   4.342544                                
    3      4.424483   4.331704                                
    4      4.417456   4.324742                                
    5      4.417522   4.317687                                
    6      4.401334   4.307899                                
 51%|█████     | 2329/4583 [10:44<10:24,  3.61it/s, loss=4.4] 

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



    7      4.368776   4.293603                                
    8      4.361104   4.28061                                 
    9      4.347784   4.269474                                
    10     4.339543   4.26056                                 
    11     4.31276    4.249115                                
    12     4.301177   4.237602                                
    13     4.277916   4.228312                                
    14     4.284327   4.221485                                
    15     4.250942   4.213447                                
    16     4.274412   4.210318                                
    17     4.245044   4.205905                                
    18     4.227175   4.204106                                
    19     4.234197   4.203295                                



[4.2032948]

In [22]:
learner.save_encoder('adam3_20_enc')

In [23]:
learner.load_encoder('adam3_20_enc')

In [24]:
pickle.dump(TEXT, open(f'{PATH}models/TEXT.pkl','wb'))

### Testing

In [25]:
m=learner.model
ss=""". So, it wasn't quite was I was expecting, but I really liked it anyway! The best"""
s = [spacy_tok(ss)]
t=TEXT.numericalize(s)
' '.join(s[0])

TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found

In [None]:
# Set batch size to 1
m[0].bs=1
# Turn off dropout
m.eval()
# Reset hidden state
m.reset()
# Get predictions from model
res,*_ = m(t)
# Put the batch size back to what it was
m[0].bs=bs

In [None]:
nexts = torch.topk(res[-1], 10)[1]
[TEXT.vocab.itos[o] for o in to_np(nexts)]

In [None]:
print(ss,"\n")
for i in range(50):
    n=res[-1].topk(2)[1]
    n = n[1] if n.data[0]==0 else n[0]
    print(TEXT.vocab.itos[n.data[0]], end=' ')
    res,*_ = m(n[0].unsqueeze(0))
print('...')

# Archive

In [21]:
learner = model_data.get_model(optimization_function, 
                               embedding_matrix_vector_size,
                               hidden_activations_per_layer,
                               number_of_layers,
                               dropouti=0.05, 
                               dropout=0.05, 
                               wdrop=0.1, 
                               dropoute=0.02, 
                               dropouth=0.05)

In [22]:
learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)

In [23]:
# This clips the learning rate
learner.clip=0.2

In [24]:
learner.fit(3e-4, 4, wds=1e-6, cycle_len=1, cycle_mult=2)

epoch      trn_loss   val_loss                              
    0      2.820423   2.662502  
    1      2.104295   1.877877                              
    2      1.782074   1.751191                              
    3      1.624566   1.567082                              
    4      1.495123   1.4979                                
    5      1.425235   1.461224                              
    6      1.404931   1.433978                              
    7      1.391117   1.431324                              
    8      1.340597   1.385715                              
    9      1.29383    1.347896                              
    10     1.264139   1.322773                              
    11     1.229856   1.28737                               
    12     1.189815   1.266234                              
    13     1.198576   1.269404                              
    14     1.187142   1.279893                              



[1.2798933]

In [25]:
learner.fit(3e-3, 1, wds=1e-6, cycle_len=10)

epoch      trn_loss   val_loss                              
    0      1.326785   1.422977  
    1      1.2476     1.270615                              
    2      1.079133   1.213807                              
    3      0.960059   1.187449                               
    4      0.808017   1.185401                               
    5      0.675018   1.195614                               
    6      0.585171   1.211999                               
    7      0.477929   1.238316                               
    8      0.41318    1.260869                               
    9      0.455005   1.259753                               



[1.2597526]

In [26]:
model = learner.model

In [42]:
ss = "m"

In [43]:
s = [TEXT.tokenize(ss)]
t = TEXT.numericalize(s)
' '.join(s[0])

'm'

In [44]:
model[0].bs = 1
model.eval()
model.reset()
res, *_ = model(t)
model[0].bs = bs

In [45]:
next_words = torch.topk(res[-1], 10)[1]
[TEXT.vocab.itos[o] for o in to_np(next_words)]

['e', 'a', 'i', 'y', 'o', ' ', '!', 'u', 's', 'b']

In [49]:
print(ss,"\n")
for i in range(300):
    n=res[-1].topk(2)[1]
    n = n[1] if n.data[0]==0 else n[0]
    print(TEXT.vocab.itos[n.data[0]], end=' ')
    res,*_ = model(n[0].unsqueeze(0))
print('...')

Variable containing:
 26
[torch.cuda.LongTensor of size 1x1 (GPU 0)]
 

k n o c k .   w h o ' s   t h e r e !   b e n   h u r !   b e n   h u r   w h o ?   b e n   h u r   y o u   a   p o o l i n e !   <eos> k n o c k ,   k n o c k .   w h o ' s   t h e r e !   b e n   h u r !   b e n   h u r   w h o ?   b e n   h u r   y o u   a   p o o l i n e !   <eos> k n o c k ,   k n o c k .   w h o ' s   t h e r e !   b e n   h u r !   b e n   h u r   w h o ?   b e n   h u r   y o u   a   p o o l i n e !   <eos> k n o c k ,   k n o c k .   w h o ' s   t h e r e !   b e n   h u r !   b e n   h u r   w h o ?   b e n   h u r   y o u   a   p o o l i n e !   <eos> k n o c k ,   k n o c k .   w ...
