In [22]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.learner import *
from fastai.text import *

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *

import dill as pickle
import spacy

In [23]:
PATH='data/BS/'

TRN_PATH = 'train/zero/'
VAL_PATH = 'test/zero/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

%ls {PATH}

[0m[01;34mmodels[0m/  [01;34mtest[0m/  [01;34mtmp[0m/  [01;34mtrain[0m/


In [24]:
trn_files = !ls {TRN}
trn_files[:10]

['cleanedespntrain.txt']

In [25]:
review = !cat {TRN}{trn_files[0]}
review[0]

'If the rumors are true and Doc Rivers gets a contract extension soon, I just hope the Celtics tell the truth in the ensuing news conference. Don\'t feed us lines like "Thanks to Doc, our young players made major strides this season," or "We would have made the playoffs if we didn\'t have so many injuries." That\'s a load of crap. Maybe those aren\'t lies, but they seem like fibs along the lines of "I don\'t think those jeans make you look fat at all, they\'re just the wrong size" and "I wasn\'t checking her out, I thought I recognized her."'

In [26]:
!find {TRN} -name '*.txt' | xargs cat | wc -w

907856


In [27]:
!find {VAL} -name '*.txt' | xargs cat | wc -w

143059


In [28]:
spacy_tok = spacy.load('en')

In [29]:
' '.join([sent.string.strip() for sent in spacy_tok(review[0])])

'If the rumors are true and Doc Rivers gets a contract extension soon , I just hope the Celtics tell the truth in the ensuing news conference . Do n\'t feed us lines like " Thanks to Doc , our young players made major strides this season , " or " We would have made the playoffs if we did n\'t have so many injuries . " That \'s a load of crap . Maybe those are n\'t lies , but they seem like fibs along the lines of " I do n\'t think those jeans make you look fat at all , they \'re just the wrong size " and " I was n\'t checking her out , I thought I recognized her . "'

In [30]:
TEXT = data.Field(lower=True, tokenize="spacy")

In [31]:
bs=64; bptt=70

In [32]:
FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=10)

In [33]:
pickle.dump(TEXT, open(f'{PATH}models/TEXT.pkl','wb'))

In [34]:
len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

(249, 6341, 1, 1121144)

In [35]:
# 'itos': 'int-to-string'
TEXT.vocab.itos[:12]

['<unk>', '<pad>', ',', 'the', '.', 'a', '"', 'and', 'to', 'i', "'s", 'of']

In [36]:
# 'stoi': 'string to int'
TEXT.vocab.stoi['the']

3

In [37]:
md.trn_ds[0].text[:12]

['if',
 'the',
 'rumors',
 'are',
 'true',
 'and',
 'doc',
 'rivers',
 'gets',
 'a',
 'contract',
 'extension']

In [38]:
TEXT.numericalize([md.trn_ds[0].text[:12]])

Variable containing:
   47
    3
 4320
   68
  657
    7
  754
 1231
  258
    5
  482
 2327
[torch.cuda.LongTensor of size 12x1 (GPU 0)]

In [39]:
next(iter(md.trn_dl))

(Variable containing:
    47   206   134  ...    423     0  2121
     3    11   423  ...   1038     0   296
  4320   414    37  ...     18     2     4
        ...          ⋱          ...       
    14    56  3135  ...    235  1836  5284
    10   453   286  ...   2439  4107    12
     5    27    26  ...   1067     4     3
 [torch.cuda.LongTensor of size 70x64 (GPU 0)], Variable containing:
     3
    11
   423
   ⋮  
  1116
     0
  1869
 [torch.cuda.LongTensor of size 4480 (GPU 0)])

In [40]:
em_sz = 200  # size of each embedding vector
nh = 500     # number of hidden activations per layer
nl = 3       # number of layers

opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

learner = md.get_model(opt_fn, em_sz, nh, nl,
               dropouti=0.10, dropout=0.10, wdrop=0.2, dropoute=0.04, dropouth=0.1)
learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learner.clip=0.3

In [41]:
learner.fit(3e-3, 4, wds=1e-6, cycle_len=1, cycle_mult=2)

HBox(children=(IntProgress(value=0, description='Epoch', max=15), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      6.009518   5.899045  
    1      5.53205    5.350553                              
    2      5.310353   5.227189                              
    3      5.05761    4.907541                              
    4      4.8122     4.718965                              
    5      4.694429   4.639041                              
    6      4.640957   4.619134                              
    7      4.63852    4.558                                 
    8      4.517836   4.471914                              
    9      4.420044   4.41234                               
    10     4.355363   4.361744                              
    11     4.282019   4.330043                              
    12     4.240814   4.311236                              
    13     4.215526   4.302768                              
    14     4.203316   4.30257                               


[array([4.30257])]

In [53]:
learner.fit(3e-3, 1, wds=1e-6, cycle_len=100)

HBox(children=(IntProgress(value=0, description='Epoch'), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      3.789214   4.156009  
    1      3.777508   4.15216                               
    2      3.771651   4.149001                              
    3      3.742007   4.149719                              
    4      3.738685   4.143847                              
    5      3.829049   4.153617                              
    6      3.830148   4.146432                              
    7      3.787044   4.151213                              
    8      3.780343   4.150245                              
    9      3.768592   4.153553                              
    10     3.747787   4.155258                              
    11     3.756009   4.13811                               
    12     3.733103   4.147625                              
    13     3.722461   4.152918                              
    14     3.724051   4.147812                              
    15     3.720592   4.141964                      

KeyboardInterrupt: 

In [43]:
learner.save('notransferBS')

In [54]:
m=learner.model
ss="""we had everything before us, we had nothing before us, we were all going direct to Heaven, we were all going direct the other way"""
s = [TEXT.preprocess(ss)]
t=TEXT.numericalize(s)
' '.join(s[0])

'we had everything before us , we had nothing before us , we were all going direct to heaven , we were all going direct the other way'

In [55]:
# Set batch size to 1
m[0].bs=1
# Turn off dropout
m.eval()
# Reset hidden state
m.reset()
# Get predictions from model
res,*_ = m(t)
# Put the batch size back to what it was
m[0].bs=bs

In [56]:
nexts = torch.topk(res[-1], 10)[1]
[TEXT.vocab.itos[o] for o in to_np(nexts)]

['.', 'to', '--', ',', 'and', ':', 'around', 'i', 'as', 'we']

In [57]:
print(ss,"\n")
for i in range(100):
    #n=res[-1].topk(2)[1]
    n=torch.multinomial(res[-1].exp(), 2)
    n = n[1] if n.data[0]==0 else n[0]
    print(TEXT.vocab.itos[n.data[0]], end=' ')
    res,*_ = m(n[0].unsqueeze(0))
print('...')

we had everything before us, we had nothing before us, we were all going direct to Heaven, we were all going direct the other way 

but emerging by 25 cents on the dollar of acting ? of course . let 's hope he has a lead into the water ... not even my father . i do n't care , i 'm the walter smith tape . the this guy is version of jennifer expletive , that she does n't know the college goodbye treatment , but i shoe him with her fantasy team . i ca n't have a piece in his camera cbs or espn the magazine , and 7 ) what was the fake mtv scene , which includes a. lewis , ...
