# Importing libs & loading data

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.learner import *

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *

import dill as pickle
import spacy

In [23]:
import os
os.listdir('data/1MDB')

['.ipynb_checkpoints',
 'test',
 'README',
 'imdbEr.txt',
 'imdb.vocab',
 'aclImdb',
 'aclImdb.tgz',
 'models',
 'train',
 'tmp']

In [24]:
PATH = 'data/1MDB/'
TRAIN_PATH = 'train/all/'
VAL_PATH = 'test/all/'
TRAIN = f'{PATH}{TRAIN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

!ls {PATH}

README	aclImdb  aclImdb.tgz  imdb.vocab  imdbEr.txt  models  test  tmp  train


In [25]:
#See training files that the reviews live in
train_files = !ls {TRAIN}
train_files[:10]

['0_0.txt',
 '0_3.txt',
 '0_9.txt',
 '10000_0.txt',
 '10000_4.txt',
 '10000_8.txt',
 '10001_0.txt',
 '10001_10.txt',
 '10001_4.txt',
 '10002_0.txt']

In [26]:
#See example review
review = !cat {TRAIN}{train_files[3]}
review[0]

'Take a low budget, inexperienced actors doubling as production staff�� as well as limited facilities��and you can\'t expect much more than "Time Chasers" gives you, but you can absolutely expect a lot less. This film represents a bunch of good natured friends and neighbors coming together to collaborate on an interesting project. If your cousin had been one of those involved, you would probably think to yourself, "ok, this movie is terrible... but a really good effort." For all the poorly delivered dialog and ham-fisted editing, "Time Chasers" has great scope and ambition... and one can imagine it was necessary to shoot every scene in only one or two takes. So, I\'m suggesting people cut "Time Chasers" some slack before they cut in the jugular. That said, I\'m not sure I can ever forgive the pseudo-old lady from the grocery store for the worst delivery every wrenched from the jaws of a problematic script.'

In [27]:
#Check how many words in the dataset
!find {TRAIN} -name '*.txt' | xargs cat | wc -w

17486270


In [None]:
!find {VAL} -name '*.txt' | xargs cat | wc -w

# Data pre-processing

In [None]:
#Tokenize the text - ie. change sentences into an array of words
spacy_token = spacy.load('en')
' '.join([sentence.string.strip() for sentence in spacy_token(review[0])])

In [None]:
#Create a torchtext field object that describes how to preprocess and tell it spacy is tokenizing
TEXT = data.Field(lower=True, tokenize='spacy')
bs = 32
bptt = 70

In [43]:
#Build our Model_Data object
FILES = dict(train=TRAIN_PATH, validation=VAL_PATH, test=VAL_PATH)
model_data = LanguageModelData.from_text_files(PATH, 
                                               TEXT, #after building our ModelData object, TEXT is automatically filled with vocab
                                               **FILES, 
                                               bs=bs, bptt=bptt, min_freq=10)

RuntimeError: cuda runtime error (2) : out of memory at /pytorch/torch/lib/THC/THCTensorCopy.cu:204

In [None]:
pickle.dump(TEXT, open(f'{PATH}models/TEXT.pkl', 'wb'))

In [None]:
#Checking the ModelData object lengths
#no. of batches, #no. of unique tokens, #no. of tokens in training data set, #no. of sentences
len(model_data.trn_dl), model_data.nt, len(model_data.trn_ds), len(model_data.trn_ds[0].text)

In [None]:
#see vocab int-to-string
TEXT.vocab.itos[:12]

In [None]:
#see string to int 
TEXT.vocab.stoi['the']

In [None]:
#see that in each training data set, there is just a list of words 
model_data.trn_ds[0].text[:12]

In [None]:
#use torchtext to convert these to ints
TEXT.numericalize([model_data.trn_ds[0].text[:12]])

In [None]:
#iterate through the training data labels
#notice that we create batches of 64 columns (bs) & varying sequence lengths (bptt)
#each batch also contains same data but one word later - that is what we're trying to predict
next(iter(model_data.trn_dl))

# Begin Training The Model

In [None]:
embed_size = 200
activations_per_layer = 500
number_of_layers = 3

In [None]:
#Create our own version of the Adam optimizer
optim_fn = partial(optim.Adam, betas=(0.7, 0.99))

#Get a model
learner = model_data.get_model(optim_fn, 
                         embed_size, 
                         activations_per_layer, 
                         number_of_layers,
                        dropouti=0.05, dropout=0.05, wdrop=0.1, dropoute=0.02, dropouth=0.05)

#partial() take in a function and set its parameters 
learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)

#limit the maximum learning rate
learner.clip = 0.3


In [None]:
#Begin training the model
learner.fit(3e-3, 4, wds=1e-6, cycle_len=1, cycle_mult=2)

In [None]:
#Save the trained model
learner.save_encoder('adam1_enc')
learner.load_encoder('adam1_enc')

In [None]:
learner.load_cycle('adam3_10', 2)

In [None]:
learner.fit(3e-3, 1, wds=1e-6, cycle_len=10)

In [None]:
learner.save_encoder('adam3_20_enc')
learner.load_encoder('adam3_20_enc')

In [None]:
#Language models are measure by perplexity - which is the exp of the loss function
math.exp() #insert loss value

In [None]:
pickle.dump(TEXT, open('f{PATH}models/TEXT.pk1', 'wb'))

# Test Version

In [None]:
model = learner.model
ss=""". So, it wasn't quite the best but I enjoyed it! The best"""
s = [spacy_token(ss)] #turn the string into an array of words ("tokens") 
t = TEXT.numericalize(s) #torchtext ModelData object that will convert the array of words into ints according to vocab  
' '.join(s[0])

In [None]:
#Set batch size to 1
model[0].bs = 1
#Turn off dropout
model.eval()
#Reset hidden state
model.reset()
#Get predictions
res,*_ = model(t)
#Put the batch size back
model[0].bs = bs

In [None]:
#Check top 10 predictions
next_words = torch.topk(res[-1], 10)[1] #topk() returns the 10 largest elements in res[-1]
[TEXT.vocab.itos(o) for o in to_np(next_words)]

In [None]:
#See the model generate text!
print(ss, "\n")
for i in range(20):
    n = res[-1].topk(2)[1]
    n = n[1] if n.data[0]==0 else n[0]
    print(TEXT.vocab.itos[n.data[0]], end=' ')
    res,*_ = model(n[0].unsqueeze(0))
print("......")

# Sentiment Analysis

In [None]:
TEXT = pickle.load(open('f{PATH}models/TEXT.pkl', 'rb'))

In [None]:
IMDB_LABEL = data.Field(sequential=False) #Tokenize the text field

#splits is a torchtext method that creates train, test, and validation sets. 
#1MDB data set already comes with torchtext
splits = torchtext.datasets.IMDB.splits(TEXT, IMDB_LABEL, 'data/')  

In [None]:
t = splits[0].examples[0]
#See the label, then the text that resulted in it (ie. positive or negative)
t.label, ' '.join(t.text[:16])

In [None]:
#Create ModelData 
embed_size = 200
activations_per_layer = 500
number_of_layers = 3

model_data2 = TextData.from_splits(PATH, splits, bs)

model2 = model_data2.get_model(opt_fn, 1500, bptt, 
                               emb_sz = embed_size, 
                               n_hid = activations_per_layer, 
                               n_layers = number_of_layers,
                              dropout=0.1, dropouti=0.4, wdrop=0.5, dropoute=0.05, dropouth=0.3)

model2.reg_fn = partial(seq2seq_reg, alpha=2, beta=1) #regularisation function
model2.load_encoder(f'adam3_20_enc')

In [None]:
model2.clip = 25
lrs = np.array([1e-4, 1e-4, 1e-4, 1e-3, 1e-2])

In [None]:
model2.freeze_to(-1)
model2.fit(lrs/2, 1, metrics=[accuracy])
model2.unfreeze()
model2.fit(lrs, 1, metrics=[accuracy], cycle_len=1)

In [None]:
model2.fit(lrs, 7, metrics=[accuracy], cycle_len=2, cycle_save_name='1mdb2')

In [None]:
model2.load_cycle('1mdb2', 4)

In [None]:
accuracy_np(*m3.predict_with_targs())