Movie polarity classification

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import sys
sys.path.append('/home/ubuntu/fastai/')

from fastai.learner import *

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *

import dill as pickle
import spacy

In [2]:
PATH = '/home/ubuntu/fastai/projects/part_1_material/RNNs/data/movie_review_2class/'
trn_path = 'trn/all'
val_path = 'val/all'

In [3]:
train_pos_files = !ls {PATH}{trn_path}/pos
trin_neg_files = !ls {trn_path}/neg
val_pos_files = !ls {val_path}/pos
val_neg_files = !ls {val_path}/neg

Ok let's look at a positive movie review

In [11]:
' '.join([part for part in test_review])

NameError: name 'test_review' is not defined

In [4]:
tokenizer = spacy.load('en')

In [17]:
TEXT = data.Field(lower=True, tokenize="spacy")

In [18]:
bs = 64
bptt = 70

In [19]:
FILES = dict(train=trn_path, validation=val_path, test=val_path)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=10)

Now we can see that the `TEXT` object that we had instansiated above has the properties of 
`TEXT.vocab.itos` (int-to-string) which is the vocabulary for the corpus (i.e. all the words) as well as the mapping dictionary
from word to token via `TEXT.vocab.stoi` (string-to-int). We can see that these provide methods for mapping between tokens and strings, as follows:

In [20]:
print(TEXT.vocab.itos[:12])
print(TEXT.vocab.stoi['the'])

['<unk>', '<pad>', ',', 'the', '.', 'a', 'and', 'of', 'to', 'is', 'in', "'s"]
3


What we can do now is to dump the tokenized text data to a `.pickle` file so that we can read this later instead of having to do the tokenization again.


In [61]:
#pickle.dump(TEXT, open(f'{PATH}models/TEXT.pkl', 'wb'))

Now we can look at some information about the movie reviews such as the # batches in the data set, # of tokens in the vocab, # of tokens in the training set, and the # of sentences.

In [17]:
print('Number of batches: {}'.format(len(md.trn_dl)))
print('Number of tokens in vocab: {}'.format(md.nt))
print('Number of tokens in training set: {}'.format(len(md.trn_ds)))
print('Number of sentences: {}'.format(len(md.trn_ds[0].text)))

Number of batches: 277
Number of tokens in vocab: 8162
Number of tokens in training set: 1
Number of sentences: 1245853


Some things to note about the LanguageModelData object: 1) There is only one item in each dataset:

In [18]:
md.trn_ds[0].text[:12]

['plot',
 ':',
 'jet',
 'li',
 'is',
 'a',
 'chinese',
 'cop',
 'asked',
 'to',
 'help',
 'some']

2) The LanguageModelData object will create batches with 64 columns (the batch size), and varying sequence lengths around 70 (the bptt we set above). Furthermore, the batches will contain the exact same number of data as labels, but offset by one, because we are trying to predict the next word in the sequence. The labels are a 1D flattened array:

In [19]:
next(iter(md.trn_dl))

(Variable containing:
   114    26  7663  ...   1910  2647    24
    58    14   139  ...      4  1390  2649
  2700     9    96  ...    398    10   721
        ...          ⋱          ...       
     4     8     9  ...    255     6     0
     4    30  3609  ...     43  1692   589
     4    33   151  ...     28   163   152
 [torch.cuda.LongTensor of size 68x64 (GPU 0)], Variable containing:
    58
    14
   139
   ⋮  
    31
  1012
   116
 [torch.cuda.LongTensor of size 4352 (GPU 0)])

Let's set the other parameters needed for fast.ai LanguageModels, mainly the embedding dimensions, the number of hidden units per layer,  and the number of hidden layers.

In [40]:
# Embedding size
emb_sz = 200
# Number of hidden units
nh = 250
# Number of hidden layers
nl = 3

Next we want to define the optimizer we want to use, as a default use Adam with the momentum set to less than the default of 0.9.

In [37]:
opt = partial(optim.Adam, betas=(0.7, 0.99))

In [22]:
learner = md.get_model(opt_fn=opt, emb_sz=emb_sz, n_hid=nh,
                       n_layers=nl, dropouti=0.05, dropout=0.05,
                       wdrop=0.1, dropoute=0.02, dropouth=0.05)

We can also set a regularization function for this type of learner as well as a clipping percentage for gradient clipping

In [23]:
learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learner.clip = 0.3

In [25]:
learner.fit(3e-3, 4, wds=1e-6, cycle_len=10)

HBox(children=(IntProgress(value=0, description='Epoch', max=40), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      5.54098    5.337223  
    1      5.101896   4.975279                              
    2      4.888684   4.800024                              
    3      4.751794   4.698601                              
    4      4.654961   4.635167                              
    5      4.584552   4.591514                              
    6      4.522377   4.564551                              
    7      4.481195   4.550163                              
    8      4.48174    4.54351                               
    9      4.465264   4.541524                              
    10     4.540777   4.546882                              
    11     4.467847   4.51085                               
    12     4.404246   4.483813                              
    13     4.338234   4.466834                              
    14     4.280584   4.449007                              
    15     4.234407   4.437104                      

[array([4.4272])]

The model used here is a seq2seq model so we want to save the first half of the model which is the encoder

In [30]:
learner.save_encoder('adam1_enc')
# learner.load_encoder('adam1_enc')

In [31]:
learner.fit(3e-3, 1, wds=1e-6, cycle_len=10)

HBox(children=(IntProgress(value=0, description='Epoch', max=10), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      4.006791   4.440831  
    1      4.018357   4.436441                              
    2      4.033728   4.415796                              
    3      3.96038    4.432343                              
    4      3.925398   4.433996                              
    5      3.889835   4.435346                              
    6      3.895788   4.420619                              
    7      3.898626   4.425046                              
    8      3.817986   4.433572                              
    9      3.812217   4.440577                              



[array([4.44058])]

In [33]:
learner.save_encoder('adam3_10_enc')
#learner.load_encoder('adam3_10_enc')

Note that language models are usually measured in terms of perplexity, which is simply exp() of our loss.

In [34]:
np.exp(4.44)

84.77494167382804

In [22]:
pickle.dump(TEXT, open(f'{PATH}models/TEXT.pkl','wb'))

Now let's use the trained language model to do our sentiment analysis. First we will need the vocab, which we can load from the pickle file.

In [28]:
TEXT = pickle.load(open(f'{PATH}models/TEXT.pkl','rb'))

In [31]:
MOVIE_POL_LABEL = data.Field(sequential=False)
splits = torchtext.datasets.IMDB.splits(TEXT, MOVIE_POL_LABEL, 'data/')

downloading aclImdb_v1.tar.gz


In [32]:
t = splits[0].examples[0]

In [34]:
t.label, ' '.join(t.text[:12])

('pos', 'people claim its edited funny but they had to cut it down')

In [35]:
md2 = TextData.from_splits(PATH, splits, bs)

In [41]:
m3 = md2.get_model(opt, 1500, bptt, emb_sz=emb_sz, n_hid=nh, n_layers=nl, 
           dropout=0.1, dropouti=0.4, wdrop=0.5, dropoute=0.05, dropouth=0.3)
m3.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
m3.load_encoder(f'adam3_10_enc')

In [42]:
m3.clip=25.
lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])

In [43]:
m3.freeze_to(-1)
m3.fit(lrs/2, 1, metrics=[accuracy])
m3.unfreeze()
m3.fit(lrs, 1, metrics=[accuracy], cycle_len=1)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                    
    0      0.717523   0.957377   0.511286  



HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                    
    0      0.685852   0.614746   0.664265  



[array([0.61475]), 0.6642650580167264]

In [44]:
m3.fit(lrs, 7, metrics=[accuracy], cycle_len=2, cycle_save_name='movie_polarity_2')

HBox(children=(IntProgress(value=0, description='Epoch', max=14), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                    
    0      0.671479   0.709848   0.643709  
    1      0.648107   0.658009   0.67022                     
    2      0.639724   0.662498   0.693186                    
    3      0.620692   0.68616    0.697425                    
  1%|          | 3/391 [00:02<05:43,  1.13it/s, loss=0.619]

KeyboardInterrupt: 