# NLP Using Fastai

In [1]:
! [ -e /content ] && pip install -Uqq fastbook
import fastbook

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/719.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m716.8/719.8 kB[0m [31m25.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m719.8/719.8 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from fastbook import *
from IPython.display import display,HTML

### Get the IMDB Data

In [3]:
from fastai.text.all import *
path = untar_data(URLs.IMDB)
path

Path('/root/.fastai/data/imdb')

In [4]:
files = get_text_files(path, folders = ['train', 'test', 'unsup'])
files

(#100000) [Path('/root/.fastai/data/imdb/unsup/34391_0.txt'),Path('/root/.fastai/data/imdb/unsup/7307_0.txt'),Path('/root/.fastai/data/imdb/unsup/22454_0.txt'),Path('/root/.fastai/data/imdb/unsup/40514_0.txt'),Path('/root/.fastai/data/imdb/unsup/48062_0.txt'),Path('/root/.fastai/data/imdb/unsup/7455_0.txt'),Path('/root/.fastai/data/imdb/unsup/2827_0.txt'),Path('/root/.fastai/data/imdb/unsup/41382_0.txt'),Path('/root/.fastai/data/imdb/unsup/39685_0.txt'),Path('/root/.fastai/data/imdb/unsup/9323_0.txt')...]

In [5]:
txt = files[0].open().read()
txt[:75]

'I do not blame the game, but myself, for losing hundreds of hours that with'

### Tokenization

#### Word Level Tokenization

In [6]:
spacy = WordTokenizer()
tokens = first(spacy([txt]))
coll_repr(tokens, 30)

"(#385) ['I','do','not','blame','the','game',',','but','myself',',','for','losing','hundreds','of','hours','that','with','ease','could','have','been','spent','doing','something','constructive','/','more','fun','.','It'...]"

In [7]:
tokenizer = Tokenizer(spacy)
coll_repr(tokenizer(txt), 31)

"(#408) ['xxbos','i','do','not','blame','the','game',',','but','myself',',','for','losing','hundreds','of','hours','that','with','ease','could','have','been','spent','doing','something','constructive','/','more','fun','.','xxmaj'...]"

In [8]:
defaults.text_proc_rules

[<function fastai.text.core.fix_html(x)>,
 <function fastai.text.core.replace_rep(t)>,
 <function fastai.text.core.replace_wrep(t)>,
 <function fastai.text.core.spec_add_spaces(t)>,
 <function fastai.text.core.rm_useless_spaces(t)>,
 <function fastai.text.core.replace_all_caps(t)>,
 <function fastai.text.core.replace_maj(t)>,
 <function fastai.text.core.lowercase(t, add_bos=True, add_eos=False)>]

#### Subword Level Tokenization

In [9]:
txts = L(o.open().read() for o in files[:2000])

In [10]:
def subword(sz):
    sp = SubwordTokenizer(vocab_sz=sz)
    sp.setup(txts)
    return ' '.join(first(sp([txt])))

Larger vocabulary size means a sentence will be represented by fewer tokens, whereas smaller vocabulary size means that a sentence will require more tokens to be represented. Picking a subword vocab size represents a compromise: a larger vocab means fewer tokens per sentence, which means faster training, less memory, and less state for the model to remember; but on the downside, it means larger embedding matrices, which require more data to learn.

In [11]:
subword(1000)

'▁I ▁do ▁not ▁b la me ▁the ▁game , ▁but ▁myself , ▁for ▁lo s ing ▁ h un d r ed s ▁of ▁ho ur s ▁that ▁with ▁e a se ▁could ▁have ▁been ▁sp ent ▁do ing ▁something ▁con st r u c t ive / mo re ▁fun . ▁It ▁to ok ▁me ▁a ▁good ▁two ▁years ▁before ▁real iz ing ▁the ▁complete ▁no n s en se ▁of ▁actually ▁de vo ting ▁even ▁a ▁minute ▁to ▁this ▁game , ▁and ▁at ▁the ▁point ▁of ▁re co g n iz ing ▁this ▁fact ▁it ▁to ok ▁me ▁ less ▁than ▁no ▁time ▁to ▁st op ▁play ing ▁for ▁good . ▁W h y ▁is ▁it ▁no n s en se ? ▁< br ▁/> < br ▁/> The ▁game ▁is ▁ho r ri b ly ▁line ar . ▁P la y ▁it ▁through ▁with ▁one ▁character ▁and ▁you \' ve ▁ex p l or ed ▁all ▁there ▁is ▁to ▁it . ▁ Q u est s ▁come ▁in ▁3 ▁vari ation s ▁( " W ow ! " ▁is ▁pretty ▁far ▁from ▁the ▁first ▁thought ▁that ▁come s ▁to ▁mind ▁when ▁see ing ▁that ). ▁Ch ar act er ▁develop ment ▁come s ▁to ▁a ▁ha l t ▁at ▁level ▁2 0 , ▁after ▁which ▁little ▁hope ▁is ▁in ▁s ight . ▁A t ▁to p ▁level ▁all ▁you ▁can ▁do ▁to ▁f ur ther ▁b u il d ▁up ▁your ▁character 

In [12]:
subword(200)

'▁I ▁do ▁not ▁b la m e ▁the ▁ g a m e , ▁but ▁ m y s e l f , ▁for ▁lo s ing ▁ h un d re d s ▁of ▁ h o ur s ▁that ▁with ▁ e a s e ▁co u l d ▁have ▁be en ▁s p ent ▁do ing ▁s o m e th ing ▁co n st r u c t i ve / m o re ▁f un . ▁I t ▁to o k ▁ m e ▁a ▁ g o o d ▁ t w o ▁ y e ar s ▁be f o re ▁ re al i z ing ▁the ▁co m p le t e ▁ n on s en s e ▁of ▁ act u al ly ▁de v o t ing ▁ e ve n ▁a ▁ m in u t e ▁to ▁this ▁ g a m e , ▁and ▁a t ▁the ▁p o in t ▁of ▁ re c o g n i z ing ▁this ▁f act ▁it ▁to o k ▁ m e ▁ le s s ▁ th an ▁ n o ▁ t i m e ▁to ▁ st o p ▁p la y ing ▁for ▁ g o o d . ▁ W h y ▁is ▁it ▁ n on s en s e ? ▁ < br ▁/> < br ▁/> T h e ▁ g a m e ▁is ▁ h or ri b ly ▁ l in e ar . ▁ P la y ▁it ▁ th ro u g h ▁with ▁on e ▁ ch ar act er ▁and ▁you \' ve ▁ e x p l or ed ▁a ll ▁the re ▁is ▁to ▁it . ▁ Q u e st s ▁co m e ▁in ▁ 3 ▁ v ar i at i on s ▁ ( " W o w ! " ▁is ▁p re t t y ▁f ar ▁f ro m ▁the ▁f ir st ▁ th o u g h t ▁that ▁co m es ▁to ▁ m in d ▁w h en ▁see ing ▁that ) . ▁ C h ar act er ▁de ve l o p m e

### Numericalization

Map integers to tokens using Numericalize

In [13]:
toks200 = txts[:200].map(tokenizer)
toks200[0]

(#408) ['xxbos','i','do','not','blame','the','game',',','but','myself'...]

In [14]:
num = Numericalize()
num.setup(toks200)
coll_repr(num.vocab,20)

"(#2136) ['xxunk','xxpad','xxbos','xxeos','xxfld','xxrep','xxwrep','xxup','xxmaj','the',',','.','and','a','of','to','is','it','in','i'...]"

In [15]:
num(toks200[0])

TensorText([   2,   19,   62,   36,    0,    9,  289,   10,   31,  483,   10,   26,    0,    0,   14,  754,   20,   30,    0,   99,   39,  105,  755,  484,  134,    0,  114,   63,  238,   11,    8,
              17,  533,   83,   13,   69,  148,  150,  145,    0,    9,  427,  756,   14,  161,    0,   73,   13,  860,   15,   21,  289,   10,   12,   49,    9,  275,   14,    0,   21,  203,   17,
             533,   83,  317,   91,   81,   84,   15,  757,  377,   26,   69,   11,    8,  206,   16,   17,  756,   85,   27,    8,    9,  289,   16, 1547,    0,   11,    8,  358,   17,  165,   30,
              44,  113,   12,   33,  146,    0,   45,   54,   16,   15,   17,   11,    8,    0,  223,   18,  179,    0,   40,   23,  666,   51,   23,   16,  256,  175,   53,    9,  106,  230,   20,
             239,   15,  341,   64,  276,   20,   38,   11,    8,  113, 1015,  239,   15,   13,    0,   49,  342,  758,   10,  109,   74,  102,  485,   16,   18,    0,   11,    8,   49,  343,  342,
          

### Passing Text as Batches

In [16]:
nums200 = toks200.map(num)

In [17]:
dl = LMDataLoader(nums200)
x,y = first(dl)
x.shape,y.shape

(torch.Size([64, 72]), torch.Size([64, 72]))

In [18]:
# Independent Variable i.e. the input
' '.join(num.vocab[o] for o in x[0][:20])

'xxbos i do not xxunk the game , but myself , for xxunk xxunk of hours that with xxunk could'

In [19]:
# The output- offset by one token
' '.join(num.vocab[o] for o in y[0][:20])

'i do not xxunk the game , but myself , for xxunk xxunk of hours that with xxunk could have'

### Language Modeling Using DataBlock

In [None]:
get_imdb = partial(get_text_files, folders=['train', 'test', 'unsup'])

dls_lm = DataBlock(
    blocks=TextBlock.from_folder(path, is_lm=True),
    get_items=get_imdb, splitter=RandomSplitter(0.1)
).dataloaders(path, path=path, bs=128, seq_length=80)

In [21]:
dls_lm.show_batch(max_n=2)

Unnamed: 0,text,text_
0,"xxbos xxmaj never having seen an xxmaj oliver xxmaj stone film before , nor any films starring xxmaj eric xxmaj bogosian , i did n't know what to expect from this film . xxmaj having toyed with the idea of buying it for a while , i finally got it for free as a supplement with a xxmaj sunday newspaper and i was hugely impressed . \n\n xxmaj it tells the story","xxmaj never having seen an xxmaj oliver xxmaj stone film before , nor any films starring xxmaj eric xxmaj bogosian , i did n't know what to expect from this film . xxmaj having toyed with the idea of buying it for a while , i finally got it for free as a supplement with a xxmaj sunday newspaper and i was hugely impressed . \n\n xxmaj it tells the story of"
1,"mains have escaped the ruin of the city , and instinctively also veer away from the approaching xxmaj europeans . xxmaj they decide to "" go to the forest "" in hopes of living out their lives in peace , away from the destructiveness of all civilization . xxmaj the point in this case , is very bleak , since we can assume that they wo n't be able to escape no","have escaped the ruin of the city , and instinctively also veer away from the approaching xxmaj europeans . xxmaj they decide to "" go to the forest "" in hopes of living out their lives in peace , away from the destructiveness of all civilization . xxmaj the point in this case , is very bleak , since we can assume that they wo n't be able to escape no matter"


### Finetuning The Language Model

In [None]:
learn = language_model_learner(
    dls_lm, AWD_LSTM, drop_mult=0.3,
    metrics=[accuracy, Perplexity()]).to_fp16()

In [23]:
learn.fit_one_cycle(1, 2e-2)

  self.autocast,self.learn.scaler,self.scales = autocast(dtype=dtype),GradScaler(**self.kwargs),L()
  self.autocast,self.learn.scaler,self.scales = autocast(dtype=dtype),GradScaler(**self.kwargs),L()


epoch,train_loss,valid_loss,accuracy,perplexity,time
0,4.007043,3.900225,0.300485,49.413586,23:13


In [28]:
learn.unfreeze()
learn.fit_one_cycle(1, 2e-3)

  self.autocast,self.learn.scaler,self.scales = autocast(dtype=dtype),GradScaler(**self.kwargs),L()
  self.autocast,self.learn.scaler,self.scales = autocast(dtype=dtype),GradScaler(**self.kwargs),L()


epoch,train_loss,valid_loss,accuracy,perplexity,time
0,3.648069,3.646698,0.330425,38.347851,24:31


In [29]:
learn.save_encoder('finetuned')

#### Generating Movie Reviews Using the Finetuned Language Model

In [None]:
TEXT = "I liked this movie because"
N_WORDS = 40
N_SENTENCES = 2
preds = [learn.predict(TEXT, N_WORDS, temperature=0.75)
         for _ in range(N_SENTENCES)]

In [39]:
preds

['i liked this movie because it was shot in a city in Mexico , with a very real backdrop . It freaked me out , i think . So much so , i was surprised to see that it had a very',
 "i liked this movie because Santa Claus was just a boring flick . The plot was just plain stupid and the acting was so bad it 's laughable . i still ca n't stand this movie . It is n't even"]

### Creating the Classifier

In [None]:
dls_class = DataBlock(
    blocks=(TextBlock.from_folder(path, vocab=dls_lm.vocab), CategoryBlock),
    get_y=parent_label,
    get_items=partial(get_text_files, folders=['train', 'test']),
    splitter=GrandparentSplitter(valid_name='test')
).dataloaders(path, path=path, bs=128, seq_len=72)

In [32]:
dls_class.show_batch(max_n=3)

Unnamed: 0,text,category
0,"xxbos xxmaj match 1 : xxmaj tag xxmaj team xxmaj table xxmaj match xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley vs xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley started things off with a xxmaj tag xxmaj team xxmaj table xxmaj match against xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit . xxmaj according to the rules of the match , both opponents have to go through tables in order to get the win . xxmaj benoit and xxmaj guerrero heated up early on by taking turns hammering first xxmaj spike and then xxmaj bubba xxmaj ray . a xxmaj german xxunk by xxmaj benoit to xxmaj bubba took the wind out of the xxmaj dudley brother . xxmaj spike tried to help his brother , but the referee restrained him while xxmaj benoit and xxmaj guerrero",pos
1,"xxbos * ! ! - xxup spoilers - ! ! * \n\n xxmaj before i begin this , let me say that i have had both the advantages of seeing this movie on the big screen and of having seen the "" authorized xxmaj version "" of this movie , remade by xxmaj stephen xxmaj king , himself , in 1997 . \n\n xxmaj both advantages made me appreciate this version of "" the xxmaj shining , "" all the more . \n\n xxmaj also , let me say that xxmaj i 've read xxmaj mr . xxmaj king 's book , "" the xxmaj shining "" on many occasions over the years , and while i love the book and am a huge fan of his work , xxmaj stanley xxmaj kubrick 's retelling of this story is far more compelling … and xxup scary . \n\n xxmaj kubrick",pos
2,"xxbos xxmaj titanic directed by xxmaj james xxmaj cameron presents a fictional love story on the historical setting of the xxmaj titanic . xxmaj the plot is simple , xxunk , or not for those who love plots that twist and turn and keep you in suspense . xxmaj the end of the movie can be figured out within minutes of the start of the film , but the love story is an interesting one , however . xxmaj kate xxmaj winslett is wonderful as xxmaj rose , an aristocratic young lady betrothed by xxmaj cal ( billy xxmaj zane ) . xxmaj early on the voyage xxmaj rose meets xxmaj jack ( leonardo dicaprio ) , a lower class artist on his way to xxmaj america after winning his ticket aboard xxmaj titanic in a poker game . xxmaj if he wants something , he goes and gets it",pos


In [None]:
learn = text_classifier_learner(dls_class, AWD_LSTM, drop_mult=0.5,
                                metrics=accuracy).to_fp16()

In [None]:
learn = learn.load_encoder('finetuned')

In [35]:
learn.fit_one_cycle(1, 2e-2)

  self.autocast,self.learn.scaler,self.scales = autocast(dtype=dtype),GradScaler(**self.kwargs),L()
  self.autocast,self.learn.scaler,self.scales = autocast(dtype=dtype),GradScaler(**self.kwargs),L()


epoch,train_loss,valid_loss,accuracy,time
0,0.27668,0.218589,0.91168,01:24


In [36]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2))

  self.autocast,self.learn.scaler,self.scales = autocast(dtype=dtype),GradScaler(**self.kwargs),L()
  self.autocast,self.learn.scaler,self.scales = autocast(dtype=dtype),GradScaler(**self.kwargs),L()


epoch,train_loss,valid_loss,accuracy,time
0,0.241173,0.188449,0.92688,01:29


In [37]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3))

  self.autocast,self.learn.scaler,self.scales = autocast(dtype=dtype),GradScaler(**self.kwargs),L()
  self.autocast,self.learn.scaler,self.scales = autocast(dtype=dtype),GradScaler(**self.kwargs),L()


epoch,train_loss,valid_loss,accuracy,time
0,0.204052,0.164709,0.93872,01:36


In [38]:
learn.unfreeze()
learn.fit_one_cycle(2, slice(1e-3/(2.6**4),1e-3))

  self.autocast,self.learn.scaler,self.scales = autocast(dtype=dtype),GradScaler(**self.kwargs),L()
  self.autocast,self.learn.scaler,self.scales = autocast(dtype=dtype),GradScaler(**self.kwargs),L()


epoch,train_loss,valid_loss,accuracy,time
0,0.16324,0.159841,0.9404,01:55
1,0.143213,0.159431,0.94148,01:55
