In [1]:
# Lecture notes

In [2]:
#hide
! [ -e /content ] && pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

In [3]:
# Get IMDB data set
from fastai.text.all import *
path = untar_data(URLs.IMDB)

In [4]:
# Grab text files
files = get_text_files(path, folders = ['train', 'test', 'unsup'])

In [5]:
# Grab a review
txt = files[0].open().read(); txt[:75]

'Alan Rickman & Emma Thompson give good performances with southern/New Orlea'

In [6]:
# WordTokenizer (maybe) points to FastAis default library called spaCy
spacy = WordTokenizer()
toks = first(spacy([txt]))
print(coll_repr(toks, 30))

(#121) ['Alan','Rickman','&','Emma','Thompson','give','good','performances','with','southern','/','New','Orleans','accents','in','this','detective','flick','.','It',"'s",'worth','seeing','for','their','scenes-','and','Rickman',"'s",'scene'...]


In [7]:
# FastAi Tokenizer adds additional tokenization functionality
# Note the xx prefix
tkn = Tokenizer(spacy)
print(coll_repr(tkn(txt), 31))

(#139) ['xxbos','xxmaj','alan','xxmaj','rickman','&','xxmaj','emma','xxmaj','thompson','give','good','performances','with','southern','/','xxmaj','new','xxmaj','orleans','accents','in','this','detective','flick','.','xxmaj','it',"'s",'worth','seeing'...]


In [8]:
# View rules applied during tokenization
defaults.text_proc_rules

[<function fastai.text.core.fix_html(x)>,
 <function fastai.text.core.replace_rep(t)>,
 <function fastai.text.core.replace_wrep(t)>,
 <function fastai.text.core.spec_add_spaces(t)>,
 <function fastai.text.core.rm_useless_spaces(t)>,
 <function fastai.text.core.replace_all_caps(t)>,
 <function fastai.text.core.replace_maj(t)>,
 <function fastai.text.core.lowercase(t, add_bos=True, add_eos=False)>]

In [9]:
# Use ??___ to view impl
??fix_html

In [10]:
# How can text be broken up without relying on spaces?
# Subword Tokenization
# 1. Analyze a corpus of documents to find the most commonly occurring groups of letters. These become the vocab.
# 2. Tokenize the corpus using this vocab of subword units.

In [11]:
# Create corpus of first 2000 IMBD movie reviews
txts = L(o.open().read() for o in files[:2000])

In [12]:
# sp.setup(txts) : Train tokenizer by having it read our documents and find the common sequences of characters
# to create the vocab
def subword(sz):
    sp = SubwordTokenizer(vocab_sz=sz)
    sp.setup(txts)
    return ' '.join(first(sp([txt]))[:40])

In [13]:
# ▁ represents a space character in the original text
# Picking a subword vocab size represents a compromise: a larger vocab means fewer tokens per sentence, which means faster training, less memory, and less state for the model to remember; but on the downside, it means larger embedding matrices, which require more data to learn.
subword(1000)

sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=tmp/texts.out --vocab_size=1000 --model_prefix=tmp/spm --character_coverage=0.99999 --model_type=unigram --unk_id=9 --pad_id=-1 --bos_id=-1 --eos_id=-1 --minloglevel=2 --user_defined_symbols=▁xxunk,▁xxpad,▁xxbos,▁xxeos,▁xxfld,▁xxrep,▁xxwrep,▁xxup,▁xxmaj --hard_vocab_limit=false


'▁A l an ▁R ick man ▁ & ▁E mm a ▁Th om p son ▁give ▁good ▁performance s ▁with ▁so u ther n / N e w ▁O r le an s ▁a c cent s ▁in ▁this ▁de'

In [14]:
# Numericalize on previous word-tokenized text
toks = tkn(txt)
print(coll_repr(tkn(txt), 31))

(#139) ['xxbos','xxmaj','alan','xxmaj','rickman','&','xxmaj','emma','xxmaj','thompson','give','good','performances','with','southern','/','xxmaj','new','xxmaj','orleans','accents','in','this','detective','flick','.','xxmaj','it',"'s",'worth','seeing'...]


In [15]:
# Create small subset for tokenized corpus (takes a while)
toks200 = txts[:200].map(tkn)
toks200[0]

(#139) ['xxbos','xxmaj','alan','xxmaj','rickman','&','xxmaj','emma','xxmaj','thompson'...]

In [16]:
# Pass to setup to create our vocab
num = Numericalize()
num.setup(toks200)
coll_repr(num.vocab,20)

"(#1984) ['xxunk','xxpad','xxbos','xxeos','xxfld','xxrep','xxwrep','xxup','xxmaj','the','.',',','and','a','to','of','i','it','is','in'...]"

In [17]:
nums = num(toks)[:20]; nums

TensorText([   2,    8,    0,    8, 1442,  234,    8,    0,    8,    0,  199,   64,  731,   29,    0,  122,    8,  253,    8,    0])

In [18]:
# Confirm that integers map back to tokens
' '.join(num.vocab[o] for o in nums)

'xxbos xxmaj xxunk xxmaj rickman & xxmaj xxunk xxmaj xxunk give good performances with xxunk / xxmaj new xxmaj xxunk'

In [19]:
# Now that we have numbers, we need to put them in batches for our model