In [None]:
#export
from local.torch_basics import *
from local.test import *
from local.core import *
from local.data.transform import *
from local.data.core import *
from local.data.source import *
from local.data.external import *
from local.data.pipeline import *
from local.data.load import *
from local.text.core import *
from local.notebook.showdoc import show_doc

In [None]:
#default_exp text.data
#default_cls_lvl 3

# Text data

> Functions and transforms to help gather text data in a `DataSource`

## Numericalizing

In [None]:
#export
def make_vocab(count, min_freq=3, max_vocab=60000):
    "Create a vocab of `max_vocab` size from `Counter` `count` with items present more than `min_freq`"
    vocab = [o for o,c in count.most_common(max_vocab) if c >= min_freq]
    for o in reversed(defaults.text_spec_tok): #Make sure all special tokens are in the vocab
        if o in vocab: vocab.remove(o)
        vocab.insert(0, o)
    vocab = vocab[:max_vocab]
    return vocab + ['xxfake' for _ in range(0, 8-len(vocab)%8)]

In [None]:
count = Counter(['a', 'a', 'a', 'a', 'b', 'b', 'c', 'c', 'd'])
test_eq(set(make_vocab(count)), set(defaults.text_spec_tok + 'a xxfake'.split()))
test_eq(len(make_vocab(count))%8, 0)
test_eq(set(make_vocab(count, min_freq=1)), set(defaults.text_spec_tok + 'a b c d xxfake'.split()))
test_eq(set(make_vocab(count,max_vocab=12, min_freq=1)), set(defaults.text_spec_tok + 'a b c xxfake'.split()))

In [None]:
#export
class TensorText(TensorBase):
    def get_ctxs(self, max_n=10, **kwargs):
        n_samples = min(self.shape[0], max_n)
        df = pd.DataFrame(index = range(n_samples))
        return [df.iloc[i] for i in range(n_samples)]

    def display(self, ctxs): display_df(pd.DataFrame(ctxs))

In [None]:
# export
class Numericalize(Transform):
    "Reversible transform of tokenized texts to numericalized ids"
    def __init__(self, vocab=None, min_freq=3, max_vocab=60000, sep=' '):
        self.vocab,self.min_freq,self.max_vocab,self.sep = vocab,min_freq,max_vocab,sep
        self.o2i = None if vocab is None else defaultdict(int, {v:k for k,v in enumerate(vocab)})

    def setup(self, dsrc):
        if dsrc is None: return
        if self.vocab is None:
            dsrc = getattr(dsrc,'train',dsrc)
            count = Counter(p for o in dsrc for p in o)
            self.vocab = make_vocab(count, min_freq=self.min_freq, max_vocab=self.max_vocab)
            self.o2i = defaultdict(int, {v:k for k,v in enumerate(self.vocab) if v != 'xxfake'})

    def encodes(self, o): return TensorText(tensor([self.o2i[o_] for o_ in o]))
    def decodes(self, o): return Str(self.sep.join([self.vocab[o_] for o_ in o if self.vocab[o_] != PAD]))

In [None]:
num = Numericalize(min_freq=1, sep=' ')
num.setup(L('This is an example of text'.split(), 'this is another text'.split()))
test_eq(set(num.vocab), set(defaults.text_spec_tok + 'This is an example of text this another xxfake'.split()))
test_eq(len(num.vocab)%8, 0)
start = 'This is an example of text'
t = num(start.split())

In [None]:
test_eq(t, tensor([11, 9, 12, 13, 14, 10]))
test_eq(num.decode(t), start)

In [None]:
num = Numericalize(min_freq=2, sep=' ')
num.setup(L('This is an example of text'.split(), 'this is another text'.split()))
test_eq(set(num.vocab), set(defaults.text_spec_tok + 'is text xxfake'.split()))
test_eq(len(num.vocab)%8, 0)
t = num(start.split())
test_eq(t, tensor([0, 9, 0, 0, 0, 10]))
test_eq(num.decode(t), f'{UNK} is {UNK} {UNK} {UNK} text')

## LM_DataLoader -

In [None]:
#export
@delegates()
class LMDataLoader(TfmdDL):
    def __init__(self, dataset, lens=None, cache=2, bs=64, seq_len=72, num_workers=0, **kwargs):
        super().__init__(dataset=dataset, bs=bs, num_workers=num_workers, **kwargs)
        self.items = ReindexCollection([(o[0] if isinstance(o, tuple) else o) for o in dataset], cache=cache)
        self.seq_len = seq_len
        if lens is None: lens = [len(o) for o in self.items]
        self.lens = ReindexCollection(lens, idxs=self.items.idxs)
        # The "-1" is to allow for final label
        self.m = round_multiple(sum(lens)-1, bs*seq_len, round_down=True)
        self.n = self.m//(self.seq_len)
        self.spb = self.n//bs

    def shuffle_fn(self,idxs): return idxs
    def before_iter(self):
        super().before_iter()
        if self.shuffle: self.items.shuffle()
        self.chunks = Chunks(self.items, self.lens)

    def create_item(self, seq):
        if seq>=self.n: raise IndexError
        st = ((seq%self.bs)*self.spb + (seq//self.bs)) * self.seq_len
        txt = self.chunks[st : st+self.seq_len+1]
        return txt[:-1],txt[1:]

In [None]:
bs,sl = 4,3
ints = L([0,1,2,3,4],[5,6,7,8,9,10],[11,12,13,14,15,16,17,18],[19,20],[21,22,23],[24]).mapped(tensor)

In [None]:
dl = LMDataLoader(ints, bs=bs, seq_len=sl)
test_eq(list(dl),
    [[tensor([[0, 1, 2], [6, 7, 8], [12, 13, 14], [18, 19, 20]]),
      tensor([[1, 2, 3], [7, 8, 9], [13, 14, 15], [19, 20, 21]])],
     [tensor([[3, 4, 5], [ 9, 10, 11], [15, 16, 17], [21, 22, 23]]),
      tensor([[4, 5, 6], [10, 11, 12], [16, 17, 18], [22, 23, 24]])]])

In [None]:
# TODO check shuffled but contiguous

In [None]:
dl = LMDataLoader(ints, bs=bs, seq_len=sl, shuffle=True)
for x,y in list(dl):
    print(x, 'x')
    print(y, 'y')

tensor([[11, 12, 13],
        [17, 18, 24],
        [ 8,  9, 10],
        [19, 20,  0]]) x
tensor([[12, 13, 14],
        [18, 24,  5],
        [ 9, 10, 21],
        [20,  0,  1]]) y
tensor([[14, 15, 16],
        [ 5,  6,  7],
        [21, 22, 23],
        [ 1,  2,  3]]) x
tensor([[15, 16, 17],
        [ 6,  7,  8],
        [22, 23, 19],
        [ 2,  3,  4]]) y


## Integration example

In [None]:
path = untar_data(URLs.IMDB_SAMPLE)
df = pd.read_csv(path/'texts.csv')
df.head(2)

Unnamed: 0,label,text,is_valid
0,negative,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",False
1,positive,"This is a extremely well-made film. The acting, script and camera-work are all first-rate. The music is good, too, though it is mostly early in the film, when things are still relatively cheery. There are no really superstars in the cast, though several faces will be familiar. The entire cast does an excellent job with the script.<br /><br />But it is hard to watch, because there is no good end to a situation like the one presented. It is now fashionable to blame the British for setting Hindus and Muslims against each other, and then cruelly separating them into two countries. There is som...",False


In [None]:
df_tok,count = tokenize_df(df, 'text')
df_tok.head(2)

Unnamed: 0,label,is_valid,text,text_lengths
0,negative,False,"[xxbos, xxmaj, un, -, bleeping, -, believable, !, xxmaj, meg, xxmaj, ryan, does, n't, even, look, her, usual, pert, lovable, self, in, this, ,, which, normally, makes, me, forgive, her, shallow, ticky, acting, schtick, ., xxmaj, hard, to, believe, she, was, the, producer, on, this, dog, ., xxmaj, plus, xxmaj, kevin, xxmaj, kline, :, what, kind, of, suicide, trip, has, his, career, been, on, ?, xxmaj, whoosh, …, xxmaj, banzai, xxrep, 3, !, xxmaj, finally, this, was, directed, by, the, guy, who, did, xxmaj, big, xxmaj, chill, ?, xxmaj, must, be, a, replay, of, xxmaj, jonestown, -, hollywood,...",108
1,positive,False,"[xxbos, xxmaj, this, is, a, extremely, well, -, made, film, ., xxmaj, the, acting, ,, script, and, camera, -, work, are, all, first, -, rate, ., xxmaj, the, music, is, good, ,, too, ,, though, it, is, mostly, early, in, the, film, ,, when, things, are, still, relatively, cheery, ., xxmaj, there, are, no, really, superstars, in, the, cast, ,, though, several, faces, will, be, familiar, ., xxmaj, the, entire, cast, does, an, excellent, job, with, the, script, ., \n\n, xxmaj, but, it, is, hard, to, watch, ,, because, there, is, no, good, end, to, a, situation, like, the, one, ...]",462


In [None]:
texts,lengths = df_tok['text'],df_tok['text_lengths'].values.astype(np.int)

In [None]:
splits = RandomSplitter()(texts)
tfm = Numericalize(make_vocab(count))
#dsrc = DataSource(texts, [[tfm], [tfm]], filts=splits)
dsrc = DataSource(texts, [tfm], filts=splits)

In [None]:
dsrc.train.show_at(0)

xxbos xxmaj every once in a long while a movie will come along that will be so awful that i feel compelled to warn people . xxmaj if i labor all my days and i can save but one soul from watching this movie , how great will be my joy . 

 xxmaj where to begin my discussion of pain . xxmaj for xxunk , there was a musical montage every five minutes . xxmaj there was no character development . xxmaj every character was a stereotype . xxmaj we had xxunk guy , fat guy who eats donuts , goofy foreign guy , etc . xxmaj the script felt as if it were being written as the movie was being shot . xxmaj the production value was so incredibly low that it felt like i was watching a junior high video presentation . xxmaj have the directors , producers , etc . ever even seen a movie before ? xxmaj xxunk is getting worse and worse with every new entry . xxmaj the concept for this movie sounded so funny . xxmaj how could you go wrong with xxmaj gary xxmaj coleman and a handful of somewhat legitimate actor

1

In [None]:
tdl = LMDataLoader(dsrc.train, bs=16, shuffle=True)

In [None]:
tdl.show_batch(max_n=6)

Unnamed: 0,text
0,"xxbos i thought the original of this film was quaint and charming as well as having me sitting on the edge of my seat trying to figure it out . \n\n xxmaj since i had already seen the original , when i saw this on xxmaj sci xxmaj fi xxmaj xxunk i do n't know if this remake was deliberately made for xxmaj sci xxmaj fi - i knew what it was"
1,"connection with the character on the screen . xxmaj you might even feel briefly bored with the xxunk of time as we witness xxmaj brian dealing with his situation through first , primitive means , and then more improved ones ( using xxunk , etc ) for his survival . xxmaj it is more like the ordinary time that passes if you were actually stuck in the situation , and that is"
2,"the films defence xxmaj brad xxmaj pitt , xxmaj eric xxmaj xxunk and xxmaj peter xxup o ' xxmaj xxunk acted very well with a bad script but that is n't enough to save this awful movie . \n\n xxmaj can anybody tell me where the xxunk 200 million budget went ? xxmaj maybe in all the trees they used for the funeral xxunk - where did they get all those trees"
3,"let some of her friends read the screenplay and none of them could predict the ending . xxmaj apparently she hangs out with special kids . xxbos xxmaj the xxmaj blob starts with one of the most bizarre theme songs ever , xxunk by an uncredited xxmaj burt xxmaj xxunk of all people ! xxmaj you really have to hear it to believe it , xxmaj the xxmaj blob may be worth"
4,this movies strongest point is its cast . xxmaj this film needed good actors to deliver the dialog and thrills . xxmaj if they did n't have those actors the film would have been lost and boring . xxmaj we had xxmaj rachel xxunk from xxmaj mean xxmaj girls and xxmaj wedding xxmaj xxunk . xxmaj xxunk xxmaj murphy from xxmaj batman xxmaj begins and 28 days xxmaj later . xxmaj rounding
5,"culture statements just to keep his audience understanding that he was a comedian first , an actor second . xxmaj fuqua should have stopped this immediately . xxmaj foxx 's jokes destroyed his character , which in turn left me with nothing solid to grasp xxunk of . xxmaj instead of character development , he would crack a joke . xxmaj neither style worked , no joke was funny . xxmaj the"


In [None]:
x,y = next(iter(tdl))
test_eq(type(x), TensorText)

## Classification

In [None]:
#export
def pad_collate(samples, pad_idx=1, pad_first=True, backwards=False):
    "Function that collect samples and adds padding. Flips token order if needed"
    max_len = max([len(s[0]) for s in samples])
    res = torch.zeros(len(samples), max_len).long() + pad_idx
    if backwards: pad_first = not pad_first
    for i,s in enumerate(samples):
        sl = slice(-len(s[0]), sys.maxsize) if pad_first else slice(0, len(s[0]))
        res[i,sl] = LongTensor(s[0])
    if backwards: res = res.flip(1)
    return res, tensor(np.array([s[1] for s in samples]))

In [None]:
splits = RandomSplitter()(range_of(df_tok))
dsrc = DataSource(df_tok.itertuples(), filts=splits, tfms=[
    [attrgetter("text"), Numericalize(make_vocab(count))],
    [attrgetter("label"), Categorize()]])
dl = TfmdDL(dsrc.train, create_batch=pad_collate)

In [None]:
dl.show_batch(max_n=4)

Unnamed: 0,text,category
0,"xxbos xxmaj old xxmaj jane 's mannered tale seems very popular these days . i have lost count of the number of versions going around . xxmaj probably the reason is that her "" xxunk "" are our "" xxunk "" even at this late date . xxmaj this xxup tv mini - series gives it a mannered telling suitable to the novel . xxmaj xxunk , xxunk xxmaj emma is a pretty "" modern "" girl when you think about it , even though the xxunk of xxmaj jane xxmaj austen 's world may seem a tad artificial to us today . xxmaj if you have n't seem xxup emma , xxmaj i 'll only say that self - xxunk xxmaj emma does get her comeuppance . xxmaj it 's worth watching to find out how . xxmaj the acting xxunk here go to the ladies : xxmaj kate , xxmaj xxunk , xxmaj lucy and xxmaj samantha . xxmaj they could almost have had a psychic connection to old xxmaj jane !",positive
1,"xxbos xxmaj after mob boss xxmaj xxunk xxmaj xxunk ( late great xxmaj anthony xxmaj xxunk ) kills his lady whom has been cheating on him with xxmaj derek , their new xxunk / xxmaj vietnam vet , and blames it on the poor guy , xxmaj derek finds himself in jail where he has to xxunk with a corrupt xxunk , xxmaj xxunk 's prisoner brother who runs the jail , and , oh yeah xxunk xxunk xxunk by a shady xxup cia agent ( great genre - xxunk and first time director xxmaj john xxmaj saxon ) to turn various prisoners into super - human invincible zombies . xxmaj of course things get out of hand and it 's up to xxmaj derek , and the rest of the xxunk prisoners , to save the day after the infected ones take the jail over . \n\n xxmaj john xxmaj saxon is a great talented actor & as a director xxmaj saxon is a … great talented actor . xxmaj to say this movie ( john 's sole directorial outing to date ) lacks a certain visual flair would be a bit of an understatement . xxmaj however , the film is n't totally without merit . xxmaj the dialog , while idiotic , is just bad enough to be humorous sometimes . xxmaj sadly , this is n't really enough for the movie to coast by on that alone and it takes forever for the film to even start coming into it 's own ( which is fairly late in the movie ) . xxmaj as such , the most i can recommend this film is to say that if you 're a fan of xxmaj saxon ( which i indeed am ) , it 's worth one watch , just go in with low expectations and you should be fine . \n\n xxmaj eye xxmaj candy : xxmaj dana xxmaj xxunk xxmaj mason and xxmaj xxunk xxunk get topless \n\n xxmaj my xxmaj grade : xxup xxunk",negative
2,"xxbos xxup shallow xxup grave begins with either a tribute or a rip off of the shower scene in xxup psycho . ( i 'm leaning toward rip off . ) xxmaj after that it gets worse and then surprisingly gets better , almost to the point of being original . xxmaj bad acting and amateurish directing xxunk down a fairly interesting little story , but the film already surpasses many in the "" yankee comes down xxmaj south to get killed by a bunch of rednecks "" genre because it is actually shot in the xxmaj south . \n\n a group of college girls head to xxmaj xxunk . xxmaj xxunk for summer vacation and are xxunk in xxmaj georgia by a flat xxunk after getting off the main road . ( note to xxmaj xxunk : stay on the xxunk when you go to xxmaj florida . ) xxmaj sue xxmaj ellen ( lisa xxmaj stahl ) has to xxunk so she heads into the woods . xxmaj when she finally finds a good spot to do her business she witnesses the local sheriff ( tony xxmaj march ) xxunk his mistress ( merry xxmaj xxunk ) to death . ( note to xxmaj xxunk : do not wander off into the woods when in the xxmaj south ; not because you might witness a murder , but you may run across a xxunk plantation . ) xxmaj this is the point where the story , not the movie , actually comes close to being good . \n\n xxmaj while xxmaj tony xxmaj march will never have to practice his xxmaj oscar speech , his xxmaj sheriff xxmaj dean becomes a creepy xxunk of a normal guy torn by what he has done and what he must do . xxmaj tom xxmaj law is likable as xxmaj deputy xxmaj scott and is as authentic a xxmaj southern deputy as xxmaj i 've seen since xxmaj xxunk xxmaj xxunk ( deputy xxmaj steve xxmaj xxunk ) in xxup house xxup of 1 xxrep 3 0 xxup xxunk . \n\n a few scenes in the movie are worth the mention . xxmaj the girls stop at a xxup xxunk in xxmaj south xxmaj xxunk and display their racism when a big black guy xxunk them out . xxmaj sue xxmaj ellen runs into a xxunk to hide behind some hay xxunk and in a xxunk realistic moment a large snake is hiding in the hay with her . \n\n xxmaj and in the xxunk scene , xxmaj sheriff xxmaj dean makes like he 's about to rape xxmaj xxunk ( carol xxmaj xxunk ) and tells her to take off her clothes . xxmaj dean has turned the radio up to drown out the noise of what he 's about to do . xxmaj the preacher on the radio needs to go back and read his xxmaj bible . xxmaj his xxunk is about how xxmaj jezebel is saved by the blood of xxmaj jesus xxmaj christ . i feel sorry for this preacher 's xxunk . xxmaj jezebel was in the xxmaj old xxmaj testament a few thousand years before xxmaj christ was born and by no means is she one of the five people you are going to meet in xxmaj heaven .",negative
3,"xxbos i viewed the movie together with a xxunk friend , my wife and her female friend . xxmaj so i had views from all kinds of directions . xxmaj mainly , the film made me laugh , the sexual tension was not really there and the only noticeable actors were xxmaj xxunk xxmaj xxunk and xxmaj maria xxmaj xxunk . xxmaj yes , i do think she played her role well , even if the script was not appropriate . xxmaj there were good xxmaj romanian actors around , they just did n't have complex roles . i applaud xxmaj xxunk 's entering the movie business . i do n't know why , but i think he 's a good guy , i just hope he 'll be a good actor . \n\n xxmaj the wife loved the movie , though , and i think there might have been chords being played and to which i had no ear for . xxmaj if the film tried to present xxunk sexual xxunk and their consequences in xxunk xxmaj xxunk , then it failed miserably . xxmaj there were no consequences . xxmaj just imagine that the girls are actually a boy and a girl , and the same story becomes just a boring , uninteresting plot . \n\n i have no idea why it got all those xxup xxunk awards . xxmaj in my book , it should have gotten the "" better luck next time "" award . ( xxunk = good luck in xxmaj romanian ) .",positive


## Export -

In [None]:
#hide
from local.notebook.export import notebook2script
notebook2script(all_fs=True)

Converted 00_test.ipynb.
Converted 01_core.ipynb.
Converted 01a_torch_core.ipynb.
Converted 01b_script.ipynb.
Converted 01c_dataloader.ipynb.
Converted 02_data_transforms.ipynb.
Converted 03_data_pipeline.ipynb.
Converted 05_data_core.ipynb.
Converted 06_data_source.ipynb.
Converted 07_vision_core.ipynb.
Converted 08_pets_tutorial.ipynb.
Converted 09_vision_augment.ipynb.
Converted 11_layers.ipynb.
Converted 11a_vision_models_xresnet.ipynb.
Converted 12_optimizer.ipynb.
Converted 13_learner.ipynb.
Converted 14_callback_schedule.ipynb.
Converted 15_callback_hook.ipynb.
Converted 16_callback_progress.ipynb.
Converted 17_callback_tracker.ipynb.
Converted 18_callback_fp16.ipynb.
Converted 19_callback_mixup.ipynb.
Converted 20_metrics.ipynb.
Converted 21_tutorial_imagenette.ipynb.
Converted 22_vision_learner.ipynb.
Converted 23_tutorial_transfer_learning.ipynb.
Converted 30_text_core.ipynb.
Converted 31_text_data.ipynb.
Converted 32_text_models_awdlstm.ipynb.
Converted 33_text_models_core.i