In [27]:
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

In [28]:
from fastbook import *
from IPython.display import display, HTML

# Die Mid-Level API von fastai

In [29]:
from fastai.text.all import *

dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test')

In [30]:
path = untar_data(URLs.IMDB)
dls = DataBlock(
    blocks=(TextBlock.from_folder(path),CategoryBlock),
    get_y = parent_label,
    get_items=partial(get_text_files, folders=['train', 'test']),
    splitter=GrandparentSplitter(valid_name='test')
).dataloaders(path)

## Transforms

In [31]:
files = get_text_files(path, folders = ['train', 'test'])
txts = L(o.open().read() for o in files[:2000])

In [32]:
tok = Tokenizer.from_folder(path)
tok.setup(txts)
toks = txts.map(tok)
toks[0]

(#335) ['xxbos','xxmaj','at','the','time','i','recall','being','quite','startled'...]

In [33]:
num = Numericalize()
num.setup(toks)
nums = toks.map(num)
nums[0][:10]

TensorText([   2,    8,   47,    9,   73,   19, 1713,  132,  178, 8780])

In [34]:
nums_dec = num.decode(nums[0][:10]); nums_dec

(#10) ['xxbos','xxmaj','at','the','time','i','recall','being','quite','startled']

In [35]:
tok.decode(nums_dec)

'xxbos xxmaj at the time i recall being quite startled'

In [36]:
tok((txts[0], txts[1]))

((#335) ['xxbos','xxmaj','at','the','time','i','recall','being','quite','startled'...],
 (#939) ['xxbos','i','have','seen','a','lot','of','ppv',"'s",'in'...])

### Eigene Transforms implementieren

In [37]:
def f(x:int): return x+1
tfm = Transform(f)
tfm(2),tfm(2.0)

(3, 2.0)

Eine Funktion an eine Funktion übergeben: Pythons' decorator.

In [38]:
@Transform
def f(x:int): return x+1
f(2),f(2.0)

(3, 2.0)

In [39]:
class NormalizeMean(Transform):
    def setups(self, items): self.mean = sum(items)/len(items)
    def encodes(self, x): return x-self.mean
    def decodes(self, x): return x+self.mean

In [40]:
tfm = NormalizeMean()
tfm.setup([1,2,3,4,5])
start = 2
y = tfm(start)
z = tfm.decode(y)
tfm.mean,y,z

(3.0, -1.0, 2.0)

### Pipelines

In [41]:
tfms = Pipeline([tok, num])
t = tfms(txts[0]); t[:20]

TensorText([   2,    8,   47,    9,   73,   19, 1713,  132,  178, 8780,   12, 4250,   45,   21,   33,   11,   19, 4634,   15,   18])

In [42]:
tfms.decode(t)[:100]

'xxbos xxmaj at the time i recall being quite startled and amused by this movie . i referred to it as'

#### Setup von Pipelines mithilfe von `TfmdLists`




In [43]:
tls = TfmdLists(files, [Tokenizer.from_folder(path), Numericalize])

In [44]:
t = tls[0]; t[:20]

TensorText([    2,     8,    46,     9,    77,    19,  2301,   129,   206, 14948,    12,  5285,    48,    20,    30,    10,    19,  5164,    15,    17])

In [45]:
tls.decode(t)[:100]

'xxbos xxmaj at the time i recall being quite startled and amused by this movie . i referred to it as'

In [46]:
tls.show(t)

xxbos xxmaj at the time i recall being quite startled and amused by this movie . i referred to it as the most important movie xxmaj i 'd seen in ten years , and found myself bumping into people who said similar things . 

 xxmaj bernhard has an unusually perceptive behavioral notebook . xxmaj and she has shaped the bitter adolescent personality that we all had , into a corrosive , adult world - view . xxmaj the two together provide a startling mix which may be too edgy for some viewers . ( hi xxmaj skip . i wish you were n't my brother so i could xxrep 4 * you ! ) 

 xxmaj xxunk search for herself after returning to xxup la from xxmaj new xxmaj york , results in the immersive trying - on of various personas ( all of which fit poorly ) for our amusement , but enough of them involve acting out to appeal to a " black imperative " values system that the real barometer of her xxunk is whether black culture accepts her . ( it 's been a while . xxmaj nina xxmaj simone comes to mind . xxmaj an

In [47]:
cut = int(len(files)*0.8)
splits = [list(range(cut)), list(range(cut,len(files)))]
tls = TfmdLists(files, [Tokenizer.from_folder(path), Numericalize], 
                splits=splits)

In [48]:
tls.valid[0][:20]

TensorText([   2,    8, 2095,    8, 7242,   23,   30,  626,   16,   13, 3112,   15,  510,  464,   10,   19,   42,  100, 7951, 1087])

In [49]:
lbls = files.map(parent_label)
lbls

(#50000) ['pos','pos','pos','pos','pos','pos','pos','pos','pos','pos'...]

In [50]:
cat = Categorize()
cat.setup(lbls)
cat.vocab, cat(lbls[0])

(['neg', 'pos'], TensorCategory(1))

In [51]:
tls_y = TfmdLists(files, [parent_label, Categorize()])
tls_y[0]

TensorCategory(1)

## Datasets

In [52]:
x_tfms = [Tokenizer.from_folder(path), Numericalize]
y_tfms = [parent_label, Categorize()]
dsets = Datasets(files, [x_tfms, y_tfms])
x,y = dsets[0]
x[:20],y

(TensorText([    2,     8,    46,     9,    77,    19,  2301,   129,   206, 14948,    12,  5285,    48,    20,    30,    10,    19,  5164,    15,    17]),
 TensorCategory(1))

In [53]:
x_tfms = [Tokenizer.from_folder(path), Numericalize]
y_tfms = [parent_label, Categorize()]
dsets = Datasets(files, [x_tfms, y_tfms], splits=splits)
x,y = dsets.valid[0]
x[:20],y

(TensorText([   2,    8, 2095,    8, 7242,   23,   30,  626,   16,   13, 3112,   15,  510,  464,   10,   19,   42,  100, 7951, 1087]),
 TensorCategory(0))

In [54]:
t = dsets.valid[0]
dsets.decode(t)

("xxbos xxmaj steven xxmaj segal 's movie career is a tribute to horrible cinema . i have been tragically bored with every one of them as soon as i realized that they were even more unrealistic than xxmaj jean xxmaj claude vandamme 's . xxmaj has anyone else ever noticed that he never gets hit ? ! i mean , give me something to root for … a hard fought battle with a bad guy who 's scary . xxup twenty xxup years and he 's still filming the same fight scenes . xxmaj fight scenes can often distract you from the fact that your hero can not act . xxmaj the boring choreography of a xxmaj segal film places his painful lack of acting skill in sharp relief . xxmaj worse yet , he 's woefully out of shape . xxmaj just what we need , a fat stiff who xxup thinks he 's a leading man . xxmaj there 's not one iota of redeeming cinematic value in all this movies ninety or so minutes . xxmaj do xxup not watch this unless you feel like throwing away an hour and a half of your life .",
 'neg')

In [55]:
dls = dsets.dataloaders(bs=64, before_batch=pad_input)

* `after-item`
* `before_batch`
* `after_batch`

In [56]:
tfms = [[Tokenizer.from_folder(path), Numericalize], [parent_label, Categorize]]
files = get_text_files(path, folders = ['train', 'test'])
splits = GrandparentSplitter(valid_name='test')(files)
dsets = Datasets(files, tfms, splits=splits)
dls = dsets.dataloaders(dl_type=SortedDL, before_batch=pad_input)

In [57]:
# using the datablock-api:
path = untar_data(URLs.IMDB)
dls = DataBlock(
    blocks=(TextBlock.from_folder(path),CategoryBlock),
    get_y = parent_label,
    get_items=partial(get_text_files, folders=['train', 'test']),
    splitter=GrandparentSplitter(valid_name='test')
).dataloaders(path)