In [1]:
!pip install -qq datasets transformers[sentencepiece]

[K     |████████████████████████████████| 262 kB 9.9 MB/s 
[K     |████████████████████████████████| 2.5 MB 57.6 MB/s 
[K     |████████████████████████████████| 243 kB 67.8 MB/s 
[K     |████████████████████████████████| 43 kB 2.1 MB/s 
[K     |████████████████████████████████| 118 kB 77.2 MB/s 
[K     |████████████████████████████████| 3.3 MB 62.0 MB/s 
[K     |████████████████████████████████| 895 kB 56.9 MB/s 
[K     |████████████████████████████████| 1.1 MB 71.0 MB/s 
[?25h

## Behind using the pipeline api

In [2]:
from transformers import pipeline

In [3]:
classifier = pipeline("sentiment-analysis")
classifier(["I've been waiting for a HuggingFace course my whole life.",
            "I hate this really much"])
print(classifier.model.name_or_path)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=629.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267844284.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=48.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…


distilbert-base-uncased-finetuned-sst-2-english


## Schritt 1: Tokenization

In [4]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [6]:
raw_inputs = [
  "I've been waiting for a HuggingFace course my whole life.",
  "I hate this so much!"
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


## Schritt 2: Inputs durch das Modell senden

In [7]:
from transformers import AutoModel
model = AutoModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing DistilBertModel: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

torch.Size([2, 16, 768])


In [9]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)

In [10]:
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [11]:
print(outputs.logits.shape)
print(outputs.logits)

torch.Size([2, 2])
tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward>)


## Processing Outputs

In [12]:
import torch
predictions = torch.nn.functional.softmax(outputs.logits, dim=1)
print(predictions)

tensor([[4.0195e-02, 9.5980e-01],
        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward>)


In [13]:
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

## Hinter `pipeline`: Blurr

In [14]:
!pip install -qq fastai
!pip install -qq ohmeow-blurr

[K     |████████████████████████████████| 75 kB 3.5 MB/s 
[K     |████████████████████████████████| 188 kB 45.9 MB/s 
[K     |████████████████████████████████| 43 kB 2.3 MB/s 
[K     |████████████████████████████████| 53 kB 2.8 MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [15]:
from fastai.text.all import *
from blurr.utils import *
from blurr.data.core import *
from blurr.modeling.core import *

### Vorbereitungen

In [16]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

In [17]:
path = untar_data(URLs.IMDB_SAMPLE)
imdb_df = pd.read_csv(path/'texts.csv')

imdb_df.head()

Unnamed: 0,label,text,is_valid
0,negative,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",False
1,positive,"This is a extremely well-made film. The acting, script and camera-work are all first-rate. The music is good, too, though it is mostly early in the film, when things are still relatively cheery. There are no really superstars in the cast, though several faces will be familiar. The entire cast does an excellent job with the script.<br /><br />But it is hard to watch, because there is no good end to a situation like the one presented. It is now fashionable to blame the British for setting Hindus and Muslims against each other, and then cruelly separating them into two countries. There is som...",False
2,negative,"Every once in a long while a movie will come along that will be so awful that I feel compelled to warn people. If I labor all my days and I can save but one soul from watching this movie, how great will be my joy.<br /><br />Where to begin my discussion of pain. For starters, there was a musical montage every five minutes. There was no character development. Every character was a stereotype. We had swearing guy, fat guy who eats donuts, goofy foreign guy, etc. The script felt as if it were being written as the movie was being shot. The production value was so incredibly low that it felt li...",False
3,positive,"Name just says it all. I watched this movie with my dad when it came out and having served in Korea he had great admiration for the man. The disappointing thing about this film is that it only concentrate on a short period of the man's life - interestingly enough the man's entire life would have made such an epic bio-pic that it is staggering to imagine the cost for production.<br /><br />Some posters elude to the flawed characteristics about the man, which are cheap shots. The theme of the movie ""Duty, Honor, Country"" are not just mere words blathered from the lips of a high-brassed offic...",False
4,negative,"This movie succeeds at being one of the most unique movies you've seen. However this comes from the fact that you can't make heads or tails of this mess. It almost seems as a series of challenges set up to determine whether or not you are willing to walk out of the movie and give up the money you just paid. If you don't want to feel slighted you'll sit through this horrible film and develop a real sense of pity for the actors involved, they've all seen better days, but then you realize they actually got paid quite a bit of money to do this and you'll lose pity for them just like you've alr...",False


In [18]:
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(checkpoint, model_cls=AutoModelForSequenceClassification)

print(hf_arch)
print(type(hf_config))
print(type(hf_tokenizer))
print(type(hf_model))

distilbert
<class 'transformers.models.distilbert.configuration_distilbert.DistilBertConfig'>
<class 'transformers.models.distilbert.tokenization_distilbert_fast.DistilBertTokenizerFast'>
<class 'transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification'>


In [19]:
blocks = (HF_TextBlock(hf_arch,hf_config, hf_tokenizer, 
                       hf_model, max_length=None, padding=True,
                       truncation=True), 
          CategoryBlock)
dblock = DataBlock(blocks=blocks, get_x=ColReader('text'), get_y=ColReader('label'),
                   splitter=ColSplitter())

In [20]:
dls = dblock.dataloaders(imdb_df, bs=4)

In [21]:
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,category
0,"raising victor vargas : a review < br / > < br / > you know, raising victor vargas is like sticking your hands into a big, steaming bowl of oatmeal. it's warm and gooey, but you're not sure if it feels right. try as i might, no matter how warm and gooey raising victor vargas became i was always aware that something didn't quite feel right. victor vargas suffers from a certain overconfidence on the director's part. apparently, the director thought that the ethnic backdrop of a latino family on the lower east side, and an idyllic storyline would make the film critic proof. he was right, but it didn't fool me. raising victor vargas is the story about a seventeen - year old boy called, you guessed it, victor vargas ( victor rasuk ) who lives his teenage years chasing more skirt than the rolling stones could do in all the years they've toured. the movie starts off in ` ugly fat'donna's bedroom where victor is sure to seduce her, but a cry from outside disrupts his plans when his best - friend harold ( kevin rivera ) comes - a - looking for him. caught in the attempt by harold and his sister, victor vargas runs off for damage control. yet even with the embarrassing implication that he's been boffing the homeliest girl in the neighborhood, nothing dissuades young victor from going off on the hunt for more fresh meat. on a hot, new york city day they make way to the local public swimming pool where victor's eyes catch a glimpse of the lovely young nymph judy ( judy marte ), who's not just pretty, but a strong and independent too. the relationship that develops between victor and judy becomes the focus of the film. the story also focuses on victor's family that is comprised of his grandmother or abuelita ( altagracia guzman ), his brother nino ( also played by real life brother to victor, silvestre rasuk ) and his sister vicky ( krystal rodriguez ). the action follows victor between scenes with judy and scenes with his family. victor tries to cope with being an oversexed pimp - daddy, his feelings for judy and his grandmother's conservative catholic upbringing. < br / > < br / > the problems that arise from raising victor vargas are a few, but glaring errors. throughout the film you get to know certain characters like vicky, nino, grandma, judy and even",negative
1,"i really wanted to love this show. i truly, honestly did. < br / > < br / > for the first time, gay viewers get their own version of the "" the bachelor "". with the help of his obligatory "" hag "" andra, james, a good looking, well - to - do thirty - something has the chance of love with 15 suitors ( or "" mates "" as they are referred to in the show ). the only problem is half of them are straight and james doesn't know this. if james picks a gay one, they get a trip to new zealand, and if he picks a straight one, straight guy gets $ 25, 000. how can this not be fun?! take my hand, lets stroll : < br / > < br / > the most glaring problem with this show is the bachelor himself. james is your typical young and successful gay guy with a nice smile and body, the one you'd probably give two glances towards at your local bar before grazing for greener pastures. why they chose to cast james as the leading man is beyond me. god knows there's so many other hotter and vivacious homosexual men out there dying to be on tv. < br / > < br / > aside from his rather average physical appearance, james is about as interesting and exciting as a piece of chalk. even as such, he has this arrogant, smugly condescending aura about him. however, if james were standing up against a blank, white wall he'd meld right into in it. i honestly can't recall a single interesting or noteworthy thing james said during the course of the show. he is that boring and forgettable. in fact, one of the mates flat out advised him he wasn't feeling a connection. i thought that was the best part of the show. also, james speaks with an excruciatingly annoying lilt. sound feminine or sound masculine, but don't * * * * ing segue tones in the middle of sentences... so painful to sit through. i hated him so much all throughout the show i kept thinking, "" please choose a straight guy and humiliate yourself and your unfortunate looking hag "" < br / > < br / > then we have the suitors. a remarkably bland bunch of men who don't seem to care either way what is happening. equally vapid, they seem to be indistinguishable from one guy to the next except,",negative


In [22]:
xb, yb = dls.one_batch()

In [23]:
xb

{'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'),
 'input_ids': tensor([[  101,  6274,  5125,  ...,  1998,  2130,   102],
         [  101,  1996,  1038,  ...,  6300, 10376,   102],
         [  101,  1045, 12524,  ...,  1999,  2008,   102],
         [  101,  1045,  3427,  ...,  2091,  1012,   102]], device='cuda:0')}

In [24]:
len(xb), xb['input_ids'].shape, xb['attention_mask'].shape, len(xb['input_ids']), yb.shape

(2, torch.Size([4, 512]), torch.Size([4, 512]), 4, torch.Size([4]))

### Inputs in das HuggingFace Modell füttern

In [28]:
hf_model.cuda()
outputs = hf_model(**xb)

In [29]:
print(outputs.logits.shape)
print(outputs.logits)

torch.Size([4, 2])
tensor([[-1.0525,  1.2515],
        [ 1.2890, -1.0377],
        [ 4.4430, -3.5959],
        [ 3.9893, -3.3104]], device='cuda:0', grad_fn=<AddmmBackward>)


### Output Verarbeitung (Postprocessing)

In [30]:
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

tensor([[9.0788e-02, 9.0921e-01],
        [9.1107e-01, 8.8934e-02],
        [9.9968e-01, 3.2256e-04],
        [9.9932e-01, 6.7533e-04]], device='cuda:0', grad_fn=<SoftmaxBackward>)


In [31]:
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

### FastAI Learning mit BLURR

In [34]:
model = HF_BaseModelWrapper(hf_model)

learn = Learner(dls,
                model,
                opt_func=partial(OptimWrapper, opt=torch.optim.Adam),
                loss_func=CrossEntropyLossFlat(),
                metrics=[accuracy],
                cbs=[HF_BaseModelCallback],
                splitter=hf_splitter)

learn.freeze()

In [36]:
learn.show_results(learner=learn, max_n=2, trunc_at=500)

Unnamed: 0,text,category,target
0,"the trouble with the book, "" memoirs of a geisha "" is that it had japanese surfaces but underneath the surfaces it was all an american man's way of thinking. reading the book is like watching a magnificent ballet with great music, sets, and costumes yet performed by barnyard animals dressed in those costumesso far from japanese ways of thinking were the characters. < br / > < br / > the movie isn't about japan or real geisha. it is a story about a few american men's mistaken ideas about japan an",negative,negative
1,"< br / > < br / > i'm sure things didn't exactly go the same way in the real life of homer hickam as they did in the film adaptation of his book, rocket boys, but the movie "" october sky "" ( an anagram of the book's title ) is good enough to stand alone. i have not read hickam's memoirs, but i am still able to enjoy and understand their film adaptation. the film, directed by joe johnston and written by lewis colick, records the story of teenager homer hickam ( jake gyllenhaal ), beginning in oct",positive,positive


In [38]:
learn.blurr_predict([
    "I've been waiting for a HuggingFace course my whole life",
    "I hate this so much!"
])

[(('positive',), (#1) [tensor(1)], (#1) [tensor([0.0484, 0.9516])]),
 (('negative',), (#1) [tensor(0)], (#1) [tensor([9.9946e-01, 5.4418e-04])])]

## Modelle verwenden

In [39]:
!mkdir -p 'my_model'

In [40]:
learn.model.hf_model.save_pretrained('my_model')
hf_tokenizer.save_pretrained('my_model')

('my_model/tokenizer_config.json',
 'my_model/special_tokens_map.json',
 'my_model/vocab.txt',
 'my_model/added_tokens.json',
 'my_model/tokenizer.json')

In [41]:
hf_model is learn.model.hf_model

True

In [42]:
!ls -lsha 'my_model'

total 257M
4.0K drwxr-xr-x 2 root root 4.0K Jul 20 09:00 .
4.0K drwxr-xr-x 1 root root 4.0K Jul 20 09:00 ..
4.0K -rw-r--r-- 1 root root  734 Jul 20 09:00 config.json
256M -rw-r--r-- 1 root root 256M Jul 20 09:00 pytorch_model.bin
4.0K -rw-r--r-- 1 root root  112 Jul 20 09:00 special_tokens_map.json
4.0K -rw-r--r-- 1 root root  405 Jul 20 09:00 tokenizer_config.json
456K -rw-r--r-- 1 root root 456K Jul 20 09:00 tokenizer.json
228K -rw-r--r-- 1 root root 227K Jul 20 09:00 vocab.txt


In [43]:
hf_arch2, hf_config2, hf_tokenizer2, hf_model2 = BLURR.get_hf_objects('my_model', model_cls=AutoModelForSequenceClassification)

print(hf_arch)
print(type(hf_config))
print(type(hf_tokenizer))
print(type(hf_model))

distilbert
<class 'transformers.models.distilbert.configuration_distilbert.DistilBertConfig'>
<class 'transformers.models.distilbert.tokenization_distilbert_fast.DistilBertTokenizerFast'>
<class 'transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification'>
