In [2]:
# The overall pipeline
from transformers import pipeline

# The toakenizer
from transformers import AutoTokenizer
# Model architectures 
from transformers import AutoModel
from transformers import AutoModelForSequenceClassification

import torch

# Model overview

In [62]:
text = [
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
        "Great! I messed it up again!"
]
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
# checkpoint = "distilbert-base-cased-distilled-squad"

## Running a model directly

In [74]:
classifier = pipeline("sentiment-analysis", model=checkpoint)
classifier(text)

[{'label': 'POSITIVE', 'score': 0.9598049521446228},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455},
 {'label': 'POSITIVE', 'score': 0.9654763340950012}]

## Running a model by step

#### Tokenization
Do this preprocessing the same way as when the model was pretrained by using from_pretrained to update the tokenizer with the pretraining info from the original model  
The raw_inputs are the text; The output of the tokenizer is the input of the model. It contains the input IDs (ID of each token/word for the checkpoint) as a PyTorch tensor, and the attention_mask

In [75]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

raw_inputs = text
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0],
        [  101,  2307,   999,  1045, 18358,  2009,  2039,  2153,   999,   102,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]])}


#### Modeling
The output of the checkpoint is hidden state, not the final label  

Dimensions of the output
- Batch size: The number of sequences processed at a time (3 in our example).
- Sequence length: The length of the numerical representation of the sequence (16 in our example).
- Hidden size: The vector dimension of each model input.

In [76]:
model = AutoModel.from_pretrained(checkpoint)
outputs = model(**inputs)
# outputs is like named tuple. Check its keys by outputs.keys()
print(outputs.last_hidden_state.shape)

Some weights of the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing DistilBertModel: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'classifier.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([3, 16, 768])


#### Redo the modeling with AutoModelForSequenceClassification
Now the outputs is the logits [N_row, N_label]  
Use softmax to get the probilities

In [77]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)
print(outputs.logits)

tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464],
        [-1.6187,  1.7123]], grad_fn=<AddmmBackward0>)


In [78]:
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

tensor([[4.0195e-02, 9.5980e-01],
        [9.9946e-01, 5.4418e-04],
        [3.4524e-02, 9.6548e-01]], grad_fn=<SoftmaxBackward0>)


In [79]:
# Get labels
print(model.config.id2label)

{0: 'NEGATIVE', 1: 'POSITIVE'}


## Running a specific model

In [80]:
checkpoint = "bert-base-cased"

#### The automatic way

In [81]:
from transformers import AutoModel
bert_model = AutoModel.from_pretrained(checkpoint)
print(type(bert_model))

from transformers import AutoConfig
bert_config = AutoConfig.from_pretrained(checkpoint)
print(type(bert_config))

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<class 'transformers.models.bert.modeling_bert.BertModel'>
<class 'transformers.models.bert.configuration_bert.BertConfig'>


#### The specific way - better for customization

In [82]:
from transformers import BertConfig, BertModel

# Initialize a model (with random weights)
config = BertConfig()
model = BertModel(config)

# Or load a pre-trained model
model = BertModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [83]:
print(config)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.28.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



## Save a model

In [84]:
model.save_pretrained("./")

# Tokenizers

## Overview

In [85]:
from transformers import BertTokenizer  # can use AutoTokenizer as well
tokenizer = BertTokenizer.from_pretrained(checkpoint)

inputs = tokenizer(text)
print(inputs)

tokenizer.save_pretrained("./")

{'input_ids': [[101, 146, 112, 1396, 1151, 2613, 1111, 170, 20164, 10932, 2271, 7954, 1736, 1139, 2006, 1297, 119, 102], [101, 146, 4819, 1142, 1177, 1277, 106, 102], [101, 2038, 106, 146, 20147, 1122, 1146, 1254, 106, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


('./tokenizer_config.json',
 './special_tokens_map.json',
 './vocab.txt',
 './added_tokens.json')

## Step by step

### The detailed way

In [63]:
from transformers import BertTokenizer

# Tokens
tokenizer = BertTokenizer.from_pretrained(checkpoint)
tokens = tokenizer.tokenize(text[0])
print(tokens)

# Token ids
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

# The final output used by models also contains sentence marks
inputs = tokenizer.prepare_for_model(ids)
print(inputs["input_ids"])

# Decode
decoded_string = tokenizer.decode(ids)
print(decoded_string)
decoded_string = tokenizer.decode(inputs["input_ids"])
print(decoded_string)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


['i', "'", 've', 'been', 'waiting', 'for', 'a', 'hugging', '##face', 'course', 'my', 'whole', 'life', '.']
[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]
[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]
i've been waiting for a huggingface course my whole life.
[CLS] i've been waiting for a huggingface course my whole life. [SEP]


### Put it all together

In [64]:
# This is the same with 
# the output of passing text directly to the tokenizer instance
inputs = tokenizer(text[0])
print(inputs["input_ids"])

decoded_string = tokenizer.decode(inputs["input_ids"])
print(decoded_string)

[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]
[CLS] i've been waiting for a huggingface course my whole life. [SEP]


## Process multiple sequences

In [7]:
print(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
print(tokenizer.pad_token_id)

distilbert-base-uncased-finetuned-sst-2-english
0


In [20]:
sequence = "I've been waiting for a HuggingFace course my whole life."
tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

### The input of a model has to be a list of sequences

In [19]:
# It works when passing a list of sequences (of 1 element)
input_ids = torch.tensor([ids])
print(f'Input IDs: {input_ids};\nLogits: {model(input_ids).logits}\n')

# It fails when passing a sequence
input_ids = torch.tensor(ids)
print(input_ids)
print(model(input_ids))

Input IDs: tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]]);
Logits: tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)

tensor([ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
         2026,  2878,  2166,  1012])


RuntimeError: The size of tensor a (14) must match the size of tensor b (512) at non-singleton dimension 1

In [28]:
# Now batch multiple sequences
batched_ids = [ids, ids]
inputs = torch.tensor(batched_ids)
model(inputs)

SequenceClassifierOutput(loss=None, logits=tensor([[-2.7276,  2.8789],
        [-2.7276,  2.8789]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

### Padding
Pad short sequences so that they all have the same length with the longest one

In [29]:
padding_id = tokenizer.pad_token_id  # What padding id to use when padding
print(padding_id)

0


In [34]:
seq1_ids = [200, 200, 200]
seq2_ids = [200, 200]
batched_ids = [
    seq1_ids, 
    seq2_ids + [padding_id],
]
print(model(torch.tensor([seq1_ids])).logits)
print(model(torch.tensor([seq2_ids])).logits)
print(model(torch.tensor(batched_ids)).logits)
print("The results are not the same after batching!!!")

tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)
The results are not the same after batching!!!


### Attention masks

In [41]:
attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]
print(f'batched_ids: {batched_ids};\nattention_mask: {attention_mask}')
outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)
print("Now the logits are the same")

batched_ids: [[200, 200, 200], [200, 200, 999]];
attention_mask: [[1, 1, 1], [1, 1, 0]]
tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
Now the logits are the same


### Do the above automatically

In [60]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
model_inputs = tokenizer(sequences)
print(f"No padding: {model_inputs}\n")

padding_type = "longest"
model_inputs = tokenizer(sequences, padding=padding_type)
print(f"{padding_type} padding: {model_inputs}\n")

padding_type = "max_length"  # The default max of the model
model_inputs = tokenizer(sequences, padding=padding_type)
print(f"{padding_type} padding: {model_inputs}\n")

padding_type = "max_length"  # The defined max
model_inputs = tokenizer(sequences, padding=padding_type, max_length=8)
print(f"{padding_type} padding: {model_inputs}\n")

# Truncate: by default it's max_length of the model but we can redefined the max
model_inputs = tokenizer(sequences, truncation=True, padding=padding_type, max_length=8)
print(f"Truncated: {model_inputs}\n")

# Return a tensor
model_inputs = tokenizer(sequences, padding=padding_type, max_length=16, return_tensors="pt")
print(f"max_length padding, max_length = 16, return a PT tensor: \n{model_inputs}\n")

No padding: {'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 2061, 2031, 1045, 999, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

longest padding: {'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 2061, 2031, 1045, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}

max_length padding: {'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

## Wrapping up: From tokenizer to model

In [73]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
inputs = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**inputs)
print(output)

SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [-3.6183,  3.9137]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
