# Transformers Basics


# Tokenizers
## AutoTokenizer

In [1]:
from transformers import AutoTokenizer

In [2]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [3]:
raw_inputs = [
    "Throwing a disc is one of the true pleasures in life",
    "Losing my disc in the woods is frustrating",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="tf")
inputs




{'input_ids': <tf.Tensor: shape=(2, 13), dtype=int32, numpy=
array([[  101,  6886,  1037,  5860,  2003,  2028,  1997,  1996,  2995,
        26552,  1999,  2166,   102],
       [  101,  3974,  2026,  5860,  1999,  1996,  5249,  2003, 25198,
          102,     0,     0,     0]])>, 'attention_mask': <tf.Tensor: shape=(2, 13), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])>}

## Tokenizer without AutoTokenizer
If you want to have more control over the tokenizer you can load a specific one

In [41]:
from transformers import BertTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
# this does the same thing
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [42]:
tokenizer("Using a Transformer network is simple")==bert_tokenizer("Using a Transformer network is simple")

True

In [43]:
bert_tokenizer("Using a Transformer network is simple")

{'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [44]:
# saving local tokenizer
folder = "bert-base-cased-tokenizer"
bert_tokenizer.save_pretrained(folder)

('bert-base-cased-tokenizer\\tokenizer_config.json',
 'bert-base-cased-tokenizer\\special_tokens_map.json',
 'bert-base-cased-tokenizer\\vocab.txt',
 'bert-base-cased-tokenizer\\added_tokens.json')

## Tokenizer Steps
Calling `tokenizer.tokenize()` takes several discrete steps

In [49]:
sequence = "Using a Transformer network is simple!"
tokens = tokenizer.tokenize(sequence)
tokens

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple', '!']

In [50]:
# now look at it in steps

# 1 converts tokens to the ids of the vocabular
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[7993, 170, 13809, 23763, 2443, 1110, 3014, 106]

In [52]:
# 2 Add special tokens
model_inputs = tokenizer.prepare_for_model(ids)
model_inputs

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [54]:
# 3 Decode ids (done with model outputs to convert back into words)
decoded_string = tokenizer.decode(model_inputs["input_ids"])
print(decoded_string)

[CLS] Using a Transformer network is simple! [SEP]


## Padding 
When batching multiple sequences of different lengths, the shorter ones must be padded with null characters to make it the same length as the longest one.

Note: if you pass the padded short sequnce to a predictive model, it will not produce the same output as an unpadded one. BUT this is why we have the attention mask, which tells the model to ignore the pad tokens

In [55]:
tokenizer.pad_token_id

0

In [56]:
bert_tokenizer.pad_token_id

0

In [65]:
sequences = [
    "This is a long sequence, as you can see.", 
    "This is short."
]

model_inputs = tokenizer(sequences, padding=True)
print(model_inputs['attention_mask'])


[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]]


In [66]:
# pad the sequences up to the maximum sequence length
model_inputs = tokenizer(sequences, padding="longest")
print(model_inputs['attention_mask'])

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]]


In [67]:
# pad the sequences up to the model max length(512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, padding="max_length")
print(model_inputs['attention_mask'])

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [68]:
# pad the sequences up to the specified max length
model_inputs = tokenizer(sequences, padding="max_length", max_length=8)
print(model_inputs['attention_mask'])

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0]]


### Truncate Sequences


In [69]:
# truncate the sequences that are longer than the model max length (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, truncation=True)
print(model_inputs['attention_mask'])

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]


In [70]:
# Will truncate the sequences that are longer than the specified max length
model_inputs = tokenizer(sequences, max_length=8, truncation=True)
print(model_inputs['attention_mask'])

[[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]


# AutoModel
This allows us to download a pretrained model with weights EXCLUDING the head using the `from_pretrained()` method. This model accepts the tokenized inputs and outputs features (ie  embeddings) for each input token.

In [4]:
from transformers import TFAutoModel

In [5]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = TFAutoModel.from_pretrained(checkpoint)
model




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


<transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertModel at 0x21e9d0a23b0>

In [6]:
model.summary()

Model: "tf_distil_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMa  multiple                  66362880  
 inLayer)                                                        
                                                                 
Total params: 66362880 (253.15 MB)
Trainable params: 66362880 (253.15 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [7]:
# run tokenized input through the model to get output
outputs = model(inputs)

In [8]:
# Shape is (batch_size, sequence_length, hidden size)
outputs.last_hidden_state.shape

TensorShape([2, 13, 768])

In [9]:
outputs[0].shape

TensorShape([2, 13, 768])

Similar to AutoModel, which returns the hidden states, there are AutoX for other tasks:
- AutoModelForCausalML
- AutoModelForMaskedLM
- AutoModelForMultipleChoice
- AutoModelForQuestionAnswering
- AutoModelForSequenceClassification
- AutoModelForTokenClassification
- etc

In [10]:
from transformers import TFAutoModelForSequenceClassification
import tensorflow as tf
import numpy as np


checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [11]:
# predict outputs
outputs = model(inputs)
outputs.logits.shape   # shape=(batch_size, num_classes)

TensorShape([2, 2])

In [12]:
outputs.logits

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[-4.1785216,  4.487308 ],
       [ 4.3378005, -3.5411556]], dtype=float32)>

In [13]:
# What do the logit positions represent?
label_dict = model.config.id2label
label_dict

{0: 'NEGATIVE', 1: 'POSITIVE'}

In [14]:
predictions = tf.math.softmax(outputs.logits, axis=-1).numpy()
predictions

array([[1.7234674e-04, 9.9982762e-01],
       [9.9962151e-01, 3.7848490e-04]], dtype=float32)

In [15]:
predicted_labels = np.argmax(predictions, axis = -1)
predicted_labels

array([1, 0], dtype=int64)

In [16]:
for pred, raw_input in zip(predicted_labels, raw_inputs):
    print(label_dict[pred], raw_input)

POSITIVE Throwing a disc is one of the true pleasures in life
NEGATIVE Losing my disc in the woods is frustrating


# Models without AutoModel

This gives you more control over the specific model/checkpoint you want to use. Concepts:

A Model is made up of: 
- `Config` object that has the details/parameters needed to build the model and instantiate the `Model` class
- `Model` class (the architecture but no weights
- model file containing te weights


In [17]:
from transformers import BertConfig, TFBertModel

In [None]:
# METHOD 1: initialize untrained model based on config
# Building the config
config = BertConfig()

# Building the model from the config (THIS WILL HAVE RANDOM WEIGHTS BECAUSE IT"S NOT USING ANY PRETRAINED WEIGHTS
model = TFBertModel(config)

config

In [18]:
# METHOD 1: initialize model based with pretrained weights
model = TFBertModel.from_pretrained("bert-base-cased")
model.summary()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108310272 
                                                                 
Total params: 108310272 (413.17 MB)
Trainable params: 108310272 (413.17 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Saving model locally to folder

In [20]:
folder = "bert-base-cased"
model.save_pretrained(folder)

In [27]:
# Run model
sequences = ["Hello!", "Cool.", "Nice!"]
encoded_sequences  = tokenizer(sequences, padding=True, truncation=True, return_tensors="tf")
encoded_sequences

{'input_ids': <tf.Tensor: shape=(3, 4), dtype=int32, numpy=
array([[ 101, 7592,  999,  102],
       [ 101, 4658, 1012,  102],
       [ 101, 3835,  999,  102]])>, 'attention_mask': <tf.Tensor: shape=(3, 4), dtype=int32, numpy=
array([[1, 1, 1, 1],
       [1, 1, 1, 1],
       [1, 1, 1, 1]])>}

In [28]:
model_inputs = tf.constant(encoded_sequences.input_ids.numpy())
model_inputs

<tf.Tensor: shape=(3, 4), dtype=int32, numpy=
array([[ 101, 7592,  999,  102],
       [ 101, 4658, 1012,  102],
       [ 101, 3835,  999,  102]])>

In [30]:
output = model(model_inputs)
output.last_hidden_state.shape

TensorShape([3, 4, 768])

In [71]:
## All together now
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="tf")
output = model(**tokens)

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [72]:
output

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[-1.5606984,  1.6122835],
       [-3.6183178,  3.9137492]], dtype=float32)>, hidden_states=None, attentions=None)