# HuggingFace Transformers

# I - Import tokenizers, create/import huggingface models

In this part we learn how to import from the HF hub tokenizers and models.
We then give an example of a simple inference pipeline : from a batch of textual sentences to a batch of sequence embeddings.

In [51]:
# Import a tokenizer from a specific checkpoint
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Convert text to model-input
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]
tokenized_sentences = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(tokenized_sentences)

# Decode tokens IDs back to a string
decoded = tokenizer.decode(tokenized_sentences["input_ids"][0])
print(decoded)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}
[CLS] i've been waiting for a huggingface course my whole life. [SEP]


In [None]:
# Import a model from a specific checkpoint
from transformers import AutoModel
model = AutoModel.from_pretrained(checkpoint)
outputs = model(**tokenized_sentences)
print(outputs.last_hidden_state.shape) # (B, T, hidden_size)

# You can import a model for a different task but from the same checkpoint. AutoModel is a model : sentence --> hidden states (decoder)
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**tokenized_sentences)
print(outputs.logits)
print(outputs.logits.shape) # (B, num_labels)

torch.Size([2, 16, 768])
tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>)
torch.Size([2, 2])


In [None]:
# Creating a transformer
from transformers import BertConfig, BertModel

# Building the config
config = BertConfig()
print(f"config: {config}")

# Building the model
model = BertModel(config) # from the config (architecture) : random weights
model = BertModel.from_pretrained("bert-base-cased") # from a checkpoint : pretrained weights
# model = AutoModel.from_pretrained("bert-base-cased") # from a checkpoint : pretrained weights (same as above)

model.save_pretrained("models/my_bert_model") # save the model (architecture + weights) to a directory

config: BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.48.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [None]:
# Perform a forward pass :

# 1. We have some input data
sentences = [
    "Hello, I'm a single sentence",
    "And another sentence",
]

# 2. Tokenize the inputs : from a list of sentences to a list-matrix (thanks to padding) of token ids of shape (B, T_max)
tokenized_sentences = tokenizer(
    sentences, 
    padding=True,  # unsure shortest sentences are padded to the length of the longest sentence
    truncation=True,   # truncate the sentences to the maximum length of the model 
    return_tensors="pt", # return PyTorch tensors (?)
)
print(tokenized_sentences.input_ids)
print(tokenized_sentences.input_ids.shape) # (B, T_max) with T_max = max length of the sentences of the batch

# 3. Convert the input_ids to a pytorch tensor. This step is actually not necessary as the tokenizer can return tensors directly.
import torch
model_inputs = torch.tensor(tokenized_sentences.input_ids) # convert the input_ids to a tensor
print(model_inputs.shape) # (B, T_max)

# 4. Perform a forward pass of the model
outputs = model(model_inputs)
print(outputs.last_hidden_state.shape) # (B, T_max, hidden_size)

tensor([[ 101, 7592, 1010, 1045, 1005, 1049, 1037, 2309, 6251,  102],
        [ 101, 1998, 2178, 6251,  102,    0,    0,    0,    0,    0]])
torch.Size([2, 10])
torch.Size([2, 10])
torch.Size([2, 10, 768])


  model_inputs = torch.tensor(inputs.input_ids) # convert the input_ids to a tensor


The call x --> tokenizer(x, padding=True, truncation=True, return_tensors='pt') is a very powerfull method, because :
- it can handle non-batched inputs (a single string, treated as a batch of size 1)
- it padds and truncates the inputs so that it is treatable by the model
- it can return the inputs as PyTorch tensors directly

In a bit more low level, there is the following code generally used :

In [56]:
sentences = [
    "Hello, I'm a single sentence",
    "And another sentence",
]
print(f"Original sentences : {sentences}")

# Sentences to tokens IDs
tokens = tokenizer.tokenize(sentences) # carefull, this will create a list of tokens of size sum(len(sentence) for sentence in sentences)...
print(f"Tokens Object (wrong concatenating way) : {tokens}")

tokens = [tokenizer.tokenize(sentence) for sentence in sentences] # ... instead of a list of list of tokens
print(f"Tokens Object (correct batching way) : {tokens}")

ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens] # convert the tokens to ids
print(f"IDs of tokens: list of list of lens : {[len(i) for i in ids]} with each list containing types : {type(ids[0][0])}")

# Padding and truncating
padding_id = tokenizer.pad_token_id # get the padding token id
max_context_length = model.config.max_position_embeddings # get the maximum length of the model
print(f"Padding ID : {padding_id}")
print(f"Max Context Length : {max_context_length}")
T_max = max([len(i) for i in ids]) # get the maximum length of the sentences
ids_padded = [i + [padding_id] * (T_max - len(i)) for i in ids] # pad the sentences
ids_padded_truncated = [i[:max_context_length] for i in ids_padded] # truncate the sentences
print(f"IDs of tokens padded and truncated : {ids_padded_truncated}")
inputs_model = torch.tensor(ids_padded_truncated)
print(f"Inputs model shape: {inputs_model.shape}") # (B, min(T_max, max_context_length))

# Getting attention masks
attention_mask = (inputs_model != padding_id).float() # create the attention mask
print(f"Attention mask : {attention_mask}")

# Infer hidden states
outputs = model(inputs_model, attention_mask=attention_mask) # perform a forward pass
print(f"Last hidden states shape: {outputs.last_hidden_state.shape}") # (B, T, hidden_size)
    

Original sentences : ["Hello, I'm a single sentence", 'And another sentence']
Tokens Object (wrong concatenating way) : ['hello', ',', 'i', "'", 'm', 'a', 'single', 'sentence', 'and', 'another', 'sentence']
Tokens Object (correct batching way) : [['hello', ',', 'i', "'", 'm', 'a', 'single', 'sentence'], ['and', 'another', 'sentence']]
IDs of tokens: list of list of lens : [8, 3] with each list containing types : <class 'int'>
Padding ID : 0
Max Context Length : 512
IDs of tokens padded and truncated : [[7592, 1010, 1045, 1005, 1049, 1037, 2309, 6251], [1998, 2178, 6251, 0, 0, 0, 0, 0]]
Inputs model shape: torch.Size([2, 8])
Attention mask : tensor([[1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 0., 0., 0., 0., 0.]])
Last hidden states shape: torch.Size([2, 8, 768])


## The attention mask

The attention mask can be accessed through the returned dictionary of the tokenizer : ```tokenizer(sentences).attention_mask```. 

It is a tensor of 0s and 1s, with 1s indicating the tokens that should have an influence on the attention of any other word in the same sentence.

For inference for example, without attention masking, ["hello", "world"] and ["hello", "world", "[PAD]"] would have different embeddings, because the padding token would have an influence on the attention of the other tokens. It has also a role to play in training (masking the future tokens in the case of autoregressive models).

## Decoding tokens IDs

This is the procedure to decode the token IDs back to strings :

In [69]:
# Decode the first sentence. We can see capital letters are removed.
tokens_ids_sentence_1 = inputs_model[0, :]
decoded_sentence_1 = tokenizer.decode(tokens_ids_sentence_1)
print(f"Decoded sentence 1 : {decoded_sentence_1}")

# Concerning sentence 2, we can see the apparition of padding tokens. 
tokens_ids_sentence_2 = inputs_model[1, :]
decoded_sentence_2 = tokenizer.decode(tokens_ids_sentence_2)
print(f"Decoded sentence 2 : {decoded_sentence_2}")

# You can remove them using "skip_special_tokens=True"
decoded_sentence_2 = tokenizer.decode(tokens_ids_sentence_2, skip_special_tokens=True)
print(f"Decoded sentence 2 without special tokens : {decoded_sentence_2}")

# If you use tokenizer(), because this high level method prepares the input for the model, it adds [CLS] (beginning of the sentence) and [SEP] (end of the sentence) tokens.
ids = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")["input_ids"]
sentences_decoded = tokenizer.decode(ids[0])
print(f"Decoded sentence 1 : {sentences_decoded}")

Decoded sentence 1 : hello, i'm a single sentence
Decoded sentence 2 : and another sentence [PAD] [PAD] [PAD] [PAD] [PAD]
Decoded sentence 2 without special tokens : and another sentence
Decoded sentence 1 : [CLS] hello, i'm a single sentence [SEP]


# Wrapping up

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
print(tokens)
output = model(**tokens)
print(output.logits)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  2061,  2031,  1045,   999,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}
tensor([[-1.5607,  1.6123],
        [-3.6183,  3.9137]], grad_fn=<AddmmBackward0>)


# Training

In [7]:
# Training and training args

# Fine-tuning with Adapters

In [None]:
# Load adapters from the hub on top of a model

from adapters import AutoAdapterModel

def count_parameters(model):
    n_param_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    n_param_not_trainable = sum(p.numel() for p in model.parameters() if not p.requires_grad)
    print(f"Number of trainable parameters : {n_param_trainable}")
    print(f"Number of not trainable parameters : {n_param_not_trainable}")

# 1. Load any model
model = AutoAdapterModel.from_pretrained("FacebookAI/roberta-base")
count_parameters(model)

# 2. Load an adapter to the model (this will add the adapter to the model)
adapter_name = model.load_adapter("AdapterHub/roberta-base-pf-imdb")
count_parameters(model) # add the adapter parameters

# 3. Set the adapter as active 
model.active_adapters = adapter_name # for inference
count_parameters(model)
model.train_adapter(adapter_name) # for training
count_parameters(model) # some parameters (all non-adapter and all non-last layer (probably)) are frozen

Some weights of RobertaAdapterModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['heads.default.3.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Number of trainable parameters : 125288025
Number of not trainable parameters : 0


Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 5856.60it/s]

Number of trainable parameters : 126774683
Number of not trainable parameters : 0
None
Number of trainable parameters : 126774683
Number of not trainable parameters : 0
Number of trainable parameters : 2129051
Number of not trainable parameters : 124645632





In [None]:
# Initialize an adapter from scratch with PEFT
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(
    meta_learn_adapter=True,
    meta_learn_adapter_lr=1e-3,
    meta_learn_adapter_wd=1e-3,
    meta_learn_adapter_max_steps=1000,
)
model = get_peft_model(model, peft_config)

In [14]:
from adapters import list_adapters

# source can be "ah" (AdapterHub), "hf" (hf.co) or None (for both, default)
adapter_infos = list_adapters()
adapter_infos

[AdapterInfo(source='hf', adapter_id='BramVanroy/llama2-13b-ft-mc4_nl_cleaned_tiny', model_name=None, task=None, subtask=None, username='BramVanroy', adapter_config=None, sha1_checksum='0b6b07e68973136e4c29c2f0ee120546b3cb6c60'),
 AdapterInfo(source='hf', adapter_id='TheBloke/Llama-2-13B-Chat-Dutch-GPTQ', model_name=None, task=None, subtask=None, username='TheBloke', adapter_config=None, sha1_checksum='5f0b1b031f6b70b5c670b882d442f218c69bcbd6'),
 AdapterInfo(source='hf', adapter_id='TheBloke/Llama-2-13B-Chat-Dutch-GGUF', model_name=None, task=None, subtask=None, username='TheBloke', adapter_config=None, sha1_checksum='615b4ff967d510388e41a23f69d26ed0d5a6671c'),
 AdapterInfo(source='hf', adapter_id='TheBloke/Llama-2-13B-Chat-Dutch-AWQ', model_name=None, task=None, subtask=None, username='TheBloke', adapter_config=None, sha1_checksum='ddf1068fb40a4299e1823a9fbd85a8f0f07c044e'),
 AdapterInfo(source='hf', adapter_id='JakeTurner616/Adonalsium-Mistral-Adapters', model_name=None, task=None, s