<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/ufidon/nlp/blob/main/hftrans.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
  </td>
  <td>
    <a target="_blank" href="https://kaggle.com/kernels/welcome?src=https://github.com/ufidon/nlp/blob/main/hftrans.ipynb"><img src="https://kaggle.com/static/images/open-in-kaggle.svg" /></a>
  </td>
</table>
<br>


- 📝 Using transformers

In [None]:
# Install Huggingface core libraries
!pip install tokenizers transformers datasets accelerate

from transformers import pipeline

In [None]:
# 1.1 Behind the pipeline
# Tokenizer → Model → Post Processing

# a) Tokenizer
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Tokenization
raw_inputs = [
    "What did the grape say when it got stepped on?",
    "Nothing, it just let out a little wine!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

# b) Model
from transformers import AutoModel

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)

# Inference
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

# b.1) Another architecture
from transformers import AutoModelForSequenceClassification

checkpoint2 = "distilbert-base-uncased-finetuned-sst-2-english"
model2 = AutoModelForSequenceClassification.from_pretrained(checkpoint2)
outputs2 = model(**inputs)
print(outputs2.logits.shape)

# c) Postprocessing
print(outputs.logits)
print(outputs2.logits)


# c.1) logits ⧐ probabilities
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

model.config.id2label

In [None]:
# 1.2 Models
# a) Creating a Transformer
from transformers import BertConfig, BertModel

# Building the default config
config = BertConfig()

# Building the model from the default config
# Model is randomly initialized!
# Training such a model from scratch is resource consuming
model = BertModel(config)

print(config)

In [None]:
# b) Reusing models that have already been trained saves resource
# any model on the Model Hub compatible with the BERT architecture can be used
#   check here:https://huggingface.co/models?other=bert
# The weights will be downloaded and cached default to `~/.cache/huggingface/transformers`
# The cache folder can be set by the HF_HOME environment variable.
model = BertModel.from_pretrained("bert-base-cased")

In [None]:
# c) Saving models
model.save_pretrained("models_dir")
!ls models_dir 
# two files saved: 
# config.json about model’s architecture
# pytorch_model.bin about model’s parameters

In [None]:
# 1.3 Tokenizers
# a) word-based tokenizers split on space and punctuation
# each word is identified by the its index in the vocabulary
tokenized_text = "What did the grape say when it got stepped on?".split()
print(tokenized_text)

# b) Character-based tokenizers are too granular while word-based tokenizers are too coarse
# The trade-off is
# c) Subword tokenization

# d) load, use, and save a tokenizer
# oading the BERT tokenizer trained with the same checkpoint as BERT is done
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

# Load with AutoTokenizer, similar to AutoModel
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Tokenization
text = "Nothing, it just let out a little wine!"
tokenizer(text)

In [None]:
# Tokenization in step
# text → tokens
tokens = tokenizer.tokenize(text)
print(tokens)

In [None]:
# tokens → numerical IDs
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

In [None]:
# Save a tokenizer
tokenizer.save_pretrained("tokenizer_dir")

In [None]:
# e) Decoding: IDs → vocabulary indices
decoded_string = tokenizer.decode(ids)
print(decoded_string)

In [None]:
# 1.4 Handling multiple sequences by default
# Padding with `padding token` is required to make all sentences have the same length

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "What did the grape say when it got stepped on?"

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor([ids])
print("Input IDs:", input_ids)

output = model(input_ids)
print("Logits:", output.logits)

In [None]:
# padding tokens must be ignored using an attention mask (indicator)
# 1 - pay attention, 0 - ignore
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]

outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)

# Longer sequences can be either truncated or length-limit models
# or fed to models support longer sequences such as 
# [longformer](https://huggingface.co/docs/transformers/model_doc/longformer)

In [None]:
# 1.5 A complete example
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# one sentence
sequence = "What did the grape say when it got stepped on?"
model_inputs = tokenizer(sequence)

In [None]:
# multiple sentences
sequences = ["What did the grape say when it got stepped on?", "Nothing, it just let out a little wine!"]
model_inputs = tokenizer(sequences)

In [None]:
# padding
# Will pad the sequences up to the maximum sequence length
model_inputs = tokenizer(sequences, padding="longest")

# Will pad the sequences up to the model max length
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, padding="max_length")

# Will pad the sequences up to the specified max length
model_inputs = tokenizer(sequences, padding="max_length", max_length=8)

In [None]:
# truncating
# Will truncate the sequences that are longer than the model max length
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, truncation=True)

# Will truncate the sequences that are longer than the specified max length
model_inputs = tokenizer(sequences, max_length=8, truncation=True)

In [None]:
# to specific framework tensors
# Returns PyTorch tensors
model_inputs = tokenizer(sequences, padding=True, return_tensors="pt")

# Returns TensorFlow tensors
model_inputs = tokenizer(sequences, padding=True, return_tensors="tf")

# Returns NumPy arrays
model_inputs = tokenizer(sequences, padding=True, return_tensors="np")

In [None]:
# Special tokens
model_inputs = tokenizer(sequence)
print(model_inputs["input_ids"])

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

# Find the special tokens
print(tokenizer.decode(model_inputs["input_ids"]))
print(tokenizer.decode(ids))

In [None]:
# From tokenizer to model
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["What did the grape say when it got stepped on?", "Nothing, it just let out a little wine!"]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)