### L0: Using Pipeline

In [None]:
import torch
import torch.nn.functional as F

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
classifier("I think I'm in love")

### L1: Using Tokenizer

In [None]:
from transformers import AutoTokenizer
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


In [None]:
sentences = ["I think I am in love", "I am annoyed"]
tokenized = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

In [None]:
print(tokenized)

In [None]:
# --- AutoModel - Embedding Layer
from transformers import AutoModel

model = AutoModel.from_pretrained(checkpoint)
outputs = model(**tokenized)
print(outputs['last_hidden_state'].shape)

In [None]:
# --- AutoModelFor - Final Layer for specific task
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**tokenized)

In [None]:
predictions = F.softmax(outputs.logits, dim=-1)

In [None]:
predictions.sum(axis=1)

In [None]:
predicted_labels = torch.argmax(predictions, dim=-1)
predicted_labels_names = [model.config.id2label[idx.item()] for idx in predicted_labels]

In [None]:
print(model.config)

### L2: Using Tokenizer.tokenize()

In [None]:
tokens = tokenizer.tokenize(sentences[0]) # handle one sentence at a time
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor([ids])
outputs = model(input_ids)

In [None]:
pred = F.softmax(outputs.logits, dim=-1)

In [None]:
print(pred)
print(predictions)

Notes:
- Even though processed sentence is similar, the padding applied is not the same. Theoritically, the output generated should be the same because attention mask should be applied and remove the padding importance, however, some models have *positional embeddings* which interacts with padding, even if masked
- The solution to get consistent results is to always use the same padding strategy, pass attention mask and `model.eval()` to disable dropout
- Padding
    - `padding=max_length`
    - `padding=longest`

### L3: Getting consistent tokenizer results by applying similar padding strategy

- https://huggingface.co/learn/llm-course/chapter2/6

In [None]:
# -- using tokenizer()
tokenized1 = tokenizer(sentences[0], padding="max_length", truncation=True, return_tensors='pt')
outputs1 = model(**tokenized1)

In [None]:
def get_attention_mask(word: str, max_length: int) -> torch.Tensor:
    if len(word) > max_length:
        return torch.Tensor([1] * max_length)

    mask = [1] * len(word) + [0] * (max_length - len(word))
    return torch.tensor(mask)

In [None]:
model.config.max_position_embeddings

In [None]:
# -- using tokenizer()
max_length = model.config.max_position_embeddings
tokens = tokenizer.tokenize(sentences[0])
if len(tokens) > max_length:
    tokens = tokens[:model.config.max_position_embeddings - 2]
tokens = [tokenizer.cls_token] + tokens + [tokenizer.sep_token]

ids = tokenizer.convert_tokens_to_ids(tokens)
padding_length = max_length - len(ids)
ids = ids + [tokenizer.pad_token_id] * padding_length

input_ids = torch.tensor([ids])
# attention_mask = get_attention_mask(tokens, model.config.max_position_embeddings).unsqueeze(0)
attention_mask = torch.tensor([[1] * len(tokens) + [0] * padding_length])
outputs2 = model(input_ids, attention_mask)

In [None]:
print(len(tokens), len(ids), len(sentences[0]))

In [None]:
# -- using encode_plus()
encoded = tokenizer.encode_plus(
    sentences[0],
    add_special_tokens=True,
    max_length=512,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)

input_ids = encoded['input_ids']
attention_mask = encoded['attention_mask']

outputs3 = model(input_ids=input_ids, attention_mask=attention_mask)

In [None]:
print(outputs1)
print(outputs2)
print(outputs3)