In [1]:
from transformers import pipeline

#by pipeline
classifier = pipeline("sentiment-analysis")
classifier(
    [
        "I've been waiting for a HUggingFace course my whole life.",
        "I hate this so much!",
    ]
)

  from .autonotebook import tqdm as notebook_tqdm
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


[{'label': 'POSITIVE', 'score': 0.9598046541213989},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

In [4]:
from transformers import AutoTokenizer

#tokenizer
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [6]:
#pretrained model
from transformers import AutoModel

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint) 

#unpack dictionary
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

torch.Size([2, 16, 768])


In [10]:
#model with classification head
from transformers import AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)
print(outputs.logits.shape)
print(outputs)

torch.Size([2, 2])
SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [None]:
#post processing
import torch
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

#model config for output
model.config.id2label

tensor([[4.0195e-02, 9.5980e-01],
        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)


{0: 'NEGATIVE', 1: 'POSITIVE'}

Creating a Transformer

In [19]:
from transformers import BertModel #can be AutoModel as well

#saving model
model = BertModel.from_pretrained("bert-base-cased")
model.save_pretrained("saved_model_1")

In [16]:
model.push_to_hub("testing_mode_1")

model.safetensors: 100%|██████████| 433M/433M [00:54<00:00, 8.01MB/s] 


CommitInfo(commit_url='https://huggingface.co/teohyc/testing_mode_1/commit/3e7eb440c3cb93a89a0ee9f63e41ed6675602450', commit_message='Upload model', commit_description='', oid='3e7eb440c3cb93a89a0ee9f63e41ed6675602450', pr_url=None, repo_url=RepoUrl('https://huggingface.co/teohyc/testing_mode_1', endpoint='https://huggingface.co', repo_type='model', repo_id='teohyc/testing_mode_1'), pr_revision=None, pr_num=None)

In [51]:
from transformers import AutoTokenizer

#encoding text
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

encoded_input = tokenizer("Hello, I'm a single sentence!")
print(encoded_input)

{'input_ids': [101, 8667, 117, 146, 112, 182, 170, 1423, 5650, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [52]:
#decode
tokenizer.decode(encoded_input["input_ids"])

"[CLS] Hello, I ' m a single sentence! [SEP]"

In [56]:
encoded_input = tokenizer("How are you?", "I'm fine, thank you!", return_tensors="pt")
print(encoded_input)

{'input_ids': tensor([[ 101, 1731, 1132, 1128,  136,  102,  146,  112,  182, 2503,  117, 6243,
         1128,  106,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [None]:
#same length for both sentences, use padding
encoded_input = tokenizer(
    ["How are you?", "I'm fine, thank you!"], padding=True, return_tensors="pt"
)
print(encoded_input)

{'input_ids': tensor([[ 101, 1731, 1132, 1128,  136,  102,    0,    0,    0,    0],
        [ 101,  146,  112,  182, 2503,  117, 6243, 1128,  106,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [58]:
#truncation is for sentences too long near 512 tokens
encoded_input = tokenizer(
    "This is a very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very long sentence.",
    truncation=True,
)
print(encoded_input["input_ids"])

[101, 1188, 1110, 170, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1263, 5650, 119, 102]


In [59]:
#combine padding and truncation
encoded_input = tokenizer(
    ["How are you?", "I'm fine, thank you!"],
    padding=True,
    truncation=True,
    max_length=5,
    return_tensors="pt"
)

print(encoded_input)

{'input_ids': tensor([[ 101, 1731, 1132, 1128,  102],
        [ 101,  146,  112,  182,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]])}


In [62]:
output = model(**encoded_input)
print(output)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.1500,  0.2100,  0.2770,  ..., -0.1971,  0.2418, -0.1118],
         [ 0.0130, -0.5233,  0.6800,  ..., -0.0204, -0.2299, -0.2385],
         [ 0.1876, -0.4915, -0.2659,  ...,  0.0801, -0.4456,  0.9273],
         [ 0.6587, -0.1734, -0.0356,  ..., -0.4110, -0.3439,  0.2987],
         [ 0.8629,  0.1814,  0.1749,  ...,  0.0693,  1.2299, -0.2707]],

        [[ 0.5111,  0.4098,  0.2642,  ..., -0.0678,  0.4584, -0.3087],
         [ 0.5361,  0.1227,  0.4718,  ...,  0.0496,  0.3040,  0.1918],
         [ 0.2898,  0.4114, -0.0288,  ..., -0.0173,  0.6955, -0.1688],
         [ 0.2900, -0.0814,  0.1336,  ..., -0.1883,  0.2689, -0.0132],
         [ 1.3207,  0.3622, -0.0432,  ...,  0.1485,  1.1995, -0.9229]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.7487,  0.3865,  0.9999,  ...,  1.0000, -0.7804,  0.9881],
        [-0.6507,  0.4607,  0.9999,  ...,  1.0000, -0.8657,  0.9938]],
       grad_fn=<TanhBack