Inspecting the transformers pipeline:
- from sentence to tokens to ids (tokenizer)
- from ids to logits (model)
- from logits to probabilities (softmax)

In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel

model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
ids= tokenizer.encode(["I love you", 'i am hungry'], return_tensors='pt' ,padding=True, truncation=True)
ids

tensor([[ 101, 1045, 2293, 2017,  102, 1045, 2572, 7501,  102]])

In [4]:
answer = model(ids)

In [5]:
answer[0]

tensor([[-1.5156,  1.6027]], grad_fn=<AddmmBackward0>)

In [6]:
import torch

torch.nn.functional.softmax(answer.logits, dim=-1)

tensor([[0.0424, 0.9576]], grad_fn=<SoftmaxBackward0>)

In [7]:
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

In [8]:
tokens = tokenizer.tokenize("I love you omg man really i mean im tomas")
tokenizer.convert_tokens_to_ids(tokens)

[1045, 2293, 2017, 18168, 2290, 2158, 2428, 1045, 2812, 10047, 12675]

In [9]:
tokenizer.encode("I love you omg man really i mean im tomas")

[101, 1045, 2293, 2017, 18168, 2290, 2158, 2428, 1045, 2812, 10047, 12675, 102]

In [10]:
tokenizer.decode([1045, 2293, 2017, 18168, 2290, 2158, 2428, 1045, 2812, 10047, 12675])

'i love you omg man really i mean im tomas'

In [11]:
ids = tokenizer(["I love you omg man really i mean im tomas", 'hello'],
          padding="longest", return_tensors="pt"
           )
ids

{'input_ids': tensor([[  101,  1045,  2293,  2017, 18168,  2290,  2158,  2428,  1045,  2812,
         10047, 12675,   102],
        [  101,  7592,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

Truncate true: truncates token sequences larger than a model or user fixed size

In [13]:
result = model(**ids)

In [None]:
result

SequenceClassifierOutput(loss=None, logits=tensor([[-3.6339,  3.9340],
        [-3.6785,  3.9597]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
ids = tokenizer(["I love you omg man really i mean im tomas", 'hello']
           )
ids

{'input_ids': [[101, 1045, 2293, 2017, 18168, 2290, 2158, 2428, 1045, 2812, 10047, 12675, 102], [101, 7592, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1]]}