In [28]:
import numpy as np
import torch
from transformers import AutoModelForSequenceClassification
torch.set_printoptions(edgeitems=2, precision=2, linewidth=75)

In [21]:
from transformers import pipeline

In [22]:
classifier = pipeline("sentiment-analysis")
classifier(
    [
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
    ]
)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9598049521446228},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

In [None]:
以下是手动实现上面pipeline的功能：

In [26]:
from transformers import AutoTokenizer

In [48]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased-finetuned-sst-2-english', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [49]:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
inputs

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662,
         12172,  2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,
             0,     0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [50]:
inputs.input_ids, inputs.input_ids.shape

(tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662,
          12172,  2607,  2026,  2878,  2166,  1012,   102],
         [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,
              0,     0,     0,     0,     0,     0,     0]]),
 torch.Size([2, 16]))

In [51]:
from transformers import AutoModelForSequenceClassification

In [52]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)

In [53]:
outputs, outputs.logits, outputs.logits.shape

(SequenceClassifierOutput(loss=None, logits=tensor([[-1.56,  1.61],
         [ 4.17, -3.35]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None),
 tensor([[-1.56,  1.61],
         [ 4.17, -3.35]], grad_fn=<AddmmBackward0>),
 torch.Size([2, 2]))

In [54]:
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions

tensor([[4.02e-02, 9.60e-01],
        [9.99e-01, 5.44e-04]], grad_fn=<SoftmaxBackward0>)

In [55]:
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

In [66]:
s1 = "First sentence: {0}: {1:.3f}, {2}: {3:.3f}".format(model.config.id2label[0], predictions[0,0], model.config.id2label[1], predictions[0,1])
s1

'First sentence: NEGATIVE: 0.040, POSITIVE: 0.960'

In [67]:
s2 = "Second sentence: {0}: {1:.3f}, {2}: {3:.3f}".format(model.config.id2label[0], predictions[1,0], model.config.id2label[1], predictions[1,1])
s2

'Second sentence: NEGATIVE: 0.999, POSITIVE: 0.001'