# pip安装Transformers

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 20.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 34.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 19.1 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 46.0 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninst

# 2.5 Overview + 视频学习

使用tokenizer对每句话进行逐句的tokenize

In [3]:
from transformers import AutoTokenizer

checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sentences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this."
]

tokens = [tokenizer.tokenize(sentence) for sentence in sentences]
ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]
print(ids[0])
print(ids[1])

[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]
[1045, 5223, 2023, 1012]


pytorch的torch.tensor()方法的输入不接受不相同长度的列表

In [4]:
from transformers import AutoTokenizer
import torch

checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sentences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this."
]

tokens = [tokenizer.tokenize(sentence) for sentence in sentences]
ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]
print(ids)

input_ids = torch.tensor(ids)

[[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012], [1045, 5223, 2023, 1012]]


ValueError: ignored

使用AutoTokenizer实例化后的pad_token_id可以看是用什么进行padding

In [7]:
from transformers import AutoTokenizer

ckpt1 = 'bert-base-uncased'
tokenizer1 = AutoTokenizer.from_pretrained(ckpt1)
print(tokenizer1.pad_token)
print(tokenizer1.pad_token_id)

ckpt2 = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer2 = AutoTokenizer.from_pretrained(ckpt2)
print(tokenizer2.pad_token)
print(tokenizer2.pad_token_id)

[PAD]
0
[PAD]
0


只进行padding操作，由于缺少首层attentionmask的存在，也会导致不准确的现象产生

In [12]:
from transformers import AutoModelForSequenceClassification

checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sentences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this."
]

tokens = [tokenizer.tokenize(sentence) for sentence in sentences]
ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]

tensor_ids0 = torch.tensor([ids[0]]) # 注意torch这里应该变成多个的输入，也就是[[]]二维列表
tensor_ids1 = torch.tensor([ids[1]])

print(tensor_ids0)
print(tensor_ids1)

tensor_idsall = torch.tensor(
    [[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607, 2026,  2878,  2166,  1012],
     [1045, 5223, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
)
print(tensor_idsall)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
print(model(tensor_ids0).logits)
print(model(tensor_ids1).logits)
print(model(tensor_idsall).logits) # 可以看到这里对于ids1的得分计算实际上是错误的



tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
tensor([[1045, 5223, 2023, 1012]])
tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012],
        [ 1045,  5223,  2023,  1012,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]])
tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward>)
tensor([[ 3.9497, -3.1357]], grad_fn=<AddmmBackward>)
tensor([[-2.7276,  2.8789],
        [ 1.5444, -1.3998]], grad_fn=<AddmmBackward>)


加入attentionmask后，可以得到相同的

In [14]:
from transformers import AutoModelForSequenceClassification

checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sentences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this."
]

tokens = [tokenizer.tokenize(sentence) for sentence in sentences]
ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]

tensor_ids0 = torch.tensor([ids[0]]) # 注意torch这里应该变成多个的输入，也就是[[]]二维列表
tensor_ids1 = torch.tensor([ids[1]])

print(tensor_ids0)
print(tensor_ids1)

tensor_idsall = torch.tensor(
    [[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607, 2026,  2878,  2166,  1012],
     [1045, 5223, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
)
print(tensor_idsall)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
print("gt ids0的 logits", model(tensor_ids0).logits)
print("gt ids1的 logits", model(tensor_ids1).logits)
print("在model中不携带attention_mask参数：", model(tensor_idsall).logits) # 可以看到这里对于ids1的得分计算实际上是错误的

attention_mask = torch.tensor(
    [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
     [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
)
print("在model中带上attention_mask参数后：", model(tensor_idsall, attention_mask=attention_mask).logits)

tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
tensor([[1045, 5223, 2023, 1012]])
tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012],
        [ 1045,  5223,  2023,  1012,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]])
gt ids0的 logits tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward>)
gt ids1的 logits tensor([[ 3.9497, -3.1357]], grad_fn=<AddmmBackward>)
在model中不携带attention_mask参数： tensor([[-2.7276,  2.8789],
        [ 1.5444, -1.3998]], grad_fn=<AddmmBackward>)
在model中带上attention_mask参数后： tensor([[-2.7276,  2.8789],
        [ 3.9497, -3.1357]], grad_fn=<AddmmBackward>)


不用拆分的方式，相对整体的完成这一过程，只要在tokenizer中加入padding=True参数即可

In [16]:
from transformers import AutoTokenizer

checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sentences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this."
]

input_ids = tokenizer(sentences, padding=True) # 注意这里是一个整体的过程，直接tokenizer()
print(input_ids)

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 1045, 5223, 2023, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}


# 2.5.1 Models expect a batch of inputs

input的输入应该是一个列表套列表，[[ ... ]]

In [21]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
# input_ids = torch.tensor(ids) # 会报错 IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)
input_ids = torch.tensor([ids]) # 改成这样列表套列表就行了

model(input_ids)

SequenceClassifierOutput([('logits',
                           tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward>))])

当我们也加入一个新的维度后，打印inputs_ids和logits，都能正常输出了：

In [24]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification # 简单情感分类的例子

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life." # 

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor([ids])
print("input_ids: ", input_ids)

output = model(input_ids)
print("logits: ", output.logits)

input_ids:  tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
logits:  tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward>)
