# A test demo

In [3]:
from transformers import BertTokenizer, BertModel

# 加载预训练的 BERT tokenizer 和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# 准备输入文本
text = "Hello, how are you?"
inputs = tokenizer(text, return_tensors='pt')

# 获取模型输出
outputs = model(**inputs)

# 输出的最后一个隐藏状态
last_hidden_states = outputs.last_hidden_state
print(last_hidden_states)



tensor([[[-0.0824,  0.0667, -0.2880,  ..., -0.3566,  0.1960,  0.5381],
         [ 0.0310, -0.1448,  0.0952,  ..., -0.1560,  1.0151,  0.0947],
         [-0.8935,  0.3240,  0.4184,  ..., -0.5498,  0.2853,  0.1149],
         ...,
         [-0.2812, -0.8531,  0.6912,  ..., -0.5051,  0.4716, -0.6854],
         [-0.4429, -0.7820, -0.8055,  ...,  0.1949,  0.1081,  0.0130],
         [ 0.5570, -0.1080, -0.2412,  ...,  0.2817, -0.3996, -0.1882]]],
       grad_fn=<NativeLayerNormBackward0>)


In [5]:
from transformers import pipeline

# 创建一个文本分类管道
classifier = pipeline('sentiment-analysis')

# 进行情感分析
result = classifier("I love using Hugging Face transformers!")
print(result)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9971315860748291}]


# 使用模型进行文本分类


## 词元化

In [21]:
from transformers import BertTokenizer 


tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path='bert-base-chinese', 
    cache_dir=None,
    force_download=False
)

sents = ['你在桥上看风景', '看风景的人在楼上看你', '明月装饰了你的窗子', '你装饰了别人的梦']

out = tokenizer.encode(
    text=sents[0], 
    text_pair=sents[1], 
    # text_pair=None,
    truncation=True, 
    padding='max_length', 
    max_length=25, 
    add_special_tokens=True, 
    return_tensors=None
)

print(out)
print(tokenizer.decode(out))

[101, 872, 1762, 3441, 677, 4692, 7599, 3250, 102, 4692, 7599, 3250, 4638, 782, 1762, 3517, 677, 4692, 872, 102, 0, 0, 0, 0, 0]
[CLS] 你 在 桥 上 看 风 景 [SEP] 看 风 景 的 人 在 楼 上 看 你 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]


In [23]:
out = tokenizer.encode_plus(
    text=sents[0], 
    text_pair=sents[1],
    truncation=True,
    padding='max_length',
    max_length=25, 
    add_special_tokens=True,
    return_tensors=None, 
    return_token_type_ids=True, 
    return_attention_mask=True, 
    return_special_tokens_mask=True, 
    return_length=True
)

for k, v in out.items():
    print(k, ':', v)
tokenizer.decode(out['input_ids'])

input_ids : [101, 872, 1762, 3441, 677, 4692, 7599, 3250, 102, 4692, 7599, 3250, 4638, 782, 1762, 3517, 677, 4692, 872, 102, 0, 0, 0, 0, 0]
token_type_ids : [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
special_tokens_mask : [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
attention_mask : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
length : 25


'[CLS] 你 在 桥 上 看 风 景 [SEP] 看 风 景 的 人 在 楼 上 看 你 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [27]:
out = tokenizer.batch_encode_plus(
    batch_text_or_text_pairs=[sents[i] for i in range(len(sents))], 
    add_special_tokens=True, 
    truncation=True,  
    padding='max_length', 
    max_length=25, 
    return_tensors=None, 
    return_token_type_ids=True, 
    return_attention_mask=True, 
    return_special_tokens_mask=True, 
    return_length=True
)

for k, v in out.items():
    print(k, ':', v)

tokenizer.decode(out['input_ids'][0])

input_ids : [[101, 872, 1762, 3441, 677, 4692, 7599, 3250, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 4692, 7599, 3250, 4638, 782, 1762, 3517, 677, 4692, 872, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 3209, 3299, 6163, 7652, 749, 872, 4638, 4970, 2094, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 872, 6163, 7652, 749, 1166, 782, 4638, 3457, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
token_type_ids : [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
special_tokens_mask : [[1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

'[CLS] 你 在 桥 上 看 风 景 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

## 对vocab操作

In [32]:
vocab = tokenizer.get_vocab()

type(vocab), len(vocab), '明月' in vocab

(dict, 21128, False)

# 使用数据集工具

In [42]:
from datasets import load_dataset 


# dataset = load_dataset(path='seamew/ChnSentiCorp')
raw_datasets = load_dataset(path='glue', name='mrpc')

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [52]:
from transformers import (AutoTokenizer, 
                          DataCollatorWithPadding, 
                          AutoModelForSequenceClassification, 
                          TrainingArguments, 
                          Trainer)


checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=checkpoint)

def toknize_function(example):
    return tokenizer(example['sentence1'], example['sentence2'], truncation=True)

tokenized_datasets = raw_datasets.map(toknize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=checkpoint, 
    num_labels=2
)

training_args = TrainingArguments('test-trainer')

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer
) 

trainer.train()

loading configuration file config.json from cache at C:\Users\zhao\.cache\huggingface\hub\models--bert-base-uncased\snapshots\86b5e0934494bd15c9632b12f734a8a67f723594\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at C:\Users\zhao\.cache\huggingface\hub\models--bert-base-uncased\snapshots\86b5e0934494bd15c9632b12f734a8a67f723594\vocab.txt

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`