# 基于Transformers的NLP解决方案

# 文本分类实例

## Step1 导入相关包

In [1]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Step2 加载数据集

In [2]:
dataset = load_dataset("csv", data_files="./ChnSentiCorp_htl_all.csv", split="train")
dataset = dataset.filter(lambda x: x["review"] is not None)
dataset

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)
Generating train split: 7766 examples [00:00, 224020.06 examples/s]
Filter: 100%|██████████| 7766/7766 [00:00<00:00, 459369.39 examples/s]


Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

## Step3 划分数据集

In [3]:
datasets = dataset.train_test_split(test_size=0.1)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

## Step4 数据集预处理

In [4]:
import torch

tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-large")


def process_function(examples):
    tokenized_examples = tokenizer(
        examples["review"], max_length=32, truncation=True, padding="max_length"
    )
    tokenized_examples["labels"] = examples["label"]
    return tokenized_examples


tokenized_datasets = datasets.map(
    process_function, batched=True, remove_columns=datasets["train"].column_names
)
tokenized_datasets

Map: 100%|██████████| 6988/6988 [00:00<00:00, 26045.00 examples/s]
Map: 100%|██████████| 777/777 [00:00<00:00, 20453.86 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 777
    })
})

## Step5 创建模型

In [5]:
model = AutoModelForSequenceClassification.from_pretrained("hfl/chinese-macbert-large")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-macbert-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Step6 创建评估函数

In [6]:
import evaluate

# 如果网络不太好，也可以使用本地加载的方式
acc_metric = evaluate.load("./metric_accuracy.py")
f1_metirc = evaluate.load("./metric_f1.py")

In [7]:
def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metirc.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

## Step7 创建TrainingArguments

In [8]:
train_args = TrainingArguments(
    output_dir="./checkpoints",  # 输出文件夹
    per_device_train_batch_size=2,  # 训练时的batch_size
    gradient_accumulation_steps=32,  # *** 梯度累加 ***
    gradient_checkpointing=True,  # *** 梯度检查点 ***
    optim="adafactor",  # *** adafactor优化器 ***
    per_device_eval_batch_size=4,  # 验证时的batch_size
    num_train_epochs=1,  # 训练轮数
    logging_steps=10,  # log 打印的频率
    eval_strategy="epoch",  # 评估策略
    save_strategy="epoch",  # 保存策略
    save_total_limit=3,  # 最大保存数
    learning_rate=2e-5,  # 学习率
    weight_decay=0.001,  # weight_decay
    metric_for_best_model="f1",  # 设定评估指标
    load_best_model_at_end=True,
)  # 训练完成后加载最优模型
train_args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=epoch,
evaluation_strategy=None,
fp16=False,
fp16_backend=auto,
fp

## Step8 创建Trainer

In [9]:
from transformers import DataCollatorWithPadding

# *** 参数冻结 ***
for name, param in model.bert.named_parameters():
    param.requires_grad = False

trainer = Trainer(
    model=model,
    args=train_args,
    tokenizer=tokenizer,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=eval_metric,
)

## Step9 模型训练

In [10]:
trainer.train()
# return {"f1": float(score) if score.size == 1 else score}  AttributeError: 'float' object has no attribute 'size'
# pip install scikit-learn==1.5.2

  9%|▉         | 10/109 [00:04<00:36,  2.71it/s]

{'loss': 0.6486, 'grad_norm': 1.1082851886749268, 'learning_rate': 1.81651376146789e-05, 'epoch': 0.09}


 18%|█▊        | 20/109 [00:07<00:32,  2.70it/s]

{'loss': 0.6346, 'grad_norm': 1.2000371217727661, 'learning_rate': 1.63302752293578e-05, 'epoch': 0.18}


 28%|██▊       | 30/109 [00:11<00:29,  2.65it/s]

{'loss': 0.6354, 'grad_norm': 0.6770465970039368, 'learning_rate': 1.4495412844036698e-05, 'epoch': 0.27}


 37%|███▋      | 40/109 [00:15<00:25,  2.72it/s]

{'loss': 0.6319, 'grad_norm': 1.0347671508789062, 'learning_rate': 1.2660550458715597e-05, 'epoch': 0.37}


 46%|████▌     | 50/109 [00:19<00:21,  2.74it/s]

{'loss': 0.6038, 'grad_norm': 1.0839542150497437, 'learning_rate': 1.0825688073394496e-05, 'epoch': 0.46}


 55%|█████▌    | 60/109 [00:22<00:17,  2.77it/s]

{'loss': 0.6287, 'grad_norm': 1.0236989259719849, 'learning_rate': 8.990825688073395e-06, 'epoch': 0.55}


 64%|██████▍   | 70/109 [00:26<00:14,  2.71it/s]

{'loss': 0.6176, 'grad_norm': 1.2010418176651, 'learning_rate': 7.155963302752295e-06, 'epoch': 0.64}


 73%|███████▎  | 80/109 [00:30<00:10,  2.69it/s]

{'loss': 0.6194, 'grad_norm': 0.6772569417953491, 'learning_rate': 5.3211009174311936e-06, 'epoch': 0.73}


 83%|████████▎ | 90/109 [00:33<00:07,  2.66it/s]

{'loss': 0.6359, 'grad_norm': 0.8607439994812012, 'learning_rate': 3.486238532110092e-06, 'epoch': 0.82}


 92%|█████████▏| 100/109 [00:37<00:03,  2.67it/s]

{'loss': 0.6407, 'grad_norm': 1.1414278745651245, 'learning_rate': 1.6513761467889911e-06, 'epoch': 0.92}


                                                 
100%|██████████| 109/109 [00:44<00:00,  2.68it/s]

{'eval_loss': 0.6310529112815857, 'eval_accuracy': 0.6731016731016731, 'eval_f1': 0.8046153846153846, 'eval_runtime': 2.5306, 'eval_samples_per_second': 307.036, 'eval_steps_per_second': 77.055, 'epoch': 1.0}


100%|██████████| 109/109 [00:45<00:00,  2.37it/s]

{'train_runtime': 45.9961, 'train_samples_per_second': 151.926, 'train_steps_per_second': 2.37, 'train_loss': 0.6286082748973042, 'epoch': 1.0}





TrainOutput(global_step=109, training_loss=0.6286082748973042, metrics={'train_runtime': 45.9961, 'train_samples_per_second': 151.926, 'train_steps_per_second': 2.37, 'total_flos': 406322074411008.0, 'train_loss': 0.6286082748973042, 'epoch': 0.998282770463652})

In [11]:
trainer.evaluate(tokenized_datasets["test"])

  0%|          | 0/195 [00:00<?, ?it/s]

100%|██████████| 195/195 [00:02<00:00, 77.77it/s]


{'eval_loss': 0.6310529112815857,
 'eval_accuracy': 0.6731016731016731,
 'eval_f1': 0.8046153846153846,
 'eval_runtime': 2.5202,
 'eval_samples_per_second': 308.308,
 'eval_steps_per_second': 77.375,
 'epoch': 0.998282770463652}

In [12]:
trainer.predict(tokenized_datasets["test"])

  0%|          | 0/195 [00:00<?, ?it/s]

100%|██████████| 195/195 [00:02<00:00, 77.50it/s]


PredictionOutput(predictions=array([[-0.05655795,  0.4457591 ],
       [-0.0096358 ,  0.7620296 ],
       [ 0.03572067,  0.6134868 ],
       ...,
       [-0.08316922,  0.62343144],
       [ 0.10598477,  0.63989866],
       [-0.13379288,  0.48901066]], dtype=float32), label_ids=array([0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1

## Step10 模型预测

In [13]:
sen = "我觉得这家酒店不错，饭很好吃！"
id2_label = {0: "差评！", 1: "好评！"}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors="pt")
    inputs = {k: v.cuda() for k, v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
    print(f"输入：{sen}\n模型预测结果:{id2_label.get(pred.item())}")

输入：我觉得这家酒店不错，饭很好吃！
模型预测结果:好评！


In [14]:
from transformers import pipeline

model.config.id2label = id2_label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

In [15]:
pipe(sen)

[{'label': '好评！', 'score': 0.6788651347160339}]