# 文本分类实例

## Step1 导入相关包

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Step2 加载数据集

In [2]:
dataset = load_dataset("csv", data_files="./ChnSentiCorp_htl_all.csv", split="train")
dataset = dataset.filter(lambda x: x["review"] is not None)
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

## Step3 划分数据集

In [3]:
datasets = dataset.train_test_split(test_size=0.1)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

## Step4 数据集预处理

In [None]:
import torch

tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-large")

def process_function(examples):

    tokenized_examples = tokenizer(examples["review"], max_length=32, truncation=True, padding="max_length")
    tokenized_examples["labels"] = examples["label"]
    return tokenized_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

tokenized_datasets['train']

Map: 100%|██████████| 6988/6988 [00:00<00:00, 7381.02 examples/s]
Map: 100%|██████████| 777/777 [00:00<00:00, 5762.71 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 777
    })
})

## Step5 创建模型

In [5]:
model = AutoModelForSequenceClassification.from_pretrained("hfl/chinese-macbert-large")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-macbert-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Step6 创建评估函数

In [6]:
import evaluate

# 如果网络不太好，也可以使用本地加载的方式
acc_metric = evaluate.load("./metric_accuracy.py")
f1_metirc = evaluate.load("./metric_f1.py")

In [7]:
def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metirc.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

## Step7 创建TrainingArguments

In [8]:
train_args = TrainingArguments(
    output_dir="./results",              # 模型保存路径
    evaluation_strategy="epoch",         # 每轮评估一次
    save_strategy="epoch",               # 每轮保存一次
    learning_rate=2e-5,                  # 学习率
    per_device_train_batch_size=16,      # 训练批次大小
    per_device_eval_batch_size=16,       # 验证批次大小
    num_train_epochs=3,                  # 训练轮数
    weight_decay=0.01,                   # 权重衰减
    logging_dir="./logs",                # 日志保存路径
    logging_steps=10,                    # 每10步记录一次日志
    load_best_model_at_end=True,         # 训练结束时加载最佳模型
    metric_for_best_model="f1",          # 以F1分数选择最佳模型
)




## Step8 创建Trainer

In [9]:
from transformers import DataCollatorWithPadding

# *** 参数冻结 *** 
for name, param in model.bert.named_parameters():
    param.requires_grad = False

trainer = Trainer(model=model, 
                  args=train_args, 
                  train_dataset=tokenized_datasets["train"], 
                  eval_dataset=tokenized_datasets["test"], 
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  compute_metrics=eval_metric)

## Step9 模型训练

In [10]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [15]:
trainer.evaluate(tokenized_datasets["test"])

  0%|          | 0/777 [00:00<?, ?it/s]

{'eval_loss': 0.6191020011901855,
 'eval_accuracy': 0.6924066924066924,
 'eval_f1': 0.8182509505703421,
 'eval_runtime': 10.615,
 'eval_samples_per_second': 73.198,
 'eval_steps_per_second': 73.198,
 'epoch': 1.0}

In [16]:
trainer.predict(tokenized_datasets["test"])

  0%|          | 0/777 [00:00<?, ?it/s]

PredictionOutput(predictions=array([[-0.72761387, -0.07979526],
       [-0.77411073,  0.03051195],
       [-0.7125532 ,  0.02187884],
       ...,
       [-0.69379896, -0.02149625],
       [-0.6627295 ,  0.03299953],
       [-0.6437226 , -0.04126655]], dtype=float32), label_ids=array([1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1

## Step10 模型预测

In [17]:
sen = "我觉得这家酒店不错，饭很好吃！"
id2_label = {0: "差评！", 1: "好评！"}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors="pt")
    inputs = {k: v.cuda() for k, v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
    print(f"输入：{sen}\n模型预测结果:{id2_label.get(pred.item())}")

输入：我觉得这家酒店不错，饭很好吃！
模型预测结果:好评！


In [18]:
from transformers import pipeline

model.config.id2label = id2_label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

In [19]:
pipe(sen)

[{'label': '好评！', 'score': 0.672111451625824}]