#### 数据预处理

In [None]:
# uv 环境下可运行如下命令下载模型：
# uv run modelscope download --model pengzhendong/chinese-lert-base --local_dir models/chinese-lert-base

import evaluate
import numpy as np
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
EPOCHS = 5  # 模型学习训练集的轮次
BATCH_SIZE = 4  # 模型并行学习的样本数量
LABEL_HEADER = "sentiment"  # 标签的表头名
TEXT_HEADER = "text_a"  # 文本句子的表头名
PRETRAINED_MODEL_NAME_OR_PATH = "../models/chinese-lert-base"  # 预训练模型路径
CSV_PATH = (
    "../datasets/label_studio文本分类数据集.csv"  # label-studio标注的文本分类数据集路径
)
OUPUT_CHECKPOINT_PATH = "../models/text_classification_checkpoint"  # 训练过程中产生的模型文件
OUTPUT_MODEL_PATH = "../models/text_classification_model"  # 微调后的文本分类模型路径
VAL_SIZE = 0.1  # 验证集的占比
MAX_LENGTH = 256  # 输入文本的token长度，lert模型最大只能是512
ACCURACY_PATH = "../common/accuracy.py"  # 评估脚本本地路径

In [3]:
# 从本地加载数据集
dataset = load_dataset("csv", data_files=CSV_PATH)
dataset = dataset.select_columns([LABEL_HEADER, TEXT_HEADER])
dataset = dataset.rename_column(LABEL_HEADER, "label")
dataset = dataset.rename_column(TEXT_HEADER, "text")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 99
    })
})


In [4]:
# 获取标签和数字的映射关系
labels = dataset["train"]["label"]
id2label = {}
label2id = {}
unique_labels = set(labels)
num_labels = len(unique_labels)
for index, label in enumerate(unique_labels):
    id2label[index] = label
    label2id[label] = index
print("文本分类映射关系：", label2id)

文本分类映射关系： {'其他': 0, '差评': 1, '好评': 2}


In [5]:
# 将中文标签转换为数字
dataset = dataset.map(
    lambda str_value: {"label": label2id[str_value]}, input_columns="label"
)

# 拆分训练集和验证集
splits = dataset["train"].train_test_split(test_size=VAL_SIZE)

# 加载文本转数字工具
tokenizer = AutoTokenizer.from_pretrained(
    PRETRAINED_MODEL_NAME_OR_PATH,
    use_fast=True,
    max_length=MAX_LENGTH,
    truncation=True,
    padding="max_length",
)


# 定义文本转数字的函数，其中设置了过长文本的截断
def preprocess_function(examples):
    return tokenizer(examples["text"])


tokenized_imdb = splits.map(preprocess_function, batched=True)

# 设置短文本动态填充的参数
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 89/89 [00:00<00:00, 6357.50 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 1428.82 examples/s]


#### 模型微调

In [6]:
accuracy = evaluate.load(ACCURACY_PATH)

# 定义评估模型的函数
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [7]:
# 加载预训练模型，将其应用文本分类任务
model = AutoModelForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME_OR_PATH,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)

# 使得模型的tensor连续，避免报错
for param in model.parameters():
    param.data = param.data.contiguous()

# 定义模型训练参数
training_args = TrainingArguments(
    output_dir=OUPUT_CHECKPOINT_PATH,
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
model.half()
model.save_pretrained(OUTPUT_MODEL_PATH)
tokenizer.save_pretrained(OUTPUT_MODEL_PATH)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../models/chinese-lert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.950046,0.7
2,No log,0.779834,0.6
3,No log,0.718241,0.7
4,No log,0.758033,0.7
5,No log,0.726873,0.7




('../models/text_classification_model\\tokenizer_config.json',
 '../models/text_classification_model\\special_tokens_map.json',
 '../models/text_classification_model\\vocab.txt',
 '../models/text_classification_model\\added_tokens.json',
 '../models/text_classification_model\\tokenizer.json')

#### 模型推理

In [8]:
import torch
from transformers import pipeline
 
classifier = pipeline(
    task="text-classification",
    model=OUTPUT_MODEL_PATH,
    truncation=True,
    max_length=MAX_LENGTH,
    padding="max_length",
    torch_dtype=torch.float16
)

Device set to use cpu


In [9]:
print(classifier("猩红女巫美哭了，为何要弄我大快银 嘤嘤嘤"))

[{'label': '好评', 'score': 0.7231137156486511}]
