In [1]:
%pip install -U -q paddlenlp

In [2]:
import os
import paddle
import paddlenlp
import numpy as np

In [3]:
# 读取数据，并封装为MapDataSet数据类型
from paddlenlp.datasets import load_dataset

def read(data_path): # 读取数据
    i = 1
    with open(data_path, 'r', encoding='utf-8') as f:
        next(f)  # 跳过列名
        for line in f:
            label,text = line.strip().split('\t')
            yield {'text': text, 'label': int(label)}


train_ds = load_dataset(read, data_path='./data/data172897/train.txt', lazy=False) # 读取训练集数据
dev_ds = load_dataset(read, data_path='./data/data172897/dev.txt', lazy=False)  # 读取开发集数据
test_ds = load_dataset(read, data_path='./data/data172897/test.txt', lazy=False) # 读取测试集数据

train_ds.label_list=test_ds.label_list=[1,0] #设置数据集标签

print("数据类型:", type(train_ds))
print("训练集样例:", train_ds[0])
print("验证集样例:", dev_ds[0])
print("测试集样例:", test_ds[0])

数据类型: <class 'paddlenlp.datasets.dataset.MapDataset'>
训练集样例: {'text': '很好看，很喜欢，演技特别棒！！', 'label': 1}
验证集样例: {'text': '一般般，没有看出表达的东西', 'label': 0}
测试集样例: {'text': '还行 还差3个字', 'label': 1}


In [4]:
# 加载预训练模型和分词器
from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "ernie-3.0-medium-zh" # 预训练模型名称
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_classes=len(train_ds.label_list)) # 加载预训练模型，设置类别数量为当前数据集数量
tokenizer = AutoTokenizer.from_pretrained(model_name) # 加载预训练模型对应的分词器

[2022-10-18 09:19:31,725] [    INFO] - We are using <class 'paddlenlp.transformers.ernie.modeling.ErnieForSequenceClassification'> to load 'ernie-3.0-medium-zh'.
[2022-10-18 09:19:31,728] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/ernie-3.0-medium-zh/ernie_3.0_medium_zh.pdparams
W1018 09:19:31.730484  4914 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 8.0, Driver API Version: 11.2, Runtime API Version: 11.2
W1018 09:19:31.733821  4914 gpu_resources.cc:91] device: 0, cuDNN Version: 8.2.
[2022-10-18 09:19:34,325] [    INFO] - We are using <class 'paddlenlp.transformers.ernie.tokenizer.ErnieTokenizer'> to load 'ernie-3.0-medium-zh'.
[2022-10-18 09:19:34,327] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/ernie-3.0-medium-zh/ernie_3.0_medium_zh_vocab.txt
[2022-10-18 09:19:34,353] [    INFO] - tokenizer config file saved in /home/aistudio/.paddlenlp/models/ernie-3.0-medium-zh/tokenizer_config.json
[2022-10-18 09:19:34,355] [    INFO] - 

In [5]:
# 数据处理

import functools
import numpy as np

from paddle.io import DataLoader, BatchSampler
from paddlenlp.data import DataCollatorWithPadding

# 数据预处理函数，利用分词器将文本转化为整数序列
def preprocess_function(examples, tokenizer, max_seq_length, is_test=False):

    result = tokenizer(text=examples["text"], max_seq_len=max_seq_length)
    if not is_test:
        result["labels"] = examples["label"]
    return result

trans_func = functools.partial(preprocess_function, tokenizer=tokenizer, max_seq_length=256)
train_ds = train_ds.map(trans_func)
dev_ds = dev_ds.map(trans_func)

# collate_fn函数构造，将不同长度序列充到批中数据的最大长度，再将数据堆叠
collate_fn = DataCollatorWithPadding(tokenizer)

# 定义BatchSampler，选择批大小和是否随机乱序，进行DataLoader
train_batch_sampler = BatchSampler(train_ds, batch_size=128, shuffle=True)
dev_batch_sampler = BatchSampler(dev_ds, batch_size=128, shuffle=False)
train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=collate_fn)
dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=collate_fn)

In [6]:
import paddle
# Adam优化器、交叉熵损失函数、accuracy评价指标
optimizer = paddle.optimizer.AdamW(learning_rate=2e-5, parameters=model.parameters())
criterion = paddle.nn.loss.CrossEntropyLoss()
metric = paddle.metric.Accuracy()

In [7]:
# 构建验证集evaluate函数
@paddle.no_grad()
def evaluate(model, criterion, metric, data_loader):
    model.eval()
    metric.reset()
    losses = []
    for batch in data_loader:
        input_ids, token_type_ids, labels = batch['input_ids'], batch['token_type_ids'], batch['labels']

        logits = model(input_ids, token_type_ids)
        loss = criterion(logits, labels)
        losses.append(loss.numpy())
        correct = metric.compute(logits, labels)
        metric.update(correct)
        
    accu = metric.accumulate()
    print("eval loss: %.5f, accuracy: %.5f" % (np.mean(losses), accu))
    model.train()
    metric.reset()
    return accu

In [8]:
# 开始训练
import time
import paddle.nn.functional as F

epochs = 5 # 训练轮次
ckpt_dir = "../Model" #训练过程中保存模型参数的文件夹
best_acc = 0
best_step = 0
global_step = 0 #迭代次数
tic_train = time.time()
for epoch in range(1, epochs + 1):
    for step, batch in enumerate(train_data_loader, start=1):
        input_ids, token_type_ids, labels = batch['input_ids'], batch['token_type_ids'], batch['labels']

        # 计算模型输出、损失函数值、分类概率值、准确率
        logits = model(input_ids, token_type_ids)
        loss = criterion(logits, labels)
        probs = F.softmax(logits, axis=1)
        correct = metric.compute(probs, labels)
        metric.update(correct)
        acc = metric.accumulate()

        # 每迭代10次，打印损失函数值、准确率、计算速度
        global_step += 1
        if global_step % 10 == 0:
            print(
                "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s"
                % (global_step, epoch, step, loss, acc,
                    10 / (time.time() - tic_train)))
            tic_train = time.time()
        
        # 反向梯度回传，更新参数
        loss.backward()
        optimizer.step()
        optimizer.clear_grad()

        # 每迭代100次，评估当前训练的模型、保存当前模型参数和分词器的词表等
        if global_step % 100 == 0:
            save_dir = ckpt_dir
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            print(global_step, end=' ')
            acc_eval = evaluate(model, criterion, metric, dev_data_loader)
            if acc_eval > best_acc:
                best_acc = acc_eval
                best_step = global_step

                model.save_pretrained(save_dir) # 保存模型参数
                tokenizer.save_pretrained(save_dir) # 保存分词器数据

global step 10, epoch: 1, batch: 10, loss: 0.46610, accu: 0.84688, speed: 3.12 step/s
global step 20, epoch: 1, batch: 20, loss: 0.24254, accu: 0.85430, speed: 7.19 step/s
global step 30, epoch: 1, batch: 30, loss: 0.16339, accu: 0.86823, speed: 6.42 step/s
global step 40, epoch: 1, batch: 40, loss: 0.22133, accu: 0.87871, speed: 6.38 step/s
global step 50, epoch: 1, batch: 50, loss: 0.20629, accu: 0.88734, speed: 6.11 step/s
global step 60, epoch: 1, batch: 60, loss: 0.15578, accu: 0.89375, speed: 6.82 step/s
global step 70, epoch: 1, batch: 70, loss: 0.09900, accu: 0.90056, speed: 6.34 step/s
global step 80, epoch: 1, batch: 80, loss: 0.21836, accu: 0.90430, speed: 7.37 step/s
global step 90, epoch: 1, batch: 90, loss: 0.13752, accu: 0.90825, speed: 6.82 step/s
global step 100, epoch: 1, batch: 100, loss: 0.13874, accu: 0.91094, speed: 6.56 step/s
100 eval loss: 0.16625, accuracy: 0.93538


[2022-10-18 09:19:56,027] [    INFO] - tokenizer config file saved in ./Model/tokenizer_config.json
[2022-10-18 09:19:56,029] [    INFO] - Special tokens file saved in ./Model/special_tokens_map.json


global step 110, epoch: 1, batch: 110, loss: 0.10387, accu: 0.94453, speed: 1.62 step/s
global step 120, epoch: 1, batch: 120, loss: 0.15639, accu: 0.93281, speed: 6.60 step/s
global step 130, epoch: 1, batch: 130, loss: 0.30000, accu: 0.93073, speed: 7.17 step/s
global step 140, epoch: 1, batch: 140, loss: 0.11646, accu: 0.93301, speed: 7.07 step/s
global step 150, epoch: 1, batch: 150, loss: 0.13978, accu: 0.93281, speed: 7.37 step/s
global step 160, epoch: 1, batch: 160, loss: 0.10998, accu: 0.93372, speed: 6.41 step/s
global step 170, epoch: 1, batch: 170, loss: 0.11028, accu: 0.93504, speed: 6.13 step/s
global step 180, epoch: 1, batch: 180, loss: 0.16190, accu: 0.93613, speed: 6.34 step/s
global step 190, epoch: 1, batch: 190, loss: 0.10121, accu: 0.93628, speed: 6.57 step/s
global step 200, epoch: 1, batch: 200, loss: 0.16141, accu: 0.93688, speed: 6.97 step/s
200 eval loss: 0.16394, accuracy: 0.93484
global step 210, epoch: 1, batch: 210, loss: 0.07769, accu: 0.93984, speed: 1.

[2022-10-18 09:20:34,166] [    INFO] - tokenizer config file saved in ./Model/tokenizer_config.json
[2022-10-18 09:20:34,168] [    INFO] - Special tokens file saved in ./Model/special_tokens_map.json


global step 310, epoch: 1, batch: 310, loss: 0.18496, accu: 0.94063, speed: 1.58 step/s
global step 320, epoch: 1, batch: 320, loss: 0.11154, accu: 0.94102, speed: 6.82 step/s
global step 330, epoch: 1, batch: 330, loss: 0.18363, accu: 0.93776, speed: 6.91 step/s
global step 340, epoch: 1, batch: 340, loss: 0.15061, accu: 0.93848, speed: 7.85 step/s
global step 350, epoch: 2, batch: 4, loss: 0.18845, accu: 0.93848, speed: 7.08 step/s
global step 360, epoch: 2, batch: 14, loss: 0.13058, accu: 0.93818, speed: 7.70 step/s
global step 370, epoch: 2, batch: 24, loss: 0.14742, accu: 0.94057, speed: 7.20 step/s
global step 380, epoch: 2, batch: 34, loss: 0.19695, accu: 0.94067, speed: 6.87 step/s
global step 390, epoch: 2, batch: 44, loss: 0.12494, accu: 0.94215, speed: 7.18 step/s
global step 400, epoch: 2, batch: 54, loss: 0.24893, accu: 0.94224, speed: 7.24 step/s
400 eval loss: 0.15793, accuracy: 0.93991
global step 410, epoch: 2, batch: 64, loss: 0.15922, accu: 0.92969, speed: 1.74 step/

In [9]:
# 加载训练的模型最佳模型参数
params_path = 'Model/model_state.pdparams'
state_dict = paddle.load(params_path)
model.set_dict(state_dict)

print(f'ERNIE 3.0-Medium 在开发集上表现为：{evaluate(model, criterion, metric, dev_data_loader)}', )

eval loss: 0.15316, accuracy: 0.94281
ERNIE 3.0-Medium 在开发集上表现为：0.9428054298642534


In [10]:
# 测试集数据预处理，利用分词器将文本转化为整数序列
trans_func_test = functools.partial(preprocess_function, tokenizer=tokenizer, max_seq_length=256, is_test=True)
test_ds_trans = test_ds.map(trans_func_test)

# 进行采样组batch
collate_fn_test = DataCollatorWithPadding(tokenizer)
test_batch_sampler = BatchSampler(test_ds_trans, batch_size=32, shuffle=False)
test_data_loader = DataLoader(dataset=test_ds_trans, batch_sampler=test_batch_sampler, collate_fn=collate_fn_test)

In [11]:
# 模型预测分类结果
import paddle.nn.functional as F

label_map = {0: '负面', 1: '正面'}
results = []
model.eval()
for batch in test_data_loader:
    input_ids, token_type_ids = batch['input_ids'], batch['token_type_ids']
    logits = model(batch['input_ids'], batch['token_type_ids'])
    probs = F.softmax(logits, axis=-1)
    idx = paddle.argmax(probs, axis=1).numpy()
    idx = idx.tolist()
    preds = [label_map[i] for i in idx]
    results.extend(preds)

# 存储预测结果
test_ds = load_dataset(read, data_path='./data/data172897/test.txt', lazy=False)
res_dir = "./results"
if not os.path.exists(res_dir):
    os.makedirs(res_dir)
with open(os.path.join(res_dir, "result.tsv"), 'w', encoding="utf8") as f:
    f.write("text\tlabel\tprediction\n")
    for i, pred in enumerate(results):
        f.write(test_ds[i]['text']+"\t"+label_map[test_ds[i]['label']]+"\t"+pred+"\n")