In [None]:
import pandas as pd
import numpy as np
import os
import random
from functools import partial
import paddle
import paddle.nn.functional as F
import paddlenlp as ppnlp
from paddlenlp.data import JiebaTokenizer, Pad, Stack, Tuple
from paddlenlp.datasets import DatasetBuilder
from paddlenlp.transformers import LinearDecayWithWarmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_scores

In [None]:
# 定义数据集对应文件及其文件存储格式
class NewsData(DatasetBuilder):
    SPLITS = {
        'train': 'train.csv',  # 训练集
        'dev': 'dev.csv',      # 验证集
    }

    def _get_data(self, mode, **kwargs):
        filename = self.SPLITS[mode]
        return filename

    def _read(self, filename):
        """读取数据"""
        with open(filename, 'r', encoding='utf-8') as f:
            head = None
            for line in f:
                data = line.strip().split("\t")    # 以'\t'分隔各列
                if not head:
                    head = data
                else:
                    content, label = data
                    yield {"content": content, "label": int(label)}  # 将标签转换为整数

    def get_labels(self):
        return [0, 1]  # 类别标签为0和1

# 定义数据集加载函数
def load_dataset(name=None,
                 data_files=None,
                 splits=None,
                 lazy=None,
                 **kwargs):
   
    reader_cls = NewsData  # 加载定义的数据集格式
    if not name:
        reader_instance = reader_cls(lazy=lazy, **kwargs)
    else:
        reader_instance = reader_cls(lazy=lazy, name=name, **kwargs)

    datasets = reader_instance.read_datasets(data_files=data_files, splits=splits)
    return datasets

# 定义数据加载和处理函数
def convert_example(example, tokenizer, max_seq_length=128, is_test=False):
    qtconcat = example["content"]
    encoded_inputs = tokenizer(text=qtconcat, max_seq_len=max_seq_length)  # tokenizer处理为模型可接受的格式 
    input_ids = encoded_inputs["input_ids"]
    token_type_ids = encoded_inputs["token_type_ids"]

    if not is_test:
        label = np.array([example["label"]], dtype="int64")  # 标签直接使用0或1
        return input_ids, token_type_ids, label
    else:
        return input_ids, token_type_ids

# 定义数据加载函数dataloader
def create_dataloader(dataset,
                      mode='train',
                      batch_size=1,
                      batchify_fn=None,
                      trans_fn=None):
    if trans_fn:
        dataset = dataset.map(trans_fn)

    shuffle = True if mode == 'train' else False
    # 训练数据集随机打乱，测试数据集不打乱
    if mode == 'train':
        batch_sampler = paddle.io.DistributedBatchSampler(
            dataset, batch_size=batch_size, shuffle=shuffle)
    else:
        batch_sampler = paddle.io.BatchSampler(
            dataset, batch_size=batch_size, shuffle=shuffle)

    return paddle.io.DataLoader(
        dataset=dataset,
        batch_sampler=batch_sampler,
        collate_fn=batchify_fn,
        return_list=True)

# 定义模型训练验证评估函数
@paddle.no_grad()
def evaluate(model, criterion, metric, data_loader):
    model.eval()
    metric.reset()
    losses = []
    all_labels = []
    all_preds = []

    for batch in data_loader:
        input_ids, token_type_ids, labels = batch
        logits = model(input_ids, token_type_ids)
        loss = criterion(logits, labels)
        losses.append(loss.numpy())

        probs = F.softmax(logits, axis=1)
        preds = paddle.argmax(probs, axis=1).numpy()
        labels = labels.numpy()
        all_preds.extend(preds)
        all_labels.extend(labels)

        correct = metric.compute(probs, paddle.to_tensor(labels))
        metric.update(correct)

    accu = metric.accumulate()
    
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    print(f"eval loss: {np.mean(losses):.4f}, accu: {accu:.4f}, precision: {precision:.4f}, recall: {recall:.4f}, f1: {f1:.4f}")
    
    model.train()
    metric.reset()
    
    return accu, precision, recall, f1

# 定义模型预测函数
def predict(model, data, tokenizer, batch_size=1):
    examples = []
    # 将输入数据（list格式）处理为模型可接受的格式
    for text in data:
        input_ids, segment_ids = convert_example(
            text,
            tokenizer,
            max_seq_length=128,
            is_test=True)
        examples.append((input_ids, segment_ids))

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input id
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment id
    ): fn(samples)

    # Seperates data into some batches.
    batches = []
    one_batch = []
    for example in examples:
        one_batch.append(example)
        if len(one_batch) == batch_size:
            batches.append(one_batch)
            one_batch = []
    if one_batch:
        # The last batch whose size is less than the config batch_size setting.
        batches.append(one_batch)

    results = []
    model.eval()
    for batch in batches:
        input_ids, segment_ids = batchify_fn(batch)
        input_ids = paddle.to_tensor(input_ids)
        segment_ids = paddle.to_tensor(segment_ids)
        logits = model(input_ids, segment_ids)
        probs = F.softmax(logits, axis=1)
        idx = paddle.argmax(probs, axis=1).numpy()
        idx = idx.tolist()
        results.extend(idx)  # 结果已经是0或1
    return results  # 返回预测结果

# 定义对数据的预处理函数,处理为模型输入指定list格式
def preprocess_prediction_data(data):
    examples = []
    for content in data:
        examples.append({"content": content})
    return examples

# 将list格式的预测结果存储为txt文件，提交格式要求：每行一个类别
def write_results(labels, file_path):
    with open(file_path, "w", encoding="utf8") as f:
        f.writelines("\n".join(map(str, labels)))  # 确保标签是字符串格式

if __name__ == '__main__':
    # 设置GPU运行
    paddle.device.set_device('gpu:0')

    df = pd.read_excel("social support_coding scheme_0313.xlsx", sheet_name=1)
    df['content'] = df['content'].astype(str)

    labels_to_process = ['transp_e', 'emp_e', 'symp_e', 'symptom_i', 'experience_i', 'objective_i']
    pretrained_models = ['ernie-3.0-base-zh']
    for MODEL_NAME in pretrained_models:
        for label in labels_to_process:
            print(f"Processing label: {label}")
            with open("best_model_metrics.csv", "a") as f:
                f.write(f"{label}\n")
            
            data = df[['content', label]]
            train, dev = train_test_split(data, test_size=0.2, random_state=22)
        
            # 保存处理后的数据集文件
            train.to_csv('train.csv', sep='\t', encoding = 'utf_8_sig', index=False)
            dev.to_csv('dev.csv', sep='\t', encoding = 'utf_8_sig', index=False)
        
            model = ppnlp.transformers.AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_classes=2)  # num_classes为类别数量
            tokenizer = ppnlp.transformers.AutoTokenizer.from_pretrained(MODEL_NAME)
        
            # 加载训练和验证集
            train_ds, dev_ds = load_dataset(splits=["train", "dev"])
        
            # 参数设置：
            batch_size = 16
            max_seq_length = 48
            learning_rate = 4e-5
            epochs = 4
            warmup_proportion = 0.1
            weight_decay = 0.0
        
            # 将数据处理成模型可读入的数据格式
            trans_func = partial(
                convert_example,
                tokenizer=tokenizer,
                max_seq_length=max_seq_length)
        
            batchify_fn = lambda samples, fn=Tuple(
                Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input_ids
                Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # token_type_ids
                Stack()  # labels
            ): [data for data in fn(samples)]
        
            # 训练集迭代器
            train_data_loader = create_dataloader(
                train_ds,
                mode='train',
                batch_size=batch_size,
                batchify_fn=batchify_fn,
                trans_fn=trans_func)
        
            # 验证集迭代器
            dev_data_loader = create_dataloader(
                dev_ds,
                mode='dev',
                batch_size=batch_size,
                batchify_fn=batchify_fn,
                trans_fn=trans_func)
        
            num_training_steps = len(train_data_loader) * epochs
            lr_scheduler = LinearDecayWithWarmup(learning_rate, num_training_steps, warmup_proportion)
        
            optimizer = paddle.optimizer.AdamW(
                learning_rate=lr_scheduler,
                parameters=model.parameters(),
                weight_decay=weight_decay,
                apply_decay_param_fun=lambda x: x in [
                    p.name for n, p in model.named_parameters()
                    if not any(nd in n for nd in ["bias", "norm"])
                ])
        
            criterion = paddle.nn.loss.CrossEntropyLoss()  # 交叉熵损失函数
            metric = paddle.metric.Accuracy()              # accuracy评价指标
        
            # 固定随机种子便于结果的复现
            seed = 1024
            random.seed(seed)
            np.random.seed(seed)
            paddle.seed(seed)
        
            save_dir = "checkpoint"
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
        
            pre_accu = 0
            best_metrics = {}
            for epoch in range(1, epochs + 1):
                for step, batch in enumerate(train_data_loader, start=1):
                    input_ids, token_type_ids, labels = batch
                    logits = model(input_ids, token_type_ids)
        
                    loss = criterion(logits, labels)
                    probs = F.softmax(logits, axis=1)
                    correct = metric.compute(probs, labels)
                    metric.update(correct)
                    acc = metric.accumulate()
        
                    loss.backward()
                    optimizer.step()
                    lr_scheduler.step()
                    optimizer.clear_grad()
        
                    if step % 10 == 0:
                        print("epoch: %d, step: %d, loss: %.4f, accu: %.4f" % (epoch, step, loss, acc))
        
                accu, precision, recall, f1 = evaluate(model, criterion, metric, dev_data_loader)
                print("end of epoch:{}, accu:{}".format(epoch, accu))
        
                if accu > pre_accu:
                    paddle.save(model.state_dict(), os.path.join(save_dir, "model_state.pdparams"))
                    tokenizer.save_pretrained(save_dir)
                    pre_accu = accu
                    best_metrics = {"model_name": MODEL_NAME, "accuracy": accu, "precision": precision, "recall": recall, "f1": f1}
        
            tokenizer.save_pretrained(save_dir)
        
            with open("best_model_metrics.csv", "a") as f:
                f.write(f"model: {best_metrics['model_name']}\n")
                f.write(f"Accuracy: {best_metrics['accuracy']:.4f}\n")
                f.write(f"Precision: {best_metrics['precision']:.4f}\n")
                f.write(f"Recall: {best_metrics['recall']:.4f}\n")
                f.write(f"F1 Score: {best_metrics['f1']:.4f}\n")
            
            params_path = 'checkpoint/model_state.pdparams'
            if params_path and os.path.isfile(params_path):
                # 加载模型参数
                state_dict = paddle.load(params_path)
                model.set_dict(state_dict)
                print("Loaded parameters from %s" % params_path)
        
            # 测试最优模型参数在验证集上的分数
            evaluate(model, criterion, metric, dev_data_loader)
        
            # # 定义要进行分类的类别
            # label_list=list(train[label].unique())
            # label_map = { 
            #     idx: label_text for idx, label_text in enumerate(label_list)
            # }
            # print(label_map)
        
            # 读取要进行预测的测试集文件
            test = pd.read_csv('./2107_2302_comment_predict.csv')  
        
            # 对测试集数据进行格式处理
            data1 = list(test.content)
            
            examples = preprocess_prediction_data(data1)
        
            # 对测试集进行预测
            results = predict(model, examples, tokenizer, batch_size=16)
            write_results(results, f"./predict_{label}.txt")