In [None]:
from paddlenlp.transformers import AutoTokenizer, AutoModel
from paddle.io import DataLoader
import paddle
import json
import tqdm
import os
from paddlenlp.layers import LinearChainCrf, LinearChainCrfLoss, ViterbiDecoder
from seqeval import metrics
from easydict import EasyDict as edict
from src.dataset import SequenceTaggingDataset
from src.utils import get_metric
from src.model import EHEModel

In [None]:
config = edict(
    {
        "bert_model_name": "bert-base-chinese",
        "batch_size": 32,
        "gpu": 1,
        "dataset": "ehe",
        "data_name": "data_with_features",
        "early_stopping": 20,
        "eval_step": 10,
        "lr": 1e-5,
        "crflr": 1e-3,
        "log_base": "./log",
        "result_base": "./result"
        "pos_emb": True,
        "emotion_emb": True,
        "bilstm": True,
        "crf": True,
        "dropout": 0.2,
        "emotion_vocab_size": 2,
        "checkpoint_path": "checkpoints"
    }
)
if paddle.is_compiled_with_cuda() and config.gpu is not None:  
    device = f"gpu:{config.gpu}"  
else:  
    device = "cpu"
config.task_name = config.dataset
if config.pos_emb:
    config.task_name += "_pos"
if config.emotion_emb:
    config.task_name += "_emotion"
if config.bilstm:
    config.task_name += "_bilstm"
if config.crf:
    config.task_name += "_crf"
config.task_name += "_lr%s_crflr%s_dropout%s" % (str(config.lr).replace("-", "_"), str(config.crflr).replace("-", "_"), str(config.dropout))
print(config.task_name)
config.log_dir = os.path.join(config.log_base, config.task_name)

In [None]:
# 设置日志路径以、参数保存路径及测试结果路径
os.makedirs(config.log_dir, exist_ok=True)
os.makedirs(config.checkpoint_path, exist_ok=True)
os.makedirs(config.result_base, exist_ok=True)
config.checkpoint = "./%s/%s.pdparams" % (config.checkpoint_path, config.task_name)

In [None]:
d = json.load(open("./data/%s.json" % (config.data_name)))
config.pos_map = json.load(open("./data/pos_map.json" ))
config.label_map = json.load(open("./data/label_map.json"))
config.pos_vocab_size = len(config.pos_map)
config.num_classes = len(config.label_map)
id2label = {i: l for l, i in config.label_map.items()}
id2label.update({config.num_classes: "O", config.num_classes+1: "O"})
tokenizer = AutoTokenizer.from_pretrained(config.bert_model_name)

In [None]:
model = EHEModel(config)

# 将BERT模型参数和其他参数分开
bert_params = []
crf_params = []
for name, param in model.named_parameters():
    if 'bert' in name:
        bert_params.append(param)
    else:
        crf_params.append(param)

optimizer = paddle.optimizer.AdamW(learning_rate=config.lr, parameters=bert_params)
crf_optimizer = paddle.optimizer.AdamW(learning_rate=config.crflr, parameters=crf_params)

In [None]:
train_data = d["train"]
eval_data = d["eval"]
test_data = d["test"]
max_length = 0
for item in train_data + eval_data + test_data:
    max_length = max(max_length, len(item["input_tokens"]))
print(max_length)

In [None]:
train_dataset = SequenceTaggingDataset(
    train_data, config.bert_model_name, config.pos_map, config.label_map,
    max_length, tokenizer=tokenizer
)
eval_dataset = SequenceTaggingDataset(
    eval_data, config.bert_model_name, config.pos_map, config.label_map,
    max_length, tokenizer=tokenizer
)
train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size)
eval_dataloader = DataLoader(eval_dataset, batch_size=config.batch_size)

In [None]:
# 模型训练
step = 0
best_f1 = 0.
stop_count = config["early_stopping"]
epoch = 1
to_stop = False
model.train()
while not to_stop:
    progress_bar = tqdm.tqdm(total=len(train_dataloader), desc=f'Training Epoch {epoch}, Best F1: {best_f1:.4f}')
    for batch_data in train_dataloader:
        log_item = {}
        preds, loss = model(**batch_data)
        metric = get_metric(batch_data["labels"].numpy(), preds.numpy(), batch_data["lengths"].numpy(), id2label)
        metric["loss"] = float(loss)
        log_item["train"] = metric

        loss.backward()
        optimizer.step()
        optimizer.clear_grad()
        crf_optimizer.step()
        crf_optimizer.clear_grad()

        if step % config.eval_step == 0:
            model.eval()
            eval_loss = 0.
            eval_num = 0
            preds_list = []
            labels_list = []
            lengths_list = []
            for batch_data in eval_dataloader:
                preds, loss = model(**batch_data)
                preds_list.append(preds.numpy())
                labels_list.append(batch_data["labels"].numpy())
                lengths_list.append(batch_data["lengths"].numpy())
                
                n = batch_data["input_ids"].shape[0]
                eval_loss += (float(loss) * n)
                eval_num += n
            eval_loss = eval_loss / eval_num
            metric = get_metric(labels_list, preds_list, lengths_list, id2label)
            metric["loss"] = float(eval_loss)
            log_item["eval"] = metric
            if best_f1 < metric["f1"]:
                best_f1 = metric["f1"]
                stop_count = config["early_stopping"]
                paddle.save(model.state_dict(), config.checkpoint)
            else:
                stop_count -= 1
            metric["best_f1"] = best_f1
            log_item["eval"] = metric
            model.train()
        json.dump(log_item, open(os.path.join(config.log_dir, "%d.json" % step), "w"))
        if stop_count < 0:
            to_stop = True
            break

        step += 1
        progress_bar.update(1)  # Move progress bar
        progress_bar.set_description(f'Training Epoch {epoch}, Best F1: {best_f1:.4f}')  # Update description
    progress_bar.close()  # Close the progress bar at the end of each epoch
    
    # progress_bar.close()
    epoch += 1

In [None]:
# 模型测试
state_dict = paddle.load(config.checkpoint)
model.set_state_dict(state_dict)
model.eval()
eval_loss = 0.
eval_num = 0
preds_list = []
labels_list = []
lengths_list = []
test_dataset = SequenceTaggingDataset(
    test_data, config.bert_model_name, config.pos_map, config.label_map, 
    max_length, tokenizer=tokenizer
)
test_dataloader = DataLoader(test_dataset, batch_size=config.batch_size)
for batch_data in test_dataloader:
    preds, loss = model(**batch_data)
    preds_list.append(preds.numpy())
    labels_list.append(batch_data["labels"].numpy())
    lengths_list.append(batch_data["lengths"].numpy())
    
    n = batch_data["input_ids"].shape[0]
    eval_loss += (float(loss) * n)
    eval_num += n
eval_loss = eval_loss / eval_num
metric = get_metric(labels_list, preds_list, lengths_list, id2label)
metric["loss"] = float(eval_loss)
config["test_metric"] = metric
json.dump(config, open("./result/%s.json" % config.task_name, "w"), ensure_ascii=False)