## 0. 模型叠加预训练 
### 0.1 数据处理

![%E5%BE%AE%E4%BF%A1%E6%88%AA%E5%9B%BE_20230620175523.png](attachment:%E5%BE%AE%E4%BF%A1%E6%88%AA%E5%9B%BE_20230620175523.png)

### 0.2 模型预训练 

In [None]:
import os
from transformers import BertTokenizer, BertForPretraining, TextDatasetForNextSentencePrediction, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# 加载预训练模型和分词器
model = BertForPretraining.from_pretrained(os.path.join('.', 'model_bin'))
tokenizer = BertTokenizer.from_pretrained(os.path.join('.', 'model_bin'))

# 创建数据集
dataset = TextDatasetForNextSentencePrediction(
    tokenizer=tokenizer,
    file_path="your_corpus.txt", # 你的语料库文件路径
    block_size=128,
)

# 创建数据处理器
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
)

# 创建训练参数
training_args = TrainingArguments(
    output_dir=os.path.join('.', 'model_bin_pretrain'), # 你的输出目录路径
    num_train_epochs=3,
    per_device_train_batch_size=16,
    logging_steps=100,
)

# 创建训练器
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# 开始训练
trainer.train()

## 1. 模型构建与训练 

In [1]:
import torch
print(torch.__version__)

1.9.0+cu111


In [2]:
import torch
device=torch.device('cuda:0')

In [3]:
# 读取数据
def read_data(file):
    all_token_list = []
    all_label_list = []

    token_list = []
    label_list = []
    with open(file, "r", encoding="utf-8") as f:
        all_data = f.read().split("\n")
    lenth_sentence_list = [] #用于存储句子的长度
    for data in all_data:
        # 这里进行数据处理时，每一句话结束后都会有一个空行，当判断为空行时，说明结束，这一句话存入列表
        # 但是在我重新处理的语料中，根据句号.来对句子进行划分
        if data == ". O":
            # 统计一下大多数的句子的长度，太长或者太短的句子直接丢掉，不要保存在语料列表当中
            lenth_sentence_list.append(len(token_list))
            if len(token_list) > 10 and len(token_list) < 70:
                all_token_list.append(token_list)
                all_label_list.append(label_list)
            token_list = []
            label_list = []
        else:
            # print(data)
            token, lable = data.split(" ")
            token_list.append(token)
            label_list.append(lable)
    return all_token_list, all_label_list, lenth_sentence_list


# 以训练集为例构建标签及标签ID映射关系
def build_label(train_label):
    # PAD用于填充，UNK 出现不认识的label,则给一个UNK的标识
    label2idx = {"PAD": 0, "UNK": 1}
    for label_list in train_label:
        for lable in label_list:
            if lable not in label2idx:
                label2idx[lable] = len(label2idx)
    return label2idx, list(label2idx)

In [4]:
import os
from transformers import BertTokenizer

# train_text, train_label, train_lenth_sentence_list = read_data("/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/datas/fineturn_data/biological_data/bio_data_S_train.txt")
# test_text, test_label, test_lenth_sentence_list = read_data("/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/datas/fineturn_data/biological_data/bio_data_S_dev.txt")
# dev_text, dev_label, dev_lenth_sentence_list = read_data("/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/datas/fineturn_data/biological_data/bio_data_S_dev.txt")

train_text, train_label, train_lenth_sentence_list = read_data("/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/datas/fineturn_data/biological_data/bio_data_F_train.txt")
test_text, test_label, test_lenth_sentence_list = read_data("/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/datas/fineturn_data/biological_data/bio_data_F_dev.txt")
dev_text, dev_label, dev_lenth_sentence_list = read_data("/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/datas/fineturn_data/biological_data/bio_data_F_dev.txt")


# print(test_label, test_text)

# 定义标签和对应的数字ID
# PAD --> padding
# UNK --> unknown
# labels_list = ['PAD', 'UNK', 'O', 'B-BFA', 'I-BFA', 'B-BFO', 'I-BFO', 'B-BOP', 'I-BOP', 'B-BEN', 'I-BEN', 'B-BOR', 'I-BOR']
# FA 功能行为； FO 功能对象； OP 生物行为；OR 器官；EN 生物体
# 将生物器官和生物体合并为生物结构，看看生物行为的数量多不多，不多的话看一下用其他的办法获得吧

label2idx, idx2label = build_label(train_label)

print("label2idx为：", label2idx)
print("idx2label为：", idx2label)
print(len(train_lenth_sentence_list))    # 原本有525句子
print(len(train_text))       # 符合长度要求的只有254句子
print(len(dev_lenth_sentence_list))    # 原本有525句子
print(len(dev_text))       # 符合长度要求的只有254句子
print("train_lenth_sentence_list为：", train_lenth_sentence_list)

label2idx为： {'PAD': 0, 'UNK': 1, 'O': 2, 'B-BFA': 3, 'B-BFO': 4, 'I-BFO': 5, 'I-BFA': 6}
idx2label为： ['PAD', 'UNK', 'O', 'B-BFA', 'B-BFO', 'I-BFO', 'I-BFA']
164
157
41
38
train_lenth_sentence_list为： [15, 13, 48, 20, 16, 27, 31, 38, 27, 25, 23, 33, 13, 13, 25, 17, 20, 18, 34, 42, 21, 17, 25, 17, 23, 15, 22, 27, 9, 25, 28, 32, 38, 16, 17, 20, 35, 30, 35, 22, 23, 28, 31, 22, 55, 27, 20, 32, 14, 16, 23, 12, 20, 14, 9, 22, 30, 18, 24, 20, 27, 22, 33, 19, 30, 18, 26, 17, 14, 27, 13, 21, 18, 45, 52, 21, 21, 18, 20, 35, 23, 21, 24, 16, 17, 13, 27, 15, 16, 20, 27, 38, 27, 27, 16, 12, 12, 22, 12, 10, 23, 15, 14, 13, 25, 29, 10, 12, 16, 20, 35, 31, 47, 27, 23, 18, 19, 21, 32, 24, 20, 21, 19, 28, 10, 11, 27, 27, 23, 16, 34, 13, 19, 15, 19, 16, 23, 32, 10, 11, 19, 27, 17, 26, 19, 26, 29, 16, 22, 14, 21, 13, 25, 17, 70, 44, 31, 22, 29, 27, 30, 18, 24, 37]


In [5]:
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from transformers import BertModel
from transformers import AdamW
from seqeval.metrics import f1_score as seqeval_f1_score
from torchcrf import CRF

# 调用torch中bert的分词器
tokenizer = BertTokenizer.from_pretrained(os.path.join('.', 'model_bin'))

# 构建数据集
class BertDataset(Dataset):
    def __init__(self, all_text, all_label, label2idx, max_len, tokenizer, is_test=False):
        self.all_text = all_text
        self.all_label = all_label
        self.label2idx = label2idx
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_test = is_test

    def __getitem__(self, index):
        if self.is_test:
            self.max_len = len(self.all_label[index])

        # 按批次获得数据
        text = self.all_text[index]
        # 0到max_len的词有对应的label,后面的用padding（转换为数字张量之后就是0了）补充
        label = self.all_label[index][:self.max_len]
        
        # 这里的encode函数要做的事：
        # 对词进行编码，将词转换为在词表中对应的数字
        # 对长度大于max_length的语句进行截断（超出了max_length的直接没有了），对长度不足max_length的语句进行padding填充
        # 添加特殊的标记符
        # 返回tensor类型的张量，即将词进行编码为torch张量
        text_index = self.tokenizer.encode(text,
                                           add_special_tokens=True,
                                           max_length=self.max_len + 2,        # 加上两个左右标识符
                                           padding="max_length",
                                           truncation=True,
                                           return_tensors="pt")        # pt 返回pytorch的张量
        label_index = [0] + [self.label2idx.get(l, 1) for l in label] + [0] + [0] * (max_len - len(text))
        # 将编码后的label转换为torch张量
        label_index = torch.tensor(label_index)
        # 统一text_index和label_index的形状和维度
        return text_index.reshape(-1), label_index, len(label)

    def __len__(self):
        return self.all_text.__len__()

2023-06-13 22:56:18.111102: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-13 22:56:18.238539: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-13 22:56:18.780902: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-06-13 22:56:18.780950: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

In [6]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from transformers import BertModel
from transformers import AdamW
from seqeval.metrics import f1_score as seqeval_f1_score
from torchcrf import CRF


batch_size = 16
max_len = 70

# 训练+测试

train_dataset = BertDataset(train_text,
                            train_label,
                            label2idx,
                            max_len,
                            tokenizer)
# 在处理好的语料中随机获得batch_size个句子，作为一批语料输入模型，shuffle是否要随机获得语料
train_dataloader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              shuffle=False)

dev_dataset = BertDataset(dev_text,
                          dev_label,
                          label2idx,
                          max_len,
                          tokenizer)
dev_dataloader = DataLoader(dev_dataset,
                            batch_size=batch_size,
                            shuffle=False)

print("train_dataset的数据类型为： ", type(train_dataset))

train_dataset的数据类型为：  <class '__main__.BertDataset'>


In [7]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from transformers import BertModel
from transformers import AdamW
from seqeval.metrics import f1_score as seqeval_f1_score
from torchcrf import CRF

# 构建基于Bert_LSTM_CRF的模型
class Bert_LSTM_CRF_NerModel(nn.Module):
    def __init__(self, lstm_hidden, class_num, dropout_rate=0.3):
        super().__init__()

        self.bert = BertModel.from_pretrained(os.path.join('.', 'model_bin'))
        
        # 为了省显存做的，如果不想省可以删掉
        for name, param in self.bert.named_parameters():
            param.requires_grad = False
            
        # bidirectional 调成True的时候，Linear层的lstm_hidden需要乘2
        self.lstm = nn.LSTM(768,
                            lstm_hidden,
                            batch_first=True,
                            num_layers=1,
                            bidirectional=True)
        
        self.dropout = nn.Dropout(dropout_rate)
        
        # 线性分类器
        self.classifier = nn.Linear(lstm_hidden*2, class_num)
        
        # 条件随机场
        self.crf = CRF(class_num, batch_first=True)

    # 前馈层
    def forward(self, batch_index, batch_label=None):   
        # 传了label数据的时候，就是训练，就要看loss,没传label数据的时候，就直接对分类预测的结果pre进行解码
        if batch_label is not None:
            # 按批次传入数据到bert编码层
            bert_out = self.bert(batch_index)
            # bert_out0 字符级别的特征信息 bert_out1 篇章级别的特征信息
            bert_out0, bert_out1 = bert_out[0], bert_out[1]  
            # 命名实体识别是序列标注任务，因此需要bert_out0，字符级别的特征信息
            lstm_out, _ = self.lstm(bert_out0)
            lstm_out = self.dropout(lstm_out)
            # 调用线性分类器完成预测任务
            pre = self.classifier(lstm_out)
            # loss = self.loss_fun(pre.reshape(-1, pre.shape[-1]),batch_label.reshape(-1))
            loss = -self.crf(pre, batch_label)
            return loss
        else:
            # 按批次传入数据到bert编码层
            bert_out = self.bert(batch_index)
            # bert_out0 字符级别的特征信息 bert_out1 篇章级别的特征信息
            bert_out0, bert_out1 = bert_out[0], bert_out[1]  
            # 命名实体识别是序列标注任务，因此需要bert_out0，字符级别的特征信息
            lstm_out, _ = self.lstm(bert_out0)
            # 调用线性分类器完成预测任务
            pre = self.classifier(lstm_out)
            
            pre = self.crf.decode(pre)
            return pre

In [8]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from transformers import BertModel
from transformers import AdamW
from seqeval.metrics import f1_score as seqeval_f1_score
from torchcrf import CRF

epoch = 200
lr = 0.0005
lstm_hidden = 128
# 根据“训练+验证” “测试” “输入” 三种状态选择True或False
do_train = True
do_test = False
do_input = False
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda:0


In [9]:
if do_train:
        # 加载模型，并将模型放到cuda上
        model = Bert_LSTM_CRF_NerModel(lstm_hidden, len(label2idx)).cuda()
        
        # opt是优化器，这里采用Adam的优化方式
        opt = AdamW(model.parameters(), lr)

        best_score = -1
        for e in range(epoch):
            model.train()
            batch_index = 1    # 设置该变量记录当前是训练模型的第几批，每一批都有16个句子，所以总共有12批    
            for batch_text_index, batch_label_index, batch_len in train_dataloader:
                # 按批获得数据，数据放入cuda中
                batch_text_index = batch_text_index.cuda()
                batch_label_index = batch_label_index.cuda()
                # 数据读入读入模型中，获得训练的loss
                loss = model.forward(batch_text_index, batch_label_index)
                loss.backward()
                # opt优化器优化
                opt.step()
                # 梯度归0
                opt.zero_grad()
                
                print("当前的批次为：", batch_index)
                print(f'loss{loss:.2f}')
                batch_index = batch_index + 1
            
            print('-----------------------------------------------')
            print('********  have trained ' + str(e+1) + ' epoch!  ********')
            print('-----------------------------------------------')
            
            # 模型训练结束之后，开始进行测试
            model.eval()
            all_pre = []    # 记录所有的预测label
            all_tag = []    # 记录所有的真实label
            for batch_text_index, batch_label_index, batch_len in dev_dataloader:
                batch_text_index = batch_text_index.cuda()
                batch_label_index = batch_label_index.cuda()
                # 获得模型预测的结果
                pre = model.forward(batch_text_index)
                # 加载真实的label
                tag = batch_label_index.tolist()

                for p, t, l in zip(pre, tag, batch_len):
                    p = p[1:1 + l]
                    t = t[1:1 + l]

                    p = [idx2label[i] for i in p]
                    t = [idx2label[i] for i in t]

                    all_pre.append(p)
                    all_tag.append(t)

                # 获得f1_score
                f1_score = seqeval_f1_score(all_tag, all_pre)
                # 记录最好的f1_score结果，并将得到该结果的模型文件.pt保存下来
                if f1_score > best_score:
                    torch.save(model, 'best_modelBio.pt')
                    best_score = f1_score
                    
                print(f'********  best_score:{best_score}  ********')
                print('-----------------------------------------------')
                print(f'********  f1_score:{f1_score}  ********')
                print('-----------------------------------------------')
                

Some weights of the model checkpoint at ./model_bin were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  score = torch.where(mask[i].unsqueeze(1), next_score, score)


当前的批次为： 1
loss1949.40
当前的批次为： 2
loss1503.77
当前的批次为： 3
loss1187.31
当前的批次为： 4
loss862.86
当前的批次为： 5
loss784.64
当前的批次为： 6
loss611.41
当前的批次为： 7
loss540.64
当前的批次为： 8
loss537.69
当前的批次为： 9
loss396.58
当前的批次为： 10
loss368.39
-----------------------------------------------
********  have trained 1 epoch!  ********
-----------------------------------------------




********  best_score:0.0  ********
-----------------------------------------------
********  f1_score:0.0  ********
-----------------------------------------------
********  best_score:0.0  ********
-----------------------------------------------
********  f1_score:0.0  ********
-----------------------------------------------
********  best_score:0.0  ********
-----------------------------------------------
********  f1_score:0.0  ********
-----------------------------------------------
当前的批次为： 1
loss480.73
当前的批次为： 2
loss428.58
当前的批次为： 3
loss355.35
当前的批次为： 4
loss271.70
当前的批次为： 5
loss389.60
当前的批次为： 6
loss318.78
当前的批次为： 7
loss304.48
当前的批次为： 8
loss382.01
当前的批次为： 9
loss266.94
当前的批次为： 10
loss287.47
-----------------------------------------------
********  have trained 2 epoch!  ********
-----------------------------------------------
********  best_score:0.0  ********
-----------------------------------------------
********  f1_score:0.0  ********
-------------------------------------------

当前的批次为： 8
loss232.31
当前的批次为： 9
loss174.46
当前的批次为： 10
loss194.81
-----------------------------------------------
********  have trained 11 epoch!  ********
-----------------------------------------------
********  best_score:0.0  ********
-----------------------------------------------
********  f1_score:0.0  ********
-----------------------------------------------
********  best_score:0.0  ********
-----------------------------------------------
********  f1_score:0.0  ********
-----------------------------------------------
********  best_score:0.0  ********
-----------------------------------------------
********  f1_score:0.0  ********
-----------------------------------------------
当前的批次为： 1
loss259.90
当前的批次为： 2
loss243.78
当前的批次为： 3
loss206.80
当前的批次为： 4
loss172.12
当前的批次为： 5
loss223.40
当前的批次为： 6
loss179.32
当前的批次为： 7
loss195.06
当前的批次为： 8
loss235.49
当前的批次为： 9
loss172.68
当前的批次为： 10
loss190.50
-----------------------------------------------
********  have trained 12 epoch!  ********
---

当前的批次为： 2
loss215.91
当前的批次为： 3
loss173.98
当前的批次为： 4
loss155.91
当前的批次为： 5
loss171.62
当前的批次为： 6
loss148.81
当前的批次为： 7
loss157.58
当前的批次为： 8
loss201.20
当前的批次为： 9
loss149.08
当前的批次为： 10
loss162.55
-----------------------------------------------
********  have trained 21 epoch!  ********
-----------------------------------------------
********  best_score:0.0  ********
-----------------------------------------------
********  f1_score:0.0  ********
-----------------------------------------------
********  best_score:0.0  ********
-----------------------------------------------
********  f1_score:0.0  ********
-----------------------------------------------
********  best_score:0.0  ********
-----------------------------------------------
********  f1_score:0.0  ********
-----------------------------------------------
当前的批次为： 1
loss199.43
当前的批次为： 2
loss212.36
当前的批次为： 3
loss171.71
当前的批次为： 4
loss157.20
当前的批次为： 5
loss165.35
当前的批次为： 6
loss143.04
当前的批次为： 7
loss156.24
当前的批次为： 8
loss187.37
当前的批次为： 9
l

当前的批次为： 2
loss178.72
当前的批次为： 3
loss138.07
当前的批次为： 4
loss115.94
当前的批次为： 5
loss126.83
当前的批次为： 6
loss117.74
当前的批次为： 7
loss135.44
当前的批次为： 8
loss155.24
当前的批次为： 9
loss119.58
当前的批次为： 10
loss126.59
-----------------------------------------------
********  have trained 31 epoch!  ********
-----------------------------------------------
********  best_score:0.0  ********
-----------------------------------------------
********  f1_score:0.0  ********
-----------------------------------------------
********  best_score:0.0  ********
-----------------------------------------------
********  f1_score:0.0  ********
-----------------------------------------------
********  best_score:0.0  ********
-----------------------------------------------
********  f1_score:0.0  ********
-----------------------------------------------
当前的批次为： 1
loss127.50
当前的批次为： 2
loss160.73
当前的批次为： 3
loss125.71
当前的批次为： 4
loss121.67
当前的批次为： 5
loss119.12
当前的批次为： 6
loss103.26
当前的批次为： 7
loss118.41
当前的批次为： 8
loss136.93
当前的批次为： 9
l

当前的批次为： 1
loss102.46
当前的批次为： 2
loss141.46
当前的批次为： 3
loss96.04
当前的批次为： 4
loss82.68
当前的批次为： 5
loss90.38
当前的批次为： 6
loss75.30
当前的批次为： 7
loss91.72
当前的批次为： 8
loss107.20
当前的批次为： 9
loss78.81
当前的批次为： 10
loss116.95
-----------------------------------------------
********  have trained 41 epoch!  ********
-----------------------------------------------
********  best_score:0.12500000000000003  ********
-----------------------------------------------
********  f1_score:0.0  ********
-----------------------------------------------
********  best_score:0.12500000000000003  ********
-----------------------------------------------
********  f1_score:0.0392156862745098  ********
-----------------------------------------------
********  best_score:0.12500000000000003  ********
-----------------------------------------------
********  f1_score:0.03508771929824562  ********
-----------------------------------------------
当前的批次为： 1
loss104.46
当前的批次为： 2
loss119.55
当前的批次为： 3
loss101.97
当前的批次为： 4
loss77.33
当前

当前的批次为： 3
loss75.62
当前的批次为： 4
loss87.18
当前的批次为： 5
loss69.47
当前的批次为： 6
loss68.46
当前的批次为： 7
loss69.81
当前的批次为： 8
loss77.60
当前的批次为： 9
loss65.40
当前的批次为： 10
loss94.45
-----------------------------------------------
********  have trained 50 epoch!  ********
-----------------------------------------------
********  best_score:0.1818181818181818  ********
-----------------------------------------------
********  f1_score:0.1818181818181818  ********
-----------------------------------------------
********  best_score:0.1818181818181818  ********
-----------------------------------------------
********  f1_score:0.12698412698412698  ********
-----------------------------------------------
********  best_score:0.1818181818181818  ********
-----------------------------------------------
********  f1_score:0.1643835616438356  ********
-----------------------------------------------
当前的批次为： 1
loss65.63
当前的批次为： 2
loss102.50
当前的批次为： 3
loss73.31
当前的批次为： 4
loss65.28
当前的批次为： 5
loss71.38
当前的批次为： 6
loss64

当前的批次为： 1
loss55.73
当前的批次为： 2
loss72.61
当前的批次为： 3
loss48.22
当前的批次为： 4
loss57.99
当前的批次为： 5
loss62.54
当前的批次为： 6
loss47.56
当前的批次为： 7
loss65.63
当前的批次为： 8
loss66.60
当前的批次为： 9
loss53.87
当前的批次为： 10
loss66.56
-----------------------------------------------
********  have trained 59 epoch!  ********
-----------------------------------------------
********  best_score:0.18749999999999997  ********
-----------------------------------------------
********  f1_score:0.06451612903225805  ********
-----------------------------------------------
********  best_score:0.18749999999999997  ********
-----------------------------------------------
********  f1_score:0.06349206349206349  ********
-----------------------------------------------
********  best_score:0.18749999999999997  ********
-----------------------------------------------
********  f1_score:0.08333333333333334  ********
-----------------------------------------------
当前的批次为： 1
loss48.50
当前的批次为： 2
loss65.92
当前的批次为： 3
loss45.04
当前的批次为： 4
lo

当前的批次为： 1
loss91.60
当前的批次为： 2
loss103.60
当前的批次为： 3
loss66.25
当前的批次为： 4
loss45.29
当前的批次为： 5
loss44.26
当前的批次为： 6
loss100.56
当前的批次为： 7
loss109.20
当前的批次为： 8
loss114.53
当前的批次为： 9
loss66.30
当前的批次为： 10
loss70.61
-----------------------------------------------
********  have trained 68 epoch!  ********
-----------------------------------------------
********  best_score:0.18749999999999997  ********
-----------------------------------------------
********  f1_score:0.11764705882352941  ********
-----------------------------------------------
********  best_score:0.18749999999999997  ********
-----------------------------------------------
********  f1_score:0.16071428571428573  ********
-----------------------------------------------
********  best_score:0.18749999999999997  ********
-----------------------------------------------
********  f1_score:0.15748031496062992  ********
-----------------------------------------------
当前的批次为： 1
loss51.93
当前的批次为： 2
loss130.24
当前的批次为： 3
loss95.37
当前的批次为：

当前的批次为： 3
loss37.83
当前的批次为： 4
loss35.46
当前的批次为： 5
loss37.17
当前的批次为： 6
loss28.42
当前的批次为： 7
loss44.71
当前的批次为： 8
loss37.42
当前的批次为： 9
loss34.49
当前的批次为： 10
loss47.72
-----------------------------------------------
********  have trained 77 epoch!  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.05555555555555555  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.07792207792207793  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.0898876404494382  ********
-----------------------------------------------
当前的批次为： 1
loss36.25
当前的批次为： 2
loss46.80
当前的批次为： 3
loss35.18
当前的批次为： 4
loss45.01
当前的批次为： 5
loss32.45
当前的批次为： 6
loss28

当前的批次为： 3
loss27.75
当前的批次为： 4
loss29.64
当前的批次为： 5
loss32.90
当前的批次为： 6
loss28.69
当前的批次为： 7
loss35.33
当前的批次为： 8
loss30.76
当前的批次为： 9
loss23.13
当前的批次为： 10
loss35.59
-----------------------------------------------
********  have trained 86 epoch!  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.060606060606060615  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.08450704225352111  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.0975609756097561  ********
-----------------------------------------------
当前的批次为： 1
loss33.82
当前的批次为： 2
loss33.77
当前的批次为： 3
loss26.02
当前的批次为： 4
loss25.70
当前的批次为： 5
loss29.55
当前的批次为： 6
loss2

当前的批次为： 3
loss23.67
当前的批次为： 4
loss22.41
当前的批次为： 5
loss18.90
当前的批次为： 6
loss25.88
当前的批次为： 7
loss26.30
当前的批次为： 8
loss29.06
当前的批次为： 9
loss19.31
当前的批次为： 10
loss27.86
-----------------------------------------------
********  have trained 95 epoch!  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.06250000000000001  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.06060606060606061  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.08  ********
-----------------------------------------------
当前的批次为： 1
loss59.96
当前的批次为： 2
loss26.79
当前的批次为： 3
loss19.34
当前的批次为： 4
loss24.88
当前的批次为： 5
loss22.89
当前的批次为： 6
loss22.39
当前的批次为： 7


当前的批次为： 3
loss19.14
当前的批次为： 4
loss23.43
当前的批次为： 5
loss25.90
当前的批次为： 6
loss25.60
当前的批次为： 7
loss23.06
当前的批次为： 8
loss20.00
当前的批次为： 9
loss20.20
当前的批次为： 10
loss30.23
-----------------------------------------------
********  have trained 104 epoch!  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.046511627906976744  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.08421052631578947  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.09174311926605505  ********
-----------------------------------------------
当前的批次为： 1
loss26.43
当前的批次为： 2
loss34.64
当前的批次为： 3
loss23.87
当前的批次为： 4
loss23.26
当前的批次为： 5
loss23.98
当前的批次为： 6
los

当前的批次为： 2
loss26.63
当前的批次为： 3
loss13.83
当前的批次为： 4
loss17.96
当前的批次为： 5
loss14.88
当前的批次为： 6
loss15.85
当前的批次为： 7
loss29.08
当前的批次为： 8
loss17.31
当前的批次为： 9
loss16.37
当前的批次为： 10
loss20.88
-----------------------------------------------
********  have trained 113 epoch!  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.12500000000000003  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.11594202898550723  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.1282051282051282  ********
-----------------------------------------------
当前的批次为： 1
loss18.98
当前的批次为： 2
loss28.56
当前的批次为： 3
loss14.71
当前的批次为： 4
loss16.26
当前的批次为： 5
loss2

当前的批次为： 2
loss17.06
当前的批次为： 3
loss15.24
当前的批次为： 4
loss16.87
当前的批次为： 5
loss12.31
当前的批次为： 6
loss16.24
当前的批次为： 7
loss16.37
当前的批次为： 8
loss17.97
当前的批次为： 9
loss11.75
当前的批次为： 10
loss19.65
-----------------------------------------------
********  have trained 122 epoch!  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.0  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.029850746268656712  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.05263157894736842  ********
-----------------------------------------------
当前的批次为： 1
loss17.96
当前的批次为： 2
loss16.50
当前的批次为： 3
loss17.06
当前的批次为： 4
loss17.99
当前的批次为： 5
loss13.56
当前的批次为： 6

当前的批次为： 2
loss17.30
当前的批次为： 3
loss19.03
当前的批次为： 4
loss10.86
当前的批次为： 5
loss13.90
当前的批次为： 6
loss11.86
当前的批次为： 7
loss18.33
当前的批次为： 8
loss12.42
当前的批次为： 9
loss12.34
当前的批次为： 10
loss22.31
-----------------------------------------------
********  have trained 131 epoch!  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.052631578947368425  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.06818181818181818  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.0808080808080808  ********
-----------------------------------------------
当前的批次为： 1
loss16.68
当前的批次为： 2
loss14.34
当前的批次为： 3
loss12.77
当前的批次为： 4
loss13.10
当前的批次为： 5
loss

当前的批次为： 2
loss17.85
当前的批次为： 3
loss8.27
当前的批次为： 4
loss9.60
当前的批次为： 5
loss9.05
当前的批次为： 6
loss12.82
当前的批次为： 7
loss17.67
当前的批次为： 8
loss9.56
当前的批次为： 9
loss9.37
当前的批次为： 10
loss14.80
-----------------------------------------------
********  have trained 140 epoch!  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.049999999999999996  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.04494382022471911  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.058823529411764705  ********
-----------------------------------------------
当前的批次为： 1
loss14.25
当前的批次为： 2
loss12.29
当前的批次为： 3
loss9.06
当前的批次为： 4
loss9.85
当前的批次为： 5
loss9.58


当前的批次为： 2
loss12.48
当前的批次为： 3
loss9.30
当前的批次为： 4
loss10.99
当前的批次为： 5
loss8.77
当前的批次为： 6
loss7.15
当前的批次为： 7
loss15.46
当前的批次为： 8
loss8.25
当前的批次为： 9
loss15.68
当前的批次为： 10
loss11.79
-----------------------------------------------
********  have trained 149 epoch!  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.05882352941176471  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.08450704225352111  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.1  ********
-----------------------------------------------
当前的批次为： 1
loss10.99
当前的批次为： 2
loss16.44
当前的批次为： 3
loss7.96
当前的批次为： 4
loss8.26
当前的批次为： 5
loss7.74
当前的批次为： 6
loss8.7

当前的批次为： 2
loss9.94
当前的批次为： 3
loss4.83
当前的批次为： 4
loss11.40
当前的批次为： 5
loss6.61
当前的批次为： 6
loss15.95
当前的批次为： 7
loss16.61
当前的批次为： 8
loss6.19
当前的批次为： 9
loss5.04
当前的批次为： 10
loss11.30
-----------------------------------------------
********  have trained 158 epoch!  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.05882352941176471  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.05128205128205128  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.06896551724137932  ********
-----------------------------------------------
当前的批次为： 1
loss12.42
当前的批次为： 2
loss12.34
当前的批次为： 3
loss5.56
当前的批次为： 4
loss5.53
当前的批次为： 5
loss11.11
当

当前的批次为： 2
loss9.80
当前的批次为： 3
loss7.00
当前的批次为： 4
loss9.57
当前的批次为： 5
loss4.99
当前的批次为： 6
loss5.73
当前的批次为： 7
loss11.91
当前的批次为： 8
loss5.56
当前的批次为： 9
loss8.31
当前的批次为： 10
loss12.13
-----------------------------------------------
********  have trained 167 epoch!  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.07142857142857144  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.06153846153846154  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.0821917808219178  ********
-----------------------------------------------
当前的批次为： 1
loss9.56
当前的批次为： 2
loss18.67
当前的批次为： 3
loss5.73
当前的批次为： 4
loss13.35
当前的批次为： 5
loss4.64
当前的批次

当前的批次为： 1
loss8.55
当前的批次为： 2
loss8.21
当前的批次为： 3
loss5.67
当前的批次为： 4
loss6.09
当前的批次为： 5
loss5.10
当前的批次为： 6
loss4.36
当前的批次为： 7
loss7.97
当前的批次为： 8
loss5.38
当前的批次为： 9
loss8.69
当前的批次为： 10
loss7.99
-----------------------------------------------
********  have trained 176 epoch!  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.05555555555555555  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.048192771084337345  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.06521739130434782  ********
-----------------------------------------------
当前的批次为： 1
loss7.73
当前的批次为： 2
loss9.09
当前的批次为： 3
loss4.11
当前的批次为： 4
loss5.33
当前的批次为：

当前的批次为： 1
loss6.24
当前的批次为： 2
loss6.28
当前的批次为： 3
loss4.17
当前的批次为： 4
loss5.36
当前的批次为： 5
loss3.55
当前的批次为： 6
loss6.19
当前的批次为： 7
loss7.25
当前的批次为： 8
loss4.77
当前的批次为： 9
loss3.87
当前的批次为： 10
loss6.23
-----------------------------------------------
********  have trained 185 epoch!  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.05714285714285715  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.05063291139240506  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.06818181818181819  ********
-----------------------------------------------
当前的批次为： 1
loss6.35
当前的批次为： 2
loss7.47
当前的批次为： 3
loss7.35
当前的批次为： 4
loss6.15
当前的批次为： 

当前的批次为： 1
loss4.50
当前的批次为： 2
loss8.11
当前的批次为： 3
loss6.86
当前的批次为： 4
loss6.66
当前的批次为： 5
loss5.80
当前的批次为： 6
loss5.43
当前的批次为： 7
loss9.97
当前的批次为： 8
loss5.15
当前的批次为： 9
loss4.47
当前的批次为： 10
loss10.13
-----------------------------------------------
********  have trained 194 epoch!  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.0975609756097561  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.10869565217391304  ********
-----------------------------------------------
********  best_score:0.2580645161290322  ********
-----------------------------------------------
********  f1_score:0.11881188118811882  ********
-----------------------------------------------
当前的批次为： 1
loss6.95
当前的批次为： 2
loss5.82
当前的批次为： 3
loss3.89
当前的批次为： 4
loss6.88
当前的批次为： 

In [None]:
print(best_score)
print(len(all_pre))

In [None]:
all_pre

In [None]:
all_tag

## 2 使用新的模型预测数据 

In [None]:
# 读取数据
def read_data(file):
    all_token_list = []
    all_label_list = []

    token_list = []
    label_list = []
    with open(file, "r", encoding="utf-8") as f:
        all_data = f.read().split("\n")
    lenth_sentence_list = [] #用于存储句子的长度
    for data in all_data:
        # 但是在我重新处理的语料中，根据句号.来对句子进行划分
        if len(data) > 0:
            if data == ". O":
                # 统计一下大多数的句子的长度，太长或者太短的句子直接丢掉，不要保存在语料列表当中
                if len(token_list) > 0: 
                    lenth_sentence_list.append(len(token_list))
                    all_token_list.append(token_list)
                    all_label_list.append(label_list)
                token_list = []
                label_list = []
            else:
                token, lable = data.split(" ")
                token_list.append(token)
                label_list.append(lable)
    return all_token_list, all_label_list, lenth_sentence_list
dev_text, dev_label, dev_lenth_sentence_list = read_data("/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/datas/original_data/biological/bio_dev_data.txt")
print("最长的句子为：", max(dev_lenth_sentence_list))
print("有多少条句子：", len(dev_lenth_sentence_list))

In [None]:
import os
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from transformers import BertModel
from transformers import AdamW
from seqeval.metrics import f1_score as seqeval_f1_score
from torchcrf import CRF

# 调用torch中bert的分词器
tokenizer = BertTokenizer.from_pretrained(os.path.join('.', 'model_bin'))

# 构建数据集
class BertDataset(Dataset):
    def __init__(self, all_text, all_label, label2idx, max_len, tokenizer, is_test=False):
        self.all_text = all_text
        self.all_label = all_label
        self.label2idx = label2idx
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_test = is_test

    def __getitem__(self, index):
        if self.is_test:
            self.max_len = len(self.all_label[index])

        # 按批次获得数据
        text = self.all_text[index]
        # 0到max_len的词有对应的label,后面的用padding（转换为数字张量之后就是0了）补充
        label = self.all_label[index][:self.max_len]
        
        # 这里的encode函数要做的事：
        # 对词进行编码，将词转换为在词表中对应的数字
        # 对长度大于max_length的语句进行截断（超出了max_length的直接没有了），对长度不足max_length的语句进行padding填充
        # 添加特殊的标记符
        # 返回tensor类型的张量，即将词进行编码为torch张量
        text_index = self.tokenizer.encode(text,
                                           add_special_tokens=True,
                                           max_length=self.max_len + 2,        # 加上两个左右标识符
                                           padding="max_length",
                                           truncation=True,
                                           return_tensors="pt")        # pt 返回pytorch的张量
        label_index = [0] + [self.label2idx.get(l, 1) for l in label] + [0] + [0] * (max_len - len(text))
        # 将编码后的label转换为torch张量
        label_index = torch.tensor(label_index)
        # 统一text_index和label_index的形状和维度
        return text_index.reshape(-1), label_index, len(label)

    def __len__(self):
        return self.all_text.__len__()

In [None]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from transformers import BertModel
from transformers import AdamW
from seqeval.metrics import f1_score as seqeval_f1_score
from torchcrf import CRF


batch_size = 16
max_len = max(dev_lenth_sentence_list)

dev_dataset = BertDataset(dev_text,
                          dev_label,
                          label2idx,
                          max_len,
                          tokenizer)
dev_dataloader = DataLoader(dev_dataset,
                            batch_size=batch_size,
                            shuffle=False)

print("train_dataset的数据类型为： ", type(train_dataset))

In [None]:
model.eval()
all_pre = []    # 记录所有的预测label
all_tag = []    # 记录所有的真实label
batch_index = 1
for batch_text_index, batch_label_index, batch_len in dev_dataloader:
    batch_text_index = batch_text_index.cuda()
    batch_label_index = batch_label_index.cuda()
    # 获得模型预测的结果(这里是一个批次的数据)
    pre = model.forward(batch_text_index)
    # 加载真实的label
    tag = batch_label_index.tolist()

    for p, t, l in zip(pre, tag, batch_len):
        p = p[1:1 + l]
        t = t[1:1 + l]

        p = [idx2label[i] for i in p]
        t = [idx2label[i] for i in t]

        all_pre.append(p)
        all_tag.append(t)

    batch_index = batch_index + 1

In [None]:
print("语料的总批次为：", batch_index)
print("label的总个数为：", len(all_pre))
print("字符的总个数为：", len(dev_text))

## 3 将模型预测的结果写入文件 

In [None]:
word2label_list = []
for i in range(len(dev_text)):
    word2label_dic = {}
    for j in range(len(dev_text[i])):
        word2label_dic[dev_text[i][j]] = all_pre[i][j]
    word2label_list.append(word2label_dic)
print("字符-label对的总个数为：", len(word2label_list))

In [None]:
word2label_list

In [None]:
with open("/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/datas/result_data/biological_data/1.txt", "w", encoding="utf-8") as file:
    for i in range(len(word2label_list)):
        for key, value in word2label_list[i].items():
            file.write(f"{key} {value}\n")
        file.write(". O\n")

## 4 将预测的结果转换格式（有预处理的格式转换为一篇文章下属的实体及label的格式） 

In [None]:
def parse_predict_results(file_path):
    """
    Parse results from predicted file.
    """
    with open(file_path, 'r', encoding="utf-8") as f:
        lines = f.readlines()
    id2results = {}
    category = ""
    category_set = set()
    for line in lines:
        try:
            category = line.split(' ')[1].split('-')[1].strip()
        except:
            pass
        if category in ('BFA', 'BFO', 'BOR', 'BEN', 'BOP'):
            category_set.add(category)

    cur_id = ""
    extracted_cnt = 0
    category = ""
    for line in lines:
        if line.startswith('##id:'):
            new_id = line[5:].split(" ")[0].strip()
            if cur_id != new_id:
                if extracted_cnt > 0 :
                    id2results[cur_id] = result_dict
                result_dict = {}
                for category in category_set:
                    result_dict[category] = set()
                cur_id = new_id
                extracted_cnt += 1
            label_word = ''

        else:
            label = line.split(' ')[1].strip()
            if line.split(' ')[0].startswith('##'):
                token = line.split(' ')[0][2:]
            else:
                token = line.split(' ')[0]
            if (label == 'O') or (len(label) == 0):
                if len(label_word) != 0:
                    result_dict[category].add(label_word)
                    category = ""
                    label_word = ""
            elif label.startswith('B-'):
                if len(label_word) != 0:
                    result_dict[category].add(label_word)
                    category = ""
                label_word = token
                category = label.split('-')[1].strip()
            elif label.startswith('I-'):
                if len(category) > 0:
                    label_word += " " + token
    id2results[cur_id] = result_dict
    return id2results

In [None]:
doc_word_dict = parse_predict_results("/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/datas/result_data/biological_data/标注后的数据/1.txt")

In [None]:
doc_word_dict

## 5.将最终格式写入csv文件，这里包括了文章和关键词实体，以及文章和关键词实体之间的关系 

In [None]:
import json
with open("/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/datas/original_data/biological/id和标题的对应关系.jsonl", "r", encoding="utf-8") as file:
    for line in file:
        id2doc_dict = json.loads(line)
print("文章的篇数为：", len(id2doc_dict))

# 获得 id2doc_dict中的所有键，也就是所有的文章id
id_list = list(id2doc_dict.keys())

with open("/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/datas/result_data/biological_data/格式转换后的数据/doc_BEN.csv", "w", encoding="utf-8") as file:
    file.write("文章,BEN,LABEL\n")
    for key, value in doc_word_dict.items():
        id = key
        for i in range(len(id_list)):
            if id == id_list[i][5:]:
                title = id2doc_dict[id_list[i]]    
        try:
            # BEN的实体类别是集合
            BEN_set = value["BEN"]
            for ele in BEN_set:
                file.write(title)
                file.write(",")
                file.write(ele)
                file.write(",")
                file.write("has_BEN")
                file.write("\n")
        except:
            continue
        