## 0. 模型叠加预训练 
### 0.1 数据处理


In [1]:
import csv

def remove_quotes(string):
    if string.startswith("'") and string.endswith("'"):
        string = string.strip("'")
    elif string.startswith('"') and string.endswith('"'):
        string = string.strip('"')
    return string

with open('/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/biological_ner/data/original_data/before_translate/biological_strategy.csv', 'r', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    # 跳过表头
    next(reader)
    
    # 逐行读取数据, 并将需要叙述存入列表中
    text_list = []
    
    for row in reader:
        id = '##id:'+ row[0]
        first_fun = row[1]
        second_fun = row[2]
        third_fun = row[3]
        title = row[4]
        URL = row[5]
        introduction = row[6]
        strategy = row[7]
        potential = row[8]
        related_innovation = row[9]
        related_strategy = row[10]
        reference = row[11]
        
        # 删除字符串首尾的‘’符号
        introduction = remove_quotes(introduction)
        strategy = remove_quotes(strategy).replace("', '",", ")
        potential = remove_quotes(potential).replace("', '",". ")
        
        # 将introduction、strategy、potential合并为段落
        content = introduction + strategy + potential
        
        # 按照标点符号划分句子
        content_list = []
        content_list = content.split('. ')
        
        
        # 将文章编号、标题、introduction、strategy、potential合并，并处理成标注需要的格式
        for i in range(len(content_list)):
            text = content_list[i] + '.'
            text_list.append(text)
print("语料总量的句子数为：", len(text_list))

语料总量的句子数为： 8001


In [2]:
content_list

['Shellfish that live in salt water environments have an easily accessible source of the calcium and carbonate ions needed to build new shells',
 'In contrast, those that live in freshwater environments, such as the common pond snail, need to develop clever mechanisms for obtaining those resources since the availability of dissolved calcium and carbonate ions is significantly less than that of their marine cousins.When resources of calcium ions are particularly low, the organism maintains critical calcium requirements for new shell formation by cycling internal sources from previously formed shell',
 'Cells create a driving force for the uptake of calcium ions by utilizing the hydrogen ions generated from dissolved carbon dioxide',
 'The hydrogen ions essentially exit the cell through a revolving ,  door through which calcium ions summarily enter the cell.']

### 0.2 模型预训练 

In [3]:
from transformers import BertTokenizer, BertForMaskedLM
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch
import os

# 自定义语料库的示例数据
corpus = text_list

# 定义自定义数据集类
class CustomDataset(Dataset):
    def __init__(self, corpus, tokenizer):
        self.corpus = corpus
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.corpus)

    def __getitem__(self, index):
        sentence = self.corpus[index]
        tokens = self.tokenizer.tokenize(sentence)
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        input_ids = [self.tokenizer.cls_token_id] + token_ids + [self.tokenizer.sep_token_id]
        segment_ids = [0] * len(input_ids)  # 单句子任务，所有token属于同一个segment
        return torch.tensor(input_ids), torch.tensor(segment_ids)

# 初始化BERT模型和分词器
tokenizer = BertTokenizer.from_pretrained(os.path.join('.', 'model_bin'))
model = BertForMaskedLM.from_pretrained(os.path.join('.', 'model_bin'))

# 定义自定义数据加载器
def collate_fn(batch):
    input_ids = [item[0] for item in batch]
    segment_ids = [item[1] for item in batch]
    input_ids = pad_sequence(input_ids, batch_first=True)
    segment_ids = pad_sequence(segment_ids, batch_first=True)
    attention_mask = (input_ids != tokenizer.pad_token_id)
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'token_type_ids': segment_ids
    }

# 创建数据集和数据加载器
dataset = CustomDataset(corpus, tokenizer)
dataloader = DataLoader(dataset, batch_size=16, collate_fn=collate_fn)

# 模型训练设置
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
total_epochs = 20

# 模型训练
for epoch in range(total_epochs):
    model.train()
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=input_ids  # 使用标签代替masked_lm_labels参数
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch+1}/{total_epochs} - Loss: {loss.item()}")

# 保存训练好的模型
model.save_pretrained(os.path.join('.', 'model_bin_pretrain'))
tokenizer.save_pretrained(os.path.join('.', 'model_bin_pretrain'))

Some weights of the model checkpoint at ./model_bin were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1/20 - Loss: 9.045442857313901e-05
Epoch 2/20 - Loss: 0.0001546507264720276
Epoch 3/20 - Loss: 3.7101839552633464e-05
Epoch 4/20 - Loss: 0.00014673463010694832
Epoch 5/20 - Loss: 9.13749317987822e-06
Epoch 6/20 - Loss: 5.9845806390512735e-06
Epoch 7/20 - Loss: 7.906241080490872e-06
Epoch 8/20 - Loss: 2.547132453400991e-06
Epoch 9/20 - Loss: 1.2645373317354824e-05
Epoch 10/20 - Loss: 4.191711468592985e-06
Epoch 11/20 - Loss: 2.033827968261903e-06
Epoch 12/20 - Loss: 1.411029188602697e-06
Epoch 13/20 - Loss: 8.417607091359969e-07
Epoch 14/20 - Loss: 8.101310413621832e-07
Epoch 15/20 - Loss: 5.157610303285765e-07
Epoch 16/20 - Loss: 6.519995849885163e-07
Epoch 17/20 - Loss: 5.230601232142362e-07
Epoch 18/20 - Loss: 2.031359372267616e-06
Epoch 19/20 - Loss: 2.5495044155832147e-06
Epoch 20/20 - Loss: 6.106406544859055e-07


('./model_bin_pretrain/tokenizer_config.json',
 './model_bin_pretrain/special_tokens_map.json',
 './model_bin_pretrain/vocab.txt',
 './model_bin_pretrain/added_tokens.json')

## 1. 模型构建与训练 

In [4]:
import torch
print(torch.__version__)

1.9.0+cu111


In [5]:
import torch
device=torch.device('cuda:0')

In [6]:
# 读取数据
def read_data(file):
    all_token_list = []
    all_label_list = []

    token_list = []
    label_list = []
    with open(file, "r", encoding="utf-8") as f:
        all_data = f.read().split("\n")
    lenth_sentence_list = [] #用于存储句子的长度
    for data in all_data:
        # 这里进行数据处理时，每一句话结束后都会有一个空行，当判断为空行时，说明结束，这一句话存入列表
        # 但是在我重新处理的语料中，根据句号.来对句子进行划分
        if data == ". O":
            # 统计一下大多数的句子的长度，太长或者太短的句子直接丢掉，不要保存在语料列表当中
            lenth_sentence_list.append(len(token_list))
            if len(token_list) > 10 and len(token_list) < 70:
                all_token_list.append(token_list)
                all_label_list.append(label_list)
            token_list = []
            label_list = []
        else:
            # print(data)
            token, lable = data.split(" ")
            token_list.append(token)
            label_list.append(lable)
    return all_token_list, all_label_list, lenth_sentence_list


# 以训练集为例构建标签及标签ID映射关系
def build_label(train_label):
    # PAD用于填充，UNK 出现不认识的label,则给一个UNK的标识
    label2idx = {"PAD": 0, "UNK": 1}
    for label_list in train_label:
        for lable in label_list:
            if lable not in label2idx:
                label2idx[lable] = len(label2idx)
    return label2idx, list(label2idx)

In [7]:
import os
from transformers import BertTokenizer

# train_text, train_label, train_lenth_sentence_list = read_data("/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/datas/fineturn_data/biological_data/bio_data_S_train.txt")
# test_text, test_label, test_lenth_sentence_list = read_data("/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/datas/fineturn_data/biological_data/bio_data_S_dev.txt")
# dev_text, dev_label, dev_lenth_sentence_list = read_data("/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/datas/fineturn_data/biological_data/bio_data_S_dev.txt")

train_text, train_label, train_lenth_sentence_list = read_data("/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/datas/fineturn_data/biological_data/bio_data_F_train.txt")
test_text, test_label, test_lenth_sentence_list = read_data("/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/datas/fineturn_data/biological_data/bio_data_F_dev.txt")
dev_text, dev_label, dev_lenth_sentence_list = read_data("/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/datas/fineturn_data/biological_data/bio_data_F_dev.txt")


# print(test_label, test_text)

# 定义标签和对应的数字ID
# PAD --> padding
# UNK --> unknown
# labels_list = ['PAD', 'UNK', 'O', 'B-BFA', 'I-BFA', 'B-BFO', 'I-BFO', 'B-BOP', 'I-BOP', 'B-BEN', 'I-BEN', 'B-BOR', 'I-BOR']
# FA 功能行为； FO 功能对象； OP 生物行为；OR 器官；EN 生物体
# 将生物器官和生物体合并为生物结构，看看生物行为的数量多不多，不多的话看一下用其他的办法获得吧

label2idx, idx2label = build_label(train_label)

print("label2idx为：", label2idx)
print("idx2label为：", idx2label)
print(len(train_lenth_sentence_list))    # 原本有525句子
print(len(train_text))       # 符合长度要求的只有254句子
print(len(dev_lenth_sentence_list))    # 原本有525句子
print(len(dev_text))       # 符合长度要求的只有254句子
print("train_lenth_sentence_list为：", train_lenth_sentence_list)

FileNotFoundError: [Errno 2] No such file or directory: '/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/datas/fineturn_data/biological_data/bio_data_F_train.txt'

In [8]:
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from transformers import BertModel
from transformers import AdamW
from seqeval.metrics import f1_score as seqeval_f1_score
from torchcrf import CRF

# 调用torch中bert的分词器
tokenizer = BertTokenizer.from_pretrained(os.path.join('.', 'model_bin_pretrain'))

# 构建数据集
class BertDataset(Dataset):
    def __init__(self, all_text, all_label, label2idx, max_len, tokenizer, is_test=False):
        self.all_text = all_text
        self.all_label = all_label
        self.label2idx = label2idx
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_test = is_test

    def __getitem__(self, index):
        if self.is_test:
            self.max_len = len(self.all_label[index])

        # 按批次获得数据
        text = self.all_text[index]
        # 0到max_len的词有对应的label,后面的用padding（转换为数字张量之后就是0了）补充
        label = self.all_label[index][:self.max_len]
        
        # 这里的encode函数要做的事：
        # 对词进行编码，将词转换为在词表中对应的数字
        # 对长度大于max_length的语句进行截断（超出了max_length的直接没有了），对长度不足max_length的语句进行padding填充
        # 添加特殊的标记符
        # 返回tensor类型的张量，即将词进行编码为torch张量
        text_index = self.tokenizer.encode(text,
                                           add_special_tokens=True,
                                           max_length=self.max_len + 2,        # 加上两个左右标识符
                                           padding="max_length",
                                           truncation=True,
                                           return_tensors="pt")        # pt 返回pytorch的张量
        label_index = [0] + [self.label2idx.get(l, 1) for l in label] + [0] + [0] * (max_len - len(text))
        # 将编码后的label转换为torch张量
        label_index = torch.tensor(label_index)
        # 统一text_index和label_index的形状和维度
        return text_index.reshape(-1), label_index, len(label)

    def __len__(self):
        return self.all_text.__len__()

2023-06-21 01:11:06.169986: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-21 01:11:06.298625: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-21 01:11:06.792002: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-06-21 01:11:06.792051: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

In [9]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from transformers import BertModel
from transformers import AdamW
from seqeval.metrics import f1_score as seqeval_f1_score
from torchcrf import CRF


batch_size = 16
max_len = 70

# 训练+测试

train_dataset = BertDataset(train_text,
                            train_label,
                            label2idx,
                            max_len,
                            tokenizer)
# 在处理好的语料中随机获得batch_size个句子，作为一批语料输入模型，shuffle是否要随机获得语料
train_dataloader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              shuffle=False)

dev_dataset = BertDataset(dev_text,
                          dev_label,
                          label2idx,
                          max_len,
                          tokenizer)
dev_dataloader = DataLoader(dev_dataset,
                            batch_size=batch_size,
                            shuffle=False)

print("train_dataset的数据类型为： ", type(train_dataset))

NameError: name 'train_text' is not defined

In [None]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from transformers import BertModel
from transformers import AdamW
from seqeval.metrics import f1_score as seqeval_f1_score
from torchcrf import CRF

# 构建基于Bert_LSTM_CRF的模型
class Bert_LSTM_CRF_NerModel(nn.Module):
    def __init__(self, lstm_hidden, class_num, dropout_rate=0.3):
        super().__init__()

        self.bert = BertModel.from_pretrained(os.path.join('.', 'model_bin_pretrain'))
        
        # 为了省显存做的，如果不想省可以删掉
        for name, param in self.bert.named_parameters():
            param.requires_grad = False
            
        # bidirectional 调成True的时候，Linear层的lstm_hidden需要乘2
        self.lstm = nn.LSTM(768,
                            lstm_hidden,
                            batch_first=True,
                            num_layers=1,
                            bidirectional=True)
        
        self.dropout = nn.Dropout(dropout_rate)
        
        # 线性分类器
        self.classifier = nn.Linear(lstm_hidden*2, class_num)
        
        # 条件随机场
        self.crf = CRF(class_num, batch_first=True)

    # 前馈层
    def forward(self, batch_index, batch_label=None):   
        # 传了label数据的时候，就是训练，就要看loss,没传label数据的时候，就直接对分类预测的结果pre进行解码
        if batch_label is not None:
            # 按批次传入数据到bert编码层
            bert_out = self.bert(batch_index)
            # bert_out0 字符级别的特征信息 bert_out1 篇章级别的特征信息
            bert_out0, bert_out1 = bert_out[0], bert_out[1]  
            # 命名实体识别是序列标注任务，因此需要bert_out0，字符级别的特征信息
            lstm_out, _ = self.lstm(bert_out0)
            lstm_out = self.dropout(lstm_out)
            # 调用线性分类器完成预测任务
            pre = self.classifier(lstm_out)
            # loss = self.loss_fun(pre.reshape(-1, pre.shape[-1]),batch_label.reshape(-1))
            loss = -self.crf(pre, batch_label)
            return loss
        else:
            # 按批次传入数据到bert编码层
            bert_out = self.bert(batch_index)
            # bert_out0 字符级别的特征信息 bert_out1 篇章级别的特征信息
            bert_out0, bert_out1 = bert_out[0], bert_out[1]  
            # 命名实体识别是序列标注任务，因此需要bert_out0，字符级别的特征信息
            lstm_out, _ = self.lstm(bert_out0)
            # 调用线性分类器完成预测任务
            pre = self.classifier(lstm_out)
            
            pre = self.crf.decode(pre)
            return pre

In [None]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from transformers import BertModel
from transformers import AdamW
from seqeval.metrics import f1_score as seqeval_f1_score
from torchcrf import CRF

epoch = 200
lr = 0.0005
lstm_hidden = 128
# 根据“训练+验证” “测试” “输入” 三种状态选择True或False
do_train = True
do_test = False
do_input = False
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device:", device)

In [None]:
if do_train:
        # 加载模型，并将模型放到cuda上
        model = Bert_LSTM_CRF_NerModel(lstm_hidden, len(label2idx)).cuda()
        
        # opt是优化器，这里采用Adam的优化方式
        opt = AdamW(model.parameters(), lr)

        best_score = -1
        for e in range(epoch):
            model.train()
            batch_index = 1    # 设置该变量记录当前是训练模型的第几批，每一批都有16个句子，所以总共有12批    
            for batch_text_index, batch_label_index, batch_len in train_dataloader:
                # 按批获得数据，数据放入cuda中
                batch_text_index = batch_text_index.cuda()
                batch_label_index = batch_label_index.cuda()
                # 数据读入读入模型中，获得训练的loss
                loss = model.forward(batch_text_index, batch_label_index)
                loss.backward()
                # opt优化器优化
                opt.step()
                # 梯度归0
                opt.zero_grad()
                
                print("当前的批次为：", batch_index)
                print(f'loss{loss:.2f}')
                batch_index = batch_index + 1
            
            print('-----------------------------------------------')
            print('********  have trained ' + str(e+1) + ' epoch!  ********')
            print('-----------------------------------------------')
            
            # 模型训练结束之后，开始进行测试
            model.eval()
            all_pre = []    # 记录所有的预测label
            all_tag = []    # 记录所有的真实label
            for batch_text_index, batch_label_index, batch_len in dev_dataloader:
                batch_text_index = batch_text_index.cuda()
                batch_label_index = batch_label_index.cuda()
                # 获得模型预测的结果
                pre = model.forward(batch_text_index)
                # 加载真实的label
                tag = batch_label_index.tolist()

                for p, t, l in zip(pre, tag, batch_len):
                    p = p[1:1 + l]
                    t = t[1:1 + l]

                    p = [idx2label[i] for i in p]
                    t = [idx2label[i] for i in t]

                    all_pre.append(p)
                    all_tag.append(t)

                # 获得f1_score
                f1_score = seqeval_f1_score(all_tag, all_pre)
                # 记录最好的f1_score结果，并将得到该结果的模型文件.pt保存下来
                if f1_score > best_score:
                    torch.save(model, 'best_modelBio_pretrain.pt')
                    best_score = f1_score
                    
                print(f'********  best_score:{best_score}  ********')
                print('-----------------------------------------------')
                print(f'********  f1_score:{f1_score}  ********')
                print('-----------------------------------------------')
                

In [None]:
print(best_score)
print(len(all_pre))

In [None]:
all_pre

In [None]:
all_tag

## 2 使用新的模型预测数据 

In [None]:
# 读取数据
def read_data(file):
    all_token_list = []
    all_label_list = []

    token_list = []
    label_list = []
    with open(file, "r", encoding="utf-8") as f:
        all_data = f.read().split("\n")
    lenth_sentence_list = [] #用于存储句子的长度
    for data in all_data:
        # 但是在我重新处理的语料中，根据句号.来对句子进行划分
        if len(data) > 0:
            if data == ". O":
                # 统计一下大多数的句子的长度，太长或者太短的句子直接丢掉，不要保存在语料列表当中
                if len(token_list) > 0: 
                    lenth_sentence_list.append(len(token_list))
                    all_token_list.append(token_list)
                    all_label_list.append(label_list)
                token_list = []
                label_list = []
            else:
                token, lable = data.split(" ")
                token_list.append(token)
                label_list.append(lable)
    return all_token_list, all_label_list, lenth_sentence_list
dev_text, dev_label, dev_lenth_sentence_list = read_data("/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/datas/original_data/biological/bio_dev_data.txt")
print("最长的句子为：", max(dev_lenth_sentence_list))
print("有多少条句子：", len(dev_lenth_sentence_list))

In [None]:
import os
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from transformers import BertModel
from transformers import AdamW
from seqeval.metrics import f1_score as seqeval_f1_score
from torchcrf import CRF

# 调用torch中bert的分词器
tokenizer = BertTokenizer.from_pretrained(os.path.join('.', 'model_bin_pretrain'))

# 构建数据集
class BertDataset(Dataset):
    def __init__(self, all_text, all_label, label2idx, max_len, tokenizer, is_test=False):
        self.all_text = all_text
        self.all_label = all_label
        self.label2idx = label2idx
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_test = is_test

    def __getitem__(self, index):
        if self.is_test:
            self.max_len = len(self.all_label[index])

        # 按批次获得数据
        text = self.all_text[index]
        # 0到max_len的词有对应的label,后面的用padding（转换为数字张量之后就是0了）补充
        label = self.all_label[index][:self.max_len]
        
        # 这里的encode函数要做的事：
        # 对词进行编码，将词转换为在词表中对应的数字
        # 对长度大于max_length的语句进行截断（超出了max_length的直接没有了），对长度不足max_length的语句进行padding填充
        # 添加特殊的标记符
        # 返回tensor类型的张量，即将词进行编码为torch张量
        text_index = self.tokenizer.encode(text,
                                           add_special_tokens=True,
                                           max_length=self.max_len + 2,        # 加上两个左右标识符
                                           padding="max_length",
                                           truncation=True,
                                           return_tensors="pt")        # pt 返回pytorch的张量
        label_index = [0] + [self.label2idx.get(l, 1) for l in label] + [0] + [0] * (max_len - len(text))
        # 将编码后的label转换为torch张量
        label_index = torch.tensor(label_index)
        # 统一text_index和label_index的形状和维度
        return text_index.reshape(-1), label_index, len(label)

    def __len__(self):
        return self.all_text.__len__()

In [None]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from transformers import BertModel
from transformers import AdamW
from seqeval.metrics import f1_score as seqeval_f1_score
from torchcrf import CRF


batch_size = 16
max_len = max(dev_lenth_sentence_list)

dev_dataset = BertDataset(dev_text,
                          dev_label,
                          label2idx,
                          max_len,
                          tokenizer)
dev_dataloader = DataLoader(dev_dataset,
                            batch_size=batch_size,
                            shuffle=False)

print("train_dataset的数据类型为： ", type(train_dataset))

In [None]:
model.eval()
all_pre = []    # 记录所有的预测label
all_tag = []    # 记录所有的真实label
batch_index = 1
for batch_text_index, batch_label_index, batch_len in dev_dataloader:
    batch_text_index = batch_text_index.cuda()
    batch_label_index = batch_label_index.cuda()
    # 获得模型预测的结果(这里是一个批次的数据)
    pre = model.forward(batch_text_index)
    # 加载真实的label
    tag = batch_label_index.tolist()

    for p, t, l in zip(pre, tag, batch_len):
        p = p[1:1 + l]
        t = t[1:1 + l]

        p = [idx2label[i] for i in p]
        t = [idx2label[i] for i in t]

        all_pre.append(p)
        all_tag.append(t)

    batch_index = batch_index + 1

In [None]:
print("语料的总批次为：", batch_index)
print("label的总个数为：", len(all_pre))
print("字符的总个数为：", len(dev_text))

## 3 将模型预测的结果写入文件 

In [None]:
word2label_list = []
for i in range(len(dev_text)):
    word2label_dic = {}
    for j in range(len(dev_text[i])):
        word2label_dic[dev_text[i][j]] = all_pre[i][j]
    word2label_list.append(word2label_dic)
print("字符-label对的总个数为：", len(word2label_list))

In [None]:
word2label_list

In [None]:
with open("/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/datas/result_data/biological_data/1.txt", "w", encoding="utf-8") as file:
    for i in range(len(word2label_list)):
        for key, value in word2label_list[i].items():
            file.write(f"{key} {value}\n")
        file.write(". O\n")

## 4 将预测的结果转换格式（有预处理的格式转换为一篇文章下属的实体及label的格式） 

In [None]:
def parse_predict_results(file_path):
    """
    Parse results from predicted file.
    """
    with open(file_path, 'r', encoding="utf-8") as f:
        lines = f.readlines()
    id2results = {}
    category = ""
    category_set = set()
    for line in lines:
        try:
            category = line.split(' ')[1].split('-')[1].strip()
        except:
            pass
        if category in ('BFA', 'BFO', 'BOR', 'BEN', 'BOP'):
            category_set.add(category)

    cur_id = ""
    extracted_cnt = 0
    category = ""
    for line in lines:
        if line.startswith('##id:'):
            new_id = line[5:].split(" ")[0].strip()
            if cur_id != new_id:
                if extracted_cnt > 0 :
                    id2results[cur_id] = result_dict
                result_dict = {}
                for category in category_set:
                    result_dict[category] = set()
                cur_id = new_id
                extracted_cnt += 1
            label_word = ''

        else:
            label = line.split(' ')[1].strip()
            if line.split(' ')[0].startswith('##'):
                token = line.split(' ')[0][2:]
            else:
                token = line.split(' ')[0]
            if (label == 'O') or (len(label) == 0):
                if len(label_word) != 0:
                    result_dict[category].add(label_word)
                    category = ""
                    label_word = ""
            elif label.startswith('B-'):
                if len(label_word) != 0:
                    result_dict[category].add(label_word)
                    category = ""
                label_word = token
                category = label.split('-')[1].strip()
            elif label.startswith('I-'):
                if len(category) > 0:
                    label_word += " " + token
    id2results[cur_id] = result_dict
    return id2results

In [None]:
doc_word_dict = parse_predict_results("/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/datas/result_data/biological_data/标注后的数据/1.txt")

In [None]:
doc_word_dict

## 5.将最终格式写入csv文件，这里包括了文章和关键词实体，以及文章和关键词实体之间的关系 

In [None]:
import json
with open("/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/datas/original_data/biological/id和标题的对应关系.jsonl", "r", encoding="utf-8") as file:
    for line in file:
        id2doc_dict = json.loads(line)
print("文章的篇数为：", len(id2doc_dict))

# 获得 id2doc_dict中的所有键，也就是所有的文章id
id_list = list(id2doc_dict.keys())

with open("/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/datas/result_data/biological_data/格式转换后的数据/doc_BEN.csv", "w", encoding="utf-8") as file:
    file.write("文章,BEN,LABEL\n")
    for key, value in doc_word_dict.items():
        id = key
        for i in range(len(id_list)):
            if id == id_list[i][5:]:
                title = id2doc_dict[id_list[i]]    
        try:
            # BEN的实体类别是集合
            BEN_set = value["BEN"]
            for ele in BEN_set:
                file.write(title)
                file.write(",")
                file.write(ele)
                file.write(",")
                file.write("has_BEN")
                file.write("\n")
        except:
            continue
        