In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
import torch
import re
from nltk.corpus import stopwords

In [2]:
df = pd.read_excel(r'C:\Users\RIC_ZX\Desktop\papers\ZLW\美国和欧盟 已筛选TAC_(poly butylene adipate-co-terephthalate) OR TAC_(PBAT).XLSX')
df = df[['权利要求','摘要','标题']]
# BERT的中文模型通常由512个字符的长度限制,而数据中的统计信息显示max到598，所以对于字数超过512的摘要，我们需要进行切割操作，控制字符长度在512以内
df['摘要'] = df['摘要'].apply(lambda x: x[:512])
#df['摘要'].str.len().describe()
df['权利要求'] = df['权利要求'].apply(lambda x: x[:512])
df['权利要求'].str.len().describe()

summaries = df['权利要求'].tolist()
english_summaries = []
for summary in summaries:
    if bool(re.search('[a-zA-Z]', summary)): # 判断是否含有英文字母
        english_summaries.append(summary)
# 读取Excel文件
abstract = df['摘要'].tolist()
title = df['标题'].tolist()
abstracts = english_summaries + title + abstract

def filter_text(text):    
    # 去除括号及其内部内容
    text = re.sub(r'\([^)]*\)', '', text)
    # 去除所有符号和数字
    pattern = re.compile(r'[^a-zA-Z\s]')
    text = pattern.sub(' ', text)
    # 分词
    words = text.split()
    # 过滤停用词、标点符号以及人称代词、感叹词、连接词和介词
    filtered_words = [word for word in words if word not in stopwords.words('english')
                      and word.isalpha()
                      and word not in ['I', 'you', 'he', 'she', 'it', 'we', 
                                        'they', 'me', 'him', 'her', 'us', 'them']
                      and word not in ['!', '?', '.', ',', ';', ':']
                      and word not in ['and', 'or', 'but', 'so', 'yet', 'for',
                                       'nor', 'as', 'if', 'when', 'because', 'while',
                                       'although', 'since', 'as if', 'as though']]
    # 拼接过滤后的单词并返回
    return ' '.join(filtered_words)
data = [filter_text(item) for item in abstracts]

In [3]:
data

In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity

device = torch.device("cuda")

# 加载预训练模型和分词器
tokenizer = DistilBertTokenizer.from_pretrained('all-mpnet-base-v2')
model = DistilBertModel.from_pretrained('all-mpnet-base-v2').to(device)

# 设置微调参数
lr = 0.00002
batch_size = 16
num_epochs = 30

# 定义优化器和损失函数
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
loss_fn = torch.nn.BCEWithLogitsLoss()

# 开始微调
for epoch in range(num_epochs):
    epoch_loss = 0
    model.train()
    for i in range(0, len(data), batch_size):
        # 随机抽取一条权力要求文本作为正样本
        pos_text = data[i]
        # 随机抽取一条非正样本，并计算其余弦相似度
        neg_text_index = torch.randint(len(data), (1,)).item()
        while neg_text_index == i:
            neg_text_index = torch.randint(len(data), (1,)).item()
        neg_text = data[neg_text_index] 
        # 将数据移动到GPU上
        pos_vector = model(tokenizer(pos_text, return_tensors='pt')['input_ids'].to(device)).last_hidden_state.mean(dim=1)
        neg_vector = model(tokenizer(neg_text, return_tensors='pt')['input_ids'].to(device)).last_hidden_state.mean(dim=1)
        # 计算损失函数并进行反向传播
        similarity = cosine_similarity(pos_vector.detach().cpu().numpy(), neg_vector.detach().cpu().numpy())[0][0]
        labels = torch.tensor([1.0 if similarity >= 0 else 0.0], dtype=torch.float).to(device)
        outputs = torch.sum(pos_vector * neg_vector).unsqueeze(0)
        loss = loss_fn(outputs, labels)
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print("Epoch {} loss: {}".format(epoch+1, epoch_loss))

In [None]:
# 加载模型
model = DistilBertModel.from_pretrained('all-mpnet-base-v2')
model.load_state_dict(torch.load("all-mpnet-base-v2_model.pth"))

In [6]:
import torch
from transformers import MPNetTokenizer, MPNetModel
from sklearn.metrics.pairwise import cosine_similarity

device = torch.device("cuda")

# 加载预训练模型和分词器
tokenizer = MPNetTokenizer.from_pretrained('all-mpnet-base-v2')
model = MPNetModel.from_pretrained('all-mpnet-base-v2').to(device)

# 设置微调参数
lr = 0.00002
batch_size = 32
num_epochs = 60

# 定义优化器和损失函数
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
loss_fn = torch.nn.BCEWithLogitsLoss()

# 开始微调
for epoch in range(num_epochs):
    epoch_loss = 0
    model.train()
    for i in range(0, len(data), batch_size):
        # 随机抽取一条权力要求文本作为正样本
        pos_text = data[i]
        # 随机抽取一条非正样本，并计算其余弦相似度
        neg_text_index = torch.randint(len(data), (1,)).item()
        while neg_text_index == i:
            neg_text_index = torch.randint(len(data), (1,)).item()
        neg_text = data[neg_text_index] 
        # 将数据移动到GPU上
        pos_vector = model(tokenizer(pos_text, return_tensors='pt')['input_ids'].to(device)).last_hidden_state.mean(dim=1)
        neg_vector = model(tokenizer(neg_text, return_tensors='pt')['input_ids'].to(device)).last_hidden_state.mean(dim=1)
        # 计算损失函数并进行反向传播
        similarity = cosine_similarity(pos_vector.detach().cpu().numpy(), neg_vector.detach().cpu().numpy())[0][0]
        labels = torch.tensor([1.0 if similarity >= 0 else 0.0], dtype=torch.float).to(device)
        outputs = torch.sum(pos_vector * neg_vector).unsqueeze(0)
        loss = loss_fn(outputs, labels)
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print("Epoch {} loss: {}".format(epoch+1, epoch_loss))

In [7]:
# 保存模型
torch.save(model.state_dict(), "all-mpnet-base-v2_model.pth")

In [9]:
# 保存模型
torch.save(model.state_dict(), "all-mpnet-base-v2_60_model.pth")