## 按照词汇计算相似度

In [1]:
import re
def clean_sentence(sentence):
    intervention_list = sentence.split('|')
    cleaned_sentence=''
    for intervention in intervention_list:
        cleaned = re.sub(r'[()|]', '', intervention).strip()
        if ':' in cleaned:
            cleaned = cleaned.split(':', 1)[1].strip()  # Get content after the first ':'
        cleaned_sentence+=' '+cleaned
    return cleaned_sentence



In [2]:
import re
import nltk
from nltk.stem import WordNetLemmatizer

# 确保你已经下载了WordNet词库
nltk.download('punkt')
nltk.download('wordnet')

def clean_sentence(sentence):
    # 清理干预措施中的不必要字符，并提取有效内容
    intervention_list = sentence.split('|')
    cleaned_sentence = ''
    for intervention in intervention_list:
        cleaned = re.sub(r'[()|]', '', intervention).strip()
        if ':' in cleaned:
            cleaned = cleaned.split(':', 1)[1].strip()  # Get content after the first ':'
        cleaned_sentence += ' ' + cleaned
    return cleaned_sentence.strip()  # 返回去掉前后空格的结果

def lemmatize_text(text):
    # 词形还原函数
    lemmatizer = WordNetLemmatizer()
    words = nltk.word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word.lower()) for word in words]
    return set(lemmatized_words)  # 返回去重的词集

def filter_content(content, retrieved):
    # 转换内容为小写并去掉标点符号，方便匹配
    content_lower = re.sub(r'[^\w\s]', '', content.lower())
    content_words = lemmatize_text(content_lower)  # 词形还原处理

    filtered_participants = []
    filtered_interventions = []
    filtered_outcomes = []

    # 过滤参与者
    for participant in retrieved['participants']:
        if participant:  # 只处理非空字符串
            participant_lower = participant.lower()
            lemmatized_participant = lemmatize_text(participant_lower)
            if lemmatized_participant.intersection(content_words):  # 检查是否有交集
                print(lemmatized_participant.intersection(content_words))
                filtered_participants.append(' '.join(lemmatized_participant.intersection(content_words)))

    # 处理干预措施
    intervention_sentence = retrieved['interventions']
    cleaned_interventions = clean_sentence(intervention_sentence).split()  # 清理并拆分干预措施
    for intervention in cleaned_interventions:
        intervention_lower = intervention.lower()
        lemmatized_intervention = lemmatize_text(intervention_lower)
        if lemmatized_intervention.intersection(content_words):  # 检查是否有交集
            filtered_interventions.append(' '.join(lemmatized_intervention.intersection(content_words)))

    # 过滤结果
    for outcome in retrieved['outcomes']:
        outcome_lower = outcome.lower()
        lemmatized_outcome = lemmatize_text(outcome_lower)
        if lemmatized_outcome.intersection(content_words):  # 检查是否有交集
            filtered_outcomes.append(' '.join(lemmatized_outcome.intersection(content_words)))

    return {
        'participants': filtered_participants,
        'interventions': filtered_interventions,
        'outcomes': filtered_outcomes
    }

# 示例内容和检索到的内容
content = "Title : Effect of tarenflurbil on cognitive decline and activities of daily living in patients with mild Alzheimer disease : a randomized controlled trial . METHODS : A multicenter , randomized , double - blind , placebo - controlled trial enrolling patients with mild AD was conducted at 133 trial sites in the United States between February 21 , 2005 , and April 30 , 2008 . Concomitant treatment with cholinesterase inhibitors or memantine was permitted . Tarenflurbil , 800 mg , or placebo , administered twice a day . Co - primary efficacy end points were the change from baseline to month 18 in total score on the subscale of the Alzheimer Disease Assessment Scale - Cognitive Subscale ( ADAS - Cog , 80 - point version ) and Alzheimer Disease Cooperative Studies - activities of daily living ( ADCS - ADL ) scale . Additional prespecified slope analyses explored the possibility of disease modification . -DOCSTART-"

retrieved = {
    'participants': ['', 'patients with mild alzheimers disease', 'alzheimers disease'],
    'interventions': 'drug tarenflurbil|drug placebo',
    'outcomes': [
        'alzheimer disease assessment scale cognitive subscale adascog change in total score from baseline to month 18',
        'alzheimer disease cooperative studies activities of daily living adcsadl change in total score from baseline to month 18'
    ]
}

# 执行过滤
filtered_output = filter_content(content, retrieved)
filtered_output

[nltk_data] Downloading package punkt to
[nltk_data]     /local/home/sumyao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /local/home/sumyao/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


{'patient', 'mild', 'disease', 'with'}
{'disease'}


{'participants': ['patient mild disease with', 'disease'],
 'interventions': ['tarenflurbil', 'placebo'],
 'outcomes': ['alzheimer total month cognitive baseline score in to assessment scale 18 change subscale disease from',
  'alzheimer living study month total daily activity baseline score in to cooperative 18 change disease of from']}

In [35]:
contents = "Title  Effectiveness and tolerability of high - dose ( 23 mg / d ) versus standard - dose ( 10 mg / d ) donepezil in moderate to severe Alzheimer  s disease : A 24 - week , randomized , double - blind study . METHODS : This randomized , double - blind study was conducted at 219 sites in Asia , Europe , Australia , North America , South Africa , and South America from June 6 , 2007 , to March 27 , 2009 . Patients aged 45 to 90 years with probable AD , Mini - Mental State Examination score 0 to 20 ( moderate to severe impairment ) , and who were receiving donepezil 10 mg once daily for > or =12 weeks before the start of the study were eligible . Patients ( n = 1467 ) were randomly assigned to receive high - dose donepezil ( 23 mg once daily ) or standard - dose donepezil ( 10 mg once daily ) for 24 weeks . Coprimary effectiveness measures were changes in cognition and global functioning , as assessed using least squares mean changes from baseline ( LSM [ SE ] A ) scores ( last observation carried forward ) on the Severe Impairment Battery ( SIB ; cognition ) and the Clinician s Interview - Based Impression of Change Plus Caregiver Input scale ( CIBIC + ; global function rating ) overall change score ( mean [ SD ] ) at week 24 . Treatment - emergent adverse events ( TEAEs ) were assessed using spontaneous patient / caregiver reporting and open - ended questioning ; clinical laboratory testing ( hematology , biochemistry , and urinalysis panels analyzed by a central laboratory ) ; 12 - lead ECG ; and physical and neurologic examinations , including vital sign measurements . -DOCSTART-"
print(type(contents))
retrieved =  {
            "participants": [
                "",
                "ALL",
                "Alzheimer's Disease"
            ],
            "interventions": "DRUG: Aricept (donepezil SR 23 mg)|DRUG: Aricept (donepezil IR 10 mg)",
            "outcomes": [
                "Change From Baseline to Week 24 in SIB Total Score, The SIB is an assessment of cognitive dysfunction across nine domains such as memory, language, and orientation. The score ranges from 0 (worst) to 100 (best). This outcome was calculated using the LOCF (last observation carried forward) method., Baseline and Week 24|Overall Change From Baseline in Modified CIBIC+ to Week 24, The CIBIC+ is a rating scale derived from an interview with the patient and caregiver with an independent rater designed to measure several domains of patient function, such as mental/cognitive state, behavior, and activities of daily living. The scores range from 1 (marked improvement) to 7 (marked worsening)., Baseline and Week 24",
                "Change From Baseline to Week 24 in ADCS-ADL Total Score, The ADCS-ADL (Alzhemier's Disease Cooperative Study-Activities of Daily Living) is a 19-item assessment scale used to measure a patient's basic functional abilities, such as walking, grooming, and bathing.Scores range from 0 to 54, with a higher score indicating greater functional ability., Baseline and Week 24|Change From Baseline to Week 24 in MMSE Total Score, The MMSE (Mini-Mental State Examination) is a 30-item test that evaluates 5 domains of cognitive function (orientation to time and place, immediate and delayed recall, attention, calculation, and language). The scores range from 0 (most impaired) to 30 (no impaiment)., Baseline and Week 24"
            ]
        }

# 执行过滤
filtered_output = filter_content(contents, retrieved)
filtered_output

<class 'str'>
{'alzheimer', 'disease'}


{'participants': ['alzheimer disease'],
 'interventions': ['donepezil', '23', 'mg', 'donepezil', '10', 'mg'],
 'outcomes': ['wa the patient week daily 24 scale with change baseline this observation a caregiver to measure state rating interview carried and function using 0 last forward sib in score of from',
  'the patient week daily 24 score scale with change baseline a to measure state and function 0 disease in examination of from']}

## 句子_TF-IDF计算相似度

In [3]:
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def clean_sentence(sentence):
    intervention_list = sentence.split('|')
    cleaned_sentence = ''
    for intervention in intervention_list:
        cleaned = re.sub(r'[()|]', '', intervention).strip()
        if ':' in cleaned:
            cleaned = cleaned.split(':', 1)[1].strip()  # Get content after the first ':'
        cleaned_sentence += ' ' + cleaned
    return cleaned_sentence.strip()

def find_most_similar_sentence(content, retrieved):
    # 清理参与者、干预措施和结果
    participants = [p for p in retrieved['participants'] if p]  # 去掉空字符串
    interventions = clean_sentence(retrieved['interventions']).split()
    outcomes = retrieved['outcomes']

    # 将所有句子合并为列表
    all_sentences = content.split('. ')
    sentences = [s.strip() for s in all_sentences if s]  # 去掉空句子

    # 计算TF-IDF矩阵
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences + participants + interventions + outcomes)
    
    # 计算句子与其他项之间的相似度
    similarity_scores = cosine_similarity(tfidf_matrix[:len(sentences)], tfidf_matrix[len(sentences):])

    # 找到最相似的句子
    similar_sentences = {}
    for idx, sentence in enumerate(sentences):
        max_score_index = np.argmax(similarity_scores[idx])  # 找到最大相似度的索引
        if max_score_index < len(participants):
            similar_sentences[sentence] = {
                'participants': participants[max_score_index],
                'interventions': None,
                'outcomes': None
            }
        elif max_score_index < len(participants) + len(interventions):
            similar_sentences[sentence] = {
                'participants': None,
                'interventions': interventions[max_score_index - len(participants)],
                'outcomes': None
            }
        else:
            similar_sentences[sentence] = {
                'participants': None,
                'interventions': None,
                'outcomes': outcomes[max_score_index - len(participants) - len(interventions)]
            }

    return similar_sentences

# 示例内容和检索到的内容
content = "Title : Effect of tarenflurbil on cognitive decline and activities of daily living in patients with mild Alzheimer disease : a randomized controlled trial. METHODS : A multicenter, randomized, double-blind, placebo-controlled trial enrolling patients with mild AD was conducted at 133 trial sites in the United States between February 21, 2005, and April 30, 2008. Concomitant treatment with cholinesterase inhibitors or memantine was permitted. Tarenflurbil, 800 mg, or placebo, administered twice a day. Co-primary efficacy end points were the change from baseline to month 18 in total score on the subscale of the Alzheimer Disease Assessment Scale - Cognitive Subscale (ADAS-Cog, 80-point version) and Alzheimer Disease Cooperative Studies - activities of daily living (ADCS-ADL) scale. Additional prespecified slope analyses explored the possibility of disease modification."

retrieved = {
    'participants': ['', 'patients with mild alzheimers disease', 'alzheimers disease'],
    'interventions': 'drug tarenflurbil|drug placebo',
    'outcomes': [
        'alzheimer disease assessment scale cognitive subscale adascog change in total score from baseline to month 18',
        'alzheimer disease cooperative studies activities of daily living adcsadl change in total score from baseline to month 18'
    ]
}

# 执行查找最相似句子
most_similar_sentences = find_most_similar_sentence(content, retrieved)
print(most_similar_sentences)


{'Title : Effect of tarenflurbil on cognitive decline and activities of daily living in patients with mild Alzheimer disease : a randomized controlled trial': {'participants': None, 'interventions': None, 'outcomes': 'alzheimer disease cooperative studies activities of daily living adcsadl change in total score from baseline to month 18'}, 'METHODS : A multicenter, randomized, double-blind, placebo-controlled trial enrolling patients with mild AD was conducted at 133 trial sites in the United States between February 21, 2005, and April 30, 2008': {'participants': 'patients with mild alzheimers disease', 'interventions': None, 'outcomes': None}, 'Concomitant treatment with cholinesterase inhibitors or memantine was permitted': {'participants': 'patients with mild alzheimers disease', 'interventions': None, 'outcomes': None}, 'Tarenflurbil, 800 mg, or placebo, administered twice a day': {'participants': None, 'interventions': 'tarenflurbil', 'outcomes': None}, 'Co-primary efficacy end po

In [None]:
import re
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

# 加载 BERT 模型和分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def clean_sentence(sentence):
    intervention_list = sentence.split('|')
    cleaned_sentence = ''
    for intervention in intervention_list:
        cleaned = re.sub(r'[()|]', '', intervention).strip()
        if ':' in cleaned:
            cleaned = cleaned.split(':', 1)[1].strip()  # Get content after the first ':'
        cleaned_sentence += ' ' + cleaned
    return cleaned_sentence.strip()

def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()  # 使用平均池化获取句子嵌入

def find_most_similar_sentence(content, retrieved, weights, match_boost=0.3):
    # 清理参与者、干预措施和结果
    participants = [p for p in retrieved['participants'] if p]  # 去掉空字符串
    interventions = clean_sentence(retrieved['interventions']).split()
    print(interventions)
    outcomes = retrieved['outcomes']

    # 将所有句子合并为列表
    sentences = content.split('. ')
    sentences = [s.strip() for s in sentences if s]  # 去掉空句子

    # 计算句子嵌入
    sentence_embeddings = [get_sentence_embedding(s) for s in sentences]
    participant_embeddings = [get_sentence_embedding(p) for p in participants]
    intervention_embeddings = [get_sentence_embedding(i) for i in interventions]
    outcome_embeddings = [get_sentence_embedding(o) for o in outcomes]

    # 初始化相关结果列表
    participants_related, interventions_related, outcomes_related = [], [], []

    # 计算相似度，结合包含关键词的优先级
    for idx, sentence_embedding in enumerate(sentence_embeddings):
        sentence = sentences[idx]

        # 计算与参与者的相似度
        participant_scores = cosine_similarity([sentence_embedding], participant_embeddings)[0]
        max_participant_score = max(participant_scores) * weights['participants']
        if any(p.lower() in sentence.lower() for p in participants):  # 检查是否包含参与者关键词
            max_participant_score += match_boost  # 提升分数

        participants_related.append((sentence, max_participant_score))

        # 计算与干预措施的相似度
        intervention_scores = cosine_similarity([sentence_embedding], intervention_embeddings)[0]
        max_intervention_score = max(intervention_scores) * weights['interventions']
        if any(i.lower() in sentence.lower() for i in interventions):  # 检查是否包含干预关键词
            max_intervention_score += match_boost  # 提升分数

        interventions_related.append((sentence, max_intervention_score))

        # 计算与结果的相似度
        outcome_scores = cosine_similarity([sentence_embedding], outcome_embeddings)[0]
        max_outcome_score = max(outcome_scores) * weights['outcomes']
        if any(o.lower() in sentence.lower() for o in outcomes):  # 检查是否包含结果关键词
            max_outcome_score += match_boost  # 提升分数
        outcomes_related.append((sentence, max_outcome_score))

    # 排序并选择最相关的两个句子
    participants_related = sorted(participants_related, key=lambda x: x[1], reverse=True)[:2]
    interventions_related = sorted(interventions_related, key=lambda x: x[1], reverse=True)[:2]
    outcomes_related = sorted(outcomes_related, key=lambda x: x[1], reverse=True)[:2]

    return {
        'participants_related': participants_related,
        'interventions_related': interventions_related,
        'outcomes_related': outcomes_related
    }

# 示例内容和检索到的内容
content = "Title: Effect of tarenflurbil on cognitive decline and activities of daily living in patients with mild Alzheimer disease: a randomized controlled trial. METHODS: A multicenter, randomized, double-blind, placebo-controlled trial enrolling patients with mild AD was conducted at 133 trial sites in the United States between February 21, 2005, and April 30, 2008. Concomitant treatment with cholinesterase inhibitors or memantine was permitted. Tarenflurbil, 800 mg, or placebo, administered twice a day. Co-primary efficacy end points were the change from baseline to month 18 in total score on the subscale of the Alzheimer Disease Assessment Scale - Cognitive Subscale (ADAS-Cog, 80-point version) and Alzheimer Disease Cooperative Studies - activities of daily living (ADCS-ADL) scale. Additional prespecified slope analyses explored the possibility of disease modification."

retrieved = {
    'participants': ['', 'patients with mild alzheimers disease', 'alzheimers disease'],
    'interventions': 'drug tarenflurbil|drug placebo',
    'outcomes': [
        'alzheimer disease assessment scale cognitive subscale adascog change in total score from baseline to month 18',
        'alzheimer disease cooperative studies activities of daily living adcsadl change in total score from baseline to month 18'
    ]
}

# 定义关键词的权重和匹配提升系数
weights = {
    'participants': 1.0,
    'interventions': 1.0,
    'outcomes': 1.0
}

# 执行查找最相似句子
most_similar_sentences = find_most_similar_sentence(content, retrieved, weights)

## 句子_BM25计算相似度

### 1.声明BM25类

In [2]:
import math
from collections import defaultdict, Counter
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

"""
documents = [
    ["bm25", "算法", "是", "一种", "检索", "模型"],
    ["如何", "实现", "bm25", "算法"],
    ["bm25", "在", "信息检索", "中", "非常", "重要"],] # 示例文档
bm25 = BM25(documents) # 创建BM25实例
query = ["bm25", "算法"]
results = bm25.search(query) # 查询
for doc_idx, score in results:
    print(f"文档ID: {doc_idx}, 分数: {score}")#输出结果
"""
class BM25:
    def __init__(self, documents, k1=1.5, b=0.75):
        """初始化BM25类，接受文档列表和两个调节参数"""
        self.documents = documents
        self.N = len(documents)  # 文档总数
        self.avgdl = sum(len(doc) for doc in documents) / self.N  # 文档平均长度
        self.k1 = k1
        self.b = b
        self.inverted_index = defaultdict(list)  # 倒排索引
        self.doc_lengths = []  # 记录每个文档的长度
        self.build_index()

    def build_index(self):
        """构建倒排索引并计算每个词项的文档频率和文档长度"""
        for idx, doc in enumerate(self.documents):
            self.doc_lengths.append(len(doc))
            term_counts = Counter(doc)
            for term, freq in term_counts.items():
                self.inverted_index[term].append((idx, freq))

    def idf(self, term):
        """计算词项的逆文档频率（IDF）"""
        df = len(self.inverted_index.get(term, []))  # 包含该词的文档数
        return math.log((self.N - df + 0.5) / (df + 0.5) + 1)

    def score(self, query, doc_idx):
        """计算查询与指定文档之间的BM25得分"""
        score = 0.0
        doc_length = self.doc_lengths[doc_idx]
        
        # 获取当前文档中所有词项的词频
        term_freqs = {term: freq for term, doc_freqs in self.inverted_index.items() 
                      for doc, freq in doc_freqs if doc == doc_idx}

        for term in query:
            if term in term_freqs:
                f = term_freqs[term]  # 词频
                idf = self.idf(term)  # 逆文档频率
                numerator = f * (self.k1 + 1)
                denominator = f + self.k1 * (1 - self.b + self.b * doc_length / self.avgdl)
                score += idf * (numerator / denominator)

        return score

    def search(self, query, top_n=10):
        """对查询进行BM25检索并返回相关性最高的前N个文档"""
        scores = {idx: self.score(query, idx) for idx in range(self.N)}
        return sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n]

[nltk_data] Downloading package punkt to
[nltk_data]     /local/home/sumyao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## 句子_BM25-bert

In [4]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

class BM25_BERT:
    def __init__(self, documents, k1=1.5, b=0.75, bert_model="bert-base-uncased"):
        self.bm25 = BM25(documents)  # 初始化 BM25
        self.documents = documents
        self.tokenizer = BertTokenizer.from_pretrained(bert_model)  # BERT Tokenizer
        self.model = BertModel.from_pretrained(bert_model)  # BERT Model
        self.document_embeddings = self.encode_documents()  # 预编码文档

    def encode_documents(self):
        """使用 BERT 对文档进行编码"""
        embeddings = []
        for doc in self.documents:
            sentence = " ".join(doc)  # 将文档拼接为单个字符串
            inputs = self.tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
            with torch.no_grad():
                outputs = self.model(**inputs)
            embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
        return np.array(embeddings)

    def encode_query(self, query):
        """使用 BERT 对查询进行编码"""
        sentence = " ".join(query)
        inputs = self.tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

    def bert_similarity(self, query_embedding, doc_idx):
        """计算 BERT 相似度（余弦相似度）"""
        doc_embedding = self.document_embeddings[doc_idx]
        cos_sim = np.dot(query_embedding, doc_embedding) / (np.linalg.norm(query_embedding) * np.linalg.norm(doc_embedding))
        return cos_sim

    def search(self, query, top_n=10, alpha=0.5):
        """BM25 和 BERT 集成检索"""
        bm25_scores = {idx: self.bm25.score(query, idx) for idx in range(self.bm25.N)}
        query_embedding = self.encode_query(query)
        final_scores = {}
        for idx, bm25_score in bm25_scores.items():
            bert_score = self.bert_similarity(query_embedding, idx)
            final_scores[idx] = alpha * bm25_score + (1 - alpha) * bert_score
        return sorted(final_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]

# 示例文档
documents = [
    ["bm25", "算法", "是", "一种", "检索", "模型"],
    ["如何", "实现", "bm25", "算法"],
    ["bm25", "在", "信息检索", "中", "非常", "重要"],
]
query = ["bm25", "算法"]

# 初始化 BM25 + BERT
bm25_bert = BM25_BERT(documents)

# 检索
results = bm25_bert.search(query)
for doc_idx, score in results:
    print(f"文档代号: Doc-{doc_idx}, 分数: {score}")
    print(documents[doc_idx])




文档代号: Doc-1, 分数: 0.7748780250549316
['如何', '实现', 'bm25', '算法']
文档代号: Doc-0, 分数: 0.6661134958267212
['bm25', '算法', '是', '一种', '检索', '模型']
文档代号: Doc-2, 分数: 0.41082605719566345
['bm25', '在', '信息检索', '中', '非常', '重要']


### json2nlpSentence

In [5]:
import re
"""
# 示例数据
retrieved = {
    "age": "",
    "gender": "ALL",
    "conditions": "Alzheimer Disease|Dementia",
    "interventions": "DRUG: MPC-7869|DRUG: MPC-7869",
    "primary outcome measures": "Cognition and activities of daily living, 18 mos",
    "secondary outcome measures": "Global function and behavior, 18 mos"
}
# 输出清洗后的句子
sentences = create_sentences(retrieved)
for sentence in sentences:
    print(sentence)
## 输出
gender is ALL
conditions is Alzheimer Disease
conditions is Dementia
interventions is MPC-7869
interventions is MPC-7869
primary outcome measures is Cognition and activities of daily living, 18 mos
secondary outcome measures is Global function and behavior, 18 mos
"""

def clean_value(value):
    if not value:
        return []
    # 如果包含 '|', 按 '|' 分割
    parts = value.split('|')
    cleaned_parts = []
    # 处理每个部分，只保留 ':' 右边的内容
    for part in parts:
        cleaned = re.sub(r'[()|]', '', part).strip()  # 去除多余字符
        if ':' in cleaned:
            cleaned = cleaned.split(':', 1)[1].strip()  # 保留冒号右边的内容
        cleaned_parts.append(cleaned)
    
    return cleaned_parts

def create_sentences(retrieved):
    sentences = []
    for key, value in retrieved.items():
        cleaned_values = clean_value(value)
        for cleaned_value in cleaned_values:
            if cleaned_value:  # 跳过空值
                sentences.append(f"{key} is {cleaned_value}")
    return sentences

### 3.转换json为自然语言+输出相似句子

##### BM25算法检索

In [3]:
def retrived_pio_BM25sentence(content, retrived_label):
    content_sentences = sent_tokenize(content) #段落分句子
    documents=[item.split() for item in content_sentences] #句子转词汇列表,[[]]
    bm25 = BM25(documents)

    participants = []
    outcomes = []
    interventions = []

    sentences=create_sentences(retrived_label)
    for sentence in sentences:
        query = sentence.split()
        #print(query)
        results = bm25.search(query)
        
        # clinical transfer and match
        for doc_idx, score in results:
            if score > 0.9:
                sim_document_content = " ".join(documents[doc_idx])
                first_word = query[0].lower()  # 将第一个词汇转为小写以便匹配
                if first_word in ["age", "gender", "conditions"]:
                    participants.append(sim_document_content)
                elif first_word.startswith("primary") or first_word.startswith("secondary"):
                    outcomes.append(sim_document_content)
                else:
                    interventions.append(sim_document_content)
    retrieved_participants=" ".join(participants)
    retrieved_outcomes=" ".join(outcomes)
    retrieved_interventions=" ".join(interventions)
    return retrieved_participants,retrieved_interventions,retrieved_outcomes

### 4.循环运行

In [None]:
import json
from tqdm import tqdm
import pandas as pd
import os
import openai

openai.api_key = "sk-2f96vHBUFQnCImcB7b1dC1C741D04c308a0d5dA91d140b3f"
openai.base_url = "https://free.gpt.ge/v1/"
openai.default_headers = {"x-foo": "true"}

api_key="sk-hZuBOOA4Ohv18QPNxV4OAhx8VE1A32m1LvWeUKpGWl2dMez4",
base_url="https://api.chatanywhere.tech/v1"


with open("/local/home/sumyao/YSforGIT/dataset/Filtered2Added/sectionspecific_nct_filtered_added_withnocluster.json",'r') as f:
    datasets=json.load(f)

predictions=[]
#for id in tqdm(range(len(datasets)),desc="Processing examples"):
for id in tqdm(range(27),desc="Processing examples"):
#for id in tqdm([4,5],desc="Processing examples"):
    content,retrived_label=datasets[id].get("content"),datasets[id].get("retrieved")
    retrieved_participants,retrieved_interventions,retrieved_outcomes= retrived_pio_BM25sentence(content,retrived_label)  
    instruction='''
                Please extract the PICO elements. You can refer to the annotation guidelines to understand what words shoud be extracted.
                    --**Output in JSON format as follows,make sure do not include the irrealavent values for the key:
                    {
                        "participants": "",
                        "interventions": "",
                        "comparator": "",
                        "outcomes": ""
                    }
        ---**Do not reply with anything beyond this JSON file.'''
    retrieved=f'''sentences related to Participants is {retrieved_participants}.
                sentences related to Interventions and comparator is{retrieved_interventions}.
            sentences related to Outcomes is  {retrieved_outcomes}.
            You will be punished if you mismatch the real words mislead by this paragraph'''
    retrieved_outcomes=    f'''sentences related to Outcomes is  {retrieved_outcomes}.
            You will be punished if you mismatch the real words mislead by this paragraph'''
    prompt='''
            ### Instruction: {}
            ### Input:{}
            ### Retrived:{}
            ### Response '''.format(instruction,content,retrieved_outcomes,"")
    #print(prompt)
    completion = openai.chat.completions.create(
                                        model="gpt-3.5-turbo",
                                        messages=[
                                            {
                                                "role": "user",
                                                "content": prompt,
                                            },
                                        ],
                                    )
    response=completion.choices[0].message.content
    #print(response)
    predictions.append(response)

with open('/local/home/sumyao/YSforGIT/output/predict_sectionspecific_chatgpt35_outcomesretrieved1.json', 'w') as json_file:
    json.dump(predictions, json_file)

In [11]:
with open('predict_sectionspecific_chatgpt35_retrieved.json', 'w') as json_file:
    json.dump(predictions, json_file,indent=4)

## ---评价函数

In [24]:
import pandas as pd
import json
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from enum import Enum

def preprocess_text(text):
    """---input:sentence---output: cleaned sentence"""
    text = text.lower()
    text = re.sub(r'[-/]', ' ', text)
    text = re.sub(r'[(),;:-]', '', text)
    text = re.sub(r'Not specified', '', text)
    text = re.sub(r'not specified', '', text)
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

class LabelSet(Enum):
    SET1 = ["participants", "interventions", "comparator", "outcomes"]
    SET2=["participants", "interventions",  "outcomes"]


def model_assess(predictions,annotations, labels=LabelSet.SET1.value):
    """
    labels = ["Age", "Gender", "Conditions", "InterventionName","InterventionProtocol", "ComparatorName", "Outcomes", "OutcomeMeasures"]
    labels = [ "Age", "Gender", "Conditions", "Intervention", "samplesize", "Outcomes"]
    labels = [ "participants",'interventions','comparator','outcomes']
    """
    # step1 clean text
    for prediction in predictions:
        for key in prediction:
            prediction[key] = preprocess_text(prediction[key])
    
    for annotation in annotations:
        for key in annotation:
            annotation[key] = preprocess_text(annotation[key])
    
    
    # step2 calculate mean_token_lenth for each label
    average_tokens, total_tokens, count_labels = ({label: 0 for label in labels} for _ in range(3)) #重复生成三个字典，键都是labels中的每个label.对应值都是 0
    for anno in annotations:
        for label in labels:
            anno_tokens = set(anno.get(label, "").split())
            total_tokens[label] += len(anno_tokens)
            if len(anno_tokens) > 0:
                count_labels[label] += 1
    for label in labels:
        if count_labels[label] > 0:
            average_tokens[label] = total_tokens[label] / count_labels[label]
 
    # step3: Calculate TP, FP, FN for each label
    token_counts = {label: {"TP": 0, "FP": 0, "FN": 0} for label in labels}
    for pred, anno in zip(predictions, annotations):
        for label in labels:
            pred_tokens = set(pred.get(label, "").split())
            anno_tokens = set(anno.get(label, "").split())

            if not pred_tokens and not anno_tokens: #均为set() 空集合的时候
                TP = average_tokens[label]
                #TP=0
                FP, FN = 0, 0
            elif not pred_tokens and anno_tokens:
                TP, FP, FN = 0, 0, len(anno_tokens)
            elif pred_tokens and not anno_tokens:
                TP, FP, FN = 0, len(pred_tokens), 0
            else:
                TP = len(pred_tokens.intersection(anno_tokens))
                FP = len(pred_tokens - anno_tokens)
                FN = len(anno_tokens - pred_tokens)

            # Update counters
            token_counts[label]["TP"] += TP
            token_counts[label]["FP"] += FP
            token_counts[label]["FN"] += FN

    # Calculate precision, recall, and F1 for each label
    metrics = {}
    for label in labels:
        TP = token_counts[label]["TP"]
        FP = token_counts[label]["FP"]
        FN = token_counts[label]["FN"]

        if TP == 0 and FP == 0 and FN == 0:
            precision, recall, f1 = 1, 1, 1
        elif TP == 0 and FN > 0:
            precision, recall, f1 = 1, 0, 0
        elif TP == 0 and FP > 0:
            precision, recall, f1 = 0, 0, 0
        else:
            precision = TP / (TP + FP) if (TP + FP) > 0 else 0
            recall = TP / (TP + FN) if (TP + FN) > 0 else 0
            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        metrics[label] = {"precision": precision, "recall": recall, "f1": f1}

    # Calculate micro-average metrics
    total_TP = sum(token_counts[label]["TP"] for label in labels)
    total_FP = sum(token_counts[label]["FP"] for label in labels)
    total_FN = sum(token_counts[label]["FN"] for label in labels)

    micro_precision = total_TP / (total_TP + total_FP) if (total_TP + total_FP) > 0 else 0
    micro_recall = total_TP / (total_TP + total_FN) if (total_TP + total_FN) > 0 else 0
    micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall) if (micro_precision + micro_recall) > 0 else 0

    micro_metrics = {"precision": micro_precision, "recall": micro_recall, "f1": micro_f1}

    return micro_metrics, metrics


In [28]:
## 计算rag-pico--zero
import os
aug_data=json.load(open("/local/home/sumyao/YSforGIT/dataset/Filtered2Added/sectionspecific_nct_filtered_added_withnocluster.json"))
annotations=[item['labels'] for item in aug_data][:27]

predictions=json.load(open('/local/home/sumyao/YSforGIT/output/predict_sectionspecific_chatgpt35_outcomesretrieved1.json'))

micro_metrics, metrics=model_assess(parsed_json_objects,annotations, labels=LabelSet.SET1.value)
micro_metrics, metrics

({'precision': 0.619144035838121,
  'recall': 0.6226107760046831,
  'f1': 0.6208725666861248},
 {'participants': {'precision': 0.9213872832369943,
   'recall': 0.5624558927311221,
   'f1': 0.6985100788781771},
  'interventions': {'precision': 0.30097087378640774,
   'recall': 0.4626865671641791,
   'f1': 0.36470588235294116},
  'comparator': {'precision': 0.46166394779771613,
   'recall': 0.5752032520325203,
   'f1': 0.5122171945701357},
  'outcomes': {'precision': 0.6404761904761904,
   'recall': 0.7472222222222222,
   'f1': 0.6897435897435897}})

In [27]:
predictions
def parse_json_strings(json_strings):
    json_objects = []
    for json_string in json_strings:
        json_string = re.sub(r'^```json\n', '', json_string)
        json_string = re.sub(r'\n```$', '', json_string)
        json_string = json_string.strip()
        try:
            # 解析 JSON 字符串
            json_object = json.loads(json_string)
            json_objects.append(json_object)
        except json.JSONDecodeError as e:
            print(f"JSON 解析错误: {e} 在字符串: {json_string}")
    return json_objects

# 使用函数解析 JSON 字符串
parsed_json_objects = parse_json_strings(predictions)
with open('ss_predictions_chatgpt35.json', 'w') as json_file:
    json.dump(parsed_json_objects, json_file)

## ---案例分析

In [None]:
## 第一个案例 纯提示
'{\n    "participants": "patients with mild Alzheimer disease",\n   
 "interventions": "Tarenflurbil, 800 mg, or placebo, administered twice a day",\n   
  "comparator": "placebo",\n    
  "outcomes": "change from baseline to month 18 in total score on the subscale of the Alzheimer Disease Assessment Scale - Cognitive Subscale (ADAS-Cog, 80-point version) and Alzheimer Disease Cooperative Studies - activities of daily living (ADCS-ADL) scale"\n}'
## 第一个案例 只加三句话
'{\n    "participants": "patients with mild Alzheimer disease",\n  
  "interventions": "Tarenflurbil, 800 mg, or placebo, administered twice a day. Concomitant treatment with cholinesterase inhibitors or memantine was permitted.",\n   
   "comparator": "",\n   
    "outcomes": "change from baseline to month 18 in total score on the subscale of the Alzheimer Disease Assessment Scale - Cognitive Subscale (ADAS-Cog, 80-point version) and Alzheimer Disease Cooperative Studies - activities of daily living (ADCS-ADL) scale"\n}'
## 第一个案例 加上对检索内容的批判
'{\n    "participants": "patients with mild Alzheimer disease",\n    
"interventions": "Tarenflurbil, 800 mg, or placebo, administered twice a day",\n   
 "comparator": "placebo",\n   
  "outcomes": "change from baseline to month 18 in total score on the subscale of the Alzheimer Disease Assessment Scale - Cognitive Subscale (ADAS-Cog, 80-point version) and Alzheimer Disease Cooperative Studies - activities of daily living (ADCS-ADL) scale"\n}'
##第二个案例 纯提示
  '{\n    "participants": "patients aged 45 to 90 years with probable AD, Mini-Mental State Examination score 0 to 20 (moderate to severe impairment), and who were receiving donepezil 10 mg once daily for > or =12 weeks before the start of the study",\n    
  "interventions": "high-dose donepezil (23 mg once daily) or standard-dose donepezil (10 mg once daily)",\n    
  "comparator": "standard-dose donepezil",\n    
  "outcomes": "changes in cognition and global functioning"\n}'

  
  