In [1]:
import re
import torch
import pandas as pd
from pyhpo import Ontology

from phenodp import PhenoDP, PhenoDP_Initial
from phenodp.encoders import PCL_HPOEncoder
from phenodp.utils import load_similarity_matrix, load_node_embeddings

In [2]:
ontology = Ontology(data_folder='../data/hpo-2025-05-06')

In [3]:
# Initialize the PhenoDP model
recommender = PCL_HPOEncoder(input_dim=256, num_heads=8, num_layers=3, hidden_dim=512, output_dim=1, max_seq_length=128)
pre_model = PhenoDP_Initial(ontology, ic_type='omim')
hp2d_sim_dict = load_similarity_matrix('../data/JC_sim_dict.pkl')
node_embedding = load_node_embeddings('../data/node_embedding_dict.pkl')
recommender.load_state_dict(torch.load('../data/transformer_encoder_infoNCE.pth'))
phenodp = PhenoDP(pre_model, hp2d_sim_dict, node_embedding, recommender)



generate disease dict...
related hpo num: 9216
generate disease ic dict... 
calculating hp weights
PCL_HPOEncoder is a pre-trained model


In [4]:
def calculate_coefficient_of_variation(results, top_n=3):
    """
    Computes the mean, standard deviation, and coefficient of variation (CV) for the 'Total_Similarity' column 
    of the top n rows in a given DataFrame.

    Parameters:
    results (pd.DataFrame): A DataFrame containing the 'Total_Similarity' column.
    top_n (int): The number of top rows to consider for the calculation. Default is 3.

    Returns:
    mean (float): The mean value of the 'Total_Similarity' column for the top n rows.
    std (float): The standard deviation of the 'Total_Similarity' column for the top n rows.
    cv (float): The coefficient of variation (CV) expressed as a percentage.
    """
    # Extract the top n rows of the 'Total_Similarity' column
    data = results.head(top_n)['Total_Similarity']
    
    # Calculate mean, standard deviation, and coefficient of variation
    mean = data.mean()
    std = data.std()
    cv = (std / mean) * 100
    
    return cv

def Get_Definition(hpo_list):
    definition_list = []
    for t in hpo_list:
        definition = Ontology.get_hpo_object(t).definition
        match = re.search(r'"(.*?)"', definition)
        if match:
            definition_list.append(match.group(1))
    return ' '.join(definition_list)

def generate_diagnosis_prompt(Patient_hps, results, Top_n=3, Top_Recom=2):
    """
    Generate a prompt for explaining potential symptoms to differentiate between candidate diseases.

    Args:
        Patient_hps (list): List of patient's observed HPO terms.
        results (pd.DataFrame): DataFrame containing candidate diseases and their details.
        Top_n (int): Number of top candidate diseases to consider. Default is 3.
        Top_Recom (int): Number of top recommended symptoms for each disease. Default is 2.

    Returns:
        str: A formatted prompt for disease differentiation.
    """
    # Get observed symptoms
    observered_syn = Get_Definition(Patient_hps)
    
    # Get top candidate diseases
    Condidate_diseases = results.head(Top_n)['Disease'].values
    
    # Initialize lists for diseases and recommendations
    diseases_list = []
    Recom_list = []
    txt_inputs = []
    
    # Process each candidate disease
    for index, t in enumerate(Condidate_diseases):
        # Get recommended symptoms for the disease
        recom = phenodp.run_Recommender(Patient_hps, target_disease=t, candidate_diseases=Condidate_diseases)
        Recom_list.append([Ontology.get_hpo_object(t).name for t in recom.head(Top_Recom).hp.values])
        
        # Get disease details and append to diseases_list
        diseases_list.extend([str(index + 1) + '. [OMIM:' + str(t) + '] ' + j.name for j in Ontology.omim_diseases if j.id == t])
        
        # Format disease and symptoms for txt_inputs
        txt_inputs.append(diseases_list[-1] + ' : ' + ', '.join(Recom_list[-1]))
    
    # Format diseases_list and txt_inputs as strings
    diseases_list_str = "\n".join(diseases_list)
    txt_inputs_str = "\n".join(txt_inputs)
    
    # Generate the prompt
    prompt = f"""
Assume you are an experienced clinical physician. Below is a patient’s symptom description using HPO (Human Phenotype Ontology) terms, along with three candidate diagnoses. To further differentiate between these diagnoses, the physician has provided potential symptoms that the patient does not currently exhibit but could help clarify or confirm the diagnosis. Your task is to explain why these potential symptoms are critical for distinguishing between the three diseases.  

**Patient’s Symptom Description**:  
{observered_syn}  

**Three Most Likely Disease Diagnoses**:  
{diseases_list_str}  

**Potential Symptoms for Further Differentiation**:  
{txt_inputs_str}  

**Instructions**:  
1. **Explain Potential Symptoms**: Provide a clear and concise rationale for why the listed potential symptoms are critical for distinguishing between the three diseases. Focus on how these symptoms are specific to or more prevalent in one disease compared to the others.  
2. **Do Not Diagnose**: Do not make any new diagnoses or suggest additional diseases. Your response should focus solely on explaining the potential symptoms for differentiation.  
3. **Length and Style**: The report should be approximately 200–300 words in length, written in a professional and authentic tone that mimics a human expert.  
4. **No References**: Do not include any references in the report.   
"""
    return prompt

### Ranker

In [5]:
patient_data = pd.read_pickle('../data/dataset_real-world1.pkl')
Patient_hps = patient_data[0][1]

In [6]:
results = phenodp.run_Ranker(Patient_hps)
results

Find Candidate Diseases: 100%|██████████| 2581/2581 [00:00<00:00, 18147.72it/s]
Calculating Phi Scores: 100%|██████████| 200/200 [00:00<00:00, 570.85it/s]
Calculating Embedding Similarity: 100%|██████████| 200/200 [00:07<00:00, 27.45it/s]


Unnamed: 0,Disease,Total_Similarity
0,616395,0.733203
1,619488,0.590312
2,608013,0.565359
3,619991,0.560130
4,608104,0.553133
...,...,...
195,251100,0.456102
196,608670,0.451051
197,612474,0.448265
198,601812,0.442209


In [7]:
calculate_coefficient_of_variation(results, top_n=3)

np.float64(14.383970851360875)

### Recommender

In [8]:
candidates_d = results.head()['Disease'].values[:3]
candidates_d

array([616395, 619488, 608013])

In [9]:
phenodp.run_Recommender(
    given_hps=Patient_hps,
    target_disease=candidates_d[0],
    candidate_diseases=candidates_d
)

using default setting...


Calculating NCE Loss: 100%|██████████| 20/20 [00:00<00:00, 128.97it/s]


Unnamed: 0,hp,importance
0,HP:0009886,1.542683
1,HP:0003261,1.493406
2,HP:0002188,1.304774
3,HP:0000518,0.755919
4,HP:0000519,0.74663
5,HP:0002719,0.69561
6,HP:0008069,0.653429
7,HP:0001263,0.644845
8,HP:0003577,0.639668
9,HP:0001249,0.632255


In [10]:
# 测试修复后的推荐器
print("=== 测试修复后的推荐器 ===")

# 再次运行推荐器
recommendations_fixed = phenodp.run_Recommender(
    given_hps=Patient_hps,
    target_disease=candidates_d[0],
    candidate_diseases=candidates_d
)

print("修复后的推荐结果 (前10个):")
print(recommendations_fixed.head(10))

# 检查importance值的分布
importance_values = recommendations_fixed['importance'].values
print(f"\nImportance值统计:")
print(f"最小值: {importance_values.min():.6f}")
print(f"最大值: {importance_values.max():.6f}")
print(f"平均值: {importance_values.mean():.6f}")
print(f"标准差: {importance_values.std():.6f}")
print(f"唯一值数量: {len(set(importance_values))}")

if len(set(importance_values)) == 1:
    print("❌ 所有importance值仍然相同!")
elif len(set(importance_values)) < 5:
    print(f"⚠️  只有{len(set(importance_values))}个不同的importance值，可能还有其他问题")
else:
    print("✅ importance值现在有足够的差异了!")


=== 测试修复后的推荐器 ===
using default setting...


Calculating NCE Loss: 100%|██████████| 20/20 [00:00<00:00, 135.47it/s]

修复后的推荐结果 (前10个):
           hp  importance
0  HP:0009886    1.542683
1  HP:0003261    1.493406
2  HP:0002188    1.304774
3  HP:0000518    0.755919
4  HP:0000519    0.746630
5  HP:0002719    0.695610
6  HP:0008069    0.653429
7  HP:0001263    0.644845
8  HP:0003577    0.639668
9  HP:0001249    0.632255

Importance值统计:
最小值: 0.489695
最大值: 1.542683
平均值: 0.738086
标准差: 0.307619
唯一值数量: 20
✅ importance值现在有足够的差异了!





### Summarizer

In [11]:
prompt = generate_diagnosis_prompt(Patient_hps, results, Top_n=5, Top_Recom=3)
print(prompt)

using default setting...


Calculating NCE Loss: 100%|██████████| 20/20 [00:00<00:00, 144.36it/s]

using default setting...



Calculating NCE Loss: 100%|██████████| 104/104 [00:00<00:00, 141.69it/s]


using default setting...


Calculating NCE Loss: 100%|██████████| 37/37 [00:00<00:00, 92.04it/s] 


using default setting...


Calculating NCE Loss: 100%|██████████| 88/88 [00:00<00:00, 153.32it/s]


using default setting...


Calculating NCE Loss: 100%|██████████| 25/25 [00:00<00:00, 152.73it/s]


Assume you are an experienced clinical physician. Below is a patient’s symptom description using HPO (Human Phenotype Ontology) terms, along with three candidate diagnoses. To further differentiate between these diagnoses, the physician has provided potential symptoms that the patient does not currently exhibit but could help clarify or confirm the diagnosis. Your task is to explain why these potential symptoms are critical for distinguishing between the three diseases.  

**Patient’s Symptom Description**:  
Premature rupture of membranes (PROM) is a condition which occurs in pregnancy when the amniotic sac ruptures more than an hour before the onset of labor. An outward turning (eversion) or rotation of the eyelid margin. A foot where the longitudinal arch of the foot is in contact with the ground or floor when the individual is standing; or, in a patient lying supine, a foot where the arch is in contact with the surface of a flat board pressed against the sole of the foot by the ex




In [12]:
with open('../data/case_report_prompt.txt', 'w', encoding='utf-8') as f:
    f.write(prompt)