In [1]:
from PhenoDP_Preprocess import PhenoDP_Initial
from pyhpo import Ontology
import pickle
import pandas as pd

# Notebook Overview: Term-to-Disease Similarity Matrix

In this notebook, we demonstrate how to compute (or update) the term-to-disease similarity matrix. This process involves leveraging the Human Phenotype Ontology (HPO) and its associated data files.

## Prerequisites

To proceed, you will need to download the following files from the HPO GitHub release page:  
[HPO GitHub Releases](https://github.com/obophenotype/human-phenotype-ontology/releases/)

1. **`hp.obo`**: The Human Phenotype Ontology file, which contains the hierarchical structure of HPO terms.  
2. **`phenotype.hpoa`**: The phenotype annotations file, which maps diseases to HPO terms.  
3. **`phenotype_to_genes.txt`**: The phenotype-to-genes file, which links HPO terms to associated genes.

These files are essential for constructing the term-to-disease similarity matrix.

## Steps Overview

1. **Download the Required Files**:  
   Ensure you have the latest versions of `hp.obo`, `phenotype.hpoa`, and `phenotype_to_genes.txt` from the HPO GitHub release page.

2. **Load the HPO Ontology**:  
   Use the `hp.obo` file to load the HPO ontology into a graph structure for further processing.

3. **Process Annotations**:  
   Parse the `phenotype.hpoa` file to extract disease-to-HPO term mappings.

4. **Compute Term Embeddings**:  
   Use a pre-trained model to compute embeddings for each HPO term.

5. **Construct the Similarity Matrix**:  
   Calculate the similarity between HPO terms and diseases based on their embeddings.

6. **Update or Save the Matrix**:  
   Save the computed similarity matrix for future use or update an existing matrix with new data.

## Next Steps

Proceed to the code cells to implement the above steps and generate the term-to-disease similarity matrix.


## Disease Similarity Can Be Achieved Through Single-threaded or Multi-threaded Implementation, Single-threaded Will Consume Over Ten Hours

In [None]:
# Update Ontology
_ = Ontology("./HPO_2025_3_3/")
Initial = PhenoDP_Initial(Ontology)
hpo_len = len(Ontology.to_dataframe())
hp2d_sim_dict, processed_list = Initial.initial_sim_singlecore()

## ## Multi-threaded Implementation

In [2]:
from concurrent.futures import ProcessPoolExecutor

# Parameters
num_groups = 30  # Number of groups to divide the task into
max_workers = 30  # Number of threads/processes to use
_ = Ontology("./HPO_2025_3_3/")
hpo_len = len(Ontology.to_dataframe())

# Define the total range
total_range = hpo_len  # Assuming hpo_len is the total length of the range

# Calculate the size of each group
group_size = total_range // num_groups

# Function to run Initial.initial_sim for a given start and end
def run_initial_sim(start, end):
    # Initialize Ontology and PhenoDP_Initial inside the worker process
    ontology = Ontology("./HPO_2025_3_3/")
    initial = PhenoDP_Initial(ontology)
    return initial.initial_sim(start=start, end=end)

# Split the range into groups
ranges = [(i * group_size, (i + 1) * group_size) for i in range(num_groups)]

# Use ProcessPoolExecutor to parallelize the execution
with ProcessPoolExecutor(max_workers=max_workers) as executor:
    # Submit tasks for each group
    futures = [executor.submit(run_initial_sim, start, end) for start, end in ranges]
    
    # Collect results as they complete
    results = [future.result() for future in futures]

# Combine results from all groups
hp2d_sim_dict = []
processed_list = []
for result in results:
    hp2d_sim_dict.extend(result[0])  # Merge dictionaries
    processed_list.extend(result[1])  # Extend lists

generate disease dict...
generate disease dict...
generate disease dict...
generate disease dict...
generate disease dict...
generate disease dict...
generate disease dict...
related hpo num: 9211
generate disease ic dict... 
calculating hp weights
related hpo num: 9211
generate disease ic dict... 
related hpo num: 9211
generate disease ic dict... 
calculating hp weights
total hpo len: 19533


HPO Processing:   0%|          | 0/651 [00:00<?, ?it/s]

calculating hp weights
related hpo num: 9211
generate disease ic dict... 
total hpo len: 19533
calculating hp weights


HPO Processing:   0%|          | 0/651 [00:00<?, ?it/s]

total hpo len: 19533


HPO Processing:   0%|          | 0/651 [00:00<?, ?it/s]

related hpo num: 9211
generate disease ic dict... 
generate disease dict...
total hpo len:calculating hp weights 19533



HPO Processing:   0%|          | 0/651 [00:00<?, ?it/s]

related hpo num: 9211
generate disease ic dict... 
related hpo num: 9211
generate disease ic dict... 
calculating hp weights
calculating hp weights
total hpo len: 19533


HPO Processing:   0%|          | 0/651 [00:00<?, ?it/s]

total hpo len: 19533


HPO Processing:   0%|          | 0/651 [00:00<?, ?it/s]

total hpo len: 19533


HPO Processing:   0%|          | 0/651 [00:00<?, ?it/s]

related hpo num: 9211
generate disease ic dict... 
calculating hp weights


HPO Processing:   0%|          | 1/651 [00:05<57:55,  5.35s/it]

total hpo len: 19533


HPO Processing:   0%|          | 2/651 [00:04<26:47,  2.48s/it]

generate disease dict...
generate disease dict...generate disease dict...generate disease dict...


generate disease dict...
generate disease dict...
generate disease dict...
generate disease dict...
generate disease dict...
generate disease dict...
generate disease dict...
generate disease dict...
generate disease dict...
generate disease dict...


HPO Processing:   0%|          | 1/651 [00:05<59:06,  5.46s/it]

generate disease dict...
generate disease dict...
generate disease dict...


HPO Processing:   0%|          | 1/651 [00:05<54:18,  5.01s/it]

generate disease dict...
generate disease dict...
generate disease dict...


HPO Processing:   0%|          | 1/651 [00:05<57:54,  5.34s/it]

generate disease dict...
generate disease dict...
related hpo num: 9211
generate disease ic dict... 
related hpo num: 9211
generate disease ic dict... 
related hpo num: 9211related hpo num:
 9211
generate disease ic dict... 
generate disease ic dict... 
related hpo num: calculating hp weights
related hpo num:9211 9211

generate disease ic dict... generate disease ic dict... 

related hpo num: 9211
generate disease ic dict... 
related hpo num: 9211
generate disease ic dict... 
related hpo num: 9211
generate disease ic dict... 
related hpo num:related hpo num: 9211
generate disease ic dict... 
 9211
generate disease ic dict... 
related hpo num: 9211related hpo num:
generate disease ic dict...  
9211
generate disease ic dict... 
calculating hp weights


HPO Processing:   0%|          | 2/651 [00:10<54:17,  5.02s/it]

calculating hp weights
calculating hp weights
related hpo num:calculating hp weights 
calculating hp weightscalculating hp weights
9211
generate disease ic dict... 

calculating hp weights
calculating hp weights
calculating hp weights
calculating hp weights
related hpo num: 9211
generate disease ic dict... 
related hpo num: 9211
generate disease ic dict... 
calculating hp weights
related hpo num: 9211calculating hp weights


HPO Processing:   0%|          | 3/651 [00:10<38:44,  3.59s/it]


generate disease ic dict... 


HPO Processing:   0%|          | 1/651 [00:05<59:35,  5.50s/it]

calculating hp weights
calculating hp weights
total hpo len:calculating hp weights
 19533
calculating hp weights


HPO Processing:   0%|          | 0/651 [00:00<?, ?it/s]

related hpo num: 9211
generate disease ic dict... 
total hpo len: 19533


HPO Processing:   0%|          | 0/651 [00:00<?, ?it/s]

related hpo num:total hpo len: 19533
 related hpo num:9211 
9211


HPO Processing:   0%|          | 0/651 [00:00<?, ?it/s]

generate disease ic dict... generate disease ic dict... 

total hpo len: total hpo len: 19533
19533


HPO Processing:   0%|          | 0/651 [00:00<?, ?it/s]

total hpo len: 19533


HPO Processing:   0%|          | 0/651 [00:00<?, ?it/s]

total hpo len: total hpo len:19533 total hpo len: 19533


HPO Processing:   0%|          | 0/651 [00:00<?, ?it/s]

related hpo num:19533 
9211



HPO Processing:   0%|          | 0/651 [00:00<?, ?it/s]

generate disease ic dict... 
total hpo len: 19533


HPO Processing:   0%|          | 0/651 [00:00<?, ?it/s]

calculating hp weights
total hpo len: 19533


HPO Processing:   0%|          | 0/651 [00:00<?, ?it/s]

related hpo num: 9211
generate disease ic dict... 
total hpo len: 19533

HPO Processing:   0%|          | 2/651 [00:10<58:43,  5.43s/it]




HPO Processing:   0%|          | 0/651 [00:00<?, ?it/s]

total hpo len: 19533


HPO Processing:   0%|          | 0/651 [00:00<?, ?it/s]

calculating hp weights
calculating hp weights
total hpo len: 19533


HPO Processing:   0%|          | 0/651 [00:00<?, ?it/s]

calculating hp weights
total hpo len: 19533


HPO Processing:   0%|          | 0/651 [00:00<?, ?it/s]

calculating hp weightstotal hpo len:
 19533


HPO Processing:   0%|          | 0/651 [00:00<?, ?it/s]

total hpo len: 19533


HPO Processing:   0%|          | 2/651 [00:10<55:54,  5.17s/it]

total hpo len: 

HPO Processing:   0%|          | 2/651 [00:10<57:29,  5.32s/it]

19533


HPO Processing:   0%|          | 0/651 [00:00<?, ?it/s]

total hpo len: 19533


HPO Processing:   0%|          | 0/651 [00:00<?, ?it/s]

total hpo len: 19533


HPO Processing:   0%|          | 0/651 [00:00<?, ?it/s]

total hpo len: 19533


HPO Processing:   0%|          | 0/651 [00:00<?, ?it/s]

total hpo len: 19533


HPO Processing: 100%|██████████| 651/651 [55:37<00:00,  5.13s/it]  


end


HPO Processing: 100%|██████████| 651/651 [56:01<00:00,  5.16s/it]


end


HPO Processing: 100%|██████████| 651/651 [56:19<00:00,  5.19s/it]


end


HPO Processing: 100%|██████████| 651/651 [56:17<00:00,  5.19s/it]


end


HPO Processing: 100%|██████████| 651/651 [56:25<00:00,  5.20s/it]


end


HPO Processing: 100%|██████████| 651/651 [56:31<00:00,  4.72s/it]


end


HPO Processing: 100%|██████████| 651/651 [56:55<00:00,  5.25s/it]


end


HPO Processing:  78%|███████▊  | 510/651 [57:31<14:27,  6.15s/it]


end


HPO Processing: 100%|██████████| 651/651 [1:06:35<00:00,  6.14s/it]


end


HPO Processing: 100%|██████████| 651/651 [1:07:20<00:00,  6.21s/it]


end


HPO Processing: 100%|██████████| 651/651 [1:07:32<00:00,  6.22s/it]


end


HPO Processing: 100%|██████████| 651/651 [1:10:36<00:00,  6.51s/it]


end


HPO Processing: 100%|██████████| 651/651 [1:10:54<00:00,  6.53s/it]


end


HPO Processing: 100%|██████████| 651/651 [1:10:58<00:00,  6.54s/it]


end


HPO Processing: 100%|██████████| 651/651 [1:11:01<00:00,  6.55s/it]


end


HPO Processing: 100%|██████████| 651/651 [1:11:02<00:00,  6.55s/it]


end


HPO Processing: 100%|██████████| 651/651 [1:11:02<00:00,  6.55s/it]


end


HPO Processing: 100%|██████████| 651/651 [1:11:14<00:00,  6.57s/it]


end


HPO Processing: 100%|██████████| 651/651 [1:11:29<00:00,  6.59s/it]


end


HPO Processing: 100%|██████████| 651/651 [1:11:36<00:00,  6.60s/it]


end


HPO Processing: 100%|██████████| 651/651 [1:11:37<00:00,  6.60s/it]


end


HPO Processing: 100%|██████████| 651/651 [1:11:44<00:00,  6.61s/it]


end


HPO Processing: 100%|██████████| 651/651 [1:11:45<00:00,  6.61s/it]


end


HPO Processing: 100%|██████████| 651/651 [1:11:45<00:00,  6.61s/it]


end


HPO Processing: 100%|██████████| 651/651 [1:11:49<00:00,  6.62s/it]


end


HPO Processing: 100%|██████████| 651/651 [1:11:48<00:00,  6.62s/it]


end


HPO Processing: 100%|██████████| 651/651 [1:12:11<00:00,  6.65s/it]


end


HPO Processing: 100%|██████████| 651/651 [1:12:15<00:00,  6.66s/it]


end


HPO Processing: 100%|██████████| 651/651 [1:12:33<00:00,  6.69s/it]


end


HPO Processing: 100%|██████████| 651/651 [1:12:56<00:00,  6.72s/it]


end


In [3]:
Initial = PhenoDP_Initial(Ontology)

generate disease dict...
related hpo num: 9211
generate disease ic dict... 
calculating hp weights


In [4]:
df = pd.DataFrame(hp2d_sim_dict, index=processed_list, columns=[str(t) for t in Initial.disease_list])
JC_sim_dict = df.to_dict(orient='index')

In [5]:
with open('./JC_sim_dict_test.pkl', 'wb') as f:
    pickle.dump(JC_sim_dict, f)