In [None]:
import pickle
import pandas as pd
from pyhpo import Ontology
from concurrent.futures import ProcessPoolExecutor

from phenodp import PhenoDP_Initial

ontology = Ontology(data_folder='../data/hpo-2025-05-06')

pre_model = PhenoDP_Initial(ontology)
hpo_len = len(pre_model.hpo_list)

num_groups = 30  # Number of groups to divide the task into
max_workers = 30  # Number of threads/processes to use

group_size = hpo_len // num_groups

def run_initial_sim(start, end):
    return pre_model.initial_sim(start=start, end=end)

ranges = [(i * group_size, (i + 1) * group_size) for i in range(num_groups)]

with ProcessPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(run_initial_sim, start, end) for start, end in ranges]
    
    results = [future.result() for future in futures]

hp2d_sim_dict = []
processed_list = []

for result in results:
    hp2d_sim_dict.extend(result[0])  # Merge dictionaries
    processed_list.extend(result[1])  # Extend lists

In [3]:
df = pd.DataFrame(hp2d_sim_dict, index=processed_list, columns=[str(t) for t in pre_model.disease_list])
JC_sim_dict = df.to_dict(orient='index')

with open('../data/JC_sim_dict.pkl', 'wb') as f:
    pickle.dump(JC_sim_dict, f)