In [1]:
# download link: https://github.com/obophenotype/human-phenotype-ontology/releases

import obonet
import pandas as pd
import polars as pl

# Load ontology
graph = obonet.read_obo("docs/hp.obo")

# Load phenotype-disease mappings
df_hpoa = pd.read_csv(
    "docs/phenotype.hpoa",
    sep="\t",
    comment="#",
    dtype=str,
)
df_hpoa = pl.from_pandas(df_hpoa)


# Load phenotype-gene mappings
df_genes = pd.read_csv(
    "docs/phenotype_to_genes.txt",
    sep="\t",
    comment="#",
    dtype=str,
)
df_genes = pl.from_pandas(df_genes)



In [13]:
snomed_ids = []
hpo_ids = []
hpo_names  = []
# Iterate over all nodes
for node_id, data in graph.nodes(data=True):
    # Get xrefs
    xrefs = data.get("xref", [])
    
    # Normalize to list
    if isinstance(xrefs, str):
        xrefs = [xrefs]
    
    # Check for SNOMED mappings
    for x in xrefs:
        if x.startswith("SNOMEDCT"):
            hpo_ids.append(node_id)
            snomed_ids.append(x.split(":")[-1])
            hpo_names.append(data.get("name", ""))

df_snomed_mapping = pl.DataFrame({
    "hpo_id": hpo_ids,
    "hpo_map_to_snomed_id": snomed_ids,
    "hpo_name": hpo_names
})

In [21]:
pheno_disease_association = (df_snomed_mapping.join(df_hpoa, on="hpo_id", how="left")
 .select("hpo_id", "hpo_name", "hpo_map_to_snomed_id", "database_id", "disease_name")
.rename( {"database_id": "disease_associated_id", "disease_name": "disease_associated_name"}))

In [None]:
pheno_gene_association = (df_snomed_mapping.join(df_genes, on="hpo_id", how="left")
 .select("hpo_id", "hpo_name", "hpo_map_to_snomed_id", "gene_symbol")
 .rename( {"gene_symbol": "gene_associated_symbol"}))


In [28]:
pheno_disease_association.write_csv("docs/pheno_disease_association.csv")
pheno_gene_association.write_csv("docs/pheno_gene_association.csv")
