# HPO parser

Need to parse out HPO to MeSH id mapping in order to link in rare disease phenotypes.

In [1]:
from collections import defaultdict

import pandas as pd
import numpy as np

In [2]:
def read_file(fname):
    with open(fname, "r") as fin:
        for line in fin:
            yield line.rstrip("\n")

## Parse out all names

In [3]:
data = defaultdict(list)

stream = read_file("../data/raw/hpo.obo")

for line in stream:
    if line == "[Term]":
        hpo_id = next(stream)
        assert hpo_id.startswith("id:")
        hpo_id = hpo_id[4:]
        
        hpo_name = next(stream)
        if not hpo_name.startswith("name:"):
            hpo_name = next(stream)
            
        assert hpo_name.startswith("name:"), hpo_name
        hpo_name = hpo_name[6:]
        
        data["hpo_id"].append(hpo_id)
        data["hpo_name"].append(hpo_name)
        
terms = pd.DataFrame(data)

## Parse out cross references

In [4]:
data = defaultdict(list)

stream = read_file("../data/raw/hpo.obo")

for line in stream:
    if line == "[Term]":
        hpo_id = next(stream)
        assert hpo_id.startswith("id:")
        hpo_id = hpo_id[4:]
        
        for val in stream:
            if not val:
                break
                
            if val.startswith("xref:"):
                data["hpo_id"].append(hpo_id)
                data["xref"].append(val[6:])
                
xrefs = pd.DataFrame(data)

## Parse out alternate ids

In [5]:
data = defaultdict(list)

stream = read_file("../data/raw/hpo.obo")

for line in stream:
    if line == "[Term]":
        hpo_id = next(stream)
        assert hpo_id.startswith("id:"), hpo_id
        hpo_id = hpo_id[4:]
        
        for val in stream:
            if not val:
                break
                
            if val.startswith("alt_id:"):
                data["hpo_id"].append(hpo_id)
                data["alt_id"].append(val[8:])
                
alts = pd.DataFrame(data)

---

## Total number of HPO terms

In [6]:
terms.shape

(12786, 2)

In [7]:
terms.head()

Unnamed: 0,hpo_id,hpo_name
0,HP:0000001,All
1,HP:0000002,Abnormality of body height
2,HP:0000003,Multicystic kidney dysplasia
3,HP:0000005,Mode of inheritance
4,HP:0000006,Autosomal dominant inheritance


---

## Cross references to other ids

In [8]:
xrefs["xtype"] = xrefs["xref"].map(lambda v: v[:v.find(":")])

In [9]:
xrefs.shape

(20268, 3)

In [10]:
xrefs.head()

Unnamed: 0,hpo_id,xref,xtype
0,HP:0000001,UMLS:C0444868,UMLS
1,HP:0000002,UMLS:C4025901,UMLS
2,HP:0000003,MSH:D021782,MSH
3,HP:0000003,SNOMEDCT_US:204962002,SNOMEDCT_US
4,HP:0000003,SNOMEDCT_US:82525005,SNOMEDCT_US


In [11]:
xrefs["hpo_id"].nunique()

11760

Some 1026 HPO terms do not have any sort of cross reference to other ontologies. These can still be included in the network, but will probably have no connection to any other concepts.

In [12]:
xrefs["xtype"].value_counts()

UMLS                                     13167
SNOMEDCT_US                               4703
MSH                                       2186
MEDDRA                                      93
ICD-10                                      38
pmid                                        23
EPCC                                        13
NCIT                                        10
MP                                          10
ORPHA                                        6
MPATH                                        4
ICD-O                                        3
ICD-9                                        3
PMID                                         3
DOI                                          2
NCIT_C345                                    1
http                                         1
DOID                                         1
Absence of ceruloplasmin in the blood        1
Name: xtype, dtype: int64

---

## Alternate identifiers

Some of the HPO terms are outdated, and the official id has changed.

In [13]:
alts.shape

(3518, 2)

In [14]:
alts.head()

Unnamed: 0,alt_id,hpo_id
0,HP:0004715,HP:0000003
1,HP:0001453,HP:0000005
2,HP:0001461,HP:0000005
3,HP:0001415,HP:0000006
4,HP:0001447,HP:0000006


## Save to file

In [15]:
terms.to_csv("data/hpo_terms.tsv", sep='\t', index=False)

In [16]:
xrefs.to_csv("data/hpo_xrefs.tsv", sep='\t', index=False)

In [17]:
alts.to_csv("data/hpo_alts.tsv", sep='\t', index=False)