# Map DOIDs to CUIs

In [1]:
import pandas as pd

In [2]:
def rawgit(handle, repo, commit, *args):
    """Returns url for a raw file in a github reposotory."""
    url_head = 'https://raw.githubusercontent.com'
    return '/'.join((url_head, handle, repo, commit) + args)

In [3]:
commit = '75050ea2d4f60e745d3f3578ae03560a2cc0e444'
url = rawgit('dhimmel', 'disease-ontology', commit, 'data/slim-terms.tsv')
disease_df = pd.read_table(url)

disease_df.head()

Unnamed: 0,doid,name,source,pathophysiology
0,DOID:2531,hematologic cancer,DOcancerslim,neoplastic
1,DOID:1319,brain cancer,DOcancerslim,neoplastic
2,DOID:1324,lung cancer,DOcancerslim,neoplastic
3,DOID:263,kidney cancer,DOcancerslim,neoplastic
4,DOID:1793,pancreatic cancer,DOcancerslim,neoplastic


In [4]:
disease_df.shape

(137, 4)

---

In [5]:
def parse_doid():
    temp = []
    alts = []
    with open("../data/doid.obo", "r") as fin:
        for line in fin:
            line = line.rstrip("\n")

            if line.startswith("id: DOID"):
                doid = line[4:]
            elif line.startswith("xref:"):
                temp.append((doid, line[6:]))
            elif line.startswith("alt_id:"):
                alts.append((doid, line[8:]))
                
    return (pd.DataFrame(temp, columns=["doid", "xref"]),
            pd.DataFrame(alts, columns=["doid", "alt_id"])
           )

In [6]:
xrefs, alts = parse_doid()

In [7]:
xrefs.shape

(39177, 2)

In [8]:
xrefs.head(3)

Unnamed: 0,doid,xref
0,DOID:0001816,MESH:D006394
1,DOID:0001816,NCI:C3088
2,DOID:0001816,NCI:C9275


In [9]:
alts.shape

(1609, 2)

In [10]:
alts.head(3)

Unnamed: 0,doid,alt_id
0,DOID:0001816,DOID:267
1,DOID:0001816,DOID:4508
2,DOID:0050025,DOID:0050021


---

In [11]:
alts["alt_id"].str.startswith("DOID").all()

True

## CUIs

In [12]:
cuis = (xrefs
    .assign(iscui = lambda df: df["xref"].str.startswith("UMLS_CUI"))
    .query("iscui")
    .drop("iscui", axis=1)
    .assign(cui = lambda df: df["xref"].str.split(":").str[1])
    .drop("xref", axis=1)
    .reset_index(drop=True)
)

In [13]:
cuis.head()

Unnamed: 0,doid,cui
0,DOID:0001816,C0018923
1,DOID:0001816,C0854893
2,DOID:0002116,C0033999
3,DOID:0014667,C0025517
4,DOID:0050004,C0578661


In [14]:
cuis.shape

(6585, 2)

---

In [15]:
set(disease_df["doid"]) - set(cuis["doid"])

{'DOID:0060073', 'DOID:9917'}

Two DOIDs from Hetionet are not mappable to CUIs, but one is an outdated id.

In [16]:
alts.query("alt_id == 'DOID:9917'")

Unnamed: 0,doid,alt_id
1055,DOID:5158,DOID:9917


In [17]:
cuis.query("doid == 'DOID:5158'")

Unnamed: 0,doid,cui
4427,DOID:5158,C0032229
4428,DOID:5158,C0153494


In [18]:
cuis.to_csv("../maps/doid_to_cui.tsv", sep='\t', index=False)
alts.to_csv("../maps/doid_alts.tsv", sep='\t', index=False)