# Map DOIDs to CUIs

In [1]:
import pandas as pd

In [2]:
def rawgit(handle, repo, commit, *args):
    """Returns url for a raw file in a github reposotory."""
    url_head = 'https://raw.githubusercontent.com'
    return '/'.join((url_head, handle, repo, commit) + args)

In [3]:
commit = '75050ea2d4f60e745d3f3578ae03560a2cc0e444'
url = rawgit('dhimmel', 'disease-ontology', commit, 'data/slim-terms.tsv')
disease_df = pd.read_table(url)

disease_df.head()

Unnamed: 0,doid,name,source,pathophysiology
0,DOID:2531,hematologic cancer,DOcancerslim,neoplastic
1,DOID:1319,brain cancer,DOcancerslim,neoplastic
2,DOID:1324,lung cancer,DOcancerslim,neoplastic
3,DOID:263,kidney cancer,DOcancerslim,neoplastic
4,DOID:1793,pancreatic cancer,DOcancerslim,neoplastic


In [4]:
disease_df.shape

(137, 4)

There are 137 unique diseases in Hetionet that we need to map from DOID to UMLS CUIs.

---

## Disease Ontology source file parser

In [5]:
def parse_doid(fname):
    temp = []
    alts = []
    with open(fname, "r") as fin:
        for line in fin:
            line = line.rstrip("\n")

            if line.startswith("id: DOID"):
                doid = line[4:]
            elif line.startswith("xref:"):
                temp.append((doid, line[6:]))
            elif line.startswith("alt_id:"):
                alts.append((doid, line[8:]))
                
    return (pd.DataFrame(temp, columns=["doid", "xref"]),
            pd.DataFrame(alts, columns=["doid", "alt_id"])
           )

In [6]:
xrefs, alts = parse_doid("../../data/ontologies/doid.obo")

In [7]:
xrefs.shape

(39177, 2)

In [8]:
xrefs.head(3)

Unnamed: 0,doid,xref
0,DOID:0001816,MESH:D006394
1,DOID:0001816,NCI:C3088
2,DOID:0001816,NCI:C9275


In [9]:
alts.shape

(1609, 2)

In [10]:
alts.head(3)

Unnamed: 0,doid,alt_id
0,DOID:0001816,DOID:267
1,DOID:0001816,DOID:4508
2,DOID:0050025,DOID:0050021


---

## Data validation

In [11]:
xrefs["doid"].str.startswith("DOID").all()

True

In [12]:
alts["alt_id"].str.startswith("DOID").all()

True

### Number of cross references

In [13]:
xrefs["xref"].str.split(":").str[0].value_counts().head()

SNOMEDCT_US_2016_03_01    12444
UMLS_CUI                   6585
NCI                        4599
ICD10CM                    4227
OMIM                       4166
Name: xref, dtype: int64

Lots of mappings directly to UMLS CUIs.

## CUIs

In [14]:
cuis = (xrefs
    [
        xrefs["xref"].str.startswith("UMLS_CUI")            
    ]
)

In [15]:
cuis.shape

(6585, 2)

In [16]:
cuis.head()

Unnamed: 0,doid,xref
6,DOID:0001816,UMLS_CUI:C0018923
7,DOID:0001816,UMLS_CUI:C0854893
18,DOID:0002116,UMLS_CUI:C0033999
28,DOID:0014667,UMLS_CUI:C0025517
30,DOID:0050004,UMLS_CUI:C0578661


### Check data

In [17]:
cuis["doid"].str.startswith("DOID:").all()

True

In [18]:
cuis["xref"].str.startswith("UMLS_CUI:C").value_counts()

True     6584
False       1
Name: xref, dtype: int64

### One CUI has an incorrect format:

In [19]:
cuis[~cuis["xref"].str.startswith("UMLS_CUI:C")]

Unnamed: 0,doid,xref
3326,DOID:0060573,UMLS_CUI: C1264039


There appears to be an extra space at the front.

## Format DOID to CUI map

In [20]:
cuis = (cuis
    .assign(cui = lambda df: df["xref"].str.split(":").str[1].str.lstrip())
    .drop("xref", axis=1)
    .assign(cui = lambda df: df["cui"].map(lambda v: "UMLS:{}".format(v)))
    .sort_values(["doid", "cui"])
    .reset_index(drop=True)    
)

In [21]:
cuis.head()

Unnamed: 0,doid,cui
0,DOID:0001816,UMLS:C0018923
1,DOID:0001816,UMLS:C0854893
2,DOID:0002116,UMLS:C0033999
3,DOID:0014667,UMLS:C0025517
4,DOID:0050004,UMLS:C0578661


In [22]:
cuis.shape

(6585, 2)

---

## Check that we can map all the DOIDs in Hetionet

In [23]:
set(disease_df["doid"]) - set(cuis["doid"])

{'DOID:0060073', 'DOID:9917'}

Two DOIDs from Hetionet are not mappable to CUIs, but one is an outdated id.

In [24]:
alts.query("alt_id == 'DOID:9917'")

Unnamed: 0,doid,alt_id
1055,DOID:5158,DOID:9917


In [25]:
cuis.query("doid == 'DOID:5158'")

Unnamed: 0,doid,cui
4427,DOID:5158,UMLS:C0032229
4428,DOID:5158,UMLS:C0153494


---

## Data validation

In [26]:
cuis["doid"].str.startswith("DOID:").all()

True

In [27]:
cuis["cui"].str.startswith("UMLS:C").all()

True

In [28]:
alts["doid"].str.startswith("DOID:").all()

True

In [29]:
alts["alt_id"].str.startswith("DOID:").all()

True

## Save mapping files to disk

In [30]:
cuis.to_csv("../../pipeline/maps/doid_to_cui.tsv", sep='\t', index=False)
alts.to_csv("../../pipeline/maps/doid_alts.tsv", sep='\t', index=False)