# Parse Uberon for Uberon ID to CUI mappings

Uberon's provided direct mappings to CUIs don't match the mappings we get when we go through MeSH as an intermediate. We will combine the results of both since there doesn't seem to be a clear consensus between the two. This may be a result of outdated CUIs, which is an upstream problem.

In [1]:
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

In [2]:
def read_umls(fname):
    res = defaultdict(list)
    with open(fname, "r") as fin:
        for line in tqdm(fin, total=13897048):
            vals = line.rstrip("\n").split("|")

            cui, sab, code = vals[0], vals[11], vals[13]

            if sab == "MSH":
                res["cui"].append(cui)
                res["code"].append(code)

    return (pd
        .DataFrame(res)
        .drop_duplicates()
        .reset_index(drop=True)
    )

In [3]:
umls = read_umls("../../data/ontologies/MRCONSO.RRF")

100%|██████████| 13897048/13897048 [00:25<00:00, 554401.92it/s]


In [4]:
umls.head()

Unnamed: 0,cui,code
0,C0000005,D012711
1,C0000039,D015060
2,C0000052,D015061
3,C0000074,D010742
4,C0000084,D015055


In [5]:
umls.shape

(370689, 2)

---

## Extract cross references from Uberon

Input file: the extended version of the core ontology found at http://uberon.github.io/downloads.html

In [6]:
def parse_uberon(fname):
    temp = []
    uid = None
    with open(fname, "r") as fin:
        for line in fin:
            line = line.rstrip("\n")

            if line.startswith("id: UBERON:"):
                uid = line[4:]
            elif uid is not None and line.startswith("xref:"):
                temp.append((uid, line[6:]))

    return pd.DataFrame(temp, columns=["uberon_id", "xref"])

In [7]:
uber = parse_uberon("../../data/ontologies/uberon_extended.obo")

In [8]:
uber.shape

(52459, 2)

In [9]:
uber.head()

Unnamed: 0,uberon_id,xref
0,UBERON:0000002,BTO:0001421
1,UBERON:0000002,BTO:0002249
2,UBERON:0000002,CALOHA:TS-0134
3,UBERON:0000002,EFO:0000979
4,UBERON:0000002,EMAPA:29927


---

## Extract Uberon to UMLS and MeSH mappings

In [10]:
temp = (uber
    .assign(src = lambda df: df["xref"].str.split(":").str[0])
    .query("src == 'UMLS' or src == 'MESH'")
)

In [11]:
temp.shape

(3973, 3)

In [12]:
temp.head()

Unnamed: 0,uberon_id,xref,src
14,UBERON:0000002,MESH:D002584,MESH
17,UBERON:0000002,"UMLS:C0007874 {source=""ncithesaurus:Cervix""}",UMLS
42,UBERON:0000004,MESH:D009666,MESH
47,UBERON:0000004,"UMLS:C0028429 {source=""ncithesaurus:Nose""}",UMLS
62,UBERON:0000006,MESH:D007515,MESH


In [13]:
mesh = (temp
    .query("src == 'MESH'")
    .assign(num = lambda df: df["xref"].str.len())
    .query("num == 12")
    .drop(["src", "num"], axis=1)
    .rename(columns={"xref": "mesh_id"})
)

In [14]:
cuis = (temp
    .query("src == 'UMLS'")
    .assign(cui = lambda df: df["xref"].str.split(" ").str[0])
    .drop(["xref", "src"], axis=1)
    .drop_duplicates()
)

In [15]:
mesh.shape

(655, 2)

In [16]:
mesh.head(2)

Unnamed: 0,uberon_id,mesh_id
14,UBERON:0000002,MESH:D002584
42,UBERON:0000004,MESH:D009666


In [17]:
cuis.shape

(2891, 2)

In [18]:
cuis.head(2)

Unnamed: 0,uberon_id,cui
17,UBERON:0000002,UMLS:C0007874
47,UBERON:0000004,UMLS:C0028429


---

## Map MeSH to CUIs

In [19]:
mapper = (umls
    .assign(mesh_id = lambda df: df["code"].map(lambda v: "MESH:{}".format(v)))
    .assign(ncui = lambda df: df["cui"].map(lambda v: "UMLS:{}".format(v)))
    [["mesh_id", "ncui"]]
    .rename(columns={"ncui": "cui"})
)

In [20]:
mapper.shape

(370689, 2)

In [21]:
mapper.head(2)

Unnamed: 0,mesh_id,cui
0,MESH:D012711,UMLS:C0000005
1,MESH:D015060,UMLS:C0000039


---

In [22]:
indirect = (mesh
    .merge(mapper, how="inner", on="mesh_id")
    .drop("mesh_id", axis=1)
)

In [23]:
indirect.shape

(893, 2)

In [24]:
indirect.head(2)

Unnamed: 0,uberon_id,cui
0,UBERON:0000002,UMLS:C0007874
1,UBERON:0000004,UMLS:C0028429


---

## Final mapping from Uberon to UMLS

In [25]:
final = (cuis
    .append(indirect)
    .drop_duplicates()
    .sort_values(["uberon_id", "cui"])
    .reset_index(drop=True)
)

In [26]:
final.shape

(3301, 2)

In [27]:
final.head()

Unnamed: 0,uberon_id,cui
0,UBERON:0000002,UMLS:C0007874
1,UBERON:0000004,UMLS:C0028429
2,UBERON:0000004,UMLS:C0458561
3,UBERON:0000006,UMLS:C0022131
4,UBERON:0000007,UMLS:C0032005


### Check that data fields are good

In [28]:
final["uberon_id"].str.startswith("UBERON:").value_counts()

True    3301
Name: uberon_id, dtype: int64

In [29]:
final["cui"].str.startswith("UMLS:").value_counts()

True    3301
Name: cui, dtype: int64

## Save mapping file to disk

In [30]:
final.to_csv("../../pipeline/maps/uberon.tsv", sep='\t', index=False)

---

# Exploratory analysis below

## How do we map Uberon ids to UMLS ids?

Start with the data Daniel had and go from there.

In [31]:
def rawgit(handle, repo, commit, *args):
    """Returns url for a raw file in a github reposotory."""
    url_head = 'https://raw.githubusercontent.com'
    return '/'.join((url_head, handle, repo, commit) + args)

In [32]:
commit = '134f23479186abba03ba340fc6dc90e16c781920'
url = rawgit('dhimmel', 'uberon', commit, 'data/hetio-slim.tsv')
uberon_df = pd.read_table(url)
uberon_df.head(2)

Unnamed: 0,uberon_id,uberon_name,mesh_id,mesh_name,bto_id
0,UBERON:0000002,uterine cervix,D002584,Cervix Uteri,BTO:0001421
1,UBERON:0000004,nose,D009666,Nose,BTO:0000840


In [33]:
uberon_df.shape

(402, 5)

In [34]:
uberon_df["uberon_id"].nunique()

402

### Check all ids accounted for

In [35]:
set(uberon_df["uberon_id"]) <= set(uber["uberon_id"])

True

All Uberon ids in Hetionet still exist in the latest Uberon data.

---

## Which cross references should we use to map Uberon to UMLS?

In [36]:
temp = (uberon_df
    [["uberon_id", "uberon_name"]]
    .merge(uber, how="inner", on="uberon_id")
    .assign(src = lambda df: df["xref"].str.split(":").str[0])
)

In [37]:
temp.shape

(6918, 4)

In [38]:
temp.head()

Unnamed: 0,uberon_id,uberon_name,xref,src
0,UBERON:0000002,uterine cervix,BTO:0001421,BTO
1,UBERON:0000002,uterine cervix,BTO:0002249,BTO
2,UBERON:0000002,uterine cervix,CALOHA:TS-0134,CALOHA
3,UBERON:0000002,uterine cervix,EFO:0000979,EFO
4,UBERON:0000002,uterine cervix,EMAPA:29927,EMAPA


### Most common Uberon cross references for our concepts

In [39]:
temp["src"].value_counts().head()

http    1016
UMLS     447
MESH     403
FMA      395
MA       383
Name: src, dtype: int64

In [40]:
temp.query("src == 'UMLS'")["uberon_id"].nunique()

365

In [41]:
temp.query("src == 'MESH'")["uberon_id"].nunique()

402

All terms have MeSH mappings, but not all of them have direct UMLS mappings.

---

# Determine if the UMLS and MeSH mappings align

### Extract out the MeSH and UMLS cross references

In [42]:
xrefs = (temp
    .query("src == 'MESH' or src == 'UMLS'")
    .assign(num = lambda df: df["xref"].str.len())
)

In [43]:
xrefs.shape

(850, 5)

In [44]:
xrefs.head()

Unnamed: 0,uberon_id,uberon_name,xref,src,num
14,UBERON:0000002,uterine cervix,MESH:D002584,MESH,12
17,UBERON:0000002,uterine cervix,"UMLS:C0007874 {source=""ncithesaurus:Cervix""}",UMLS,44
34,UBERON:0000004,nose,MESH:D009666,MESH,12
39,UBERON:0000004,nose,"UMLS:C0028429 {source=""ncithesaurus:Nose""}",UMLS,42
53,UBERON:0000006,islet of Langerhans,MESH:D007515,MESH,12


### Check MeSH id formatting

One MeSH id is too long. We will drop this id.

In [45]:
xrefs.query("src == 'MESH'")["num"].value_counts()

12    402
16      1
Name: num, dtype: int64

In [46]:
xrefs.query("src == 'MESH'").groupby("uberon_id").filter(lambda df: len(df) > 1)

Unnamed: 0,uberon_id,uberon_name,xref,src,num
2304,UBERON:0001460,arm,MESH:A01.378.800,MESH,16
2305,UBERON:0001460,arm,MESH:D001132,MESH,12


In [47]:
good = xrefs.query("xref != 'MESH:A01.378.800'")

In [48]:
good.shape

(849, 5)

In [49]:
good.head()

Unnamed: 0,uberon_id,uberon_name,xref,src,num
14,UBERON:0000002,uterine cervix,MESH:D002584,MESH,12
17,UBERON:0000002,uterine cervix,"UMLS:C0007874 {source=""ncithesaurus:Cervix""}",UMLS,44
34,UBERON:0000004,nose,MESH:D009666,MESH,12
39,UBERON:0000004,nose,"UMLS:C0028429 {source=""ncithesaurus:Nose""}",UMLS,42
53,UBERON:0000006,islet of Langerhans,MESH:D007515,MESH,12


---

## Do Daniel's Uberon to MeSH mappings match the Uberon provided Uberon to MeSH mappings?

In [50]:
official = (good
    .query("src == 'MESH'")
    [["uberon_id", "xref"]]
    .rename(columns={"xref": "mesh_id"})
    .reset_index(drop=True)
)    

In [51]:
official.shape

(402, 2)

In [52]:
official.head()

Unnamed: 0,uberon_id,mesh_id
0,UBERON:0000002,MESH:D002584
1,UBERON:0000004,MESH:D009666
2,UBERON:0000006,MESH:D007515
3,UBERON:0000007,MESH:D010902
4,UBERON:0000010,MESH:D017933


In [53]:
daniel = (uberon_df
    [["uberon_id", "mesh_id"]]
    .assign(mesh_id = lambda df: df["mesh_id"].map(
        lambda v: "MESH:{}".format(v)
    ))
)

In [54]:
daniel.shape

(402, 2)

In [55]:
daniel.head(2)

Unnamed: 0,uberon_id,mesh_id
0,UBERON:0000002,MESH:D002584
1,UBERON:0000004,MESH:D009666


In [56]:
official.equals(daniel)

True

Daniel's provided MeSH mappings exactly match the mappings provided by Uberon.

---

### Direct MeSH mappings from UMLS

In [57]:
mesh = (umls
    .assign(mesh_id = lambda df: df["code"].map(lambda v: "MESH:{}".format(v)))
    .drop("code", axis=1)
    .reset_index(drop=True)
)

In [58]:
mesh.shape

(370689, 2)

In [59]:
mesh.head()

Unnamed: 0,cui,mesh_id
0,C0000005,MESH:D012711
1,C0000039,MESH:D015060
2,C0000052,MESH:D015061
3,C0000074,MESH:D010742
4,C0000084,MESH:D015055


### Add official Uberon map to the mapping via MeSH

In [60]:
indirect = official.merge(mesh, how="inner", on="mesh_id")

In [61]:
indirect.shape

(544, 3)

In [62]:
indirect.head()

Unnamed: 0,uberon_id,mesh_id,cui
0,UBERON:0000002,MESH:D002584,C0007874
1,UBERON:0000004,MESH:D009666,C0028429
2,UBERON:0000004,MESH:D009666,C0458561
3,UBERON:0000006,MESH:D007515,C0022131
4,UBERON:0000007,MESH:D010902,C0032005


In [63]:
indirect["uberon_id"].nunique()

402

---

### Uberon provided mappings to UMLS

In [64]:
# direct map given by uberon to cuis

direct = (good
    .query("src == 'UMLS'")
    .drop(["uberon_name", "src"], axis=1)
    .assign(cui = lambda df: df["xref"].str.split(" ").str[0])
    .drop(["num", "xref"], axis=1)
    .drop_duplicates()
    .sort_values(["uberon_id", "cui"])
    .reset_index(drop=True)
)

In [65]:
direct.head()

Unnamed: 0,uberon_id,cui
0,UBERON:0000002,UMLS:C0007874
1,UBERON:0000004,UMLS:C0028429
2,UBERON:0000006,UMLS:C0022131
3,UBERON:0000007,UMLS:C0032005
4,UBERON:0000010,UMLS:C0206417


In [66]:
direct["uberon_id"].nunique()

365

In [67]:
direct.shape

(398, 2)

---

## Compare direct and indirect mappings for consistency

In [68]:
indir = (indirect
    .assign(indirect_cui = lambda df: df["cui"].map(lambda v: "UMLS:{}".format(v)))
    .drop(["cui", "mesh_id"], axis=1)
    .sort_values(["uberon_id", "indirect_cui"])
    .reset_index(drop=True)
)

In [69]:
indir.head()

Unnamed: 0,uberon_id,indirect_cui
0,UBERON:0000002,UMLS:C0007874
1,UBERON:0000004,UMLS:C0028429
2,UBERON:0000004,UMLS:C0458561
3,UBERON:0000006,UMLS:C0022131
4,UBERON:0000007,UMLS:C0032005


---

In [70]:
res = (direct
    .rename(columns={"cui": "direct_cui"})
    .merge(
        indir, how="inner", on="uberon_id"
    )
    .assign(same = lambda df: df["direct_cui"] == df["indirect_cui"])
)

In [71]:
res.head()

Unnamed: 0,uberon_id,direct_cui,indirect_cui,same
0,UBERON:0000002,UMLS:C0007874,UMLS:C0007874,True
1,UBERON:0000004,UMLS:C0028429,UMLS:C0028429,True
2,UBERON:0000004,UMLS:C0028429,UMLS:C0458561,False
3,UBERON:0000006,UMLS:C0022131,UMLS:C0022131,True
4,UBERON:0000007,UMLS:C0032005,UMLS:C0032005,True


In [72]:
res["same"].value_counts()

True     346
False    197
Name: same, dtype: int64

In [73]:
res.query("~same")["uberon_id"].nunique()

114

In [74]:
res.query("~same").sample(10)

Unnamed: 0,uberon_id,direct_cui,indirect_cui,same
104,UBERON:0001093,UMLS:C0004457,UMLS:C2919848,False
81,UBERON:0001021,UMLS:C0027740,UMLS:C0031119,False
344,UBERON:0001896,UMLS:C1269575,UMLS:C0152397,False
93,UBERON:0001064,UMLS:C0030288,UMLS:C0013294,False
128,UBERON:0001228,UMLS:C0022666,UMLS:C0022664,False
299,UBERON:0001785,UMLS:C1269897,UMLS:C0010268,False
248,UBERON:0001700,UMLS:C0017406,UMLS:C1744550,False
133,UBERON:0001237,UMLS:C0442134,UMLS:C0030378,False
307,UBERON:0001815,UMLS:C0024093,UMLS:C0205826,False
348,UBERON:0001908,UMLS:C0152405,UMLS:C0042829,False


No real clear indicator of which source of Uberon id mappings is more correct. We will just just merge both the direct and indirect mappings together and expand to both.