# parse uberon

uberon direct to cuis don't match uberon to mesh to cui, so we will do both and just merge all together

In [1]:
import pandas as pd

from collections import defaultdict

from tqdm import tqdm

In [2]:
def read_umls(fname):
    res = defaultdict(list)
    with open(fname, "r") as fin:
        for line in tqdm(fin, total=13897048):
            vals = line.rstrip("\n").split("|")

            cui, sab, code = vals[0], vals[11], vals[13]

            if sab == "MSH":
                res["cui"].append(cui)
                res["code"].append(code)

    return pd.DataFrame(res).drop_duplicates()

In [3]:
info = read_umls("data/MRCONSO.RRF")

100%|██████████| 13897048/13897048 [00:33<00:00, 420626.05it/s]


In [4]:
info.head()

Unnamed: 0,code,cui
0,D012711,C0000005
2,D015060,C0000039
16,D015061,C0000052
30,D010742,C0000074
32,D015055,C0000084


In [5]:
info.shape

(370689, 2)

---

In [6]:
def parse_uberon():
    # input file:
    # http://uberon.github.io/downloads.html
    # go for:
    # Core Ontology
    # Extended Version
    # uberon/ext is the recommended version. It imports subsets of other ontologies such as GO, and includes all of the cell ontology (CL). This is the version available for browsing on OntoBee.
    
    temp = []
    uid = None
    with open("data/uberon_extended.obo", "r") as fin:
        for line in fin:
            line = line.rstrip("\n")

            if line.startswith("id: UBERON:"):
                uid = line[4:]
            elif uid is not None and line.startswith("xref:"):
                temp.append((uid, line[6:]))

    return pd.DataFrame(temp, columns=["uberon_id", "xref"])

In [7]:
uber = parse_uberon()

In [8]:
uber.shape

(52459, 2)

In [9]:
uber.head()

Unnamed: 0,uberon_id,xref
0,UBERON:0000002,BTO:0001421
1,UBERON:0000002,BTO:0002249
2,UBERON:0000002,CALOHA:TS-0134
3,UBERON:0000002,EFO:0000979
4,UBERON:0000002,EMAPA:29927


---

## extract umls and mesh mappings from uberon

In [10]:
temp = (uber
    .assign(src = lambda df: df["xref"].str.split(":").str[0])
    .query("src == 'UMLS' or src == 'MESH'")
)

In [11]:
temp.shape

(3973, 3)

In [12]:
temp.head()

Unnamed: 0,uberon_id,xref,src
14,UBERON:0000002,MESH:D002584,MESH
17,UBERON:0000002,"UMLS:C0007874 {source=""ncithesaurus:Cervix""}",UMLS
42,UBERON:0000004,MESH:D009666,MESH
47,UBERON:0000004,"UMLS:C0028429 {source=""ncithesaurus:Nose""}",UMLS
62,UBERON:0000006,MESH:D007515,MESH


In [13]:
mesh = (temp
    .query("src == 'MESH'")
    .assign(num = lambda df: df["xref"].str.len())
    .query("num == 12")
    .drop(["src", "num"], axis=1)
    .rename(columns={"xref": "mesh_id"})
)

In [14]:
cuis = (temp
    .query("src == 'UMLS'")
    .assign(cui = lambda df: df["xref"].str.split(" ").str[0])
    .drop(["xref", "src"], axis=1)
    .drop_duplicates()
)

In [15]:
mesh.shape

(655, 2)

In [16]:
mesh.head(2)

Unnamed: 0,uberon_id,mesh_id
14,UBERON:0000002,MESH:D002584
42,UBERON:0000004,MESH:D009666


In [17]:
cuis.shape

(2891, 2)

In [18]:
cuis.head(2)

Unnamed: 0,uberon_id,cui
17,UBERON:0000002,UMLS:C0007874
47,UBERON:0000004,UMLS:C0028429


---

## Map mesh to cuis

In [19]:
mapper = (info
    .assign(mesh_id = lambda df: df["code"].map(lambda v: "MESH:{}".format(v)))
    .assign(ncui = lambda df: df["cui"].map(lambda v: "UMLS:{}".format(v)))
    [["mesh_id", "ncui"]]
    .rename(columns={"ncui": "cui"})
)

In [20]:
mapper.shape

(370689, 2)

In [21]:
mapper.head(2)

Unnamed: 0,mesh_id,cui
0,MESH:D012711,UMLS:C0000005
2,MESH:D015060,UMLS:C0000039


---

In [22]:
indirect = (mesh
    .merge(mapper, how="inner", on="mesh_id")
    .drop("mesh_id", axis=1)
)

In [23]:
indirect.shape

(893, 2)

In [24]:
indirect.head(2)

Unnamed: 0,uberon_id,cui
0,UBERON:0000002,UMLS:C0007874
1,UBERON:0000004,UMLS:C0028429


---

In [25]:
final = (cuis
    .append(indirect)
    .drop_duplicates()
    .sort_values(["uberon_id", "cui"])
    .reset_index(drop=True)
)

In [26]:
final.shape

(3301, 2)

In [27]:
final.head()

Unnamed: 0,uberon_id,cui
0,UBERON:0000002,UMLS:C0007874
1,UBERON:0000004,UMLS:C0028429
2,UBERON:0000004,UMLS:C0458561
3,UBERON:0000006,UMLS:C0022131
4,UBERON:0000007,UMLS:C0032005


In [28]:
final.to_csv("maps/uberon.tsv", sep='\t', index=False)

---

# below is analysis of how to map

In [29]:
def rawgit(handle, repo, commit, *args):
    """Returns url for a raw file in a github reposotory."""
    url_head = 'https://raw.githubusercontent.com'
    return '/'.join((url_head, handle, repo, commit) + args)

In [30]:
commit = '134f23479186abba03ba340fc6dc90e16c781920'
url = rawgit('dhimmel', 'uberon', commit, 'data/hetio-slim.tsv')
uberon_df = pd.read_table(url)
uberon_df.head(2)

Unnamed: 0,uberon_id,uberon_name,mesh_id,mesh_name,bto_id
0,UBERON:0000002,uterine cervix,D002584,Cervix Uteri,BTO:0001421
1,UBERON:0000004,nose,D009666,Nose,BTO:0000840


In [31]:
uberon_df.shape

(402, 5)

In [32]:
uberon_df["uberon_id"].nunique()

402

### Check all ids accounted for

In [33]:
set(uberon_df["uberon_id"]) <= set(uber["uberon_id"])

True

---

## Only need to map the 402 ones in hetionet, not all of them

In [34]:
temp = (uberon_df
    [["uberon_id", "uberon_name"]]
    .merge(uber, how="inner", on="uberon_id")
    .assign(src = lambda df: df["xref"].str.split(":").str[0])
)

In [35]:
temp.shape

(6918, 4)

In [36]:
temp.head()

Unnamed: 0,uberon_id,uberon_name,xref,src
0,UBERON:0000002,uterine cervix,BTO:0001421,BTO
1,UBERON:0000002,uterine cervix,BTO:0002249,BTO
2,UBERON:0000002,uterine cervix,CALOHA:TS-0134,CALOHA
3,UBERON:0000002,uterine cervix,EFO:0000979,EFO
4,UBERON:0000002,uterine cervix,EMAPA:29927,EMAPA


In [37]:
temp["src"].value_counts().head()

http    1016
UMLS     447
MESH     403
FMA      395
MA       383
Name: src, dtype: int64

In [38]:
temp.query("src == 'UMLS'")["uberon_id"].nunique()

365

In [39]:
temp.query("src == 'MESH'")["uberon_id"].nunique()

402

---

In [40]:
xrefs = (temp
    .query("src == 'MESH' or src == 'UMLS'")
    .assign(num = lambda df: df["xref"].str.len())
)

In [41]:
xrefs.head()

Unnamed: 0,uberon_id,uberon_name,xref,src,num
14,UBERON:0000002,uterine cervix,MESH:D002584,MESH,12
17,UBERON:0000002,uterine cervix,"UMLS:C0007874 {source=""ncithesaurus:Cervix""}",UMLS,44
34,UBERON:0000004,nose,MESH:D009666,MESH,12
39,UBERON:0000004,nose,"UMLS:C0028429 {source=""ncithesaurus:Nose""}",UMLS,42
53,UBERON:0000006,islet of Langerhans,MESH:D007515,MESH,12


### one mesh id is too long, we will drop it

In [42]:
xrefs.query("src == 'MESH'")["num"].value_counts()

12    402
16      1
Name: num, dtype: int64

In [43]:
xrefs.query("src == 'MESH'").groupby("uberon_id").filter(lambda df: len(df) > 1)

Unnamed: 0,uberon_id,uberon_name,xref,src,num
2304,UBERON:0001460,arm,MESH:A01.378.800,MESH,16
2305,UBERON:0001460,arm,MESH:D001132,MESH,12


In [44]:
good = xrefs.query("xref != 'MESH:A01.378.800'")

In [45]:
good.shape

(849, 5)

---

### daniel's mesh mappings match uberon provided mappings

In [46]:
uberon_df.head(2)

Unnamed: 0,uberon_id,uberon_name,mesh_id,mesh_name,bto_id
0,UBERON:0000002,uterine cervix,D002584,Cervix Uteri,BTO:0001421
1,UBERON:0000004,nose,D009666,Nose,BTO:0000840


In [47]:
a = (good
    .query("src == 'MESH'")
    [["uberon_id", "xref"]]
    .rename(columns={"xref": "mesh_id"})
    .reset_index(drop=True)
)    

In [48]:
a.head()

Unnamed: 0,uberon_id,mesh_id
0,UBERON:0000002,MESH:D002584
1,UBERON:0000004,MESH:D009666
2,UBERON:0000006,MESH:D007515
3,UBERON:0000007,MESH:D010902
4,UBERON:0000010,MESH:D017933


In [49]:
b = (uberon_df
    [["uberon_id", "mesh_id"]]
    .assign(kek = lambda df: df["mesh_id"].map(
        lambda v: "MESH:{}".format(v)
    ))
    .drop("mesh_id", axis=1)
    .rename(columns={"kek": "mesh_id"})
)

In [50]:
b.head(2)

Unnamed: 0,uberon_id,mesh_id
0,UBERON:0000002,MESH:D002584
1,UBERON:0000004,MESH:D009666


In [51]:
a.equals(b)

True

---

In [52]:
mesh = (info
    .assign(mesh_id = lambda df: df["code"].map(lambda v: "MESH:{}".format(v)))
    .drop("code", axis=1)
)

In [53]:
mesh.shape

(370689, 2)

In [54]:
mesh.head()

Unnamed: 0,cui,mesh_id
0,C0000005,MESH:D012711
2,C0000039,MESH:D015060
16,C0000052,MESH:D015061
30,C0000074,MESH:D010742
32,C0000084,MESH:D015055


In [55]:
indirect = a.merge(mesh, how="inner", on="mesh_id")

In [56]:
indirect.shape

(544, 3)

In [57]:
indirect.head()

Unnamed: 0,uberon_id,mesh_id,cui
0,UBERON:0000002,MESH:D002584,C0007874
1,UBERON:0000004,MESH:D009666,C0028429
2,UBERON:0000004,MESH:D009666,C0458561
3,UBERON:0000006,MESH:D007515,C0022131
4,UBERON:0000007,MESH:D010902,C0032005


---

In [58]:
# direct map given by uberon to cuis

direct = (good
    .query("src == 'UMLS'")
    .drop(["uberon_name", "src"], axis=1)
    .assign(cui = lambda df: df["xref"].str.split(" ").str[0])
    .drop(["num", "xref"], axis=1)
    .drop_duplicates()
    .sort_values(["uberon_id", "cui"])
    .reset_index(drop=True)
)

In [59]:
direct.head()

Unnamed: 0,uberon_id,cui
0,UBERON:0000002,UMLS:C0007874
1,UBERON:0000004,UMLS:C0028429
2,UBERON:0000006,UMLS:C0022131
3,UBERON:0000007,UMLS:C0032005
4,UBERON:0000010,UMLS:C0206417


In [60]:
direct["uberon_id"].nunique()

365

In [61]:
direct.shape

(398, 2)

---

In [62]:
indirect.head(2)

Unnamed: 0,uberon_id,mesh_id,cui
0,UBERON:0000002,MESH:D002584,C0007874
1,UBERON:0000004,MESH:D009666,C0028429


In [63]:
indirect["uberon_id"].nunique()

402

In [64]:
kek = set(direct["uberon_id"])

ti = (indirect
    .query("uberon_id in @kek")
    .drop("mesh_id", axis=1)
    .assign(ncui = lambda df: df["cui"].map(lambda v: "UMLS:{}".format(v)))
    .drop("cui", axis=1)
    .sort_values(["uberon_id", "ncui"])
    .drop_duplicates()
    .reset_index(drop=True)
)

In [65]:
ti.shape

(498, 2)

In [66]:
ti.head(2)

Unnamed: 0,uberon_id,ncui
0,UBERON:0000002,UMLS:C0007874
1,UBERON:0000004,UMLS:C0028429


---

In [67]:
direct.head(2)

Unnamed: 0,uberon_id,cui
0,UBERON:0000002,UMLS:C0007874
1,UBERON:0000004,UMLS:C0028429


In [68]:
jej = direct.merge(ti, how="inner", on="uberon_id")

In [69]:
jej.shape

(543, 3)

In [70]:
jej.head()

Unnamed: 0,uberon_id,cui,ncui
0,UBERON:0000002,UMLS:C0007874,UMLS:C0007874
1,UBERON:0000004,UMLS:C0028429,UMLS:C0028429
2,UBERON:0000004,UMLS:C0028429,UMLS:C0458561
3,UBERON:0000006,UMLS:C0022131,UMLS:C0022131
4,UBERON:0000007,UMLS:C0032005,UMLS:C0032005


In [71]:
aa = jej.assign(same = lambda df: df["cui"] == df["ncui"])

In [72]:
aa.head()

Unnamed: 0,uberon_id,cui,ncui,same
0,UBERON:0000002,UMLS:C0007874,UMLS:C0007874,True
1,UBERON:0000004,UMLS:C0028429,UMLS:C0028429,True
2,UBERON:0000004,UMLS:C0028429,UMLS:C0458561,False
3,UBERON:0000006,UMLS:C0022131,UMLS:C0022131,True
4,UBERON:0000007,UMLS:C0032005,UMLS:C0032005,True


In [73]:
aa.shape

(543, 4)

In [74]:
aa["same"].value_counts()

True     346
False    197
Name: same, dtype: int64

In [75]:
aa.query("~same").sample(10)

Unnamed: 0,uberon_id,cui,ncui,same
104,UBERON:0001093,UMLS:C0004457,UMLS:C2919848,False
281,UBERON:0001756,UMLS:C1268972,UMLS:C0242255,False
164,UBERON:0001437,UMLS:C1708734,UMLS:C0014570,False
496,UBERON:0002410,UMLS:C1305381,UMLS:C0004388,False
387,UBERON:0002061,UMLS:C1519589,UMLS:C0041206,False
273,UBERON:0001750,UMLS:C0022903,UMLS:C0229246,False
240,UBERON:0001690,UMLS:C0521421,UMLS:C0013443,False
422,UBERON:0002185,UMLS:C0006255,UMLS:C0024496,False
93,UBERON:0001064,UMLS:C0030288,UMLS:C0013294,False
478,UBERON:0002385,UMLS:C2328219,UMLS:C0026845,False


ok, seems like no clear winner
just merge both of the mappings..

maybe need to help them update mappings lol (crowdsource)

ncui from indirect

indirect correct: 1
direct correct: 2