# Harmonize SemmedDB with a Hetionet using UMLS CUIs

In [1]:
import pandas as pd
import re

## Read Hetionet nodes

In [2]:
hnodes = (pd
    .read_csv("hetionet/hetio_nodes.tsv", sep='\t')
    .rename(columns={
        "node_type": "het_type",
        "uid": "hetio_id",
        "fuid": "cui"
    })
)

In [3]:
hnodes.shape

(58700, 4)

In [4]:
hnodes.head()

Unnamed: 0,hetio_id,name,het_type,cui
0,DB00795,Sulfasalazine,Compound,UMLS:C0036078
1,DB00795,Sulfasalazine,Compound,UMLS:C0699547
2,DB00795,Sulfasalazine,Compound,UMLS:C4255898
3,N0000000151,Histamine H2 Receptor Antagonists,Pharmacologic Class,UMLS:C2757005
4,100996420,DNM1P50,Gene,100996420


---

## Read Semmed nodes

In [5]:
def is_cui(s):
    return re.match(r'^C[0-9]{7}$', s) is not None

In [6]:
snodes = (pd
    .read_csv("../semmed/data/node_map.tsv", sep='\t')
    .drop("node_uid", axis=1)
    .assign(
        cui = lambda df: df["node_id"].map(
            lambda v: "UMLS:{}".format(v) if is_cui(v) else v
        )
    )
    .drop("node_id", axis=1)
    .rename(columns={"ntype": "sem_type"})
)

In [7]:
snodes.head()

Unnamed: 0,name,sem_type,cui
0,"1,2-Dipalmitoylphosphatidylcholine",Chemicals & Drugs,UMLS:C0000039
1,"1,4-alpha-Glucan Branching Enzyme",Chemicals & Drugs,UMLS:C0000052
2,1-Carboxyglutamic Acid,Chemicals & Drugs,UMLS:C0000084
3,1-Methyl-3-isobutylxanthine,Chemicals & Drugs,UMLS:C0000096
4,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",Chemicals & Drugs,UMLS:C0000097


In [8]:
snodes["cui"].str.startswith("UMLS").value_counts()

True     210382
False         7
Name: cui, dtype: int64

---

### CUIs missing in semmeddb

In [9]:
ncuis = (hnodes
    .groupby("hetio_id")
    .size()
    .to_frame("ncuis")
    .reset_index()
)

In [10]:
nmerge = hnodes.merge(snodes, how="left", on="cui")

missing = (nmerge
    [nmerge["sem_type"].isnull()]
    .groupby("hetio_id")
    .size()
    .to_frame("missing")
    .reset_index()
)

In [11]:
res = (ncuis
    .merge(missing, how="left", on="hetio_id")
    .fillna(0)
    .assign(missing = lambda df: df["missing"].astype(int))
)

In [12]:
res.head()

Unnamed: 0,hetio_id,ncuis,missing
0,1,1,0
1,10,1,0
2,100,1,0
3,1000,1,0
4,10000,1,0


### How many CUIs exist for each Hetio id?

In [13]:
(res
    ["ncuis"]
    .value_counts(normalize=True)
    .multiply(100)
    .sort_index()
)

1      88.513959
2       6.529736
3       2.895962
4       0.790968
5       0.408241
6       0.208373
7       0.174353
8       0.127575
9       0.063788
10      0.040399
11      0.042525
12      0.042525
13      0.019136
14      0.021263
15      0.014884
16      0.008505
17      0.012758
18      0.004253
19      0.006379
21      0.019136
22      0.004253
23      0.006379
24      0.002126
25      0.004253
26      0.008505
29      0.006379
30      0.002126
31      0.004253
37      0.002126
39      0.002126
43      0.002126
57      0.002126
67      0.002126
70      0.002126
101     0.002126
139     0.002126
Name: ncuis, dtype: float64

Vast majority of nodes (88%) have only one equivalent cui. these are the best and easily reconciled with other things. then we have our long tail of things which have a bunch of mappings, going all the way up to 139 cuis for the same hetio id. we will examine this long tail

### Number of unique Hetio nodes which do not have a single CUI mapping in semmeddb

In [14]:
res.query("ncuis == missing").shape

(21154, 3)

there are 21154 unique nodes in hetionet which do not have any of its cui mappings existing in semmeddb

### node types of missing nodes (no cuis in semmeddb)

In [15]:
(res
    .query("ncuis == missing")
    .merge(hnodes, how="left", on="hetio_id")
    [["hetio_id", "het_type"]]
    .drop_duplicates()
    ["het_type"]
    .value_counts()
)

Biological Process     9647
Gene                   4914
Molecular Function     1934
Pathway                1822
Side Effect            1457
Cellular Component     1030
Pharmacologic Class     216
Compound                101
Symptom                  20
Disease                   8
Anatomy                   5
Name: het_type, dtype: int64

for the most part the missing things are things from the gene ontology

### hetio missing node types as a fraction of the total hetio nodes of that type

In [16]:
(res
    .query("ncuis == missing")
    .merge(hnodes, how="left", on="hetio_id")
    [["hetio_id", "het_type"]]
    .drop_duplicates()
    ["het_type"]
    .value_counts()
 
    .divide(
        (hnodes
            [["hetio_id", "het_type"]]
            .drop_duplicates()
            ["het_type"]
            .value_counts()
        )
    )
    .multiply(100)
    .sort_values(ascending=False)
)

Pathway                100.000000
Biological Process      84.764080
Cellular Component      74.047448
Molecular Function      67.059639
Pharmacologic Class     62.608696
Side Effect             25.409836
Gene                    23.461447
Compound                 6.507732
Disease                  5.839416
Symptom                  4.566210
Anatomy                  1.243781
Name: het_type, dtype: float64

from the above we can see that the pathway and three subsets of the gene ontology have the vast majority of their data missing in semmeddb (as in all cuis have no result in semmeddb).

conclusion: lots of stuff is missing but that's because it seems semrep didn't extract any go ontology terms. not much we can do about the missing ids; real question is whether we keep in original single id space or if we expand it out to the maybe 100 cuis for a single term (makes the network a lot more complicated)..

---

## try to figure out how to merge the different semantic types, especially when both sources disagree about what an individual thing is

In [17]:
temp = hnodes.merge(snodes, how="inner", on="cui")

In [18]:
temp.head()

Unnamed: 0,hetio_id,name_x,het_type,cui,name_y,sem_type
0,DB00795,Sulfasalazine,Compound,UMLS:C0036078,Sulfasalazine,Chemicals & Drugs
1,DB00795,Sulfasalazine,Compound,UMLS:C0699547,Azulfidine,Chemicals & Drugs
2,DB04898,Ximelagatran,Compound,UMLS:C0966370,ximelagatran,Chemicals & Drugs
3,DB04898,Ximelagatran,Compound,UMLS:C1174793,Exanta,Chemicals & Drugs
4,C0278151,Facial spasm,Side Effect,UMLS:C0278151,Facial spasm,Disorders


In [19]:
temp.groupby(["het_type", "sem_type"]).size().to_frame("rows")

Unnamed: 0_level_0,Unnamed: 1_level_0,rows
het_type,sem_type,Unnamed: 2_level_1
Anatomy,Anatomy,517
Anatomy,Disorders,2
Anatomy,Physiology,1
Biological Process,Anatomy,4
Biological Process,Disorders,16
Biological Process,Phenomena,24
Biological Process,Physiology,1725
Cellular Component,Anatomy,378
Cellular Component,Living Beings,1
Cellular Component,Physiology,1


In [20]:
temp.groupby(["sem_type", "het_type"]).size().to_frame("rows")

Unnamed: 0_level_0,Unnamed: 1_level_0,rows
sem_type,het_type,Unnamed: 2_level_1
Anatomy,Anatomy,517
Anatomy,Biological Process,4
Anatomy,Cellular Component,378
Anatomy,Side Effect,3
Chemicals & Drugs,Compound,1943
Chemicals & Drugs,Molecular Function,1
Chemicals & Drugs,Pharmacologic Class,87
Disorders,Anatomy,2
Disorders,Biological Process,16
Disorders,Disease,129


We will just accept mappings where they exist, and use the most common semantic type mapping when no CUI exists for a particular Hetio id.