# Harmonize SemmedDB with a Hetionet using UMLS CUIs

Executive decision after discussion: overwrite hetio types with semmed types. for any remaining hetio types without cuis in semmed, just change the type to the most common one

Keep any hetio nodes with missing cuis in hetio id space

In [1]:
import pandas as pd
import re

## Read Hetionet nodes

In [2]:
hnodes = pd.read_csv("hetionet/hetio_nodes.tsv", sep='\t')

In [3]:
hnodes.shape

(58700, 4)

In [4]:
hnodes.head()

Unnamed: 0,hetio_id,name,het_type,cui
0,DB00795,Sulfasalazine,Compound,UMLS:C0036078
1,DB00795,Sulfasalazine,Compound,UMLS:C0699547
2,DB00795,Sulfasalazine,Compound,UMLS:C4255898
3,N0000000151,Histamine H2 Receptor Antagonists,Pharmacologic Class,UMLS:C2757005
4,100996420,DNM1P50,Gene,100996420


---

## Read Semmed nodes

In [5]:
def is_cui(s):
    return re.match(r'^C[0-9]{7}$', s) is not None

In [6]:
snodes = (pd
    .read_csv("../semmed/data/node_map.tsv", sep='\t')
    .drop("node_uid", axis=1)
    .assign(
        cui = lambda df: df["node_id"].map(
            lambda v: "UMLS:{}".format(v) if is_cui(v) else v
        )
    )
    .drop("node_id", axis=1)
    .rename(columns={"ntype": "sem_type"})
)

In [7]:
snodes.head()

Unnamed: 0,name,sem_type,cui
0,"1,2-Dipalmitoylphosphatidylcholine",Chemicals & Drugs,UMLS:C0000039
1,"1,4-alpha-Glucan Branching Enzyme",Chemicals & Drugs,UMLS:C0000052
2,1-Carboxyglutamic Acid,Chemicals & Drugs,UMLS:C0000084
3,1-Methyl-3-isobutylxanthine,Chemicals & Drugs,UMLS:C0000096
4,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",Chemicals & Drugs,UMLS:C0000097


In [8]:
snodes["cui"].str.startswith("UMLS").value_counts()

True     210382
False         7
Name: cui, dtype: int64

---

## Determine the most common SemmedDB node type for each Hetionet node type

In [9]:
ntype_map = (hnodes
    .merge(snodes, how="inner", on="cui")
    .drop(["name_x", "name_y"], axis=1)
    .groupby("het_type")
    .apply(lambda df: df["sem_type"].value_counts().axes[0][0])
    .to_frame("sem_type")
    .reset_index()
             
    .append(
        pd.DataFrame(
            [("Pathway", "Pathway")],
            columns=["het_type", "sem_type"]
        )
    )
    .reset_index(drop=True)
)

In [10]:
ntype_map

Unnamed: 0,het_type,sem_type
0,Anatomy,Anatomy
1,Biological Process,Physiology
2,Cellular Component,Anatomy
3,Compound,Chemicals & Drugs
4,Disease,Disorders
5,Gene,Genes & Molecular Sequences
6,Molecular Function,Physiology
7,Pharmacologic Class,Chemicals & Drugs
8,Side Effect,Disorders
9,Symptom,Disorders


---

## Determine how Hetionet nodes will be identified in the merged network

Rules:
* Use all CUIs which are in SemmedDB if they exist
* Otherwise use the original Hetionet id

In [11]:
all_scuis = set(snodes["cui"])

hnode_map = (hnodes
    .assign(in_sem = lambda df: df["cui"].map(lambda v: v in all_scuis))
    .groupby("hetio_id")
    .apply(
        lambda df: df.query("in_sem") if df["in_sem"].any() else df.head(1)
    )
    .reset_index(drop=True)
    .assign(final_id = lambda df:
        df[["hetio_id", "cui", "in_sem"]].apply(
            lambda r: r["cui"] if r["in_sem"] else r["hetio_id"],
            axis=1
        )
    )
    .drop("cui", axis=1)
)

In [12]:
hnode_map.shape

(48277, 5)

In [13]:
hnode_map.head()

Unnamed: 0,hetio_id,name,het_type,in_sem,final_id
0,1,A1BG,Gene,True,UMLS:C1412045
1,10,NAT2,Gene,True,UMLS:C0796518
2,100,ADA,Gene,True,UMLS:C1412179
3,1000,CDH2,Gene,True,UMLS:C1413277
4,10000,AKT3,Gene,True,UMLS:C1332074


---

In [14]:
hnode_map["in_sem"].value_counts()

True     27123
False    21154
Name: in_sem, dtype: int64

In [15]:
hnode_map["hetio_id"].nunique()

47031

### How many identifiers per hetio id (as a %)

In [16]:
(hnode_map
    .groupby("hetio_id")
    .size()
    .value_counts(normalize=True)
    .multiply(100)
    .sort_index()
)

1     98.394676
2      1.158810
3      0.236015
4      0.087177
5      0.048904
6      0.019136
7      0.021263
8      0.006379
9      0.010631
10     0.004253
12     0.004253
14     0.004253
16     0.002126
18     0.002126
dtype: float64

---

## Assign semantic types to the node map

In [17]:
fin_hnode_map = (hnode_map
    .merge(
        snodes.drop("name", axis=1), how="left",
        left_on="final_id", right_on="cui"
    )
    .drop("cui", axis=1)
    .rename(columns={"sem_type": "orig_stype"})
    .merge(ntype_map, how="left", on="het_type")
    .assign(
        fin_type = lambda df: df["orig_stype"].fillna(df["sem_type"])
    )
    .drop(["orig_stype", "sem_type"], axis=1)
)

In [18]:
fin_hnode_map.head()

Unnamed: 0,hetio_id,name,het_type,in_sem,final_id,fin_type
0,1,A1BG,Gene,True,UMLS:C1412045,Genes & Molecular Sequences
1,10,NAT2,Gene,True,UMLS:C0796518,Genes & Molecular Sequences
2,100,ADA,Gene,True,UMLS:C1412179,Genes & Molecular Sequences
3,1000,CDH2,Gene,True,UMLS:C1413277,Genes & Molecular Sequences
4,10000,AKT3,Gene,True,UMLS:C1332074,Genes & Molecular Sequences


In [19]:
fin_hnode_map.to_csv(
    "hetionet/fin_hetio_nodes.tsv", sep='\t', index=False
)

---

## Map Hetionet edges to SemmedDB edges

Adjust the semantics of the edges accordingly, and simplify where possible.

In [20]:
hedges = (pd
    .read_csv("data/hetionet/hetnet_edges.csv", sep=',')
    .rename(columns={
        ":START_ID": "start_id",
        ":END_ID": "end_id",
        ":TYPE": "het_etype"
    })
)

In [21]:
hedges.shape

(2249052, 3)

In [22]:
hedges.head()

Unnamed: 0,start_id,end_id,het_etype
0,8568,GO:0042254,PARTICIPATES_GpBP
1,UBERON:0002450,51316,EXPRESSES_AeG
2,4893,PC7_8430,PARTICIPATES_GpPW
3,UBERON:0002185,55186,EXPRESSES_AeG
4,119504,GO:0033047,PARTICIPATES_GpBP


---

### Convert Hetio nodes to their final node types

In [23]:
het_etypes = (hedges
    .merge(
        fin_hnode_map[["hetio_id", "het_type", "fin_type"]],
        how="inner", left_on="start_id", right_on="hetio_id"
    )
    .drop("hetio_id", axis=1)
    .rename(columns={
        "het_type": "start_htype",
        "fin_type": "start_ftype"
    })
       
    .merge(
        fin_hnode_map[["hetio_id", "het_type", "fin_type"]],
        how="inner", left_on="end_id", right_on="hetio_id"
    )
    .drop("hetio_id", axis=1)
    .rename(columns={
        "het_type": "end_htype",
        "fin_type": "end_ftype"
    })
    .drop_duplicates()
    .reset_index(drop=True)
)

In [24]:
het_etypes.head()

Unnamed: 0,start_id,end_id,het_etype,start_htype,start_ftype,end_htype,end_ftype
0,8568,GO:0042254,PARTICIPATES_GpBP,Gene,Genes & Molecular Sequences,Biological Process,Physiology
1,6201,GO:0042254,PARTICIPATES_GpBP,Gene,Genes & Molecular Sequences,Biological Process,Physiology
2,6223,GO:0042254,PARTICIPATES_GpBP,Gene,Genes & Molecular Sequences,Biological Process,Physiology
3,6202,GO:0042254,PARTICIPATES_GpBP,Gene,Genes & Molecular Sequences,Biological Process,Physiology
4,65003,GO:0042254,PARTICIPATES_GpBP,Gene,Genes & Molecular Sequences,Biological Process,Physiology


### Group hetio edges by node types and count edges

In [25]:
h_etype_count = (het_etypes
    .groupby(["start_ftype", "end_ftype"])
    ["het_etype"]
    .value_counts()
    .to_frame("count")
    .reset_index()
)

In [26]:
h_etype_count

Unnamed: 0,start_ftype,end_ftype,het_etype,count
0,Anatomy,Genes & Molecular Sequences,EXPRESSES_AeG,526407
1,Anatomy,Genes & Molecular Sequences,DOWNREGULATES_AdG,102240
2,Anatomy,Genes & Molecular Sequences,UPREGULATES_AuG,97848
3,Chemicals & Drugs,Anatomy,CAUSES_CcSE,36
4,Chemicals & Drugs,Chemicals & Drugs,RESEMBLES_CrC,6486
5,Chemicals & Drugs,Chemicals & Drugs,INCLUDES_PCiC,920
6,Chemicals & Drugs,Disorders,CAUSES_CcSE,137458
7,Chemicals & Drugs,Genes & Molecular Sequences,DOWNREGULATES_CdG,21102
8,Chemicals & Drugs,Genes & Molecular Sequences,UPREGULATES_CuG,18756
9,Chemicals & Drugs,Genes & Molecular Sequences,BINDS_CbG,11571


---

### standardize Semmed edges with UMLS: tags on node ids

In [27]:
sedges = (pd
    .read_csv("../semmed/data/edges.tsv", sep='\t')
    .drop(["source_uid", "target_uid"], axis=1)

    .assign(source_cui = lambda df: df["source_id"].map(
        lambda v: "UMLS:{}".format(v) if is_cui(v) else v
    ))
    .assign(target_cui = lambda df: df["target_id"].map(
        lambda v: "UMLS:{}".format(v) if is_cui(v) else v
    ))
    .drop(["source_id", "target_id"], axis=1)
)

In [28]:
sedges.head()

Unnamed: 0,etype,source_cui,target_cui
0,REGULATES_CDreg>CD,UMLS:C0000039,UMLS:C0001026
1,REGULATES_CDreg>CD,UMLS:C0000039,UMLS:C0001041
2,REGULATES_CDreg>CD,UMLS:C0000039,UMLS:C0001492
3,REGULATES_CDreg>CD,UMLS:C0000039,UMLS:C0001962
4,RELATED_TO_CDrtCD,UMLS:C0000039,UMLS:C0001975


In [29]:
sedges.shape

(9646780, 3)

---

### Add in node types for semmed

In [30]:
sem_etypes = (sedges
    .merge(
        snodes.drop("name", axis=1),
        how="inner", left_on="source_cui", right_on="cui"
    )
    .drop("cui", axis=1)
    .rename(columns={"sem_type": "source_stype"})

    .merge(
        snodes.drop("name", axis=1),
        how="inner", left_on="target_cui", right_on="cui"
    )
    .drop("cui", axis=1)
    .rename(columns={"sem_type": "target_stype"})
)

In [31]:
sem_etypes.head()

Unnamed: 0,etype,source_cui,target_cui,source_stype,target_stype
0,REGULATES_CDreg>CD,UMLS:C0000039,UMLS:C0001026,Chemicals & Drugs,Chemicals & Drugs
1,REGULATES_CDreg>CD,UMLS:C0000167,UMLS:C0001026,Chemicals & Drugs,Chemicals & Drugs
2,PRODUCES_LBpdCD,UMLS:C0000901,UMLS:C0001026,Living Beings,Chemicals & Drugs
3,REGULATES_CDreg>CD,UMLS:C0000975,UMLS:C0001026,Chemicals & Drugs,Chemicals & Drugs
4,REGULATES_CDreg>CD,UMLS:C0000977,UMLS:C0001026,Chemicals & Drugs,Chemicals & Drugs


In [32]:
sem_etypes.shape

(9646780, 5)

---

### Group by node type and count edges

In [33]:
s_etype_count = (sem_etypes
    .groupby(["source_stype", "target_stype"])
    ["etype"]
    .value_counts()
    .to_frame("count")
    .reset_index()
)

In [34]:
s_etype_count

Unnamed: 0,source_stype,target_stype,etype,count
0,Anatomy,Anatomy,LOCATION_OF_AloA,267214
1,Anatomy,Chemicals & Drugs,LOCATION_OF_AloCD,996348
2,Anatomy,Disorders,LOCATION_OF_AloDO,520885
3,Anatomy,Genes & Molecular Sequences,LOCATION_OF_AloG,534310
4,Anatomy,Living Beings,PART_OF_Apo>LB,133351
5,Anatomy,Living Beings,LOCATION_OF_AloLB,36757
6,Anatomy,Physiology,LOCATION_OF_AloPS,29184
7,Chemicals & Drugs,Anatomy,AFFECTS_CDafA,313572
8,Chemicals & Drugs,Chemicals & Drugs,REGULATES_CDreg>CD,814972
9,Chemicals & Drugs,Chemicals & Drugs,RELATED_TO_CDrtCD,216035


---

### combine all edges from both networks and manually decide which ones to retain

In [35]:
res = (h_etype_count
    .rename(columns={
        "start_ftype": "source_stype",
        "end_ftype": "target_stype",
        "het_etype": "etype"
    })
    .assign(orig = "h")
    .append(
        s_etype_count.assign(orig = "semmed")
    )
    .sort_values(
        by=["source_stype", "target_stype", "count"],
        ascending=[True, True, False]
    )
)

In [36]:
res

Unnamed: 0,source_stype,target_stype,etype,count,orig
0,Anatomy,Anatomy,LOCATION_OF_AloA,267214,semmed
1,Anatomy,Chemicals & Drugs,LOCATION_OF_AloCD,996348,semmed
2,Anatomy,Disorders,LOCATION_OF_AloDO,520885,semmed
3,Anatomy,Genes & Molecular Sequences,LOCATION_OF_AloG,534310,semmed
0,Anatomy,Genes & Molecular Sequences,EXPRESSES_AeG,526407,h
1,Anatomy,Genes & Molecular Sequences,DOWNREGULATES_AdG,102240,h
2,Anatomy,Genes & Molecular Sequences,UPREGULATES_AuG,97848,h
4,Anatomy,Living Beings,PART_OF_Apo>LB,133351,semmed
5,Anatomy,Living Beings,LOCATION_OF_AloLB,36757,semmed
6,Anatomy,Physiology,LOCATION_OF_AloPS,29184,semmed


In [37]:
res.to_csv("edge_merge.tsv", sep='\t', index=False)

---

## Edge merge (load in manual curation results)

Edge merging was performed in Excel from the `edge_merge.tsv` file.

In [38]:
res = (pd
    .read_csv("edge_merge_final.tsv", sep='\t')
    .assign(changed = lambda df: df["etype"] != df["final_etype"])
)

In [39]:
res.head()

Unnamed: 0,source_stype,target_stype,etype,count,orig,final_etype,changed
0,Anatomy,Anatomy,LOCATION_OF_AloA,267214,semmed,LOCATION_OF_AloA,False
1,Anatomy,Chemicals & Drugs,LOCATION_OF_AloCD,996348,semmed,LOCATION_OF_AloCD,False
2,Anatomy,Disorders,LOCATION_OF_AloDO,520885,semmed,LOCATION_OF_AloDO,False
3,Anatomy,Genes & Molecular Sequences,LOCATION_OF_AloG,534310,semmed,EXPRESSES_AeG,True
4,Anatomy,Genes & Molecular Sequences,EXPRESSES_AeG,526407,h,EXPRESSES_AeG,False


In [40]:
res.shape

(81, 7)

In [41]:
res["changed"].value_counts()

False    44
True     37
Name: changed, dtype: int64

In [42]:
res.groupby(["changed", "orig"]).size()

changed  orig  
False    h          4
         semmed    40
True     h         36
         semmed     1
dtype: int64

Around half of the edges were untouched, and those which were changed were mostly hetionet edges (since we're trying to map onto semmeddb).

---

## Final concatenation of edges with provenance

In [43]:
fhedges = (hedges
    # convert source node ids
    .merge(
        fin_hnode_map[["hetio_id", "final_id", "fin_type"]],
        how="inner", left_on="start_id", right_on="hetio_id"
    )
    .drop(["start_id", "hetio_id"], axis=1)
    .rename(columns={
        "final_id": "source_fid",
        "fin_type": "source_stype"
    })
     
    # convert target node ids
    .merge(
        fin_hnode_map[["hetio_id", "final_id", "fin_type"]],
        how="inner", left_on="end_id", right_on="hetio_id"
    )
    .drop(["end_id", "hetio_id"], axis=1)
    .rename(columns={
        "final_id": "target_fid",
        "fin_type": "target_stype"
    })
     
    .rename(columns={"het_etype": "etype"})

    # change edge types
    .merge(
        (res
            .query("final_etype != 'delete'")
            [["source_stype", "target_stype", "etype", "final_etype"]]
        ),
        how="inner", on=["source_stype", "target_stype", "etype"]
    )
    .drop("etype", axis=1)
     
    .assign(origin = "hetionet")
    .drop_duplicates()
    .reset_index(drop=True)
)

In [44]:
fhedges.head()

Unnamed: 0,source_fid,source_stype,target_fid,target_stype,final_etype,origin
0,UMLS:C1857700,Genes & Molecular Sequences,GO:0042254,Physiology,AFFECTS_GafPS,hetionet
1,UMLS:C1419751,Genes & Molecular Sequences,GO:0042254,Physiology,AFFECTS_GafPS,hetionet
2,UMLS:C1335641,Genes & Molecular Sequences,GO:0042254,Physiology,AFFECTS_GafPS,hetionet
3,UMLS:C1419752,Genes & Molecular Sequences,GO:0042254,Physiology,AFFECTS_GafPS,hetionet
4,UMLS:C1422367,Genes & Molecular Sequences,GO:0042254,Physiology,AFFECTS_GafPS,hetionet


In [45]:
fhedges.shape

(2647687, 6)

---

## Final results and write to file

In [46]:
merged_network = (sedges
    .merge(
        snodes.drop("name", axis=1),
        how="inner", left_on="source_cui", right_on="cui"
    )
    .drop("cui", axis=1)
    .rename(columns={"sem_type": "source_stype"})

    .merge(
        snodes.drop("name", axis=1),
        how="inner", left_on="target_cui", right_on="cui"
    )
    .drop("cui", axis=1)
    .rename(columns={"sem_type": "target_stype"})

    .rename(columns={
        "source_cui": "source_fid",
        "target_cui": "target_fid",
        "etype": "final_etype"
    })
    .assign(origin = "semmeddb")
                  
    .append(fhedges)
    .reset_index(drop=True)
)

In [47]:
merged_network.head()

Unnamed: 0,final_etype,origin,source_fid,source_stype,target_fid,target_stype
0,REGULATES_CDreg>CD,semmeddb,UMLS:C0000039,Chemicals & Drugs,UMLS:C0001026,Chemicals & Drugs
1,REGULATES_CDreg>CD,semmeddb,UMLS:C0000167,Chemicals & Drugs,UMLS:C0001026,Chemicals & Drugs
2,PRODUCES_LBpdCD,semmeddb,UMLS:C0000901,Living Beings,UMLS:C0001026,Chemicals & Drugs
3,REGULATES_CDreg>CD,semmeddb,UMLS:C0000975,Chemicals & Drugs,UMLS:C0001026,Chemicals & Drugs
4,REGULATES_CDreg>CD,semmeddb,UMLS:C0000977,Chemicals & Drugs,UMLS:C0001026,Chemicals & Drugs


In [54]:
merged_network.sample(10)

Unnamed: 0,final_etype,origin,source_fid,source_stype,target_fid,target_stype
4291225,AUGMENTS_GagDO,semmeddb,UMLS:C0033713,Genes & Molecular Sequences,UMLS:C0205698,Disorders
11105071,UPREGULATES_AuG,hetionet,UMLS:C0027088,Anatomy,UMLS:C1539800,Genes & Molecular Sequences
11268139,DOWNREGULATES_AdG,hetionet,UMLS:C0042993,Anatomy,UMLS:C1538353,Genes & Molecular Sequences
10295667,EXPRESSES_AeG,hetionet,UMLS:C0935625,Anatomy,UMLS:C1538580,Genes & Molecular Sequences
233522,INHIBITS_CDinG,semmeddb,UMLS:C1120843,Chemicals & Drugs,UMLS:C1366557,Genes & Molecular Sequences
3353937,INTERACTS_WITH_CDiwG,semmeddb,UMLS:C0017950,Chemicals & Drugs,UMLS:C1442864,Genes & Molecular Sequences
3043797,STIMULATES_CDstG,semmeddb,UMLS:C0085301,Chemicals & Drugs,UMLS:C1419038,Genes & Molecular Sequences
8036146,INHIBITS_GinDO,semmeddb,UMLS:C1335265,Genes & Molecular Sequences,UMLS:C0339467,Disorders
10139357,AFFECTS_GafPS,hetionet,UMLS:C1857782,Genes & Molecular Sequences,GO:1900246,Physiology
2163903,AFFECTS_CDafA,semmeddb,UMLS:C0065264,Chemicals & Drugs,UMLS:C0014597,Anatomy


In [48]:
merged_network.shape

(12294467, 6)

In [49]:
merged_network["origin"].value_counts()

semmeddb    9646780
hetionet    2647687
Name: origin, dtype: int64

In [50]:
merged_network["source_fid"].nunique()

187319

In [51]:
merged_network["target_fid"].nunique()

178642

### number of edges shared by both networks

In [52]:
(merged_network
    .query("origin == 'semmeddb'")
    [["source_fid", "target_fid", "final_etype"]]
    .merge(
        merged_network.query("origin == 'hetionet'")[["source_fid", "target_fid", "final_etype"]],
        how="inner", on=["source_fid", "target_fid", "final_etype"]
    )
    .drop_duplicates()
    .shape
)

(27725, 3)

---

In [52]:
fin_nodes = (fin_hnode_map
    .query("~in_sem")
    [["final_id", "name", "fin_type"]]
    .assign(origin = "hetionet")
    .rename(columns={
        "final_id": "node_id",
        "fin_type": "sem_type"
    })
     
    .append(
        snodes.assign(origin = "semmeddb")
        .rename(columns={"cui": "node_id"})
    )
    .reset_index(drop=True)
)

In [53]:
fin_nodes.shape

(231543, 4)

In [54]:
fin_nodes.head()

Unnamed: 0,name,node_id,origin,sem_type
0,GAGE12F,100008586,hetionet,Genes & Molecular Sequences
1,DUXB,100033411,hetionet,Genes & Molecular Sequences
2,DDTL,100037417,hetionet,Genes & Molecular Sequences
3,OCTN3,100049579,hetionet,Genes & Molecular Sequences
4,POM121C,100101267,hetionet,Genes & Molecular Sequences


In [55]:
fin_nodes["node_id"].nunique()

231543

In [56]:
fin_nodes["origin"].value_counts()

semmeddb    210389
hetionet     21154
Name: origin, dtype: int64

In [57]:
fin_nodes["sem_type"].value_counts()

Chemicals & Drugs              81308
Living Beings                  48191
Disorders                      38191
Genes & Molecular Sequences    25453
Physiology                     19308
Anatomy                        16135
Pathway                         1822
Phenomena                       1135
Name: sem_type, dtype: int64

---

In [58]:
fin_nodes.to_csv("merged_network/nodes.tsv", sep='\t', index=False)

In [59]:
merged_network.to_csv("merged_network/edges.tsv", sep='\t', index=False)