# Create final merged network containing SemmedDB and Hetionet

2018-12-14

In [1]:
import pandas as pd

## Load in manual curation results for merging edges

Edge merging was performed in Excel from the `edge_merge.tsv` file.

In [2]:
res = (pd
    .read_csv("../pipeline/manual_review/edge_merge_final.tsv", sep='\t')
    .assign(changed = lambda df: df["edge_type"] != df["final_edge_type"])
)

In [3]:
res.shape

(81, 7)

In [4]:
res.head()

Unnamed: 0,source_stype,target_stype,edge_type,count,orig,final_edge_type,changed
0,Anatomy,Anatomy,LOCATION_OF_AloA,267214,semmed,LOCATION_OF_AloA,False
1,Anatomy,Chemicals & Drugs,LOCATION_OF_AloCD,996348,semmed,LOCATION_OF_AloCD,False
2,Anatomy,Disorders,LOCATION_OF_AloDO,520885,semmed,LOCATION_OF_AloDO,False
3,Anatomy,Genes & Molecular Sequences,LOCATION_OF_AloG,534310,semmed,EXPRESSES_AeG,True
4,Anatomy,Genes & Molecular Sequences,EXPRESSES_AeG,526407,h,EXPRESSES_AeG,False


### How many edge types were modified?

In [5]:
res["changed"].value_counts()

False    44
True     37
Name: changed, dtype: int64

In [6]:
res.groupby(["changed", "orig"]).size()

changed  orig  
False    h          4
         semmed    40
True     h         36
         semmed     1
dtype: int64

Around half of the edges were untouched, and those which were changed were mostly hetionet edges (since we're trying to map onto semmeddb).

---

# Convert Hetionet edges to their final type

### Read Hetionet nodes and edges

In [7]:
hnodes = pd.read_csv("../pipeline/hetionet/final_hetionet_nodes_umls.tsv", sep='\t')

In [8]:
hnodes.shape

(48276, 6)

In [9]:
hnodes.head()

Unnamed: 0,hetio_id,name,het_type,in_sem,final_id,fin_type
0,1,A1BG,Gene,True,UMLS:C1412045,Genes & Molecular Sequences
1,10,NAT2,Gene,True,UMLS:C0796518,Genes & Molecular Sequences
2,100,ADA,Gene,True,UMLS:C1412179,Genes & Molecular Sequences
3,1000,CDH2,Gene,True,UMLS:C1413277,Genes & Molecular Sequences
4,10000,AKT3,Gene,True,UMLS:C1332074,Genes & Molecular Sequences


In [10]:
hedges = (pd
    .read_csv("../data/hetionet/raw/hetionet_edges.csv", sep=',')
    .rename(columns={
        ":START_ID": "source_id",
        ":END_ID": "target_id",
        ":TYPE": "het_etype"
    })
)

In [11]:
hedges.shape

(2249052, 3)

In [12]:
hedges.head()

Unnamed: 0,source_id,target_id,het_etype
0,8568,GO:0042254,PARTICIPATES_GpBP
1,UBERON:0002450,51316,EXPRESSES_AeG
2,4893,PC7_8430,PARTICIPATES_GpPW
3,UBERON:0002185,55186,EXPRESSES_AeG
4,119504,GO:0033047,PARTICIPATES_GpBP


## Convert Hetionet edges

In [13]:
fhedges = (hedges
    # convert source node ids
    .merge(
        hnodes[["hetio_id", "final_id", "fin_type"]],
        how="inner", left_on="source_id", right_on="hetio_id"
    )
    .drop(["source_id", "hetio_id"], axis=1)
    .rename(columns={
        "final_id": "source_fid",
        "fin_type": "source_stype"
    })
     
    # convert target node ids
    .merge(
        hnodes[["hetio_id", "final_id", "fin_type"]],
        how="inner", left_on="target_id", right_on="hetio_id"
    )
    .drop(["target_id", "hetio_id"], axis=1)
    .rename(columns={
        "final_id": "target_fid",
        "fin_type": "target_stype"
    })
     
    .rename(columns={"het_etype": "edge_type"})

    # change edge types
    .merge(
        (res
            .query("final_edge_type != 'delete'")
            [["source_stype", "target_stype", "edge_type", "final_edge_type"]]
        ),
        how="inner", on=["source_stype", "target_stype", "edge_type"]
    )
    .drop("edge_type", axis=1)
     
    .assign(origin = "hetionet")
    .drop_duplicates()
    .reset_index(drop=True)
)

In [14]:
fhedges.shape

(2647660, 6)

In [15]:
fhedges.head()

Unnamed: 0,source_fid,source_stype,target_fid,target_stype,final_edge_type,origin
0,UMLS:C1857700,Genes & Molecular Sequences,GO:0042254,Physiology,AFFECTS_GafPS,hetionet
1,UMLS:C1419751,Genes & Molecular Sequences,GO:0042254,Physiology,AFFECTS_GafPS,hetionet
2,UMLS:C1335641,Genes & Molecular Sequences,GO:0042254,Physiology,AFFECTS_GafPS,hetionet
3,UMLS:C1419752,Genes & Molecular Sequences,GO:0042254,Physiology,AFFECTS_GafPS,hetionet
4,UMLS:C1422367,Genes & Molecular Sequences,GO:0042254,Physiology,AFFECTS_GafPS,hetionet


---

## Convert SemmedDB edges to their final types

### Read SemmedDB nodes

In [16]:
snodes = pd.read_csv("../data/semmeddb/no_treats/semmeddb_no_treats_nodes.tsv", sep='\t')

In [17]:
snodes.shape

(210375, 3)

In [18]:
snodes.head()

Unnamed: 0,node_id,node_name,node_type
0,DOID:0050545,visceral heterotaxy,Disorders
1,DOID:0050591,tooth agenesis,Disorders
2,DOID:0060061,cutaneous T cell lymphoma,Disorders
3,DOID:0060073,lymphatic system cancer,Disorders
4,DOID:0060119,pharynx cancer,Disorders


### Read SemmedDB edges

In [19]:
sedges = (pd
    .read_csv("../data/semmeddb/no_treats/semmeddb_no_treats_edges.tsv", sep='\t')
    .drop(["pmids", "n_pmids"], axis=1)
)

In [20]:
sedges.shape

(9646780, 3)

In [21]:
sedges.head()

Unnamed: 0,source_id,target_id,edge_type
0,DOID:0050545,UMLS:C0000768,ASSOCIATED_WITH_DOawDO
1,DOID:0050545,UMLS:C0000772,ASSOCIATED_WITH_DOawDO
2,DOID:0050545,UMLS:C0001792,AFFECTS_DOafLB
3,DOID:0050545,UMLS:C0002871,ASSOCIATED_WITH_DOawDO
4,DOID:0050545,UMLS:C0004245,ASSOCIATED_WITH_DOawDO


## Convert SemmedDB edges to their final types

In [22]:
fsedges = (sedges
    # add in source node type
    .merge(
        snodes.drop("node_name", axis=1),
        how="inner", left_on="source_id", right_on="node_id"
    )
    .drop("node_id", axis=1)
    .rename(columns={"node_type": "source_stype"})

    # add in target node type
    .merge(
        snodes.drop("node_name", axis=1),
        how="inner", left_on="target_id", right_on="node_id"
    )
    .drop("node_id", axis=1)
    .rename(columns={"node_type": "target_stype"})

    .rename(columns={
        "source_id": "source_fid",
        "target_id": "target_fid",
    })
           
    # change edge types
    .merge(
        (res
            .query("final_edge_type != 'delete'")
            [["source_stype", "target_stype", "edge_type", "final_edge_type"]]
        ),
        how="inner", on=["source_stype", "target_stype", "edge_type"]
    )
    .drop("edge_type", axis=1)
     
    .assign(origin = "semmeddb")
    .drop_duplicates()
    .reset_index(drop=True)
)

In [23]:
fsedges.shape

(9646780, 6)

In [24]:
fsedges.head()

Unnamed: 0,source_fid,target_fid,source_stype,target_stype,final_edge_type,origin
0,DOID:0050545,UMLS:C0000768,Disorders,Disorders,ASSOCIATED_WITH_DOawDO,semmeddb
1,DOID:0050591,UMLS:C0000768,Disorders,Disorders,ASSOCIATED_WITH_DOawDO,semmeddb
2,DOID:0060668,UMLS:C0000768,Disorders,Disorders,ASSOCIATED_WITH_DOawDO,semmeddb
3,UMLS:C0000727,UMLS:C0000768,Disorders,Disorders,ASSOCIATED_WITH_DOawDO,semmeddb
4,UMLS:C0000737,UMLS:C0000768,Disorders,Disorders,ASSOCIATED_WITH_DOawDO,semmeddb


---

## Final results and write to file

In [25]:
edges_final = (fsedges
    .merge(
        fhedges, how="outer",
        on=["source_fid", "target_fid", "final_edge_type"]
    )
    .assign(
        source_stype = lambda df: df["source_stype_x"].fillna(df["source_stype_y"]),
        target_stype = lambda df: df["target_stype_x"].fillna(df["target_stype_y"]),
    )
    .drop([
        "source_stype_x", "source_stype_y",
        "target_stype_x", "target_stype_y"], axis=1
    )
    .fillna(False)
    .replace({
        "semmeddb": True,
        "hetionet": True
    })
    .rename(columns={
        "origin_x": "in_semmeddb",
        "origin_y": "in_hetionet"
    })
    .reset_index(drop=True)
)

In [26]:
edges_final.shape

(12232901, 7)

In [27]:
edges_final.head()

Unnamed: 0,source_fid,target_fid,final_edge_type,in_semmeddb,in_hetionet,source_stype,target_stype
0,DOID:0050545,UMLS:C0000768,ASSOCIATED_WITH_DOawDO,True,False,Disorders,Disorders
1,DOID:0050591,UMLS:C0000768,ASSOCIATED_WITH_DOawDO,True,False,Disorders,Disorders
2,DOID:0060668,UMLS:C0000768,ASSOCIATED_WITH_DOawDO,True,False,Disorders,Disorders
3,UMLS:C0000727,UMLS:C0000768,ASSOCIATED_WITH_DOawDO,True,False,Disorders,Disorders
4,UMLS:C0000737,UMLS:C0000768,ASSOCIATED_WITH_DOawDO,True,False,Disorders,Disorders


## Merged network information

In [28]:
edges_final["source_fid"].nunique()

187318

In [29]:
edges_final["target_fid"].nunique()

178641

### Number of edges shared by both networks

In [30]:
edges_final.groupby(["in_semmeddb", "in_hetionet"]).size()

in_semmeddb  in_hetionet
False        True           2586121
True         False          9585241
             True             61539
dtype: int64

---

## Merged network node list

In [31]:
fin_nodes = (hnodes
    [["name", "final_id", "fin_type"]]
    .rename(columns={
        "name": "node_name",
        "final_id": "node_id",
        "fin_type": "node_type"
    })
    .assign(origin = "hetionet")
             
             
    .merge(
        snodes.assign(origin = "semmeddb"),
        how="outer", on="node_id"
    )
             
    .assign(
        node_type = lambda df: df["node_type_y"].fillna(df["node_type_x"]),
        node_name = lambda df: df["node_name_y"].fillna(df["node_name_x"])
    )
    .drop([
        "node_name_x", "node_name_y",
        "node_type_x", "node_type_y"], axis=1
    )
    .fillna(False)
    .replace({
        "semmeddb": True,
        "hetionet": True
    })
    .rename(columns={
        "origin_x": "in_hetionet",
        "origin_y": "in_semmeddb"
    })
    .drop_duplicates()
    .sort_values(["node_id", "node_type"])
    .reset_index(drop=True)
)

In [32]:
fin_nodes.shape

(231529, 5)

In [33]:
fin_nodes.head()

Unnamed: 0,node_id,in_hetionet,in_semmeddb,node_type,node_name
0,100008586,True,False,Genes & Molecular Sequences,GAGE12F
1,100033411,True,False,Genes & Molecular Sequences,DUXB
2,100037417,True,False,Genes & Molecular Sequences,DDTL
3,100049579,True,False,Genes & Molecular Sequences,OCTN3
4,100101267,True,False,Genes & Molecular Sequences,POM121C


### Merged network node information

In [34]:
fin_nodes["node_id"].nunique()

231529

In [35]:
fin_nodes.groupby(["in_hetionet", "in_semmeddb"]).size()

in_hetionet  in_semmeddb
False        True           183837
True         False           21154
             True            26538
dtype: int64

In [36]:
fin_nodes["node_type"].value_counts()

Chemicals & Drugs              81297
Living Beings                  48191
Disorders                      38188
Genes & Molecular Sequences    25453
Physiology                     19308
Anatomy                        16135
Pathway                         1822
Phenomena                       1135
Name: node_type, dtype: int64

---

## Save merged network to disk

In [37]:
fin_nodes.to_csv("../pipeline/merged_network/merged_hetionet_and_semmeddb_nodes.tsv", sep='\t', index=False)

In [38]:
edges_final.to_csv("../pipeline/merged_network/merged_hetionet_and_semmeddb_edges.tsv", sep='\t', index=False)