# Check that the gold standard edges are not in the network when we prepare the network

In [1]:
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

import numpy as np
from itertools import product

## Read hetionet

using hetionet for now because it should be faster to run than semmeddb

In [2]:
hnodes = (pd
    .read_csv("../../merge/data/hetionet/hetnet_nodes.csv", sep=',')
    .drop("identifier:string", axis=1)
    .rename(columns={
        ":ID": "node_id",
        "name:string": "name",
        ":LABEL": "het_type"
    })
)

In [3]:
hedges = (pd
    .read_csv("../../merge/data/hetionet/hetnet_edges.csv", sep=',')
    .rename(columns={
        ":START_ID": "start_id",
        ":END_ID": "end_id",
        ":TYPE": "het_etype"
    })
)

In [4]:
hnodes.head()

Unnamed: 0,node_id,name,het_type
0,DB00795,Sulfasalazine,Compound
1,N0000000151,Histamine H2 Receptor Antagonists,Pharmacologic Class
2,100996420,DNM1P50,Gene
3,DB04898,Ximelagatran,Compound
4,C0278151,Facial spasm,Side Effect


In [5]:
hedges.head()

Unnamed: 0,start_id,end_id,het_etype
0,8568,GO:0042254,PARTICIPATES_GpBP
1,UBERON:0002450,51316,EXPRESSES_AeG
2,4893,PC7_8430,PARTICIPATES_GpPW
3,UBERON:0002185,55186,EXPRESSES_AeG
4,119504,GO:0033047,PARTICIPATES_GpBP


In [6]:
gold = (pd
    .read_csv("../hetio_dw/filtered_semmed_gold_for_hetionet.tsv", sep='\t')
    .rename(columns={
        "chemical_id": "chemical_cui",
        "disease_id": "disease_cui",
        "chemical_hetid": "chemical_id",
        "disease_hetid": "disease_id"
    })
)

In [7]:
gold.shape

(2924, 9)

In [8]:
gold.head()

Unnamed: 0,chemical_cui,chemical_name,disease_cui,disease_name,etype,chemical_id,chemical_htype,disease_id,disease_htype
0,UMLS:C0000477,fampridine,UMLS:C0026769,multiple sclerosis,TREATS_CDtDO,DB06637,Compound,C0026769,Side Effect
1,UMLS:C0000545,eicosapentaenoic acid,UMLS:C0032961,"Pregnancy, function",TREATS_CDtDO,DB00159,Compound,C0032961,Side Effect
2,UMLS:C0000618,mercaptopurine,UMLS:C0023449,acute lymphocytic leukemia,TREATS_CDtDO,DB01033,Compound,C0023449,Side Effect
3,UMLS:C0000618,mercaptopurine,UMLS:C0023487,acute promyelocytic leukemia,TREATS_CDtDO,DB01033,Compound,C0023487,Side Effect
4,UMLS:C0000956,acenocoumarol,UMLS:C0034065,Pulmonary embolism,TREATS_CDtDO,DB01418,Compound,C0034065,Side Effect


## Add edge types to hetionet edges

In [9]:
edges = (hedges
    .merge(
        hnodes, how="inner",
        left_on="start_id", right_on="node_id"
    )
    .drop(["node_id", "name"], axis=1)
    .rename(columns={"het_type": "start_htype"})

    .merge(
        hnodes, how="inner",
        left_on="end_id", right_on="node_id"
    )
    .drop(["node_id", "name"], axis=1)
    .rename(columns={"het_type": "end_htype"})
)

In [10]:
edges.head()

Unnamed: 0,start_id,end_id,het_etype,start_htype,end_htype
0,8568,GO:0042254,PARTICIPATES_GpBP,Gene,Biological Process
1,6201,GO:0042254,PARTICIPATES_GpBP,Gene,Biological Process
2,6223,GO:0042254,PARTICIPATES_GpBP,Gene,Biological Process
3,6202,GO:0042254,PARTICIPATES_GpBP,Gene,Biological Process
4,65003,GO:0042254,PARTICIPATES_GpBP,Gene,Biological Process


In [11]:
edges.groupby(["start_htype", "end_htype"]).size()

start_htype          end_htype         
Anatomy              Gene                  726495
Compound             Compound                6486
                     Gene                   51429
                     Side Effect           138944
Disease              Anatomy                 3602
                     Disease                  543
                     Gene                   27977
                     Symptom                 3357
Gene                 Biological Process    559504
                     Cellular Component     73566
                     Gene                  474526
                     Molecular Function     97222
                     Pathway                84372
Pharmacologic Class  Compound                1029
dtype: int64

There are 14 unique node-node types. Since deepwalk collapses the edge semantics, let's subsample edges based on the node-node pair they link, instead of what semantic node type links the nodes.

---

## Check which node types the gold standard relationships are

In [12]:
gold.groupby(["chemical_htype", "disease_htype"]).size()

chemical_htype       disease_htype
Compound             Disease           117
                     Side Effect      2789
                     Symptom            11
Pharmacologic Class  Side Effect         7
dtype: int64

In [13]:
test = (gold
    [["chemical_id", "disease_id"]]
    .merge(
        edges.rename(columns={
            "start_id": "chemical_id",
            "end_id": "disease_id"
        }), how="inner", on=["chemical_id", "disease_id"]
    )
)

In [14]:
test.shape

(303, 5)

so only 300 edges from the gold std were in the network? not so bad

In [15]:
test.head()

Unnamed: 0,chemical_id,disease_id,het_etype,start_htype,end_htype
0,DB00316,C0015967,CAUSES_CcSE,Compound,Side Effect
1,DB00787,C0019348,CAUSES_CcSE,Compound,Side Effect
2,DB01001,C0006266,CAUSES_CcSE,Compound,Side Effect
3,DB00437,C0018099,CAUSES_CcSE,Compound,Side Effect
4,DB01118,C0042510,CAUSES_CcSE,Compound,Side Effect


In [16]:
# reverse edges
test2 = (gold
    [["chemical_id", "disease_id"]]
    .merge(
        edges.rename(columns={
            "start_id": "disease_id",
            "end_id": "chemical_id"
        }), how="inner", on=["chemical_id", "disease_id"]
    )
)

In [17]:
test2.shape

(0, 5)

In [18]:
test2.head()

Unnamed: 0,chemical_id,disease_id,het_etype,start_htype,end_htype


the gold standard edges only appear in hetionet in one direction