# Generate a bare minimum Hetionet for use with Deepwalk

We are not getting the kind of results that we expect out of the edge removal experiments. Need to make sure our baseline makes sense, so let's retry on a minimal hetionet. We will only include edges which have no relation to our chemicals and diseases, and see if we are still getting 0.8 ROC as a result.

In [1]:
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

import numpy as np
from itertools import product

In [2]:
np.random.seed(20180329)

## Read hetionet

using hetionet for now because it should be faster to run than semmeddb

In [3]:
hnodes = (pd
    .read_csv("../../merge/data/hetionet/hetnet_nodes.csv", sep=',')
    .drop("identifier:string", axis=1)
    .rename(columns={
        ":ID": "node_id",
        "name:string": "name",
        ":LABEL": "het_type"
    })
)

In [4]:
hedges = (pd
    .read_csv("../../merge/data/hetionet/hetnet_edges.csv", sep=',')
    .rename(columns={
        ":START_ID": "start_id",
        ":END_ID": "end_id",
        ":TYPE": "het_etype"
    })
)

In [5]:
hnodes.head()

Unnamed: 0,node_id,name,het_type
0,DB00795,Sulfasalazine,Compound
1,N0000000151,Histamine H2 Receptor Antagonists,Pharmacologic Class
2,100996420,DNM1P50,Gene
3,DB04898,Ximelagatran,Compound
4,C0278151,Facial spasm,Side Effect


In [6]:
hedges.head()

Unnamed: 0,start_id,end_id,het_etype
0,8568,GO:0042254,PARTICIPATES_GpBP
1,UBERON:0002450,51316,EXPRESSES_AeG
2,4893,PC7_8430,PARTICIPATES_GpPW
3,UBERON:0002185,55186,EXPRESSES_AeG
4,119504,GO:0033047,PARTICIPATES_GpBP


In [7]:
gold = (pd
    .read_csv("../hetio_dw/filtered_semmed_gold_for_hetionet.tsv", sep='\t')
    .rename(columns={
        "chemical_id": "chemical_cui",
        "disease_id": "disease_cui",
        "chemical_hetid": "chemical_id",
        "disease_hetid": "disease_id"
    })
)

In [8]:
gold.shape

(2924, 9)

In [9]:
gold.head()

Unnamed: 0,chemical_cui,chemical_name,disease_cui,disease_name,etype,chemical_id,chemical_htype,disease_id,disease_htype
0,UMLS:C0000477,fampridine,UMLS:C0026769,multiple sclerosis,TREATS_CDtDO,DB06637,Compound,C0026769,Side Effect
1,UMLS:C0000545,eicosapentaenoic acid,UMLS:C0032961,"Pregnancy, function",TREATS_CDtDO,DB00159,Compound,C0032961,Side Effect
2,UMLS:C0000618,mercaptopurine,UMLS:C0023449,acute lymphocytic leukemia,TREATS_CDtDO,DB01033,Compound,C0023449,Side Effect
3,UMLS:C0000618,mercaptopurine,UMLS:C0023487,acute promyelocytic leukemia,TREATS_CDtDO,DB01033,Compound,C0023487,Side Effect
4,UMLS:C0000956,acenocoumarol,UMLS:C0034065,Pulmonary embolism,TREATS_CDtDO,DB01418,Compound,C0034065,Side Effect


---

## Add edge types to hetionet edges

In [10]:
edges = (hedges
    .merge(
        hnodes, how="inner",
        left_on="start_id", right_on="node_id"
    )
    .drop(["node_id", "name"], axis=1)
    .rename(columns={"het_type": "start_htype"})

    .merge(
        hnodes, how="inner",
        left_on="end_id", right_on="node_id"
    )
    .drop(["node_id", "name"], axis=1)
    .rename(columns={"het_type": "end_htype"})
)

In [11]:
edges.head()

Unnamed: 0,start_id,end_id,het_etype,start_htype,end_htype
0,8568,GO:0042254,PARTICIPATES_GpBP,Gene,Biological Process
1,6201,GO:0042254,PARTICIPATES_GpBP,Gene,Biological Process
2,6223,GO:0042254,PARTICIPATES_GpBP,Gene,Biological Process
3,6202,GO:0042254,PARTICIPATES_GpBP,Gene,Biological Process
4,65003,GO:0042254,PARTICIPATES_GpBP,Gene,Biological Process


In [12]:
edges.groupby(["start_htype", "end_htype"]).size()

start_htype          end_htype         
Anatomy              Gene                  726495
Compound             Compound                6486
                     Gene                   51429
                     Side Effect           138944
Disease              Anatomy                 3602
                     Disease                  543
                     Gene                   27977
                     Symptom                 3357
Gene                 Biological Process    559504
                     Cellular Component     73566
                     Gene                  474526
                     Molecular Function     97222
                     Pathway                84372
Pharmacologic Class  Compound                1029
dtype: int64

---

## Create minimal network

In [13]:
gold.groupby(["chemical_htype", "disease_htype"]).size()

chemical_htype       disease_htype
Compound             Disease           117
                     Side Effect      2789
                     Symptom            11
Pharmacologic Class  Side Effect         7
dtype: int64

In [14]:
smol = pd.concat([
#     edges.query("start_htype == 'Anatomy' and end_htype == 'Gene'"),
    edges.query("start_htype == 'Compound' and end_htype == 'Compound'"),
])

In [15]:
smol.shape

(6486, 5)

## Make up some gene-side effect interactions

In [16]:
sideeff = hnodes.query("het_type == 'Side Effect'")

In [17]:
sideeff.shape

(5734, 3)

In [18]:
genes = hnodes.query("het_type == 'Gene'")

In [19]:
genes.head()

Unnamed: 0,node_id,name,het_type
2,100996420,DNM1P50,Gene
5,56136,PCDHA13,Gene
8,5518,PPP2R1A,Gene
9,128859,BPIFB6,Gene
14,23519,ANP32D,Gene


In [20]:
temp = []
for node in tqdm(sideeff["node_id"]):
    rgenes = genes.sample(4).assign(disease_id = node)
    
    temp.append(rgenes)
    
temp = pd.concat(temp)

100%|██████████| 5734/5734 [00:06<00:00, 895.86it/s]


In [21]:
temp.head()

Unnamed: 0,node_id,name,het_type,disease_id
25428,81704,DOCK8,Gene,C0278151
7423,9351,SLC9A3R2,Gene,C0278151
29190,9189,ZBED1,Gene,C0278151
35718,90075,ZNF30,Gene,C0278151
33246,6709,SPTAN1,Gene,C0013928


In [22]:
fakes = (temp
    .rename(columns={
        "node_id": "start_id",
        "disease_id": "end_id",
        "het_type": "start_htype"
    })
    .assign(end_htype = "Side Effect", het_etype = "TEST_ABC")
    .reset_index(drop=True)
    .drop("name", axis=1)
)

In [23]:
fakes.head()

Unnamed: 0,start_id,start_htype,end_id,end_htype,het_etype
0,81704,Gene,C0278151,Side Effect,TEST_ABC
1,9351,Gene,C0278151,Side Effect,TEST_ABC
2,9189,Gene,C0278151,Side Effect,TEST_ABC
3,90075,Gene,C0278151,Side Effect,TEST_ABC
4,6709,Gene,C0013928,Side Effect,TEST_ABC


In [24]:
fakes.shape

(22936, 5)

In [25]:
fake_network = fakes.append(smol).reset_index(drop=True)

In [26]:
fake_network.head()

Unnamed: 0,end_htype,end_id,het_etype,start_htype,start_id
0,Side Effect,C0278151,TEST_ABC,Gene,81704
1,Side Effect,C0278151,TEST_ABC,Gene,9351
2,Side Effect,C0278151,TEST_ABC,Gene,9189
3,Side Effect,C0278151,TEST_ABC,Gene,90075
4,Side Effect,C0013928,TEST_ABC,Gene,6709


In [27]:
fake_network.shape

(29422, 5)

## Assign deepwalk metadata

In [28]:
fhnodes = (hnodes
    .reset_index()
    .rename(columns={
        "index": "node_uid"
    })
)

In [29]:
fhnodes.head()

Unnamed: 0,node_uid,node_id,name,het_type
0,0,DB00795,Sulfasalazine,Compound
1,1,N0000000151,Histamine H2 Receptor Antagonists,Pharmacologic Class
2,2,100996420,DNM1P50,Gene
3,3,DB04898,Ximelagatran,Compound
4,4,C0278151,Facial spasm,Side Effect


In [30]:
fhnodes.shape

(47031, 4)

### Add deepwalk ids to edges

In [31]:
fedges = (fake_network
    .merge(
        fhnodes[["node_uid", "node_id"]],
        how="inner", left_on="start_id", right_on="node_id"
    )
    .drop("node_id", axis=1)
    .rename(columns={"node_uid": "source_uid"})

    .merge(
        fhnodes[["node_uid", "node_id"]],
        how="inner", left_on="end_id", right_on="node_id"
    )
    .drop("node_id", axis=1)
    .rename(columns={"node_uid": "target_uid"})
)

In [32]:
fedges.head()

Unnamed: 0,end_htype,end_id,het_etype,start_htype,start_id,source_uid,target_uid
0,Side Effect,C0278151,TEST_ABC,Gene,81704,25428,4
1,Side Effect,C0278151,TEST_ABC,Gene,9351,7423,4
2,Side Effect,C0278151,TEST_ABC,Gene,9189,29190,4
3,Side Effect,C0278151,TEST_ABC,Gene,90075,35718,4
4,Side Effect,C0013295,TEST_ABC,Gene,81704,25428,14835


In [33]:
fedges.shape

(29422, 7)

## Remove gold standard relations from the network

In [34]:
gpairs = set(zip(gold["chemical_id"], gold["disease_id"]))

In [35]:
epairs = set(zip(fedges["start_id"], fedges["end_id"]))

In [36]:
fepairs = epairs - gpairs

In [37]:
len(epairs) - len(fepairs)

0

---

In [38]:
fedges = (pd
    .DataFrame(list(fepairs), columns=["start_id", "end_id"])
    .merge(fedges, how="left", on=["start_id", "end_id"])
)

In [39]:
fedges.head()

Unnamed: 0,start_id,end_id,end_htype,het_etype,start_htype,source_uid,target_uid
0,28974,C0030587,Side Effect,TEST_ABC,Gene,41965,14438
1,DB01419,DB08984,Compound,RESEMBLES_CrC,Compound,5596,27850
2,386685,C0020626,Side Effect,TEST_ABC,Gene,44422,22841
3,442425,C0149741,Side Effect,TEST_ABC,Gene,12056,18504
4,DB08994,DB01616,Compound,RESEMBLES_CrC,Compound,32520,17755


In [40]:
fedges.shape

(29422, 7)

In [41]:
fedges["start_htype"].value_counts()

Gene        22936
Compound     6486
Name: start_htype, dtype: int64

In [42]:
fedges["end_htype"].value_counts()

Side Effect    22936
Compound        6486
Name: end_htype, dtype: int64

In [43]:
fedges["source_uid"].nunique()

14978

In [44]:
fedges["target_uid"].nunique()

6788

In [45]:
fedges.head()

Unnamed: 0,start_id,end_id,end_htype,het_etype,start_htype,source_uid,target_uid
0,28974,C0030587,Side Effect,TEST_ABC,Gene,41965,14438
1,DB01419,DB08984,Compound,RESEMBLES_CrC,Compound,5596,27850
2,386685,C0020626,Side Effect,TEST_ABC,Gene,44422,22841
3,442425,C0149741,Side Effect,TEST_ABC,Gene,12056,18504
4,DB08994,DB01616,Compound,RESEMBLES_CrC,Compound,32520,17755


---

## Subsample edges

This new version allows removal of gold standard nodes if the edge was the only thing linking the node into the network.

In [46]:
fhnodes.to_csv("tmp/hetionet_nodes.tsv", sep='\t', index=False)

In [47]:
gold.to_csv("tmp/hetionet_gold.tsv", sep='\t', index=False)

---

## Remove compound/gene and compound/side effect edges

In [48]:
fedges.to_csv("tmp/edges/fake_network.tsv", sep='\t', index=False)