# Try running deepwalk on hetionet by itself

In [1]:
import pandas as pd

## Read hetionet

In [2]:
pwd

'/home/tongli/walkpred/experiment/hetio_dw'

In [3]:
nodes = (pd
    .read_csv("../../merge/data/hetionet/hetnet_nodes.csv", sep=',')
    .drop("identifier:string", axis=1)
    .rename(columns={
        ":ID": "node_id",
        "name:string": "name",
        ":LABEL": "het_type"
    })
)

In [4]:
nodes.head()

Unnamed: 0,node_id,name,het_type
0,DB00795,Sulfasalazine,Compound
1,N0000000151,Histamine H2 Receptor Antagonists,Pharmacologic Class
2,100996420,DNM1P50,Gene
3,DB04898,Ximelagatran,Compound
4,C0278151,Facial spasm,Side Effect


In [5]:
hedges = (pd
    .read_csv("../../merge/data/hetionet/hetnet_edges.csv", sep=',')
    .rename(columns={
        ":START_ID": "start_id",
        ":END_ID": "end_id",
        ":TYPE": "het_etype"
    })
)

In [6]:
hedges.head()

Unnamed: 0,start_id,end_id,het_etype
0,8568,GO:0042254,PARTICIPATES_GpBP
1,UBERON:0002450,51316,EXPRESSES_AeG
2,4893,PC7_8430,PARTICIPATES_GpPW
3,UBERON:0002185,55186,EXPRESSES_AeG
4,119504,GO:0033047,PARTICIPATES_GpBP


---

## Assign deepwalk ids

In [7]:
fnodes = (nodes
    .reset_index()
    .rename(columns={"index": "node_uid"})
)

In [8]:
fnodes.head()

Unnamed: 0,node_uid,node_id,name,het_type
0,0,DB00795,Sulfasalazine,Compound
1,1,N0000000151,Histamine H2 Receptor Antagonists,Pharmacologic Class
2,2,100996420,DNM1P50,Gene
3,3,DB04898,Ximelagatran,Compound
4,4,C0278151,Facial spasm,Side Effect


## add deepwalk ids to edges

In [9]:
fedges = (hedges
    .merge(
        fnodes[["node_uid", "node_id"]],
        how="inner", left_on="start_id", right_on="node_id"
    )
    .drop("node_id", axis=1)
    .rename(columns={"node_uid": "source_uid"})

    .merge(
        fnodes[["node_uid", "node_id"]],
        how="inner", left_on="end_id", right_on="node_id"
    )
    .drop("node_id", axis=1)
    .rename(columns={"node_uid": "target_uid"})
)

In [10]:
fedges.head()

Unnamed: 0,start_id,end_id,het_etype,source_uid,target_uid
0,8568,GO:0042254,PARTICIPATES_GpBP,26311,5975
1,6201,GO:0042254,PARTICIPATES_GpBP,9449,5975
2,6223,GO:0042254,PARTICIPATES_GpBP,32586,5975
3,6202,GO:0042254,PARTICIPATES_GpBP,21669,5975
4,65003,GO:0042254,PARTICIPATES_GpBP,20653,5975


---

In [11]:
gold = pd.read_csv("filtered_semmed_gold_for_hetionet.tsv", sep='\t')

In [12]:
gold.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,etype,chemical_hetid,chemical_htype,disease_hetid,disease_htype
0,UMLS:C0000477,fampridine,UMLS:C0026769,multiple sclerosis,TREATS_CDtDO,DB06637,Compound,C0026769,Side Effect
1,UMLS:C0000545,eicosapentaenoic acid,UMLS:C0032961,"Pregnancy, function",TREATS_CDtDO,DB00159,Compound,C0032961,Side Effect
2,UMLS:C0000618,mercaptopurine,UMLS:C0023449,acute lymphocytic leukemia,TREATS_CDtDO,DB01033,Compound,C0023449,Side Effect
3,UMLS:C0000618,mercaptopurine,UMLS:C0023487,acute promyelocytic leukemia,TREATS_CDtDO,DB01033,Compound,C0023487,Side Effect
4,UMLS:C0000956,acenocoumarol,UMLS:C0034065,Pulmonary embolism,TREATS_CDtDO,DB01418,Compound,C0034065,Side Effect


In [13]:
fgold = (gold
    .rename(columns={
        "chemical_id": "chemical_cui",
        "disease_id": "disease_cui",
        "chemical_hetid": "chemical_id",
        "disease_hetid": "disease_id"
    })
)

In [14]:
fgold.head()

Unnamed: 0,chemical_cui,chemical_name,disease_cui,disease_name,etype,chemical_id,chemical_htype,disease_id,disease_htype
0,UMLS:C0000477,fampridine,UMLS:C0026769,multiple sclerosis,TREATS_CDtDO,DB06637,Compound,C0026769,Side Effect
1,UMLS:C0000545,eicosapentaenoic acid,UMLS:C0032961,"Pregnancy, function",TREATS_CDtDO,DB00159,Compound,C0032961,Side Effect
2,UMLS:C0000618,mercaptopurine,UMLS:C0023449,acute lymphocytic leukemia,TREATS_CDtDO,DB01033,Compound,C0023449,Side Effect
3,UMLS:C0000618,mercaptopurine,UMLS:C0023487,acute promyelocytic leukemia,TREATS_CDtDO,DB01033,Compound,C0023487,Side Effect
4,UMLS:C0000956,acenocoumarol,UMLS:C0034065,Pulmonary embolism,TREATS_CDtDO,DB01418,Compound,C0034065,Side Effect


---

In [15]:
fnodes.to_csv("data/hetio_nodes.tsv", sep='\t', index=False)

In [16]:
fedges.to_csv("data/hetio_edges.tsv", sep='\t', index=False)

In [17]:
fgold.to_csv("data/hetio_gold.tsv", sep='\t', index=False)