# Generate the master id conversion list for deepwalk

Only for hetionet at the moment.

In [1]:
import pandas as pd

## Hetionet nodes

In [2]:
nodes = (pd
    .read_csv("data/hetionet/hetionet_nodes.csv", sep=',')
    .drop("identifier:string", axis=1)
    .rename(columns={
        ":ID": "node_id",
        "name:string": "name",
        ":LABEL": "het_type"
    })
)

In [3]:
nodes.head()

Unnamed: 0,node_id,name,het_type
0,DB00795,Sulfasalazine,Compound
1,N0000000151,Histamine H2 Receptor Antagonists,Pharmacologic Class
2,100996420,DNM1P50,Gene
3,DB04898,Ximelagatran,Compound
4,C0278151,Facial spasm,Side Effect


## Hetionet edges

In [4]:
edges = (pd
    .read_csv("data/hetionet/hetionet_edges.csv", sep=',')
    .rename(columns={
        ":START_ID": "start_id",
        ":END_ID": "end_id",
        ":TYPE": "het_etype"
    })
)

In [5]:
edges.head()

Unnamed: 0,start_id,end_id,het_etype
0,8568,GO:0042254,PARTICIPATES_GpBP
1,UBERON:0002450,51316,EXPRESSES_AeG
2,4893,PC7_8430,PARTICIPATES_GpPW
3,UBERON:0002185,55186,EXPRESSES_AeG
4,119504,GO:0033047,PARTICIPATES_GpBP


## PharmacotherapyDB gold standard

In [6]:
gold = (pd
    .read_csv("data/hetionet/indications.tsv", sep='\t')
    .drop(["n_curators", "n_resources"], axis=1)
)

In [7]:
gold.head()

Unnamed: 0,doid_id,drugbank_id,disease,drug,category
0,DOID:10652,DB00843,Alzheimer's disease,Donepezil,DM
1,DOID:10652,DB00674,Alzheimer's disease,Galantamine,DM
2,DOID:10652,DB01043,Alzheimer's disease,Memantine,DM
3,DOID:10652,DB00989,Alzheimer's disease,Rivastigmine,DM
4,DOID:10652,DB00245,Alzheimer's disease,Benzatropine,SYM


---

## Are all edge nodes in the node list?

In [8]:
node_set = set(nodes["node_id"])

In [9]:
len(node_set)

47031

In [10]:
edge_node_set = set(edges["start_id"]) | set(edges["end_id"])

In [11]:
len(edge_node_set)

45158

In [12]:
edge_node_set <= node_set

True

---

## Nodes in gold standard are in node set?

In [13]:
gold_node_set = set(gold["doid_id"]) | set(gold["drugbank_id"])

In [14]:
gold_node_set <= node_set

True

---

## Generate master hetionet deepwalk ids

In [15]:
dw_ids = (nodes
    .reset_index(drop=True)
    .reset_index()
    .rename(columns={"index": "deepwalk_id"})
          
)

In [16]:
dw_ids.head()

Unnamed: 0,deepwalk_id,node_id,name,het_type
0,0,DB00795,Sulfasalazine,Compound
1,1,N0000000151,Histamine H2 Receptor Antagonists,Pharmacologic Class
2,2,100996420,DNM1P50,Gene
3,3,DB04898,Ximelagatran,Compound
4,4,C0278151,Facial spasm,Side Effect


In [17]:
set(dw_ids["deepwalk_id"]) == set(i for i in range(len(nodes)))

True

## Save master id list to file

In [18]:
dw_ids.to_csv("data/hetionet/deepwalk_ids.tsv", sep='\t', index=False)