# Prepare data for downstream use

1. Ensure node space is consistent.
2. Ensure no overlap of edges.
3. Assign deepwalk ids.

In [1]:
import pandas as pd

## Read nodes

In [2]:
nodes = (pd
    .read_csv("data/nodes_7_metanode_slim.csv", sep=',')
    .rename(columns={
        ":ID": "node_id",
        ":LABEL": "ntype"
    })
)

In [3]:
nodes.head()

Unnamed: 0,node_id,name,ntype
0,C0016192,Flagella,Anatomy
1,C0230349,Cubital fossa,Anatomy
2,C0447417,Entire retromolar area of mouth,Anatomy
3,C0033151,Primitive Gut,Anatomy
4,C0225861,Left auricular appendage,Anatomy


## Read edges

In [4]:
edges = pd.read_csv("data/edges_no_treats.tsv", sep='\t')

In [5]:
edges.shape

(9646780, 3)

In [6]:
edges.head()

Unnamed: 0,source_id,target_id,etype
0,C0000039,C0001026,REGULATES_CDreg>CD
1,C0000039,C0001041,REGULATES_CDreg>CD
2,C0000039,C0001492,REGULATES_CDreg>CD
3,C0000039,C0001962,REGULATES_CDreg>CD
4,C0000039,C0001975,RELATED_TO_CDrtCD


---

## Check list of nodes includes all nodes used in edges

In [7]:
enodes = set(edges["source_id"]) | set(edges["target_id"])

In [8]:
set(nodes["node_id"]) >= enodes

True

In [9]:
len(set(nodes["node_id"]) - enodes)

4215

Some nodes have no edges connecting them.

---

## Read gold standard

In [10]:
gold = pd.read_csv("data/filtered_goldstd.tsv", sep='\t')

In [11]:
gold.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,etype
0,C0520442,acetyldigitoxin,C0004238,Atrial fibrillation,TREATS_CDtDO
1,C0033497,propranolol,C0004238,Atrial fibrillation,TREATS_CDtDO
2,C0012265,digoxin,C0004238,Atrial fibrillation,TREATS_CDtDO
3,C0084273,quinidine,C0004238,Atrial fibrillation,TREATS_CDtDO
4,C0766326,dronedarone,C0004238,Atrial fibrillation,TREATS_CDtDO


In [12]:
gold.shape

(6329, 5)

---

## Check gold standard nodes are in full node list

In [13]:
gnodes = set(gold["chemical_id"]) | set(gold["disease_id"])

In [14]:
gnodes <= set(nodes["node_id"])

True

In [15]:
gnodes <= enodes

False

In [16]:
len(gnodes - enodes)

14

Some 14 nodes in the gold standard do not show up in any of the normal edges.

### Number of orphan nodes (no edges)

In [17]:
len(set(nodes["node_id"]) - gnodes - enodes)

4201

And 4201 nodes will never have any sort of connection to the rest of the graph at all. We will filter these nodes out since without a connecting edge we won't be able to create embeddings for them, and thus won't be able to make any predictions for them either.

---

## Remove orphan nodes

In [18]:
fnodes = gnodes | enodes

In [19]:
len(fnodes)

210389

In [20]:
nodes = nodes.query("node_id in @fnodes")

---

## Assign deepwalk ids

In [21]:
node_map = (nodes
    .sort_values("node_id")
    .reset_index(drop=True)
    .reset_index()
    .rename(columns={"index": "node_uid"})
)

In [22]:
node_map.shape

(210389, 4)

In [23]:
node_map.head()

Unnamed: 0,node_uid,node_id,name,ntype
0,0,C0000039,"1,2-Dipalmitoylphosphatidylcholine",Chemicals & Drugs
1,1,C0000052,"1,4-alpha-Glucan Branching Enzyme",Chemicals & Drugs
2,2,C0000084,1-Carboxyglutamic Acid,Chemicals & Drugs
3,3,C0000096,1-Methyl-3-isobutylxanthine,Chemicals & Drugs
4,4,C0000097,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",Chemicals & Drugs


### Edges

In [24]:
fedges = (edges
    .merge(
        node_map[["node_uid", "node_id"]],
        how="inner", left_on="source_id", right_on="node_id"
    )
    .merge(
        node_map[["node_uid", "node_id"]],
        how="inner", left_on="target_id", right_on="node_id"        
    )
    .drop(["node_id_x", "node_id_y"], axis=1)
    .rename(columns={
        "node_uid_x": "source_uid",
        "node_uid_y": "target_uid"
    })
    .sort_values(["source_uid", "target_uid", "etype"])
    .reset_index(drop=True)
)

In [25]:
fedges.shape

(9646780, 5)

In [26]:
fedges.head()

Unnamed: 0,source_id,target_id,etype,source_uid,target_uid
0,C0000039,C0001026,REGULATES_CDreg>CD,0,202
1,C0000039,C0001041,REGULATES_CDreg>CD,0,207
2,C0000039,C0001492,REGULATES_CDreg>CD,0,386
3,C0000039,C0001962,REGULATES_CDreg>CD,0,528
4,C0000039,C0001975,RELATED_TO_CDrtCD,0,534


## save to file

In [27]:
node_map.to_csv("data/clean/nodes.tsv", sep='\t', index=False)
fedges.to_csv("data/clean/edges.tsv", sep='\t', index=False)
gold.to_csv("data/clean/gold.tsv", sep='\t', index=False)