# Preprocessing SemmedDB for use with Deepwalk

We need to clean up the version of SemmedDB that Mike gave us before we can use it with Deepwalk to generate drug indication predictions.

In [1]:
from tqdm import tqdm
import pandas as pd

---

## Read edges and remove PMIDs associated with each edge

At the moment we don't need the papers from which individual assertions were mined, and so will remove this information for now.

In [2]:
edges = []
with open("data/edges_7_metanode_slim.csv", "r") as fin:
    # line count determined using "wc -l"
    for line in tqdm(fin, total=10230595):
        vals = line.rstrip("\n").split(",")
        edges.append(vals[:3])
        
edges = pd.DataFrame(
    edges[1:], columns=["source_id", "target_id", "etype"]
)

100%|██████████| 10230595/10230595 [00:31<00:00, 328720.16it/s]


In [3]:
edges.shape

(10230594, 3)

In [4]:
edges.head()

Unnamed: 0,source_id,target_id,etype
0,C0000039,C0001026,REGULATES_CDreg>CD
1,C0000039,C0001041,REGULATES_CDreg>CD
2,C0000039,C0001492,REGULATES_CDreg>CD
3,C0000039,C0001511,TREATS_CDtDO
4,C0000039,C0001962,REGULATES_CDreg>CD


In [5]:
edges["etype"].value_counts().head(6)

LOCATION_OF_AloCD         996348
REGULATES_CDreg>CD        814972
INTERACTS_WITH_CDiwG      705394
ASSOCIATED_WITH_DOawDO    690653
TREATS_CDtDO              583814
LOCATION_OF_AloG          534310
Name: etype, dtype: int64

### Remove "TREATS_CDtDO" edges from the network

These treatment edges are extracted by SemRep and are not terribly trustworthy. We will remove these edges so that the real human curated "treats" edges will not be overwhelmed.

In [6]:
edges = edges.query("etype != 'TREATS_CDtDO'")

In [7]:
edges.shape

(9646780, 3)

---

## Read gold standard

In [8]:
gold = (pd
    .read_csv("data/indications_slim.csv", sep=',')
    [["compound_umlscui", "compound_name", "disease_umlscui", "disease_name", "etype"]]
    .rename(columns={
        "compound_umlscui": "chemical_id",
        "compound_name": "chemical_name",
        "disease_umlscui": "disease_id",
    })
)

In [9]:
gold.shape

(8176, 5)

In [10]:
gold.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,etype
0,C3661315,rucaparib,C1299247,ovarian cancer,TREATS_CDtDO
1,C4044947,baricitinib,C0003873,rheumatoid arthritis,TREATS_CDtDO
2,C0520442,acetyldigitoxin,C0004238,Atrial fibrillation,TREATS_CDtDO
3,C0520442,acetyldigitoxin,C0018802,Congestive heart failure,TREATS_CDtDO
4,C0771809,acexamic acid,C0037299,Skin ulcer,TREATS_CDtDO


### Read all nodes

In [11]:
nodes = (pd
    .read_csv("data/nodes_7_metanode_slim.csv", sep=',')
    .rename(columns={
        ":ID": "node_id",
        ":LABEL": "ntype"
    })
)

In [12]:
nodes.shape

(214590, 3)

In [13]:
nodes.head()

Unnamed: 0,node_id,name,ntype
0,C0016192,Flagella,Anatomy
1,C0230349,Cubital fossa,Anatomy
2,C0447417,Entire retromolar area of mouth,Anatomy
3,C0033151,Primitive Gut,Anatomy
4,C0225861,Left auricular appendage,Anatomy


### Set nodes to those which appear in the gold standard or the edge list

In [14]:
enodes = set(edges["source_id"]) | set(edges["target_id"])
gnodes = set(gold["chemical_id"]) | set(gold["disease_id"])
fnodes = (enodes | gnodes) & set(nodes["node_id"])

In [15]:
len(fnodes)

210389

In [16]:
nodes = nodes.query("node_id in @fnodes")

### Filter gold standard to nodes within node list

In [17]:
subg = gold.query(
    "chemical_id in @fnodes and disease_id in @fnodes"
)

In [18]:
subg.shape

(6329, 5)

In [19]:
subg.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,etype
2,C0520442,acetyldigitoxin,C0004238,Atrial fibrillation,TREATS_CDtDO
3,C0520442,acetyldigitoxin,C0018802,Congestive heart failure,TREATS_CDtDO
4,C0771809,acexamic acid,C0037299,Skin ulcer,TREATS_CDtDO
5,C0050558,acipimox,C0020476,Hyperlipoproteinemia,TREATS_CDtDO
8,C0021735,interferon alfa-2b,C0023434,chronic lymphocytic leukemia,TREATS_CDtDO


In [20]:
subg["chemical_id"].nunique()

1618

In [21]:
subg["disease_id"].nunique()

963

---

## Any relations between treats relations in the gold standard?

In [22]:
(edges
    .merge(
        (subg
            [["chemical_id", "disease_id"]]
            .rename(columns={
                "chemical_id": "source_id",
                "disease_id": "target_id"
            })
        ),
        how="inner", on=["source_id", "target_id"]
    )
)

Unnamed: 0,source_id,target_id,etype
0,C0011134,C0021572,REGULATES_CDreg>CD
1,C0011134,C0021572,RELATED_TO_CDrtCD
2,C0016410,C0032961,AUGMENTS_CDagPS
3,C0016410,C0032961,DISRUPTS_CDdsPS


In [23]:
(edges
    .merge(
        (subg
            [["chemical_id", "disease_id"]]
            .rename(columns={
                "chemical_id": "target_id",
                "disease_id": "source_id"
            })
        ),
        how="inner", on=["source_id", "target_id"]
    )
)

Unnamed: 0,source_id,target_id,etype
0,C0021572,C0011134,RELATED_TO_CDrtCD
1,C1265175,C0663241,PRODUCES_LBpdCD


---

## Assign Deepwalk ids

In [24]:
node_map = (nodes
    .sort_values("node_id")
    .reset_index(drop=True)
    .reset_index()
    .rename(columns={"index": "node_uid"})
)

In [25]:
node_map.shape

(210389, 4)

In [26]:
node_map.head()

Unnamed: 0,node_uid,node_id,name,ntype
0,0,C0000039,"1,2-Dipalmitoylphosphatidylcholine",Chemicals & Drugs
1,1,C0000052,"1,4-alpha-Glucan Branching Enzyme",Chemicals & Drugs
2,2,C0000084,1-Carboxyglutamic Acid,Chemicals & Drugs
3,3,C0000096,1-Methyl-3-isobutylxanthine,Chemicals & Drugs
4,4,C0000097,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",Chemicals & Drugs


## Add Deepwalk ids to edges

In [27]:
fedges = (edges
    .merge(
        node_map[["node_uid", "node_id"]],
        how="inner", left_on="source_id", right_on="node_id"
    )
    .merge(
        node_map[["node_uid", "node_id"]],
        how="inner", left_on="target_id", right_on="node_id"        
    )
    .drop(["node_id_x", "node_id_y"], axis=1)
    .rename(columns={
        "node_uid_x": "source_uid",
        "node_uid_y": "target_uid"
    })
    .sort_values(["source_uid", "target_uid", "etype"])
    .reset_index(drop=True)
)

In [28]:
fedges.shape

(9646780, 5)

In [29]:
fedges.head()

Unnamed: 0,source_id,target_id,etype,source_uid,target_uid
0,C0000039,C0001026,REGULATES_CDreg>CD,0,202
1,C0000039,C0001041,REGULATES_CDreg>CD,0,207
2,C0000039,C0001492,REGULATES_CDreg>CD,0,386
3,C0000039,C0001962,REGULATES_CDreg>CD,0,528
4,C0000039,C0001975,RELATED_TO_CDrtCD,0,534


## Save to file

In [30]:
subg.to_csv("data/filtered_goldstd.tsv", sep='\t', index=False)

node_map.to_csv("data/node_map.tsv", sep='\t', index=False)

fedges.to_csv("data/edges.tsv", sep='\t', index=False)