# Try running deepwalk on the expanded semmeddb network containing hetionet

In [1]:
import pandas as pd
import re

## Read full network

In [2]:
nodes = pd.read_csv("../merge/merged_network/nodes.tsv", sep='\t')

In [3]:
nodes.head()

Unnamed: 0,name,node_id,origin,sem_type
0,GAGE12F,100008586,hetionet,Genes & Molecular Sequences
1,DUXB,100033411,hetionet,Genes & Molecular Sequences
2,DDTL,100037417,hetionet,Genes & Molecular Sequences
3,OCTN3,100049579,hetionet,Genes & Molecular Sequences
4,POM121C,100101267,hetionet,Genes & Molecular Sequences


In [4]:
nodes.shape

(231543, 4)

In [5]:
edges = pd.read_csv("../merge/merged_network/edges.tsv", sep='\t')

In [6]:
edges.head()

Unnamed: 0,final_etype,origin,source_fid,source_stype,target_fid,target_stype
0,REGULATES_CDreg>CD,semmeddb,UMLS:C0000039,Chemicals & Drugs,UMLS:C0001026,Chemicals & Drugs
1,REGULATES_CDreg>CD,semmeddb,UMLS:C0000167,Chemicals & Drugs,UMLS:C0001026,Chemicals & Drugs
2,PRODUCES_LBpdCD,semmeddb,UMLS:C0000901,Living Beings,UMLS:C0001026,Chemicals & Drugs
3,REGULATES_CDreg>CD,semmeddb,UMLS:C0000975,Chemicals & Drugs,UMLS:C0001026,Chemicals & Drugs
4,REGULATES_CDreg>CD,semmeddb,UMLS:C0000977,Chemicals & Drugs,UMLS:C0001026,Chemicals & Drugs


In [7]:
edges.shape

(12294467, 6)

---

## Assign deepwalk ids

In [8]:
fnodes = (nodes
    .reset_index()
    .rename(columns={"index": "node_uid"})
)

In [9]:
fnodes.head()

Unnamed: 0,node_uid,name,node_id,origin,sem_type
0,0,GAGE12F,100008586,hetionet,Genes & Molecular Sequences
1,1,DUXB,100033411,hetionet,Genes & Molecular Sequences
2,2,DDTL,100037417,hetionet,Genes & Molecular Sequences
3,3,OCTN3,100049579,hetionet,Genes & Molecular Sequences
4,4,POM121C,100101267,hetionet,Genes & Molecular Sequences


### Read gold standard

In [10]:
gold = pd.read_csv("../semmed/data/filtered_goldstd.tsv", sep='\t')

In [11]:
gold.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,etype
0,C0520442,acetyldigitoxin,C0004238,Atrial fibrillation,TREATS_CDtDO
1,C0520442,acetyldigitoxin,C0018802,Congestive heart failure,TREATS_CDtDO
2,C0771809,acexamic acid,C0037299,Skin ulcer,TREATS_CDtDO
3,C0050558,acipimox,C0020476,Hyperlipoproteinemia,TREATS_CDtDO
4,C0021735,interferon alfa-2b,C0023434,chronic lymphocytic leukemia,TREATS_CDtDO


In [12]:
gold.shape

(6329, 5)

### Add "UMLS:" to gold std CUIs

In [13]:
def is_cui(s):
    return re.match(r'^C[0-9]{7}$', s) is not None

In [14]:
gold["chemical_id"].str.match(r'^C[0-9]{7}$').all()

True

In [15]:
gold["disease_id"].str.match(r'^C[0-9]{7}$').sum()

6287

In [16]:
set(gold
    .assign(
        iscui = lambda df: df["disease_id"].str.match(r'^C[0-9]{7}$')
    )
    .query("~iscui")
    ["disease_id"]
)

{'DOID:0060061', 'DOID:0060073'}

In [17]:
fgold = (gold
    .assign(
        chemical_id = lambda df: df["chemical_id"].map(
            lambda v: "UMLS:{}".format(v)
        )
    )
    .assign(
        disease_id = lambda df: df["disease_id"].map(
            lambda v: "UMLS:{}".format(v) if is_cui(v) else v
        )
    )

    .merge(
        fnodes[["node_uid", "node_id"]],
        how="inner", left_on="chemical_id", right_on="node_id"
    )
    .drop("node_id", axis=1)
    .rename(columns={"node_uid": "chemical_uid"})

    .merge(
        fnodes[["node_uid", "node_id"]],
        how="inner", left_on="disease_id", right_on="node_id"
    )
    .drop("node_id", axis=1)
    .rename(columns={"node_uid": "disease_uid"})
)

In [18]:
fgold.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,etype,chemical_uid,disease_uid
0,UMLS:C0520442,acetyldigitoxin,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO,108642,22585
1,UMLS:C0033497,propranolol,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO,32398,22585
2,UMLS:C0012265,digoxin,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO,25280,22585
3,UMLS:C0084273,quinidine,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO,55039,22585
4,UMLS:C0766326,dronedarone,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO,134609,22585


In [19]:
fgold.shape

(6329, 7)

---

## Add deepwalk ids to edges

In [20]:
fedges = (edges
    .merge(
        fnodes[["node_uid", "node_id"]],
        how="inner", left_on="source_fid", right_on="node_id"
    )
    .drop("node_id", axis=1)
    .rename(columns={"node_uid": "source_uid"})

    .merge(
        fnodes[["node_uid", "node_id"]],
        how="inner", left_on="target_fid", right_on="node_id"
    )
    .drop("node_id", axis=1)
    .rename(columns={"node_uid": "target_uid"})
)

In [21]:
fedges.head()

Unnamed: 0,final_etype,origin,source_fid,source_stype,target_fid,target_stype,source_uid,target_uid
0,REGULATES_CDreg>CD,semmeddb,UMLS:C0000039,Chemicals & Drugs,UMLS:C0001026,Chemicals & Drugs,21154,21356
1,REGULATES_CDreg>CD,semmeddb,UMLS:C0000167,Chemicals & Drugs,UMLS:C0001026,Chemicals & Drugs,21168,21356
2,PRODUCES_LBpdCD,semmeddb,UMLS:C0000901,Living Beings,UMLS:C0001026,Chemicals & Drugs,21301,21356
3,REGULATES_CDreg>CD,semmeddb,UMLS:C0000975,Chemicals & Drugs,UMLS:C0001026,Chemicals & Drugs,21332,21356
4,REGULATES_CDreg>CD,semmeddb,UMLS:C0000977,Chemicals & Drugs,UMLS:C0001026,Chemicals & Drugs,21333,21356


---

In [22]:
fnodes.to_csv("data/merged_nodes.tsv", sep='\t', index=False)

In [23]:
fedges.to_csv("data/merged_edges.tsv", sep='\t', index=False)

In [24]:
fgold.to_csv("data/merged_gold.tsv", sep='\t', index=False)