build a minimal hetionet with all the compounds and diseases represented, but nothing from the gold standard involved. should give us 0.5 roc when we run it, since the features should mean nothing

2018-07-05

Plan

- Read Hetionet information and gold standard
- Determine which edges gold standard uses
- Create a network which has chemical and disease nodes, but no linking edges
- Verify that the concepts are in the network
- Save network to file

In [1]:
import pandas as pd

## Read in Hetionet

### Nodes

In [2]:
nodes = (pd
    .read_csv("data/hetionet/hetionet_nodes.csv", sep=',')
    .drop("identifier:string", axis=1)
    .rename(columns={
        ":ID": "node_id",
        "name:string": "name",
        ":LABEL": "het_type"
    })
)

In [3]:
nodes.head(3)

Unnamed: 0,node_id,name,het_type
0,DB00795,Sulfasalazine,Compound
1,N0000000151,Histamine H2 Receptor Antagonists,Pharmacologic Class
2,100996420,DNM1P50,Gene


### Edges

In [4]:
edges = (pd
    .read_csv("data/hetionet/hetionet_edges.csv", sep=',')
    .rename(columns={
        ":START_ID": "start_id",
        ":END_ID": "end_id",
        ":TYPE": "het_etype"
    })
)

In [5]:
edges.head(3)

Unnamed: 0,start_id,end_id,het_etype
0,8568,GO:0042254,PARTICIPATES_GpBP
1,UBERON:0002450,51316,EXPRESSES_AeG
2,4893,PC7_8430,PARTICIPATES_GpPW


## Label edges with edge types

In [6]:
ledges = (edges
    .merge(
        nodes, how="inner",
        left_on="start_id", right_on="node_id"
    )
    .drop(["node_id", "name"], axis=1)
    .rename(columns={"het_type": "start_htype"})

    .merge(
        nodes, how="inner",
        left_on="end_id", right_on="node_id"
    )
    .drop(["node_id", "name"], axis=1)
    .rename(columns={"het_type": "end_htype"})
)

In [7]:
ledges.head()

Unnamed: 0,start_id,end_id,het_etype,start_htype,end_htype
0,8568,GO:0042254,PARTICIPATES_GpBP,Gene,Biological Process
1,6201,GO:0042254,PARTICIPATES_GpBP,Gene,Biological Process
2,6223,GO:0042254,PARTICIPATES_GpBP,Gene,Biological Process
3,6202,GO:0042254,PARTICIPATES_GpBP,Gene,Biological Process
4,65003,GO:0042254,PARTICIPATES_GpBP,Gene,Biological Process


In [8]:
ledges.groupby(["start_htype", "end_htype"]).size()

start_htype          end_htype         
Anatomy              Gene                  726495
Compound             Compound                6486
                     Gene                   51429
                     Side Effect           138944
Disease              Anatomy                 3602
                     Disease                  543
                     Gene                   27977
                     Symptom                 3357
Gene                 Biological Process    559504
                     Cellular Component     73566
                     Gene                  474526
                     Molecular Function     97222
                     Pathway                84372
Pharmacologic Class  Compound                1029
dtype: int64

### Read PharmacotherapyDB gold standard

In [9]:
gold = (pd
    .read_csv("data/hetionet/indications.tsv", sep='\t')
    .drop(["n_curators", "n_resources"], axis=1)
)

In [10]:
gold.head(3)

Unnamed: 0,doid_id,drugbank_id,disease,drug,category
0,DOID:10652,DB00843,Alzheimer's disease,Donepezil,DM
1,DOID:10652,DB00674,Alzheimer's disease,Galantamine,DM
2,DOID:10652,DB01043,Alzheimer's disease,Memantine,DM


In [11]:
gold["category"].value_counts()

DM     755
SYM    390
NOT    243
Name: category, dtype: int64

### Add in edge types for gold standard

In [12]:
gold_type = (gold
    .merge(
        nodes, how="inner", left_on="doid_id", right_on="node_id"
    )
    .drop(["node_id", "name"], axis=1)
    .rename(columns={"het_type": "dise_type"})
             
    .merge(
        nodes, how="inner", left_on="drugbank_id", right_on="node_id"
    )
    .drop(["node_id", "name"], axis=1)
    .rename(columns={"het_type": "chem_type"})
)

In [13]:
gold_type.head()

Unnamed: 0,doid_id,drugbank_id,disease,drug,category,dise_type,chem_type
0,DOID:10652,DB00843,Alzheimer's disease,Donepezil,DM,Disease,Compound
1,DOID:10652,DB00674,Alzheimer's disease,Galantamine,DM,Disease,Compound
2,DOID:10652,DB01043,Alzheimer's disease,Memantine,DM,Disease,Compound
3,DOID:14330,DB01043,Parkinson's disease,Memantine,SYM,Disease,Compound
4,DOID:10652,DB00989,Alzheimer's disease,Rivastigmine,DM,Disease,Compound


In [14]:
gold_type.groupby(["category", "dise_type", "chem_type"]).size()

category  dise_type  chem_type
DM        Disease    Compound     755
NOT       Disease    Compound     243
SYM       Disease    Compound     390
dtype: int64

In [15]:
goodgold = gold.query("category == 'DM'")

In [16]:
goodgold.shape

(755, 5)

All gold standard edges are between diseases and compounds. Thus if we completely sever the link we should be ok

---

## Select out edges for minimal network

In [17]:
min_edges = (ledges
    .query("\
    start_htype == 'Compound' and end_htype == 'Compound'\
    or start_htype == 'Compound' and end_htype == 'Gene'\
    or start_htype == 'Disease' and end_htype == 'Disease'\
    or start_htype == 'Disease' and end_htype == 'Anatomy'\
    ")
    .reset_index(drop=True)
)

In [18]:
min_edges.shape

(62060, 5)

In [19]:
min_edges.head(3)

Unnamed: 0,start_id,end_id,het_etype,start_htype,end_htype
0,DB00643,51547,UPREGULATES_CuG,Compound,Gene
1,DB08881,10450,UPREGULATES_CuG,Compound,Gene
2,DB01211,10450,DOWNREGULATES_CdG,Compound,Gene


In [20]:
min_edges.groupby(["start_htype", "end_htype"]).size()

start_htype  end_htype
Compound     Compound      6486
             Gene         51429
Disease      Anatomy       3602
             Disease        543
dtype: int64

### Verify that all or most chemicals and diseases in the gold standard are included as nodes in this minimal set of edges

In [21]:
min_nodes = (pd
    .DataFrame(
        list(set(min_edges["start_id"]) | set(min_edges["end_id"])),
        columns=["node_id"]
    )
    .merge(nodes, how="left", on="node_id")
    .assign(minnet=1)
)

In [22]:
min_nodes.shape

(7927, 4)

In [23]:
min_nodes.head(3)

Unnamed: 0,node_id,name,het_type,minnet
0,10015,PDCD6IP,Gene,1
1,54480,CHPF2,Gene,1
2,126969,SLC44A3,Gene,1


In [24]:
min_nodes["het_type"].value_counts()

Gene        5872
Compound    1524
Anatomy      398
Disease      133
Name: het_type, dtype: int64

### Gold standard nodes

In [25]:
gnodes = (pd
    .DataFrame(
        list(set(goodgold["doid_id"]) | set(goodgold["drugbank_id"])),
        columns=["node_id"]
    )
    .merge(nodes, how="left", on="node_id")
    .assign(gold=1)
)

In [26]:
gnodes.shape

(464, 4)

In [27]:
gnodes.head(3)

Unnamed: 0,node_id,name,het_type,gold
0,DB04868,Nilotinib,Compound,1
1,DB00967,Desloratadine,Compound,1
2,DB00519,Trandolapril,Compound,1


In [28]:
gnodes["het_type"].value_counts()

Compound    387
Disease      77
Name: het_type, dtype: int64

### Merge and check relative amounts

In [29]:
res = (min_nodes
    .query("het_type == 'Compound' or het_type == 'Disease'")
    [["node_id", "minnet"]]
    .merge(
        gnodes[["node_id", "gold"]], how="outer", on="node_id"
    )
    .fillna(0)
    .assign(minnet = lambda df: df["minnet"].astype(int))
    .assign(gold = lambda df: df["gold"].astype(int))
)

In [30]:
res.head(3)

Unnamed: 0,node_id,minnet,gold
0,DB00789,1,0
1,DOID:12361,1,1
2,DB01180,1,0


In [31]:
res.groupby(["minnet", "gold"]).size()

minnet  gold
0       1          1
1       0       1194
        1        463
dtype: int64

**Summary:** all nodes except one in the gold standard are represented in the minimal network. We should have no problems generating embeddings for these concepts.

---

## Save minimal network to disk

In [32]:
min_edges.head()

Unnamed: 0,start_id,end_id,het_etype,start_htype,end_htype
0,DB00643,51547,UPREGULATES_CuG,Compound,Gene
1,DB08881,10450,UPREGULATES_CuG,Compound,Gene
2,DB01211,10450,DOWNREGULATES_CdG,Compound,Gene
3,DB00374,10450,DOWNREGULATES_CdG,Compound,Gene
4,DB00398,10450,UPREGULATES_CuG,Compound,Gene


In [33]:
min_edges.shape

(62060, 5)

In [34]:
min_nodes.head()

Unnamed: 0,node_id,name,het_type,minnet
0,10015,PDCD6IP,Gene,1
1,54480,CHPF2,Gene,1
2,126969,SLC44A3,Gene,1
3,DB00789,Gadopentetate dimeglumine,Compound,1
4,DOID:12361,Graves' disease,Disease,1


In [35]:
min_nodes.shape

(7927, 4)

---

In [36]:
min_edges.to_csv("data/min_hetionet/minhet_edges.tsv", sep='\t', index=False)

In [37]:
min_nodes.to_csv("data/min_hetionet/minhet_nodes.tsv", sep='\t', index=False)