# Prepare the selected metapaths for feature computation for all compound-disease pairs

In [1]:
import json
import itertools
import bz2
import configparser

import numpy as np
import pandas
from statsmodels.sandbox.stats.multicomp import multipletests

## Plan

We want to extract the useful metapaths for all chemical disease relations in our training and holdout sets by querying Neo4j. To do so, we will read the training and test sets directly.

## Read training and holdout sets

In [2]:
with open("../../crossval_idx.txt", "r") as fin:
    crossval_idx = int(fin.read().strip())

In [3]:
training = pandas.read_csv("../../../crossval/data/training/training_piece{}.tsv".format(crossval_idx), sep = '\t')
holdout = pandas.read_csv("../../../crossval/data/holdout/holdout_piece{}.tsv".format(crossval_idx), sep = '\t')

In [4]:
training.shape

(38982, 6)

In [5]:
training.head()

Unnamed: 0,category,chemical_id,chemical_name,disease_id,disease_name,piece
0,,DB00014,,DOID:0050742,,
1,,DB00014,,DOID:0060073,,
2,DM,DB00014,Goserelin,DOID:10283,prostate cancer,2.0
3,,DB00014,,DOID:11714,,
4,,DB00014,,DOID:12995,,


In [6]:
holdout.shape

(16786, 6)

In [7]:
holdout.head()

Unnamed: 0,chemical_id,disease_id,disease_name,chemical_name,category,piece
0,DB00014,DOID:0050156,,,,
1,DB00014,DOID:0050425,,,,
2,DB00014,DOID:0050741,,,,
3,DB00014,DOID:1024,,,,
4,DB00014,DOID:10283,,,,


In [8]:
holdout["category"].value_counts()

DM     163
SYM     79
NOT     49
Name: category, dtype: int64

Since the holdout set contains the true labels for whether each relation is a chemical-treats-disease relationship, we will remove this true label but add a flag denoting that the relations are from the holdout set so that we can evaluate them separately later. However they will be combined into the same dataframe for feature extraction from Neo4j in order to avoid drastically modifying the workflow created by Daniel.

## Prepare compound disease pairs tsv

In [9]:
train_pairs = (training[["chemical_id", "chemical_name", "disease_id", "disease_name", "category"]]
    .assign(origin = "train")
)

In [10]:
train_pairs.shape

(38982, 6)

In [11]:
train_pairs.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,category,origin
0,DB00014,,DOID:0050742,,,train
1,DB00014,,DOID:0060073,,,train
2,DB00014,Goserelin,DOID:10283,prostate cancer,DM,train
3,DB00014,,DOID:11714,,,train
4,DB00014,,DOID:12995,,,train


In [12]:
train_pairs["category"].value_counts()

DM     592
SYM    311
NOT    194
Name: category, dtype: int64

### Prepare holdout data and remove true labels

In [13]:
holdout.head()

Unnamed: 0,chemical_id,disease_id,disease_name,chemical_name,category,piece
0,DB00014,DOID:0050156,,,,
1,DB00014,DOID:0050425,,,,
2,DB00014,DOID:0050741,,,,
3,DB00014,DOID:1024,,,,
4,DB00014,DOID:10283,,,,


In [14]:
holdout_pairs = (holdout[["chemical_id", "chemical_name", "disease_id", "disease_name", "category"]]
    .assign(origin = "holdout")
    .assign(category = np.nan)
)

In [15]:
holdout_pairs.shape

(16786, 6)

In [16]:
holdout_pairs.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,category,origin
0,DB00014,,DOID:0050156,,,holdout
1,DB00014,,DOID:0050425,,,holdout
2,DB00014,,DOID:0050741,,,holdout
3,DB00014,,DOID:1024,,,holdout
4,DB00014,,DOID:10283,,,holdout


In [17]:
holdout_pairs["category"].isnull().all()

True

We have successfully sanitized the holdout data pairs to remove the true labels but will be extracting the DWPCs with the training data pairs at the same time.

## Save pairs to file

In [18]:
pair_df = pandas.concat([train_pairs, holdout_pairs])

pair_df["status"] = (pair_df["category"] == "DM").astype(int)

In [19]:
pair_df.shape

(55768, 7)

In [20]:
pair_df.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,category,origin,status
0,DB00014,,DOID:0050742,,,train,0
1,DB00014,,DOID:0060073,,,train,0
2,DB00014,Goserelin,DOID:10283,prostate cancer,DM,train,1
3,DB00014,,DOID:11714,,,train,0
4,DB00014,,DOID:12995,,,train,0


In [21]:
pair_df["origin"].value_counts()

train      38982
holdout    16786
Name: origin, dtype: int64

In [22]:
pair_df.groupby("origin")["category"].value_counts()

origin  category
train   DM          592
        SYM         311
        NOT         194
Name: category, dtype: int64

In [23]:
with bz2.open('features/compound-disease-pairs.tsv.bz2', 'wt') as write_file:
    pair_df.to_csv(write_file, sep='\t', index=False)

## Select metapaths

In [24]:
auroc_df = pandas.read_table('../all-features/data/feature-performance/auroc.tsv')
reject, pvals_corrected, alphacSidak, alphacBonf = multipletests(auroc_df.pval_delta_auroc, method='fdr_bh')
auroc_df['fdr_delta_auroc'] = pvals_corrected
auroc_df.head(2)

Unnamed: 0,metapath,dwpc_auroc,pdwpc_auroc,rdwpc_auroc,nonzero,pdwpc_primary_auroc,delta_auroc,pval_delta_auroc,fdr_delta_auroc
0,CbGaD,0.71771,0.61197,0.68765,0.33311,0.56415,0.15356,2.1345e-07,4.2e-05
1,CbGaDaGaD,0.70034,0.64794,0.62508,0.98446,0.61936,0.080982,2.5622e-05,0.000458


In [25]:
whitelist_df = auroc_df.query(
    "rdwpc_auroc > 0.55"
    " and delta_auroc > 0"
    " and fdr_delta_auroc < 0.05"
    " and pdwpc_primary_auroc > 0.5"
).copy()
whitelist_df['feature'] = 'dwpc_' + whitelist_df['metapath']
whitelist_df.head(2)

Unnamed: 0,metapath,dwpc_auroc,pdwpc_auroc,rdwpc_auroc,nonzero,pdwpc_primary_auroc,delta_auroc,pval_delta_auroc,fdr_delta_auroc,feature
0,CbGaD,0.71771,0.61197,0.68765,0.33311,0.56415,0.15356,2.1345e-07,4.2e-05,dwpc_CbGaD
1,CbGaDaGaD,0.70034,0.64794,0.62508,0.98446,0.61936,0.080982,2.5622e-05,0.000458,dwpc_CbGaDaGaD


In [26]:
metapaths = set(whitelist_df.metapath)
len(metapaths)

118

## Metaedges in chosen metapaths

In [27]:
m2m_df = pandas.read_table('../all-features/data/metaedge-in-metapath.tsv')
m2m_df = m2m_df.query("metapath in @metapaths")

In [28]:
# Counting multiple occurrences in the same metapath
m2m_df.metaedge.value_counts().reset_index()

Unnamed: 0,index,metaedge
0,Compound - binds - Gene,67
1,Disease - associates - Gene,67
2,Compound - treats - Disease,50
3,Compound - resembles - Compound,39
4,Compound - downregulates - Gene,31
5,Compound - upregulates - Gene,28
6,Disease - resembles - Disease,18
7,Disease - upregulates - Gene,16
8,Gene - interacts - Gene,15
9,Disease - localizes - Anatomy,14


In [29]:
# Counting only one metaedge occurrence per metapath
m2m_df.drop_duplicates().metaedge.value_counts().reset_index()

Unnamed: 0,index,metaedge
0,Compound - binds - Gene,67
1,Disease - associates - Gene,67
2,Compound - treats - Disease,50
3,Compound - resembles - Compound,39
4,Compound - downregulates - Gene,31
5,Compound - upregulates - Gene,28
6,Disease - resembles - Disease,18
7,Disease - upregulates - Gene,16
8,Gene - interacts - Gene,15
9,Disease - localizes - Anatomy,14


In [30]:
# Number of included metaedges
m2m_df.metaedge.nunique()

21

In [31]:
with open('../all-features/data/metapaths.json') as read_file:
    metapath_obj = json.load(read_file)
metapath_obj = [x for x in metapath_obj if x['abbreviation'] in metapaths]
with open('features/metapaths.json', 'wt') as write_file:
    metapath_obj = json.dump(metapath_obj, write_file, indent=2, sort_keys=True)