# Prepare the selected metapaths for feature computation for all compound-disease pairs

In [1]:
import json
import itertools
import bz2
import configparser

import pandas
from statsmodels.sandbox.stats.multicomp import multipletests

## Read node info

## Why are the chemicals and diseases from the pharmacotherapy db subset and not from the entire ontology of all diseases and drugs?

In [2]:
# Read compound and disease degrees
compound_df = pandas.read_table('../summary/compounds.tsv')
disease_df = pandas.read_table('../summary/diseases.tsv')
len(compound_df), len(disease_df)

(1393, 134)

## change this to a subset of the data only?

In [3]:
config = configparser.ConfigParser()
config.read('../config.ini')
commit = config['hetnet']['pharmacotherapydb_commit']
url = 'https://github.com/dhimmel/indications/blob/{}/catalog/indications.tsv?raw=true'.format(commit)
indication_df = pandas.read_table(url)
indication_df = indication_df.rename(columns={'doid_id': 'disease_id', 'drugbank_id': 'compound_id'})[['compound_id', 'disease_id', 'category']]
indication_df.head(2)

Unnamed: 0,compound_id,disease_id,category
0,DB00843,DOID:10652,DM
1,DB00674,DOID:10652,DM


In [4]:
indication_df["category"].value_counts()

DM     755
SYM    390
NOT    243
Name: category, dtype: int64

In [5]:
# generate all possible drug/disease pairs
# changed to outer join and not left join

rows = list()
for c, d in itertools.product(compound_df.itertuples(), disease_df.itertuples()):
    row = c.compound_id, c.compound_name, d.disease_id, d.disease_name
    rows.append(row)
    
pair_df = (pandas
    .DataFrame(rows, columns=['compound_id', 'compound_name', 'disease_id', 'disease_name'])
    .merge(indication_df, how = "outer", on = ["compound_id", "disease_id"])
)

pair_df['status'] = (pair_df.category == 'DM').astype(int)
pair_df.head(2)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,category,status
0,DB01048,Abacavir,DOID:10652,Alzheimer's disease,,0
1,DB01048,Abacavir,DOID:9206,Barrett's esophagus,,0


In [6]:
pair_df.shape

(186665, 6)

In [7]:
pair_df["category"].value_counts()

DM     755
SYM    390
NOT    243
Name: category, dtype: int64

In [8]:
pair_df["status"].value_counts()

0    185910
1       755
Name: status, dtype: int64

In [9]:
len(pair_df)

186665

In [10]:
with bz2.open('features/compound-disease-pairs.tsv.bz2', 'wt') as write_file:
    pair_df.to_csv(write_file, sep='\t', index=False)

## Select metapaths

In [11]:
auroc_df = pandas.read_table('../all-features/data/feature-performance/auroc.tsv')
reject, pvals_corrected, alphacSidak, alphacBonf = multipletests(auroc_df.pval_delta_auroc, method='fdr_bh')
auroc_df['fdr_delta_auroc'] = pvals_corrected
auroc_df.head(2)

Unnamed: 0,metapath,dwpc_auroc,pdwpc_auroc,rdwpc_auroc,nonzero,pdwpc_primary_auroc,delta_auroc,pval_delta_auroc,fdr_delta_auroc
0,CbGaD,0.71649,0.60246,0.70178,0.3265,0.56621,0.15028,7.5613e-07,4e-06
1,CbGaDaGaD,0.68749,0.63235,0.63286,0.97854,0.60799,0.079507,8.2092e-06,1.5e-05


In [12]:
whitelist_df = auroc_df.query(
    "rdwpc_auroc > 0.55"
    " and delta_auroc > 0"
    " and fdr_delta_auroc < 0.05"
    " and pdwpc_primary_auroc > 0.5"
).copy()
whitelist_df['feature'] = 'dwpc_' + whitelist_df['metapath']
whitelist_df.head(2)

Unnamed: 0,metapath,dwpc_auroc,pdwpc_auroc,rdwpc_auroc,nonzero,pdwpc_primary_auroc,delta_auroc,pval_delta_auroc,fdr_delta_auroc,feature
0,CbGaD,0.71649,0.60246,0.70178,0.3265,0.56621,0.15028,7.5613e-07,4e-06,dwpc_CbGaD
1,CbGaDaGaD,0.68749,0.63235,0.63286,0.97854,0.60799,0.079507,8.2092e-06,1.5e-05,dwpc_CbGaDaGaD


In [13]:
metapaths = set(whitelist_df.metapath)
len(metapaths)

6

## Metaedges in chosen metapaths

In [14]:
m2m_df = pandas.read_table('../all-features/data/metaedge-in-metapath.tsv')
m2m_df = m2m_df.query("metapath in @metapaths")

In [15]:
# Counting multiple occurrences in the same metapath
m2m_df.metaedge.value_counts().reset_index()

Unnamed: 0,index,metaedge
0,Compound - binds - Gene,8
1,Disease - associates - Gene,6
2,Compound - treats - Disease,6


In [16]:
# Counting only one metaedge occurrence per metapath
m2m_df.drop_duplicates().metaedge.value_counts().reset_index()

Unnamed: 0,index,metaedge
0,Compound - binds - Gene,8
1,Disease - associates - Gene,6
2,Compound - treats - Disease,6


In [17]:
# Number of included metaedges
m2m_df.metaedge.nunique()

3

In [18]:
with open('../all-features/data/metapaths.json') as read_file:
    metapath_obj = json.load(read_file)
metapath_obj = [x for x in metapath_obj if x['abbreviation'] in metapaths]
with open('features/metapaths.json', 'wt') as write_file:
    metapath_obj = json.dump(metapath_obj, write_file, indent=2, sort_keys=True)