# Transform and split DWPCs, assess performance

In [1]:
import itertools
import bz2

import pandas
import numpy
import sklearn.metrics
from scipy.special import logit

In [2]:
unperm_name = 'rephetio-v2.0'

In [3]:
feature_df = pandas.read_table('data/matrix/features.tsv.bz2')
feature_type_df = pandas.read_table('data/matrix/feature-type.tsv')

In [4]:
feature_df.head(2)

Unnamed: 0,hetnet,chemical_id,disease_id,status,primary,disease_name,chemical_name,prior_prob,CbG,CcSE,...,CuGuDdGdD,CuGuDdGuD,CuGuDlAlD,CuGuDpSpD,CuGuDrD,CuGuDrDrD,CuGuDtCtD,CuGuDuGaD,CuGuDuGdD,CuGuDuGuD
0,rephetio-v2.0_perm-2,DB00014,DOID:0050742,0,1,nicotine dependence,Goserelin,0.001517,2,249,...,0.000627,0.000651,0.00049,0.0,0.0,0.000496,0.0,0.000218,0.001182,0.001201
1,rephetio-v2.0_perm-2,DB00091,DOID:0050742,0,1,nicotine dependence,Cyclosporine,0.004893,30,344,...,0.016569,0.017337,0.004231,0.008249,0.004638,0.005157,0.002165,0.003623,0.016645,0.017121


In [5]:
feature_type_df.head()

Unnamed: 0,feature,feature_type,unperm_mean,unperm_sd
0,prior_prob,prior,0.033131,0.061393
1,CbG,degree,11.777,12.38
2,CcSE,degree,155.33,126.55
3,CdG,degree,29.332,63.599
4,CiPC,degree,0.90372,1.031


In [6]:
def transform_dwpcs(x, scaler):
    x = numpy.array(x)
    return numpy.arcsinh(x / scaler)

transformed_df = feature_df.copy()
dwpc_features = feature_type_df.query("feature_type == 'dwpc'").feature
degree_features = feature_type_df.query("feature_type == 'degree'").feature
feature_to_scaler = dict(zip(feature_type_df.feature, feature_type_df.unperm_mean))
for column in dwpc_features:
    transformed_df[column] = transform_dwpcs(transformed_df[column], feature_to_scaler[column])

In [7]:
column_names = list()
columns = list()
for metapath in dwpc_features:
    df = pandas.pivot_table(transformed_df, values=metapath, index=['chemical_id', 'disease_id'], columns='hetnet')
    df = df[df['rephetio-v2.0'].notnull()]
    dwpc = df.iloc[:, 0]
    pdwpc = df.iloc[:, 1:].mean(axis='columns')
    rdwpc = dwpc - pdwpc
    for column in dwpc, pdwpc, rdwpc:
        columns.append(column)
    for feature_type in 'dwpc', 'pdwpc', 'rdwpc':
        column_names.append('{}_{}'.format(feature_type, metapath))

split_df = pandas.concat(columns, axis=1)
split_df.columns = column_names
split_df.reset_index(inplace=True)

In [8]:
split_df.head(2)

Unnamed: 0,chemical_id,disease_id,dwpc_CbGaD,pdwpc_CbGaD,rdwpc_CbGaD,dwpc_CbGaDaGaD,pdwpc_CbGaDaGaD,rdwpc_CbGaDaGaD,dwpc_CbGaDaGdD,pdwpc_CbGaDaGdD,...,rdwpc_CuGuDtCtD,dwpc_CuGuDuGaD,pdwpc_CuGuDuGaD,rdwpc_CuGuDuGaD,dwpc_CuGuDuGdD,pdwpc_CuGuDuGdD,rdwpc_CuGuDuGdD,dwpc_CuGuDuGuD,pdwpc_CuGuDuGuD,rdwpc_CuGuDuGuD
0,DB00014,DOID:10283,0.0,0.215946,-0.215946,0.95365,0.843652,0.109999,1.113291,0.363054,...,0.198711,0.673295,0.359476,0.313819,0.0,0.098787,-0.098787,0.116955,0.095147,0.021808
1,DB00091,DOID:1312,0.0,0.300358,-0.300358,0.559519,0.808109,-0.24859,0.0,0.0,...,-1.31849,2.508972,1.753617,0.755355,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
base_df = feature_df.query("hetnet == @unperm_name").copy()
base_df.insert(8, 'prior_logit', logit(base_df['prior_prob']))
for metaege in degree_features:
    base_df['degree_{}'.format(metaege)] = numpy.arcsinh(base_df[metaege])
base_df.drop(
    ['hetnet', 'primary', 'prior_prob'] + list(degree_features) + list(dwpc_features),
    axis='columns', inplace=True)
transformed_df = base_df.merge(split_df)
transformed_df.head(2)

Unnamed: 0,chemical_id,disease_id,status,disease_name,chemical_name,prior_logit,degree_CbG,degree_CcSE,degree_CdG,degree_CiPC,...,rdwpc_CuGuDtCtD,dwpc_CuGuDuGaD,pdwpc_CuGuDuGaD,rdwpc_CuGuDuGaD,dwpc_CuGuDuGdD,pdwpc_CuGuDuGdD,rdwpc_CuGuDuGdD,dwpc_CuGuDuGuD,pdwpc_CuGuDuGuD,rdwpc_CuGuDuGuD
0,DB00169,DOID:0050742,0,nicotine dependence,Cholecalciferol,-5.315024,3.333478,0.0,0.0,0.881374,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,DB00195,DOID:0050742,0,nicotine dependence,Betaxolol,-6.48937,2.094713,6.265305,0.881374,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
path = 'data/matrix/rephetio-v2.0/transformed-features.tsv.bz2'
with bz2.open(path, 'wt') as write_file:
    transformed_df.to_csv(write_file, sep='\t', index=False, float_format='%.5g')

### Compute performance

In [11]:
rows  = list()
for column in transformed_df.columns[transformed_df.columns.str.contains('dwpc')]:
    feature_type, metapath = column.split('_', 1)
    auroc = sklearn.metrics.roc_auc_score(transformed_df.status, transformed_df[column])
    rows.append([feature_type + '_auroc', metapath, auroc])
auroc_df = pandas.DataFrame(rows, columns=['feature_type', 'metapath', 'auroc'])
auroc_df = auroc_df.pivot_table(values='auroc', index='metapath', columns='feature_type').reset_index()
auroc_df.head(2)

feature_type,metapath,dwpc_auroc,pdwpc_auroc,rdwpc_auroc
0,CbGaD,0.71771,0.611966,0.687649
1,CbGaDaGaD,0.700339,0.64794,0.625081


In [12]:
primary_auroc_df = pandas.read_table('data/feature-performance/primary-aurocs.tsv')
primary_auroc_df = primary_auroc_df.rename(columns={'feature': 'metapath', 'auroc_permuted': 'pdwpc_primary_auroc', 'pval_auroc': 'pval_delta_auroc'})
primary_auroc_df = primary_auroc_df[['metapath', 'nonzero', 'pdwpc_primary_auroc', 'delta_auroc', 'pval_delta_auroc']]
auroc_df = auroc_df.merge(primary_auroc_df)
auroc_df.head(2)

Unnamed: 0,metapath,dwpc_auroc,pdwpc_auroc,rdwpc_auroc,nonzero,pdwpc_primary_auroc,delta_auroc,pval_delta_auroc
0,CbGaD,0.71771,0.611966,0.687649,0.33311,0.56415,0.15356,2.1345e-07
1,CbGaDaGaD,0.700339,0.64794,0.625081,0.98446,0.61936,0.080982,2.5622e-05


In [13]:
auroc_df.to_csv('data/feature-performance/auroc.tsv', sep='\t', index=False, float_format='%.5g')

In [14]:
#auroc_df.sort_values('rdwpc_auroc', ascending = False)
auroc_df[-auroc_df.metapath.str.contains('CtD')].sort_values('rdwpc_auroc', ascending = False).head()

Unnamed: 0,metapath,dwpc_auroc,pdwpc_auroc,rdwpc_auroc,nonzero,pdwpc_primary_auroc,delta_auroc,pval_delta_auroc
0,CbGaD,0.71771,0.611966,0.687649,0.33311,0.56415,0.15356,2.1345e-07
390,CiPCiCbGaD,0.65831,0.616203,0.647833,0.33986,0.57671,0.081605,9.2239e-05
127,CbGpPWpGaD,0.694207,0.649417,0.645272,0.98345,0.6276,0.066604,4.9797e-06
9,CbGaDrD,0.631121,0.491673,0.644762,0.61047,0.49599,0.13513,5.9212e-05
402,CrCbGaD,0.636029,0.580497,0.638874,0.45574,0.56265,0.073381,2.4222e-05
