# Transform and split DWPCs, assess performance

In [1]:
import itertools
import bz2

import pandas
import numpy
import sklearn.metrics
from scipy.special import logit

In [2]:
unperm_name = 'rephetio-v2.0'

In [3]:
feature_df = pandas.read_table('data/matrix/features.tsv.bz2')
feature_type_df = pandas.read_table('data/matrix/feature-type.tsv')

In [4]:
feature_df.head(2)

Unnamed: 0,hetnet,compound_id,disease_id,status,primary,disease_name,compound_name,prior_prob,CbG,CtD,...,DtC,CbGaD,CbGaDaGaD,CbGaDtCtD,CbGbCbGaD,CbGbCtD,CtDaGaD,CtDaGbCtD,CtDtCbGaD,CtDtCtD
0,rephetio-v2.0_perm-2,DB00014,DOID:1024,0,1,leprosy,Goserelin,0.006101,2,2,...,2,0.0,0.000545,0.000299,0.001267,0.0,0.001198,0.003134,0.0,0.014391
1,rephetio-v2.0_perm-1,DB00136,DOID:1024,0,1,leprosy,Calcitriol,0.002948,4,1,...,2,0.0,0.00251,8.1e-05,0.000907,0.0,0.004485,0.002194,0.000736,0.0


In [5]:
feature_type_df.head()

Unnamed: 0,feature,feature_type,unperm_mean,unperm_sd
0,prior_prob,prior,0.037868,0.066775
1,CbG,degree,11.544,12.697
2,CtD,degree,2.2322,2.5949
3,DaG,degree,153.61,152.49
4,DtC,degree,10.246,11.685


In [6]:
def transform_dwpcs(x, scaler):
    x = numpy.array(x)
    return numpy.arcsinh(x / scaler)

transformed_df = feature_df.copy()
dwpc_features = feature_type_df.query("feature_type == 'dwpc'").feature
degree_features = feature_type_df.query("feature_type == 'degree'").feature
feature_to_scaler = dict(zip(feature_type_df.feature, feature_type_df.unperm_mean))
for column in dwpc_features:
    transformed_df[column] = transform_dwpcs(transformed_df[column], feature_to_scaler[column])

In [7]:
column_names = list()
columns = list()
for metapath in dwpc_features:
    df = pandas.pivot_table(transformed_df, values=metapath, index=['compound_id', 'disease_id'], columns='hetnet')
    df = df[df['rephetio-v2.0'].notnull()]
    dwpc = df.iloc[:, 0]
    pdwpc = df.iloc[:, 1:].mean(axis='columns')
    rdwpc = dwpc - pdwpc
    for column in dwpc, pdwpc, rdwpc:
        columns.append(column)
    for feature_type in 'dwpc', 'pdwpc', 'rdwpc':
        column_names.append('{}_{}'.format(feature_type, metapath))

split_df = pandas.concat(columns, axis=1)
split_df.columns = column_names
split_df.reset_index(inplace=True)

In [8]:
split_df.head(2)

Unnamed: 0,compound_id,disease_id,dwpc_CbGaD,pdwpc_CbGaD,rdwpc_CbGaD,dwpc_CbGaDaGaD,pdwpc_CbGaDaGaD,rdwpc_CbGaDaGaD,dwpc_CbGaDtCtD,pdwpc_CbGaDtCtD,...,rdwpc_CtDaGaD,dwpc_CtDaGbCtD,pdwpc_CtDaGbCtD,rdwpc_CtDaGbCtD,dwpc_CtDtCbGaD,pdwpc_CtDtCbGaD,rdwpc_CtDtCbGaD,dwpc_CtDtCtD,pdwpc_CtDtCtD,rdwpc_CtDtCtD
0,DB00014,DOID:10283,0.0,0.10931,-0.10931,0.903603,0.62067,0.282934,0.0,0.489436,...,0.162255,1.821218,1.241336,0.579881,1.60291,1.450227,0.152683,0.710316,0.810733,-0.100417
1,DB00014,DOID:11934,0.0,0.0,0.0,0.239307,0.204612,0.034695,0.0,0.189799,...,0.755596,1.112478,0.683351,0.429127,1.060563,0.459331,0.601232,1.505999,0.554617,0.951381


In [9]:
base_df = feature_df.query("hetnet == @unperm_name").copy()
base_df.insert(8, 'prior_logit', logit(base_df['prior_prob']))
for metaege in degree_features:
    base_df['degree_{}'.format(metaege)] = numpy.arcsinh(base_df[metaege])
base_df.drop(
    ['hetnet', 'primary', 'prior_prob'] + list(degree_features) + list(dwpc_features),
    axis='columns', inplace=True)
transformed_df = base_df.merge(split_df)
transformed_df.head(2)

Unnamed: 0,compound_id,disease_id,status,disease_name,compound_name,prior_logit,degree_CbG,degree_CtD,degree_DaG,degree_DtC,...,rdwpc_CtDaGaD,dwpc_CtDaGbCtD,pdwpc_CtDaGbCtD,rdwpc_CtDaGbCtD,dwpc_CtDtCbGaD,pdwpc_CtDtCbGaD,rdwpc_CtDtCbGaD,dwpc_CtDtCtD,pdwpc_CtDtCtD,rdwpc_CtDtCtD
0,DB00187,DOID:1024,0,leprosy,Esmolol,-5.82371,1.443635,0.881374,3.829114,1.443635,...,-0.150761,0.680314,0.066536,0.613779,0.0,0.266034,-0.266034,0.0,0.064124,-0.064124
1,DB00195,DOID:1024,0,leprosy,Betaxolol,-5.82371,2.094713,0.881374,3.829114,1.443635,...,-0.130838,0.680314,0.123345,0.556969,0.0,0.219626,-0.219626,0.0,0.064124,-0.064124


In [10]:
path = 'data/matrix/rephetio-v2.0/transformed-features.tsv.bz2'
with bz2.open(path, 'wt') as write_file:
    transformed_df.to_csv(write_file, sep='\t', index=False, float_format='%.5g')

### Compute performance

In [11]:
rows  = list()
for column in transformed_df.columns[transformed_df.columns.str.contains('dwpc')]:
    feature_type, metapath = column.split('_', 1)
    auroc = sklearn.metrics.roc_auc_score(transformed_df.status, transformed_df[column])
    rows.append([feature_type + '_auroc', metapath, auroc])
auroc_df = pandas.DataFrame(rows, columns=['feature_type', 'metapath', 'auroc'])
auroc_df = auroc_df.pivot_table(values='auroc', index='metapath', columns='feature_type').reset_index()
auroc_df.head(2)

feature_type,metapath,dwpc_auroc,pdwpc_auroc,rdwpc_auroc
0,CbGaD,0.716491,0.602462,0.701777
1,CbGaDaGaD,0.687493,0.632354,0.63286


In [12]:
primary_auroc_df = pandas.read_table('data/feature-performance/primary-aurocs.tsv')
primary_auroc_df = primary_auroc_df.rename(columns={'feature': 'metapath', 'auroc_permuted': 'pdwpc_primary_auroc', 'pval_auroc': 'pval_delta_auroc'})
primary_auroc_df = primary_auroc_df[['metapath', 'nonzero', 'pdwpc_primary_auroc', 'delta_auroc', 'pval_delta_auroc']]
auroc_df = auroc_df.merge(primary_auroc_df)
auroc_df.head(2)

Unnamed: 0,metapath,dwpc_auroc,pdwpc_auroc,rdwpc_auroc,nonzero,pdwpc_primary_auroc,delta_auroc,pval_delta_auroc
0,CbGaD,0.716491,0.602462,0.701777,0.3265,0.56621,0.15028,7.5613e-07
1,CbGaDaGaD,0.687493,0.632354,0.63286,0.97854,0.60799,0.079507,8.2092e-06


In [13]:
auroc_df.to_csv('data/feature-performance/auroc.tsv', sep='\t', index=False, float_format='%.5g')

In [14]:
#auroc_df.sort_values('rdwpc_auroc', ascending = False)
auroc_df[-auroc_df.metapath.str.contains('CtD')].sort_values('rdwpc_auroc', ascending = False).head()

Unnamed: 0,metapath,dwpc_auroc,pdwpc_auroc,rdwpc_auroc,nonzero,pdwpc_primary_auroc,delta_auroc,pval_delta_auroc
0,CbGaD,0.716491,0.602462,0.701777,0.3265,0.56621,0.15028,7.5613e-07
3,CbGbCbGaD,0.690508,0.648705,0.649169,0.91089,0.62966,0.060851,0.00029706
1,CbGaDaGaD,0.687493,0.632354,0.63286,0.97854,0.60799,0.079507,8.2092e-06
