# Transform and split DWPCs, assess performance

In [1]:
import itertools
import bz2

import pandas
import numpy
import sklearn.metrics
from scipy.special import logit

In [2]:
unperm_name = 'rephetio-v2.0'

In [3]:
feature_df = pandas.read_table('data/matrix/features.tsv.bz2')
feature_type_df = pandas.read_table('data/matrix/feature-type.tsv')

In [4]:
feature_df.head(2)

Unnamed: 0,hetnet,chemical_id,disease_id,status,primary,disease_name,chemical_name,prior_prob,CbG,CtD,...,DtC,CbGaD,CbGaDaGaD,CbGaDtCtD,CbGbCbGaD,CbGbCtD,CtDaGaD,CtDaGbCtD,CtDtCbGaD,CtDtCtD
0,rephetio-v2.0_perm-1,DB00014,DOID:0050742,0,1,nicotine dependence,Goserelin,0.001517,2,1,...,1,0.0,0.001564,0.001338,0.001015,0.0,0.003219,0.0,0.000291,0.0
1,rephetio-v2.0_perm-2,DB00014,DOID:0050742,1,1,nicotine dependence,Goserelin,0.001517,2,1,...,1,0.0,0.003091,0.0,0.000554,0.0,0.0,0.0,0.0,0.0


In [5]:
feature_type_df.head()

Unnamed: 0,feature,feature_type,unperm_mean,unperm_sd
0,prior_prob,prior,0.033641,0.061468
1,CbG,degree,11.866,13.054
2,CtD,degree,1.9635,2.3398
3,DaG,degree,151.01,155.03
4,DtC,degree,9.9182,11.845


In [6]:
def transform_dwpcs(x, scaler):
    x = numpy.array(x)
    return numpy.arcsinh(x / scaler)

transformed_df = feature_df.copy()
dwpc_features = feature_type_df.query("feature_type == 'dwpc'").feature
degree_features = feature_type_df.query("feature_type == 'degree'").feature
feature_to_scaler = dict(zip(feature_type_df.feature, feature_type_df.unperm_mean))
for column in dwpc_features:
    transformed_df[column] = transform_dwpcs(transformed_df[column], feature_to_scaler[column])

In [7]:
column_names = list()
columns = list()
for metapath in dwpc_features:
    df = pandas.pivot_table(transformed_df, values=metapath, index=['chemical_id', 'disease_id'], columns='hetnet')
    df = df[df['rephetio-v2.0'].notnull()]
    dwpc = df.iloc[:, 0]
    pdwpc = df.iloc[:, 1:].mean(axis='columns')
    rdwpc = dwpc - pdwpc
    for column in dwpc, pdwpc, rdwpc:
        columns.append(column)
    for feature_type in 'dwpc', 'pdwpc', 'rdwpc':
        column_names.append('{}_{}'.format(feature_type, metapath))

split_df = pandas.concat(columns, axis=1)
split_df.columns = column_names
split_df.reset_index(inplace=True)

In [8]:
split_df.head(2)

Unnamed: 0,chemical_id,disease_id,dwpc_CbGaD,pdwpc_CbGaD,rdwpc_CbGaD,dwpc_CbGaDaGaD,pdwpc_CbGaDaGaD,rdwpc_CbGaDaGaD,dwpc_CbGaDtCtD,pdwpc_CbGaDtCtD,...,rdwpc_CtDaGaD,dwpc_CtDaGbCtD,pdwpc_CtDaGbCtD,rdwpc_CtDaGbCtD,dwpc_CtDtCbGaD,pdwpc_CtDtCbGaD,rdwpc_CtDtCbGaD,dwpc_CtDtCtD,pdwpc_CtDtCtD,rdwpc_CtDtCtD
0,DB00014,DOID:10283,0.0,0.402967,-0.402967,0.958915,0.869622,0.089293,0.0,0.503217,...,-0.758871,0.0,0.693193,-0.693193,0.0,1.164831,-1.164831,0.0,0.807504,-0.807504
1,DB00014,DOID:3277,0.0,0.0,0.0,0.105336,0.17117,-0.065835,0.0,0.177952,...,0.0338,0.334546,0.029422,0.305124,0.0,0.244952,-0.244952,0.0,0.0,0.0


In [9]:
base_df = feature_df.query("hetnet == @unperm_name").copy()
base_df.insert(8, 'prior_logit', logit(base_df['prior_prob']))
for metaege in degree_features:
    base_df['degree_{}'.format(metaege)] = numpy.arcsinh(base_df[metaege])
base_df.drop(
    ['hetnet', 'primary', 'prior_prob'] + list(degree_features) + list(dwpc_features),
    axis='columns', inplace=True)
transformed_df = base_df.merge(split_df)
transformed_df.head(2)

Unnamed: 0,chemical_id,disease_id,status,disease_name,chemical_name,prior_logit,degree_CbG,degree_CtD,degree_DaG,degree_DtC,...,rdwpc_CtDaGaD,dwpc_CtDaGbCtD,pdwpc_CtDaGbCtD,rdwpc_CtDaGbCtD,dwpc_CtDtCbGaD,pdwpc_CtDtCbGaD,rdwpc_CtDtCbGaD,dwpc_CtDtCtD,pdwpc_CtDtCtD,rdwpc_CtDtCtD
0,DB00104,DOID:0050742,0,nicotine dependence,Octreotide,-6.48937,2.49178,0.881374,3.638278,0.881374,...,-0.174628,0.0,0.041041,-0.041041,0.0,0.25241,-0.25241,0.0,0.0,0.0
1,DB00178,DOID:0050742,0,nicotine dependence,Ramipril,-4.996524,2.094713,2.094713,3.638278,0.881374,...,-0.491327,0.0,0.163237,-0.163237,0.658845,0.512217,0.146628,0.0,0.0,0.0


In [10]:
path = 'data/matrix/rephetio-v2.0/transformed-features.tsv.bz2'
with bz2.open(path, 'wt') as write_file:
    transformed_df.to_csv(write_file, sep='\t', index=False, float_format='%.5g')

### Compute performance

In [11]:
rows  = list()
for column in transformed_df.columns[transformed_df.columns.str.contains('dwpc')]:
    feature_type, metapath = column.split('_', 1)
    auroc = sklearn.metrics.roc_auc_score(transformed_df.status, transformed_df[column])
    rows.append([feature_type + '_auroc', metapath, auroc])
auroc_df = pandas.DataFrame(rows, columns=['feature_type', 'metapath', 'auroc'])
auroc_df = auroc_df.pivot_table(values='auroc', index='metapath', columns='feature_type').reset_index()
auroc_df.head(2)

feature_type,metapath,dwpc_auroc,pdwpc_auroc,rdwpc_auroc
0,CbGaD,0.724445,0.629454,0.683047
1,CbGaDaGaD,0.705098,0.65118,0.635442


In [12]:
primary_auroc_df = pandas.read_table('data/feature-performance/primary-aurocs.tsv')
primary_auroc_df = primary_auroc_df.rename(columns={'feature': 'metapath', 'auroc_permuted': 'pdwpc_primary_auroc', 'pval_auroc': 'pval_delta_auroc'})
primary_auroc_df = primary_auroc_df[['metapath', 'nonzero', 'pdwpc_primary_auroc', 'delta_auroc', 'pval_delta_auroc']]
auroc_df = auroc_df.merge(primary_auroc_df)
auroc_df.head(2)

Unnamed: 0,metapath,dwpc_auroc,pdwpc_auroc,rdwpc_auroc,nonzero,pdwpc_primary_auroc,delta_auroc,pval_delta_auroc
0,CbGaD,0.724445,0.629454,0.683047,0.32196,0.57265,0.15179,7e-06
1,CbGaDaGaD,0.705098,0.65118,0.635442,0.98209,0.61864,0.08646,0.000358


In [13]:
auroc_df.to_csv('data/feature-performance/auroc.tsv', sep='\t', index=False, float_format='%.5g')

In [14]:
#auroc_df.sort_values('rdwpc_auroc', ascending = False)
auroc_df[-auroc_df.metapath.str.contains('CtD')].sort_values('rdwpc_auroc', ascending = False).head()

Unnamed: 0,metapath,dwpc_auroc,pdwpc_auroc,rdwpc_auroc,nonzero,pdwpc_primary_auroc,delta_auroc,pval_delta_auroc
0,CbGaD,0.724445,0.629454,0.683047,0.32196,0.57265,0.15179,7e-06
1,CbGaDaGaD,0.705098,0.65118,0.635442,0.98209,0.61864,0.08646,0.000358
3,CbGbCbGaD,0.678146,0.661192,0.608608,0.91824,0.63764,0.040506,7.2e-05
