# Transform and split DWPCs, assess performance

In [1]:
import itertools
import bz2

import pandas
import numpy
import sklearn.metrics
from scipy.special import logit

In [2]:
unperm_name = 'rephetio-v2.0'

In [3]:
feature_df = pandas.read_table('data/matrix/features.tsv.bz2')
feature_type_df = pandas.read_table('data/matrix/feature-type.tsv')

In [4]:
feature_df.head(2)

Unnamed: 0,hetnet,compound_id,disease_id,status,primary,disease_name,compound_name,prior_prob,CbG,CtD,...,DtC,CbGaD,CbGaDaGaD,CbGaDtCtD,CbGbCbGaD,CbGbCtD,CtDaGaD,CtDaGbCtD,CtDtCbGaD,CtDtCtD
0,rephetio-v2.0_perm-2,DB00014,DOID:0050742,0,1,nicotine dependence,Goserelin,0.001517,2,1,...,1,0.0,0.0,0.0,0.001016,0.0,0.0,0.0,0.00214,0.0
1,rephetio-v2.0_perm-5,DB00014,DOID:0050742,0,1,nicotine dependence,Goserelin,0.001517,2,1,...,1,0.0,0.001451,0.003834,0.000399,0.005393,0.002735,0.0,0.0,0.0


In [5]:
feature_type_df.head()

Unnamed: 0,feature,feature_type,unperm_mean,unperm_sd
0,prior_prob,prior,0.033282,0.060884
1,CbG,degree,11.438,12.307
2,CtD,degree,1.9527,2.3228
3,DaG,degree,148.57,149.87
4,DtC,degree,9.7649,11.49


In [6]:
def transform_dwpcs(x, scaler):
    x = numpy.array(x)
    return numpy.arcsinh(x / scaler)

transformed_df = feature_df.copy()
dwpc_features = feature_type_df.query("feature_type == 'dwpc'").feature
degree_features = feature_type_df.query("feature_type == 'degree'").feature
feature_to_scaler = dict(zip(feature_type_df.feature, feature_type_df.unperm_mean))
for column in dwpc_features:
    transformed_df[column] = transform_dwpcs(transformed_df[column], feature_to_scaler[column])

In [7]:
column_names = list()
columns = list()
for metapath in dwpc_features:
    df = pandas.pivot_table(transformed_df, values=metapath, index=['compound_id', 'disease_id'], columns='hetnet')
    df = df[df['rephetio-v2.0'].notnull()]
    dwpc = df.iloc[:, 0]
    pdwpc = df.iloc[:, 1:].mean(axis='columns')
    rdwpc = dwpc - pdwpc
    for column in dwpc, pdwpc, rdwpc:
        columns.append(column)
    for feature_type in 'dwpc', 'pdwpc', 'rdwpc':
        column_names.append('{}_{}'.format(feature_type, metapath))

split_df = pandas.concat(columns, axis=1)
split_df.columns = column_names
split_df.reset_index(inplace=True)

In [8]:
split_df.head(2)

Unnamed: 0,compound_id,disease_id,dwpc_CbGaD,pdwpc_CbGaD,rdwpc_CbGaD,dwpc_CbGaDaGaD,pdwpc_CbGaDaGaD,rdwpc_CbGaDaGaD,dwpc_CbGaDtCtD,pdwpc_CbGaDtCtD,...,rdwpc_CtDaGaD,dwpc_CtDaGbCtD,pdwpc_CtDaGbCtD,rdwpc_CtDaGbCtD,dwpc_CtDtCbGaD,pdwpc_CtDtCbGaD,rdwpc_CtDtCbGaD,dwpc_CtDtCtD,pdwpc_CtDtCtD,rdwpc_CtDtCtD
0,DB00014,DOID:10283,0.0,0.314799,-0.314799,0.965397,0.592134,0.373262,0.0,0.467725,...,-0.866692,0.0,0.64188,-0.64188,0.0,1.16984,-1.16984,0.0,0.774751,-0.774751
1,DB00014,DOID:1312,0.0,0.546056,-0.546056,0.04203,0.164178,-0.122148,0.0,0.072878,...,0.094959,0.649387,0.030717,0.61867,0.315708,0.277201,0.038507,0.0,0.289342,-0.289342


In [9]:
base_df = feature_df.query("hetnet == @unperm_name").copy()
base_df.insert(8, 'prior_logit', logit(base_df['prior_prob']))
for metaege in degree_features:
    base_df['degree_{}'.format(metaege)] = numpy.arcsinh(base_df[metaege])
base_df.drop(
    ['hetnet', 'primary', 'prior_prob'] + list(degree_features) + list(dwpc_features),
    axis='columns', inplace=True)
transformed_df = base_df.merge(split_df)
transformed_df.head(2)

Unnamed: 0,compound_id,disease_id,status,disease_name,compound_name,prior_logit,degree_CbG,degree_CtD,degree_DaG,degree_DtC,...,rdwpc_CtDaGaD,dwpc_CtDaGbCtD,pdwpc_CtDaGbCtD,rdwpc_CtDaGbCtD,dwpc_CtDtCbGaD,pdwpc_CtDtCbGaD,rdwpc_CtDtCbGaD,dwpc_CtDtCtD,pdwpc_CtDtCtD,rdwpc_CtDtCtD
0,DB00226,DOID:0050742,0,nicotine dependence,Guanadrel,-6.48937,0.881374,0.881374,3.638278,0.881374,...,-0.191124,0.0,0.079111,-0.079111,0.928549,0.146498,0.782051,0.0,0.440984,-0.440984
1,DB00252,DOID:0050742,0,nicotine dependence,Phenytoin,-6.48937,3.738236,0.881374,3.638278,0.881374,...,0.216083,1.420996,0.067601,1.353396,1.136866,0.222538,0.914327,0.0,0.0,0.0


In [10]:
path = 'data/matrix/rephetio-v2.0/transformed-features.tsv.bz2'
with bz2.open(path, 'wt') as write_file:
    transformed_df.to_csv(write_file, sep='\t', index=False, float_format='%.5g')

### Compute performance

In [11]:
rows  = list()
for column in transformed_df.columns[transformed_df.columns.str.contains('dwpc')]:
    feature_type, metapath = column.split('_', 1)
    auroc = sklearn.metrics.roc_auc_score(transformed_df.status, transformed_df[column])
    rows.append([feature_type + '_auroc', metapath, auroc])
auroc_df = pandas.DataFrame(rows, columns=['feature_type', 'metapath', 'auroc'])
auroc_df = auroc_df.pivot_table(values='auroc', index='metapath', columns='feature_type').reset_index()
auroc_df.head(2)

feature_type,metapath,dwpc_auroc,pdwpc_auroc,rdwpc_auroc
0,CbGaD,0.725068,0.630214,0.686282
1,CbGaDaGaD,0.705271,0.662229,0.619878


In [12]:
primary_auroc_df = pandas.read_table('data/feature-performance/primary-aurocs.tsv')
primary_auroc_df = primary_auroc_df.rename(columns={'feature': 'metapath', 'auroc_permuted': 'pdwpc_primary_auroc', 'pval_auroc': 'pval_delta_auroc'})
primary_auroc_df = primary_auroc_df[['metapath', 'nonzero', 'pdwpc_primary_auroc', 'delta_auroc', 'pval_delta_auroc']]
auroc_df = auroc_df.merge(primary_auroc_df)
auroc_df.head(2)

Unnamed: 0,metapath,dwpc_auroc,pdwpc_auroc,rdwpc_auroc,nonzero,pdwpc_primary_auroc,delta_auroc,pval_delta_auroc
0,CbGaD,0.725068,0.630214,0.686282,0.3223,0.58046,0.14461,4.6216e-08
1,CbGaDaGaD,0.705271,0.662229,0.619878,0.98412,0.62328,0.081989,2.1266e-05


In [13]:
auroc_df.to_csv('data/feature-performance/auroc.tsv', sep='\t', index=False, float_format='%.5g')

In [14]:
#auroc_df.sort_values('rdwpc_auroc', ascending = False)
auroc_df[-auroc_df.metapath.str.contains('CtD')].sort_values('rdwpc_auroc', ascending = False).head()

Unnamed: 0,metapath,dwpc_auroc,pdwpc_auroc,rdwpc_auroc,nonzero,pdwpc_primary_auroc,delta_auroc,pval_delta_auroc
0,CbGaD,0.725068,0.630214,0.686282,0.3223,0.58046,0.14461,4.6216e-08
1,CbGaDaGaD,0.705271,0.662229,0.619878,0.98412,0.62328,0.081989,2.1266e-05
3,CbGbCbGaD,0.674394,0.667998,0.602468,0.92872,0.63521,0.039189,1.3274e-05
