# Assess the marginal performance of every feature for primary observations

In [1]:
import math

import pandas
import sklearn.metrics
import scipy.stats
from statsmodels.sandbox.stats.multicomp import multipletests

In [2]:
feature_type_df = pandas.read_table('data/matrix/feature-type.tsv')
feature_type_df = feature_type_df[['feature', 'feature_type']]
feature_df = pandas.read_table('data/matrix/features.tsv.bz2')
features = list(feature_type_df['feature'])

In [3]:
feature_df.head(2)

Unnamed: 0,hetnet,chemical_id,disease_id,status,primary,disease_name,chemical_name,prior_prob,CbG,CcSE,...,CuGuDdGdD,CuGuDdGuD,CuGuDlAlD,CuGuDpSpD,CuGuDrD,CuGuDrDrD,CuGuDtCtD,CuGuDuGaD,CuGuDuGdD,CuGuDuGuD
0,rephetio-v2.0_perm-2,DB00014,DOID:0050742,0,1,nicotine dependence,Goserelin,0.001517,2,249,...,0.000627,0.000651,0.00049,0.0,0.0,0.000496,0.0,0.000218,0.001182,0.001201
1,rephetio-v2.0_perm-2,DB00091,DOID:0050742,0,1,nicotine dependence,Cyclosporine,0.004893,30,344,...,0.016569,0.017337,0.004231,0.008249,0.004638,0.005157,0.002165,0.003623,0.016645,0.017121


In [4]:
def compute_metrics(y_true, y_score):
    series = pandas.Series()
    series['nonzero'] = (y_score > 0).mean()
    series['auroc'] = sklearn.metrics.roc_auc_score(y_true, y_score)
    series['auprc'] = sklearn.metrics.average_precision_score(y_true, y_score)
    return series

def columnar_performance(df):
    y_true = df['status']
    perf_df = df[features].apply(lambda x: compute_metrics(y_true, x), axis='index')
    perf_df = perf_df.T
    perf_df.index.name = 'feature'
    return perf_df.reset_index()

perf_df = feature_df.groupby('hetnet').apply(columnar_performance).reset_index(level='hetnet')
perf_df = perf_df.merge(feature_type_df)
perf_df['permuted'] = perf_df.hetnet.str.contains('_perm').astype(int)

In [5]:
perf_df.merge(feature_type_df).head()

Unnamed: 0,hetnet,feature,nonzero,auroc,auprc,feature_type,permuted
0,rephetio-v2.0,prior_prob,1.0,0.874431,0.704884,prior,0
1,rephetio-v2.0_perm-1,prior_prob,1.0,0.824536,0.415491,prior,1
2,rephetio-v2.0_perm-2,prior_prob,1.0,0.827855,0.42355,prior,1
3,rephetio-v2.0_perm-3,prior_prob,1.0,0.822572,0.422721,prior,1
4,rephetio-v2.0_perm-4,prior_prob,1.0,0.814273,0.44027,prior,1


In [6]:
def compare_permutation(df):
    unperm = df.query("permuted == 0").iloc[0, :]
    perm_df = df.query("permuted == 1")
    series = pandas.Series()
    series['nonzero'] = unperm['nonzero']
    series['auroc'] = unperm.auroc
    series['auroc_permuted'] = perm_df.auroc.mean()
    series['delta_auroc'] = series['auroc'] - series['auroc_permuted']
    ttest = scipy.stats.ttest_1samp(perm_df.auroc, unperm.auroc)
    pvalue = ttest.pvalue
    series['pval_auroc'] = pvalue
    #series['nlog10_pval_auroc'] = -math.log10(pvalue)
    return(series)

compare_df = perf_df.groupby(['feature_type', 'feature']).apply(compare_permutation).reset_index()
reject, compare_df['fdr_pval_auroc'], alphacSidak, alphacBonf = multipletests(
    pvals=compare_df.pval_auroc, method='fdr_bh')
compare_df = feature_type_df.merge(compare_df)

In [7]:
compare_df.head(3)

Unnamed: 0,feature,feature_type,nonzero,auroc,auroc_permuted,delta_auroc,pval_auroc,fdr_pval_auroc
0,prior_prob,prior,1.0,0.874431,0.822811,0.05162,2.3e-05,0.00041
1,CbG,degree,0.991892,0.545148,0.545553,-0.000405,0.576708,0.614286
2,CcSE,degree,0.946622,0.582443,0.579357,0.003086,0.010432,0.021485


In [8]:
# Save datasets
perf_df.to_csv('data/feature-performance/primary-aucs.tsv', sep='\t', index=False, float_format='%.5g')
compare_df.to_csv('data/feature-performance/primary-aurocs.tsv', sep='\t', index=False, float_format='%.5g')