# Convert features into a matrix

In [1]:
import bz2

import pandas
import numpy
from scipy.special import logit

In [2]:
# Read compound-disease pairs (observations)
pair_df = pandas.read_table('features/compound-disease-pairs.tsv.bz2')
pair_df.head(2)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,category,status
0,DB01048,Abacavir,DOID:10652,Alzheimer's disease,,0
1,DB01048,Abacavir,DOID:9206,Barrett's esophagus,,0


In [3]:
# Read prior probabilities
prior_df = pandas.read_table('../all-features/data/matrix/prior.tsv')
prior_df.tail(2)

Unnamed: 0,compound_id,disease_id,prior_prob
186660,DB01624,DOID:12306,0
186661,DB01624,DOID:1245,0


In [4]:
# Read degree features
compound_degree_df = pandas.read_table('../all-features/data/matrix/compound_degree.tsv')
disease_degree_df = pandas.read_table('../all-features/data/matrix/disease_degree.tsv')

for df in compound_degree_df, disease_degree_df:
    df.rename(columns={k: 'degree_' + k for k in df.columns[1:]}, inplace=True)

disease_degree_df.head(2)

Unnamed: 0,disease_id,degree_DaG,degree_DtC
0,DOID:0050156,18,0
1,DOID:0050425,12,0


In [5]:
# Read DWPCs
dwpc_df = pandas.read_table('features/dwpc.tsv.bz2')
dwpc_df.head(2)

Unnamed: 0,hetnet,compound_id,disease_id,metapath,PC,w,DWPC,seconds
0,hetio-ind,DB01048,DOID:9206,CtDtCtD,0,0.4,0,0.01605
1,hetio-ind,DB01048,DOID:10652,CtDtCtD,0,0.4,0,0.01971


In [6]:
dwpc_mat_df = dwpc_df.pivot_table(values = 'DWPC', index=['compound_id', 'disease_id'], columns = 'metapath')
dwpc_mat_df.columns = 'dwpc_' + dwpc_mat_df.columns
dwpc_mat_df = dwpc_mat_df.reset_index()
dwpc_mat_df.head(2)

Unnamed: 0,compound_id,disease_id,dwpc_CbGaD,dwpc_CbGaDaGaD,dwpc_CbGaDtCtD,dwpc_CbGbCbGaD,dwpc_CbGbCtD,dwpc_CtDtCtD
0,DB00014,DOID:0050156,0,0.002188,0,0,0,0
1,DB00014,DOID:0050425,0,0.002232,0,0,0,0


### Combine all observation-by-feature matrixes

In [7]:
feature_mat_df = pair_df.merge(prior_df).merge(compound_degree_df).merge(disease_degree_df).merge(dwpc_mat_df)

In [8]:
feature_mat_df.head(2)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,category,status,prior_prob,degree_CbG,degree_CtD,degree_DaG,degree_DtC,dwpc_CbGaD,dwpc_CbGaDaGaD,dwpc_CbGaDtCtD,dwpc_CbGbCbGaD,dwpc_CbGbCtD,dwpc_CtDtCtD
0,DB01048,Abacavir,DOID:10652,Alzheimer's disease,,0,0.004455,3,1,196,3,0.0,0.008574,0,0.002532,0.0,0
1,DB05812,Abiraterone,DOID:10652,Alzheimer's disease,,0,0.004455,8,1,196,3,0.002654,0.009692,0,0.011804,0.003682,0


In [9]:
feature_mat_df.shape

(186662, 17)

In [10]:
with bz2.open('features/features.tsv.bz2', 'wt') as write_file:
    feature_mat_df.to_csv(write_file, sep='\t', index=False, float_format='%.4g')

## Transform

In [11]:
trans_df = feature_mat_df.copy()
degree_features = list(trans_df.columns[trans_df.columns.str.startswith('degree_')])
dwpc_features = list(trans_df.columns[trans_df.columns.str.startswith('dwpc_')])

# Transform prior
trans_df.insert(7, 'prior_logit', logit(trans_df.prior_prob))

# Transform degree features
for feature in degree_features:
    trans_df[feature] = numpy.arcsinh(trans_df[feature])

# Transform DWPC features
for feature in dwpc_features:
    x = trans_df[feature]
    trans_df[feature] = numpy.arcsinh(x / x.mean())

# Standardize all features besides the prior
for feature in degree_features + dwpc_features:
    x = trans_df[feature]
    trans_df[feature] = (x - x.mean()) / x.std()

trans_df.head(3)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,category,status,prior_prob,prior_logit,degree_CbG,degree_CtD,degree_DaG,degree_DtC,dwpc_CbGaD,dwpc_CbGaDaGaD,dwpc_CbGaDtCtD,dwpc_CbGbCbGaD,dwpc_CbGbCtD,dwpc_CtDtCtD
0,DB01048,Abacavir,DOID:10652,Alzheimer's disease,,0,0.004455,-5.409286,-0.593931,1.049637,1.076089,0.376915,-0.379551,1.186524,-0.555518,-0.050625,-0.522748,-0.168527
1,DB05812,Abiraterone,DOID:10652,Alzheimer's disease,,0,0.004455,-5.409286,0.423725,1.049637,1.076089,0.376915,1.077829,1.360487,-0.555518,1.544004,0.847445,-0.168527
2,DB00659,Acamprosate,DOID:10652,Alzheimer's disease,,0,0.004455,-5.409286,0.852142,1.049637,1.076089,0.376915,2.43631,2.602306,-0.555518,1.767558,4.114038,-0.168527


In [12]:
with bz2.open('features/transformed-features.tsv.bz2', 'wt') as write_file:
    trans_df.to_csv(write_file, sep='\t', index=False, float_format='%.4g')