# Convert features into a matrix

In [1]:
import bz2

import pandas
import numpy
from scipy.special import logit

In [2]:
# Read compound-disease pairs (observations)
pair_df = pandas.read_table('features/compound-disease-pairs.tsv.bz2')
pair_df.head(2)

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,category,origin,status
0,DB00014,,DOID:0050742,,,train,0
1,DB00014,,DOID:0060073,,,train,0


In [3]:
pair_df.shape

(55768, 7)

## Read prior probabilities

In [4]:
prior_df = pandas.read_table('../all-features/data/matrix/prior.tsv')
prior_df.tail(2)

Unnamed: 0,chemical_id,disease_id,prior_prob
186660,DB01624,DOID:12306,0.0
186661,DB01624,DOID:1245,0.0


In [5]:
prior_df.shape

(186662, 3)

## Read degree features

In [6]:
compound_degree_df = pandas.read_table('../all-features/data/matrix/compound_degree.tsv')
disease_degree_df = pandas.read_table('../all-features/data/matrix/disease_degree.tsv')

for df in compound_degree_df, disease_degree_df:
    df.rename(columns={k: 'degree_' + k for k in df.columns[1:]}, inplace=True)

disease_degree_df.head(2)

Unnamed: 0,disease_id,degree_DaG,degree_DtC
0,DOID:0050156,18,0
1,DOID:0050425,12,0


In [7]:
# Read DWPCs
dwpc_df = pandas.read_table('features/dwpc.tsv.bz2')
dwpc_df.head(2)

Unnamed: 0,hetnet,chemical_id,disease_id,metapath,PC,w,DWPC,seconds
0,hetio-ind,DB00014,DOID:824,CtDtCtD,0,0.4,0.0,0.7727
1,hetio-ind,DB00091,DOID:14268,CtDtCtD,0,0.4,0.0,0.7202


In [8]:
dwpc_df.shape

(334608, 8)

In [9]:
dwpc_mat_df = dwpc_df.pivot_table(values = 'DWPC', index=['chemical_id', 'disease_id'], columns = 'metapath')
dwpc_mat_df.columns = 'dwpc_' + dwpc_mat_df.columns
dwpc_mat_df = dwpc_mat_df.reset_index()
dwpc_mat_df.head(2)

Unnamed: 0,chemical_id,disease_id,dwpc_CbGaD,dwpc_CbGaDaGaD,dwpc_CbGaDtCtD,dwpc_CbGbCbGaD,dwpc_CbGbCtD,dwpc_CtDtCtD
0,DB00014,DOID:0050156,0.0,0.002188,0.0,0.0,0.0,0.0
1,DB00014,DOID:0050425,0.0,0.002232,0.0,0.0,0.0,0.0


---

### Combine all observation-by-feature matrixes

In [10]:
feature_mat_df = (pair_df
    .merge(prior_df)
    .merge(compound_degree_df)
    .merge(disease_degree_df)
    .merge(dwpc_mat_df)
)

In [11]:
feature_mat_df.head(2)

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,category,origin,status,prior_prob,degree_CbG,degree_CtD,degree_DaG,degree_DtC,dwpc_CbGaD,dwpc_CbGaDaGaD,dwpc_CbGaDtCtD,dwpc_CbGbCbGaD,dwpc_CbGbCtD,dwpc_CtDtCtD
0,DB00014,,DOID:0050742,,,train,0,0.001517,2,1,19,1,0.0,0.0,0.0,0.0,0.0,0.0
1,DB00091,,DOID:0050742,,,train,0,0.004893,30,3,19,1,0.0,0.002658,0.0,0.008582,0.0,0.0


In [12]:
feature_mat_df.shape

(55598, 18)

In [13]:
feature_mat_df["origin"].value_counts()

train      38889
holdout    16709
Name: origin, dtype: int64

In [14]:
with bz2.open('features/features.tsv.bz2', 'wt') as write_file:
    feature_mat_df.to_csv(write_file, sep='\t', index=False, float_format='%.4g')

## Transform

In [15]:
trans_df = feature_mat_df.copy()
degree_features = list(trans_df.columns[trans_df.columns.str.startswith('degree_')])
dwpc_features = list(trans_df.columns[trans_df.columns.str.startswith('dwpc_')])

# Transform prior
trans_df.insert(7, 'prior_logit', logit(trans_df.prior_prob))

# Transform degree features
for feature in degree_features:
    trans_df[feature] = numpy.arcsinh(trans_df[feature])

# Transform DWPC features
for feature in dwpc_features:
    x = trans_df[feature]
    trans_df[feature] = numpy.arcsinh(x / x.mean())

# Standardize all features besides the prior
for feature in degree_features + dwpc_features:
    x = trans_df[feature]
    trans_df[feature] = (x - x.mean()) / x.std()

trans_df.head(3)

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,category,origin,status,prior_logit,prior_prob,degree_CbG,degree_CtD,degree_DaG,degree_DtC,dwpc_CbGaD,dwpc_CbGaDaGaD,dwpc_CbGaDtCtD,dwpc_CbGbCbGaD,dwpc_CbGbCtD,dwpc_CtDtCtD
0,DB00014,,DOID:0050742,,,train,0,-6.48937,0.001517,-1.420283,0.303021,-1.356887,-0.730715,-0.51409,-1.333397,-0.731737,-1.014264,-0.702052,-0.295374
1,DB00091,,DOID:0050742,,,train,0,-5.315024,0.004893,1.527618,1.59749,-1.356887,-0.730715,-0.51409,-0.627804,-0.731737,0.474427,-0.702052,-0.295374
2,DB00104,,DOID:0050742,,,train,0,-6.48937,0.001517,-0.254745,0.303021,-1.356887,-0.730715,-0.51409,-0.330297,-0.731737,-1.014264,-0.702052,-0.295374


In [16]:
with bz2.open('features/transformed-features.tsv.bz2', 'wt') as write_file:
    trans_df.to_csv(write_file, sep='\t', index=False, float_format='%.4g')