# New extraction step for prediction generation

In [1]:
import pandas as pd
import multiprocessing as mp
import json

import bz2
import numpy
from scipy.special import logit

import sys

sys.path.append("..")

In [2]:
from src.extractor import MatrixFormattedGraph

---

## Calculate DWPCs for informative features

In [3]:
with open("features/metapaths.json", "r") as fin:
    metapaths = json.load(fin)

paths = [val["abbreviation"] for val in metapaths]

In [4]:
paths

['CbGaD', 'CbGbCtD', 'CtDtCtD', 'CbGaDaGaD', 'CbGaDtCtD', 'CbGbCbGaD']

---

In [5]:
mg = MatrixFormattedGraph(
    "../../integrate/data/import_csvs/hetnet_nodes.csv",
    "../../integrate/data/import_csvs/hetnet_edges.csv",
    start_kind="Compound", end_kind="Disease",
    max_length=4
)

Reading file information...
Initializing metagraph...
Generating adjcency matrices...


100%|██████████| 3/3 [00:01<00:00,  1.84it/s]



Weighting matrices by degree with dampening factor 0.4...


100%|██████████| 3/3 [00:00<00:00,  4.94it/s]


In [6]:
dwpcs = mg.extract_dwpc(
    metapaths=paths,
    start_nodes="Compound", end_nodes="Disease",
    n_jobs=mp.cpu_count()
)

Calculating DWPCs...


100%|██████████| 6/6 [00:00<00:00,  7.55it/s]



Reformating results...


100%|██████████| 6/6 [00:23<00:00,  3.71s/it]


In [7]:
dwpcs.columns = 'dwpc_' + dwpcs.columns

dwpcs = dwpcs.rename(columns={
    "dwpc_compound_id": "chemical_id",
    "dwpc_disease_id": "disease_id"
})

In [8]:
dwpcs.shape

(212624, 8)

In [9]:
dwpcs.head()

Unnamed: 0,chemical_id,disease_id,dwpc_CbGaD,dwpc_CbGbCtD,dwpc_CtDtCtD,dwpc_CbGaDaGaD,dwpc_CbGaDtCtD,dwpc_CbGbCbGaD
0,DB00465,DOID:7148,0.003429,0.008544,0.0,0.010193,0.004,0.006288
1,DB00465,DOID:14330,0.0,0.0,0.0,0.006266,0.0,0.002362
2,DB00465,DOID:8577,0.012938,0.021249,0.0,0.00623,0.002391,0.011538
3,DB00465,DOID:10763,0.007359,0.009657,0.0,0.014443,0.001433,0.016377
4,DB00465,DOID:1312,0.0,0.0,0.0,0.004861,0.000527,0.001592


---

## Subset DWPCs

In [10]:
pairs = pd.read_table("features/compound-disease-pairs.tsv.bz2")

In [11]:
pairs.shape

(55768, 7)

In [12]:
pairs.head(2)

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,category,origin,status
0,DB00014,,DOID:0050742,,,train,0
1,DB00014,,DOID:0060073,,,train,0


In [13]:
dwpcs = dwpcs.merge(
    pairs[["chemical_id", "disease_id"]].drop_duplicates(),
    how="right", on=["chemical_id", "disease_id"]
)

In [14]:
dwpcs.shape

(55305, 8)

In [15]:
dwpcs.head()

Unnamed: 0,chemical_id,disease_id,dwpc_CbGaD,dwpc_CbGbCtD,dwpc_CtDtCtD,dwpc_CbGaDaGaD,dwpc_CbGaDtCtD,dwpc_CbGbCbGaD
0,DB01577,DOID:7148,0.0,0.000359,0.0,0.009033,0.001288,0.004466
1,DB01577,DOID:14330,0.030913,0.0,0.0,0.016834,0.0,0.041583
2,DB01577,DOID:8577,0.006942,0.00044,0.0,0.005156,0.001588,0.010714
3,DB01577,DOID:10763,0.019299,0.045888,0.0,0.018277,0.004652,0.047164
4,DB01577,DOID:1312,0.0,0.000611,0.0,0.003409,0.0,0.001127


---

In [16]:
prior_df = pd.read_table('../all-features/data/matrix/prior.tsv')
prior_df.tail(2)

Unnamed: 0,chemical_id,disease_id,prior_prob
186660,DB01624,DOID:12306,0.0
186661,DB01624,DOID:1245,0.0


In [17]:
compound_degree_df = pd.read_table('../all-features/data/matrix/compound_degree.tsv')
disease_degree_df = pd.read_table('../all-features/data/matrix/disease_degree.tsv')

for df in compound_degree_df, disease_degree_df:
    df.rename(columns={k: 'degree_' + k for k in df.columns[1:]}, inplace=True)

disease_degree_df.head(2)

Unnamed: 0,disease_id,degree_DaG,degree_DtC
0,DOID:0050156,18,0
1,DOID:0050425,12,0


---

In [18]:
feature_mat_df = (pairs
    .merge(prior_df)
    .merge(compound_degree_df)
    .merge(disease_degree_df)
    .merge(dwpcs)
)

In [19]:
feature_mat_df.shape

(55598, 18)

In [20]:
feature_mat_df["origin"].value_counts()

train      38889
holdout    16709
Name: origin, dtype: int64

In [21]:
with bz2.open('features/features.tsv.bz2', 'wt') as write_file:
    feature_mat_df.to_csv(write_file, sep='\t', index=False, float_format='%.4g')

---

In [22]:
trans_df = feature_mat_df.copy()
degree_features = list(trans_df.columns[trans_df.columns.str.startswith('degree_')])
dwpc_features = list(trans_df.columns[trans_df.columns.str.startswith('dwpc_')])

# Transform prior
trans_df.insert(7, 'prior_logit', logit(trans_df.prior_prob))

# Transform degree features
for feature in degree_features:
    trans_df[feature] = numpy.arcsinh(trans_df[feature])

# Transform DWPC features
for feature in dwpc_features:
    x = trans_df[feature]
    trans_df[feature] = numpy.arcsinh(x / x.mean())

# Standardize all features besides the prior
for feature in degree_features + dwpc_features:
    x = trans_df[feature]
    trans_df[feature] = (x - x.mean()) / x.std()

trans_df.head(3)

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,category,origin,status,prior_logit,prior_prob,degree_CbG,degree_CtD,degree_DaG,degree_DtC,dwpc_CbGaD,dwpc_CbGbCtD,dwpc_CtDtCtD,dwpc_CbGaDaGaD,dwpc_CbGaDtCtD,dwpc_CbGbCbGaD
0,DB00014,,DOID:0050742,,,train,0,-6.48937,0.001517,-1.420283,0.303021,-1.356887,-0.730715,-0.51409,-0.702052,-0.29475,-1.32985,-0.731192,-1.008686
1,DB00091,,DOID:0050742,,,train,0,-5.315024,0.004893,1.527618,1.59749,-1.356887,-0.730715,-0.51409,-0.702052,-0.29475,-0.628026,-0.731192,0.455704
2,DB00104,,DOID:0050742,,,train,0,-6.48937,0.001517,-0.254745,0.303021,-1.356887,-0.730715,-0.51409,-0.702052,-0.29475,-0.331944,-0.731192,-1.008686


In [23]:
with bz2.open('features/transformed-features.tsv.bz2', 'wt') as write_file:
    trans_df.to_csv(write_file, sep='\t', index=False, float_format='%.4g')