# New extraction step for prediction generation

In [1]:
import pandas as pd
import multiprocessing as mp
import json

import bz2
import numpy
from scipy.special import logit

import sys

sys.path.append("..")

In [2]:
from src.extractor import MatrixFormattedGraph

---

## Calculate DWPCs for informative features

In [3]:
with open("features/metapaths.json", "r") as fin:
    metapaths = json.load(fin)

paths = [val["abbreviation"] for val in metapaths]

In [4]:
paths

['CbGaD',
 'CrCtD',
 'CtDrD',
 'CbGbCtD',
 'CbGeAlD',
 'CbGiGaD',
 'CbGuAlD',
 'CbGuDrD',
 'CcSEcCtD',
 'CdGbCtD',
 'CdGdCtD',
 'CdGiGaD',
 'CiPCiCtD',
 'CrCbGaD',
 'CrCdGaD',
 'CrCrCtD',
 'CrCtDrD',
 'CrCuGaD',
 'CtDtCtD',
 'CuGiGaD',
 'CuGuCtD',
 'CbGaDaGaD',
 'CbGaDlAlD',
 'CbGaDpSpD',
 'CbGaDrDrD',
 'CbGaDtCtD',
 'CbGbCbGaD',
 'CbGbCdGaD',
 'CbGbCdGuD',
 'CbGbCrCtD',
 'CbGbCuGaD',
 'CbGdAdGaD',
 'CbGdCdGaD',
 'CbGdCrCtD',
 'CbGdCtDrD',
 'CbGdCuGaD',
 'CbGdDaGaD',
 'CbGdDdGaD',
 'CbGdDlAlD',
 'CbGdDpSpD',
 'CbGdDuGaD',
 'CbGeAeGaD',
 'CbGeAlDrD',
 'CbGeAuGaD',
 'CbGiGaDrD',
 'CbGiGbCtD',
 'CbGiGiGaD',
 'CbGiGuAlD',
 'CbGpBPpGaD',
 'CbGpMFpGaD',
 'CbGpPWpGaD',
 'CbGuAeGaD',
 'CbGuAlDrD',
 'CbGuAuGaD',
 'CbGuCdGaD',
 'CbGuCtDrD',
 'CbGuCuGaD',
 'CbGuDaGaD',
 'CbGuDdGaD',
 'CbGuDlAlD',
 'CbGuDpSpD',
 'CbGuDrDrD',
 'CbGuDuGaD',
 'CcSEcCbGaD',
 'CcSEcCdGuD',
 'CcSEcCrCtD',
 'CcSEcCuGaD',
 'CdGaDaGaD',
 'CdGaDuGaD',
 'CdGcGiGaD',
 'CdGdCrCtD',
 'CdGdCtDrD',
 'CdGiGiGaD',
 'CdGuDaGaD',
 'C

---

In [5]:
mg = MatrixFormattedGraph(
    "../../integrate/data/import_csvs/hetnet_nodes.csv",
    "../../integrate/data/import_csvs/hetnet_edges.csv",
    start_kind="Compound", end_kind="Disease",
    max_length=4
)

Reading file information...
Initializing metagraph...
Generating adjacency matrices...


100%|██████████| 23/23 [01:09<00:00,  1.67it/s]



Weighting matrices by degree with dampening factor 0.4...


100%|██████████| 23/23 [00:34<00:00,  1.99s/it]


In [6]:
dwpcs = mg.extract_dwpc(
    metapaths=paths,
    start_nodes="Compound", end_nodes="Disease",
    n_jobs=mp.cpu_count()
)

Calculating DWPCs...


100%|██████████| 110/110 [00:38<00:00,  5.17s/it]



Reformating results...


100%|██████████| 110/110 [01:10<00:00,  1.95it/s]


In [7]:
dwpcs.columns = 'dwpc_' + dwpcs.columns

dwpcs = dwpcs.rename(columns={
    "dwpc_compound_id": "chemical_id",
    "dwpc_disease_id": "disease_id"
})

In [8]:
dwpcs.shape

(212624, 112)

In [9]:
dwpcs.head()

Unnamed: 0,chemical_id,disease_id,dwpc_CbGaD,dwpc_CrCtD,dwpc_CtDrD,dwpc_CbGbCtD,dwpc_CbGeAlD,dwpc_CbGiGaD,dwpc_CbGuAlD,dwpc_CbGuDrD,...,dwpc_CtDrDtCtD,dwpc_CtDtCrCtD,dwpc_CuGaDuGaD,dwpc_CuGbCtDrD,dwpc_CuGcGiGaD,dwpc_CuGdDpSpD,dwpc_CuGuCbGaD,dwpc_CuGuCrCtD,dwpc_CuGuCtDrD,dwpc_CuGuCuGaD
0,DB01595,DOID:11615,0.0,0.0,0.0,0.0,0.000498,0.0,0.000499,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,DB01595,DOID:13241,0.0,0.0,0.0,0.0,0.005107,0.0,0.00328,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,DB01595,DOID:12365,0.0,0.0,0.0,0.010235,0.001718,9.3e-05,0.00186,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,DB01595,DOID:363,0.0,0.0,0.0,0.001738,0.001612,0.000329,0.00116,0.001119,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,DB01595,DOID:1319,0.0,0.0,0.0,0.001541,0.006177,0.000121,0.005345,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


---

## Subset DWPCs

In [10]:
pairs = pd.read_table("features/compound-disease-pairs.tsv.bz2")

In [11]:
pairs.shape

(55768, 7)

In [12]:
pairs.head(2)

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,category,origin,status
0,DB00014,,DOID:0050742,,,train,0
1,DB00014,,DOID:0060073,,,train,0


In [13]:
dwpcs = dwpcs.merge(
    pairs[["chemical_id", "disease_id"]].drop_duplicates(),
    how="right", on=["chemical_id", "disease_id"]
)

In [14]:
dwpcs.shape

(55305, 112)

In [15]:
dwpcs.head()

Unnamed: 0,chemical_id,disease_id,dwpc_CbGaD,dwpc_CrCtD,dwpc_CtDrD,dwpc_CbGbCtD,dwpc_CbGeAlD,dwpc_CbGiGaD,dwpc_CbGuAlD,dwpc_CbGuDrD,...,dwpc_CtDrDtCtD,dwpc_CtDtCrCtD,dwpc_CuGaDuGaD,dwpc_CuGbCtDrD,dwpc_CuGcGiGaD,dwpc_CuGdDpSpD,dwpc_CuGuCbGaD,dwpc_CuGuCrCtD,dwpc_CuGuCtDrD,dwpc_CuGuCuGaD
0,DB00441,DOID:12365,0.0,0.0,0.0,0.000881,0.011484,0.000517,0.008128,0.0,...,0.0,0.0,0.004917,0.0,0.002366,0.025473,0.002792,0.005475,0.0,0.029277
1,DB00441,DOID:363,0.059557,0.0,0.0,0.003035,0.013855,0.00276,0.007495,0.0,...,0.041666,0.020541,0.010085,0.001845,0.008141,0.027562,0.022479,0.030058,0.019431,0.034918
2,DB00441,DOID:1319,0.0,0.0,0.0,0.039331,0.00564,0.005431,0.00128,0.0,...,0.01531,0.00747,0.007558,0.001579,0.012385,0.023547,0.015886,0.031982,0.016532,0.028792
3,DB00441,DOID:13189,0.0,0.0,0.0,0.002867,0.001202,0.0,0.0,0.003657,...,0.001307,0.0,0.003428,0.000701,0.001017,0.015974,0.018489,0.013264,0.012554,0.006245
4,DB00441,DOID:12361,0.0,0.0,0.0,0.0,0.005509,0.000598,0.002326,0.0,...,0.0,0.0,0.005013,0.003219,0.001689,0.020047,0.001863,0.0,0.007813,0.026349


---

In [16]:
prior_df = pd.read_table('../all-features/data/matrix/prior.tsv')
prior_df.tail(2)

Unnamed: 0,chemical_id,disease_id,prior_prob
209166,DB01624,DOID:12306,0.0
209167,DB01624,DOID:1245,0.0


In [17]:
compound_degree_df = pd.read_table('../all-features/data/matrix/compound_degree.tsv')
disease_degree_df = pd.read_table('../all-features/data/matrix/disease_degree.tsv')

for df in compound_degree_df, disease_degree_df:
    df.rename(columns={k: 'degree_' + k for k in df.columns[1:]}, inplace=True)

disease_degree_df.head(2)

Unnamed: 0,disease_id,degree_DaG,degree_DdG,degree_DlA,degree_DpS,degree_DrD,degree_DtC,degree_DuG
0,DOID:0050156,18,250,4,8,2,0,250
1,DOID:0050425,12,0,16,21,6,0,0


---

In [18]:
feature_mat_df = (pairs
    .merge(prior_df)
    .merge(compound_degree_df)
    .merge(disease_degree_df)
    .merge(dwpcs)
)

In [19]:
feature_mat_df.shape

(55768, 132)

In [20]:
feature_mat_df["origin"].value_counts()

train      38982
holdout    16786
Name: origin, dtype: int64

In [21]:
with bz2.open('features/features.tsv.bz2', 'wt') as write_file:
    feature_mat_df.to_csv(write_file, sep='\t', index=False, float_format='%.4g')

---

In [22]:
trans_df = feature_mat_df.copy()
degree_features = list(trans_df.columns[trans_df.columns.str.startswith('degree_')])
dwpc_features = list(trans_df.columns[trans_df.columns.str.startswith('dwpc_')])

# Transform prior
trans_df.insert(7, 'prior_logit', logit(trans_df.prior_prob))

# Transform degree features
for feature in degree_features:
    trans_df[feature] = numpy.arcsinh(trans_df[feature])

# Transform DWPC features
for feature in dwpc_features:
    x = trans_df[feature]
    trans_df[feature] = numpy.arcsinh(x / x.mean())

# Standardize all features besides the prior
for feature in degree_features + dwpc_features:
    x = trans_df[feature]
    trans_df[feature] = (x - x.mean()) / x.std()

trans_df.head(3)

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,category,origin,status,prior_logit,prior_prob,degree_CbG,...,dwpc_CtDrDtCtD,dwpc_CtDtCrCtD,dwpc_CuGaDuGaD,dwpc_CuGbCtDrD,dwpc_CuGcGiGaD,dwpc_CuGdDpSpD,dwpc_CuGuCbGaD,dwpc_CuGuCrCtD,dwpc_CuGuCtDrD,dwpc_CuGuCuGaD
0,DB00014,,DOID:0050742,,,train,0,-6.48937,0.001517,-1.3939,...,-0.327949,-0.325709,-0.639979,-0.345238,-0.582593,-0.625693,-0.383295,-0.510571,0.020766,-0.652319
1,DB00091,,DOID:0050742,,,train,0,-5.315024,0.004893,1.518152,...,-0.327949,-0.325709,-0.128367,-0.345238,1.670098,2.00133,1.315657,-0.510571,1.325222,-0.049884
2,DB00104,,DOID:0050742,,,train,0,-6.48937,0.001517,-0.242536,...,-0.327949,-0.325709,-0.639979,-0.345238,-0.582593,-0.625693,-0.721822,-0.510571,-0.643848,-0.678037


In [23]:
with bz2.open('features/transformed-features.tsv.bz2', 'wt') as write_file:
    trans_df.to_csv(write_file, sep='\t', index=False, float_format='%.4g')