# New extract and matrixify in one notebook

Goal to reproduce the files needed.

Skip 3-extract
skip parts of 4-matrixify

but produce the same output files

In [1]:
import pandas as pd
import numpy as np

import os
import bz2
import itertools

import sys

sys.path.append("..")

In [2]:
from src.extractor import MatrixFormattedGraph

---

## Get the matricies for all permutations

In [3]:
def get_matrix(folder, index):
    perm_val = "" if index == 0 else "_perm-{}".format(index)
    nodes_fname = "{}/hetnet{}_nodes.csv".format(folder, perm_val)
    edges_fname = "{}/hetnet{}_edges.csv".format(folder, perm_val)

    return MatrixFormattedGraph(
        nodes_fname, edges_fname,
        start_kind="Compound", end_kind="Disease",
        max_length=4
    )

In [4]:
folder = "../../integrate/data/import_csvs"

matricies = [
    get_matrix(folder, index) for index in range(6)
]

Reading file information...
Initializing metagraph...
Generating adjcency matrices...


100%|██████████| 3/3 [00:00<00:00,  2.88it/s]



Weighting matrices by degree with dampening factor 0.4...


100%|██████████| 3/3 [00:00<00:00,  9.99it/s]


Reading file information...
Initializing metagraph...
Generating adjcency matrices...


100%|██████████| 3/3 [00:01<00:00,  1.92it/s]



Weighting matrices by degree with dampening factor 0.4...


100%|██████████| 3/3 [00:00<00:00, 13.07it/s]


Reading file information...
Initializing metagraph...
Generating adjcency matrices...


100%|██████████| 3/3 [00:01<00:00,  2.87it/s]



Weighting matrices by degree with dampening factor 0.4...


100%|██████████| 3/3 [00:00<00:00, 27.85it/s]


Reading file information...
Initializing metagraph...
Generating adjcency matrices...


100%|██████████| 3/3 [00:01<00:00,  2.86it/s]



Weighting matrices by degree with dampening factor 0.4...


100%|██████████| 3/3 [00:00<00:00, 11.06it/s]


Reading file information...
Initializing metagraph...
Generating adjcency matrices...


100%|██████████| 3/3 [00:00<00:00,  3.28it/s]



Weighting matrices by degree with dampening factor 0.4...


100%|██████████| 3/3 [00:00<00:00, 14.64it/s]


Reading file information...
Initializing metagraph...
Generating adjcency matrices...


100%|██████████| 3/3 [00:00<00:00,  3.67it/s]



Weighting matrices by degree with dampening factor 0.4...


100%|██████████| 3/3 [00:00<00:00, 10.46it/s]


---

## Extract out the DWPCs

In [5]:
dwpcs = [
    (mg
        .extract_dwpc(
            start_nodes="Compound", end_nodes="Disease",
            n_jobs=32
        )
        .rename(columns={"compound_id": "chemical_id"})
    )
    
    for mg in matricies
]

Calculating DWPCs...


100%|██████████| 9/9 [00:00<00:00, 11.43it/s]



Reformating results...


100%|██████████| 9/9 [00:13<00:00,  1.74s/it]


Calculating DWPCs...


100%|██████████| 9/9 [00:00<00:00,  9.11it/s]



Reformating results...


100%|██████████| 9/9 [00:12<00:00,  1.39s/it]


Calculating DWPCs...


100%|██████████| 9/9 [00:00<00:00, 10.70it/s]



Reformating results...


100%|██████████| 9/9 [00:13<00:00,  1.89s/it]


Calculating DWPCs...


100%|██████████| 9/9 [00:01<00:00,  3.79it/s]



Reformating results...


100%|██████████| 9/9 [00:13<00:00,  1.62s/it]


Calculating DWPCs...


100%|██████████| 9/9 [00:01<00:00,  2.26it/s]



Reformating results...


100%|██████████| 9/9 [00:12<00:00,  1.75s/it]


Calculating DWPCs...


100%|██████████| 9/9 [00:01<00:00,  5.67it/s]



Reformating results...


100%|██████████| 9/9 [00:12<00:00,  1.40s/it]


---

## Subset data

In [6]:
partitions = (pd
    .read_csv("data/partitions.tsv", sep='\t')
    .assign(idx = lambda df: df["hetnet"].str[-1].astype(np.int64))
)

In [7]:
dwpc_spread_df = pd.concat([
    dwpc_df.merge(
        partitions.query("idx == @val")[["hetnet", "chemical_id", "disease_id"]],
        how="right", on=["chemical_id", "disease_id"]
    )
    
    for (val, dwpc_df) in enumerate(dwpcs)
])

In [8]:
dwpc_spread_df = dwpc_spread_df.dropna(axis=1, how="any")

In [9]:
dwpc_spread_df.shape

(30136, 12)

In [10]:
dwpc_spread_df.head()

Unnamed: 0,chemical_id,disease_id,CbGaDaGaD,CbGbCtD,CtDtCbGaD,CtDtCtD,CtDaGbCtD,CbGbCbGaD,CtDaGaD,CbGaDtCtD,CbGaD,hetnet
0,DB00305,DOID:14221,0.001353,0.0,0.002845,0.0,0.020182,0.002868,0.006951,0.0,0.0,rephetio-v2.0
1,DB00305,DOID:10534,0.007244,0.006638,0.0,0.0,0.0,0.004051,0.0,0.006274,0.0,rephetio-v2.0
2,DB00641,DOID:1936,0.015329,0.070543,0.0,0.0,0.0,0.023724,0.0,0.000672,0.008678,rephetio-v2.0
3,DB01013,DOID:12236,0.004113,0.0,0.002913,0.0,0.0,0.003225,0.008587,0.0,0.0,rephetio-v2.0
4,DB01013,DOID:219,0.003064,0.000657,0.001257,0.005645,0.0,0.002802,0.00267,0.001998,0.0,rephetio-v2.0


In [11]:
dwpc_spread_df["hetnet"].value_counts()

rephetio-v2.0_perm-2    5463
rephetio-v2.0_perm-1    5449
rephetio-v2.0_perm-4    5445
rephetio-v2.0_perm-3    5415
rephetio-v2.0_perm-5    5404
rephetio-v2.0           2960
Name: hetnet, dtype: int64

## Write to file

In [12]:
path = 'data/matrix/dwpc.tsv.bz2'
with bz2.open(path, 'wt') as wf:
    dwpc_spread_df.to_csv(wf, index=False, sep='\t', float_format='%.5g')

---

## Calculate degree features

In [13]:
url = "../../integrate/data/summary/metaedge-styles.tsv"

metaedge_style_df = pd.read_table(url)
metaedge_to_abbreviation = dict(zip(metaedge_style_df.metaedge, metaedge_style_df.abbreviation))

url = "../../integrate/data/summary/degrees.xlsx"

disease_degree_df = (pd
    .read_excel(url, sheetname='Disease')
    .rename(columns={'node_id': 'disease_id'})
    .drop('node_name', axis='columns')
    .rename(columns=metaedge_to_abbreviation)
)

compound_degree_df = (pd
    .read_excel(url, sheetname='Compound')
    .rename(columns={'node_id': 'chemical_id'})
    .drop('node_name', axis='columns')
    .rename(columns=metaedge_to_abbreviation)
)

In [14]:
compound_degree_df.head(2)

Unnamed: 0,chemical_id,CbG,CtD
0,DB00014,2,1
1,DB00035,5,0


In [15]:
disease_degree_df.head(2)

Unnamed: 0,disease_id,DaG,DtC
0,DOID:0050156,18,0
1,DOID:0050425,12,0


In [16]:
compound_degree_df.to_csv('data/matrix/compound_degree.tsv', index=False, sep='\t')
disease_degree_df.to_csv('data/matrix/disease_degree.tsv', index=False, sep='\t')

---

## Compute prior dataset

In [17]:
# Read compound and disease degrees
compound_df = pd.read_table('../summary/compounds.tsv')
disease_df = pd.read_table('../summary/diseases.tsv')

total_pairs = len(compound_df) * len(disease_df)

nonzero_prior_pairs = sum(compound_df.treats > 0) * sum(disease_df.treats > 0)
total_pairs, nonzero_prior_pairs

(186662, 23579)

In [18]:
rows = list(itertools.product(compound_df.chemical_id, disease_df.disease_id))

prior_df = (pd
    .DataFrame(rows, columns=['chemical_id', 'disease_id'])
    .merge(
        pd.read_table('../prior/data/observation-prior.tsv')[['chemical_id', 'disease_id', 'prior_perm']],
        how='left'
    )
    .fillna(0)
    .rename(columns={'prior_perm': 'prior_prob'})
)

prior_df.head(2)

Unnamed: 0,chemical_id,disease_id,prior_prob
0,DB01048,DOID:10652,0.003058
1,DB01048,DOID:9206,0.003058


In [19]:
sum(prior_df.prior_prob)

591.99999235996574

In [20]:
(prior_df.prior_prob > 0).value_counts(True)

False    0.873681
True     0.126319
Name: prior_prob, dtype: float64

In [21]:
prior_df.to_csv('data/matrix/prior.tsv', index=False, sep='\t', float_format='%.5g')

---

## Create a single matrix-like dataframe

In [22]:
matrix_df = (partitions
    .drop("idx", axis=1)
    .merge(disease_df.iloc[:, :2])
    .merge(compound_df.iloc[:, :2])
    .merge(prior_df)
    .merge(compound_degree_df)
    .merge(disease_degree_df)
    .merge(dwpc_spread_df)
)

In [23]:
matrix_df.head(2)

Unnamed: 0,hetnet,chemical_id,disease_id,status,primary,disease_name,chemical_name,prior_prob,CbG,CtD,...,DtC,CbGaDaGaD,CbGbCtD,CtDtCbGaD,CtDtCtD,CtDaGbCtD,CbGbCbGaD,CtDaGaD,CbGaDtCtD,CbGaD
0,rephetio-v2.0_perm-3,DB00014,DOID:0050742,0,1,nicotine dependence,Goserelin,0.001517,2,1,...,1,0.000446,0.001792,0.0,0.0,0.009797,0.000909,0.00545,0.0,0.0
1,rephetio-v2.0_perm-1,DB00091,DOID:0050742,0,1,nicotine dependence,Cyclosporine,0.004893,30,3,...,1,0.003851,0.000636,0.001031,0.0,0.0,0.006292,0.005458,0.001843,0.0


In [24]:
df_creators = [
    {'feature_type': 'prior', 'feature': ['prior_prob']},
    {'feature_type': 'degree', 'feature': compound_degree_df.columns[1:]},
    {'feature_type': 'degree', 'feature': disease_degree_df.columns[1:]},
    
    # this line is super fragile
    # drops the "hetnet" column and the two identifier columns
    {'feature_type': 'dwpc', 'feature': dwpc_spread_df.columns[2:-1]},
]
feature_df = pd.concat(map(pd.DataFrame, df_creators))

In [25]:
unperm_name = 'rephetio-v2.0'

In [26]:
unperm_matrix_df = (matrix_df
    .query("hetnet == @unperm_name")
    .drop('hetnet', axis='columns')
)

feature_df['unperm_mean'] = list(
    unperm_matrix_df[feature_df.feature].mean()
)
feature_df['unperm_sd'] = list(
    unperm_matrix_df[feature_df.feature].std()
)
feature_df.head(2)

Unnamed: 0,feature,feature_type,unperm_mean,unperm_sd
0,prior_prob,prior,0.033654,0.062149
0,CbG,degree,11.528716,13.092279


In [27]:
feature_df.to_csv('data/matrix/feature-type.tsv', index=False, sep='\t', float_format='%.5g')

path = 'data/matrix/features.tsv.bz2'
with bz2.open(path, 'wt') as wf:
    matrix_df.to_csv(wf, index=False, sep='\t', float_format='%.5g')

In [28]:
# Save hetnet specific feature files
directory = os.path.join('data', 'matrix', unperm_name)
if not os.path.exists(directory):
    os.mkdir(directory)
path = os.path.join(directory, 'features.tsv.bz2')
with bz2.open(path, 'wt') as wf:
    unperm_matrix_df.to_csv(wf, index=False, sep='\t', float_format='%.5g')