# New extract and matrixify in one notebook

Goal to reproduce the files needed.

Skip 3-extract
skip parts of 4-matrixify

but produce the same output files

In [1]:
import pandas as pd
import numpy as np

import os
import bz2
import itertools

import sys

sys.path.append("..")

In [2]:
from src.extractor import MatrixFormattedGraph

---

## Get the matricies for all permutations

In [3]:
def get_matrix(folder, index):
    perm_val = "" if index == 0 else "_perm-{}".format(index)
    nodes_fname = "{}/hetnet{}_nodes.csv".format(folder, perm_val)
    edges_fname = "{}/hetnet{}_edges.csv".format(folder, perm_val)

    return MatrixFormattedGraph(
        nodes_fname, edges_fname,
        start_kind="Compound", end_kind="Disease",
        max_length=4
    )

In [4]:
folder = "../../integrate/data/import_csvs"

matricies = [
    get_matrix(folder, index) for index in range(6)
]

Reading file information...
Initializing metagraph...
Generating adjcency matrices...


100%|██████████| 3/3 [00:00<00:00,  3.55it/s]



Weighting matrices by degree with dampening factor 0.4...


100%|██████████| 3/3 [00:00<00:00,  8.24it/s]


Reading file information...
Initializing metagraph...
Generating adjcency matrices...


100%|██████████| 3/3 [00:00<00:00,  3.31it/s]



Weighting matrices by degree with dampening factor 0.4...


100%|██████████| 3/3 [00:00<00:00, 10.28it/s]


Reading file information...
Initializing metagraph...
Generating adjcency matrices...


100%|██████████| 3/3 [00:00<00:00,  3.56it/s]



Weighting matrices by degree with dampening factor 0.4...


100%|██████████| 3/3 [00:00<00:00, 14.14it/s]


Reading file information...
Initializing metagraph...
Generating adjcency matrices...


100%|██████████| 3/3 [00:00<00:00,  3.62it/s]



Weighting matrices by degree with dampening factor 0.4...


100%|██████████| 3/3 [00:00<00:00, 12.32it/s]


Reading file information...
Initializing metagraph...
Generating adjcency matrices...


100%|██████████| 3/3 [00:00<00:00,  3.69it/s]



Weighting matrices by degree with dampening factor 0.4...


100%|██████████| 3/3 [00:00<00:00,  7.35it/s]


Reading file information...
Initializing metagraph...
Generating adjcency matrices...


100%|██████████| 3/3 [00:00<00:00,  3.22it/s]



Weighting matrices by degree with dampening factor 0.4...


100%|██████████| 3/3 [00:00<00:00, 11.07it/s]


---

## Extract out the DWPCs

In [5]:
dwpcs = [
    (mg
        .extract_dwpc(
            start_nodes="Compound", end_nodes="Disease",
            n_jobs=32
        )
        .rename(columns={"compound_id": "chemical_id"})
    )
    
    for mg in matricies
]

Calculating DWPCs...


100%|██████████| 9/9 [00:02<00:00,  3.41it/s]



Reformating results...


100%|██████████| 9/9 [01:12<00:00, 16.05s/it]


Calculating DWPCs...


100%|██████████| 9/9 [00:04<00:00,  1.03it/s]



Reformating results...


100%|██████████| 9/9 [00:41<00:00,  5.41s/it]


Calculating DWPCs...


100%|██████████| 9/9 [00:03<00:00,  1.12it/s]



Reformating results...


100%|██████████| 9/9 [00:22<00:00,  2.30s/it]


Calculating DWPCs...


100%|██████████| 9/9 [00:02<00:00,  2.17it/s]



Reformating results...


100%|██████████| 9/9 [00:16<00:00,  1.42s/it]


Calculating DWPCs...


100%|██████████| 9/9 [00:02<00:00,  2.19it/s]



Reformating results...


100%|██████████| 9/9 [00:15<00:00,  1.49s/it]


Calculating DWPCs...


100%|██████████| 9/9 [00:01<00:00,  5.70it/s]



Reformating results...


100%|██████████| 9/9 [00:14<00:00,  1.32s/it]


---

## Subset data

In [6]:
partitions = (pd
    .read_csv("data/partitions.tsv", sep='\t')
    .assign(idx = lambda df: df["hetnet"].str[-1].astype(np.int64))
)

In [7]:
dwpc_spread_df = pd.concat([
    dwpc_df.merge(
        partitions.query("idx == @val")[["hetnet", "chemical_id", "disease_id"]],
        how="right", on=["chemical_id", "disease_id"]
    )
    
    for (val, dwpc_df) in enumerate(dwpcs)
])

In [8]:
dwpc_spread_df = dwpc_spread_df.dropna(axis=1, how="any")

In [9]:
dwpc_spread_df.shape

(30198, 12)

In [10]:
dwpc_spread_df.head()

Unnamed: 0,chemical_id,disease_id,CbGaD,CtDaGaD,CbGaDaGaD,CtDaGbCtD,CbGaDtCtD,CbGbCtD,CbGbCbGaD,CtDtCbGaD,CtDtCtD,hetnet
0,DB00468,DOID:635,0.0,0.025201,0.003199,0.0,0.0,0.051552,0.003482,0.0,0.0,rephetio-v2.0
1,DB00468,DOID:10534,0.0,0.001986,0.009869,0.001308,0.005191,0.006196,0.014856,0.003513,0.0,rephetio-v2.0
2,DB00468,DOID:1115,0.0,0.002964,0.005034,0.000589,0.003827,0.006297,0.00343,0.0,0.0,rephetio-v2.0
3,DB00468,DOID:3310,0.0,0.017528,0.005008,0.000449,0.002203,0.024342,0.002396,0.0,0.0,rephetio-v2.0
4,DB00468,DOID:12930,0.007265,0.006323,0.003083,0.0,0.000806,0.002914,0.016035,0.003013,0.0,rephetio-v2.0


In [11]:
dwpc_spread_df["hetnet"].value_counts()

rephetio-v2.0_perm-3    5470
rephetio-v2.0_perm-2    5465
rephetio-v2.0_perm-5    5455
rephetio-v2.0_perm-4    5440
rephetio-v2.0_perm-1    5408
rephetio-v2.0           2960
Name: hetnet, dtype: int64

## Write to file

In [12]:
path = 'data/matrix/dwpc.tsv.bz2'
with bz2.open(path, 'wt') as wf:
    dwpc_spread_df.to_csv(wf, index=False, sep='\t', float_format='%.5g')

---

## Calculate degree features

In [13]:
url = "../../integrate/data/summary/metaedge-styles.tsv"

metaedge_style_df = pd.read_table(url)
metaedge_to_abbreviation = dict(zip(metaedge_style_df.metaedge, metaedge_style_df.abbreviation))

url = "../../integrate/data/summary/degrees.xlsx"

disease_degree_df = (pd
    .read_excel(url, sheetname='Disease')
    .rename(columns={'node_id': 'disease_id'})
    .drop('node_name', axis='columns')
    .rename(columns=metaedge_to_abbreviation)
)

compound_degree_df = (pd
    .read_excel(url, sheetname='Compound')
    .rename(columns={'node_id': 'chemical_id'})
    .drop('node_name', axis='columns')
    .rename(columns=metaedge_to_abbreviation)
)

In [14]:
compound_degree_df.head(2)

Unnamed: 0,chemical_id,CbG,CtD
0,DB00014,2,1
1,DB00035,5,0


In [15]:
disease_degree_df.head(2)

Unnamed: 0,disease_id,DaG,DtC
0,DOID:0050156,18,0
1,DOID:0050425,12,0


In [16]:
compound_degree_df.to_csv('data/matrix/compound_degree.tsv', index=False, sep='\t')
disease_degree_df.to_csv('data/matrix/disease_degree.tsv', index=False, sep='\t')

---

## Compute prior dataset

In [17]:
# Read compound and disease degrees
compound_df = pd.read_table('../summary/compounds.tsv')
disease_df = pd.read_table('../summary/diseases.tsv')

total_pairs = len(compound_df) * len(disease_df)

nonzero_prior_pairs = sum(compound_df.treats > 0) * sum(disease_df.treats > 0)
total_pairs, nonzero_prior_pairs

(186662, 23579)

In [18]:
rows = list(itertools.product(compound_df.chemical_id, disease_df.disease_id))

prior_df = (pd
    .DataFrame(rows, columns=['chemical_id', 'disease_id'])
    .merge(
        pd.read_table('../prior/data/observation-prior.tsv')[['chemical_id', 'disease_id', 'prior_perm']],
        how='left'
    )
    .fillna(0)
    .rename(columns={'prior_perm': 'prior_prob'})
)

prior_df.head(2)

Unnamed: 0,chemical_id,disease_id,prior_prob
0,DB01048,DOID:10652,0.003058
1,DB01048,DOID:9206,0.003058


In [19]:
sum(prior_df.prior_prob)

591.99999235996574

In [20]:
(prior_df.prior_prob > 0).value_counts(True)

False    0.873681
True     0.126319
Name: prior_prob, dtype: float64

In [21]:
prior_df.to_csv('data/matrix/prior.tsv', index=False, sep='\t', float_format='%.5g')

---

## Create a single matrix-like dataframe

In [22]:
matrix_df = (partitions
    .drop("idx", axis=1)
    .merge(disease_df.iloc[:, :2])
    .merge(compound_df.iloc[:, :2])
    .merge(prior_df)
    .merge(compound_degree_df)
    .merge(disease_degree_df)
    .merge(dwpc_spread_df)
)

In [23]:
matrix_df.head(2)

Unnamed: 0,hetnet,chemical_id,disease_id,status,primary,disease_name,chemical_name,prior_prob,CbG,CtD,...,DtC,CbGaD,CtDaGaD,CbGaDaGaD,CtDaGbCtD,CbGaDtCtD,CbGbCtD,CbGbCbGaD,CtDtCbGaD,CtDtCtD
0,rephetio-v2.0,DB00014,DOID:0050742,0,1,nicotine dependence,Goserelin,0.001517,2,1,...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,rephetio-v2.0_perm-1,DB00014,DOID:0050742,0,0,nicotine dependence,Goserelin,0.001517,2,1,...,1,0.0,0.004874,0.001362,0.000619,0.000854,0.0,0.001535,0.00117,0.0


In [24]:
df_creators = [
    {'feature_type': 'prior', 'feature': ['prior_prob']},
    {'feature_type': 'degree', 'feature': compound_degree_df.columns[1:]},
    {'feature_type': 'degree', 'feature': disease_degree_df.columns[1:]},
    
    # this line is super fragile
    # drops the "hetnet" column and the two identifier columns
    {
        'feature_type': 'dwpc',
        'feature': dwpc_spread_df.drop([
            "chemical_id", "disease_id", "hetnet"
        ], axis=1).columns
    },
]
feature_df = pd.concat(map(pd.DataFrame, df_creators))

In [25]:
unperm_name = 'rephetio-v2.0'

In [26]:
unperm_matrix_df = (matrix_df
    .query("hetnet == @unperm_name")
    .drop('hetnet', axis=1)
)

In [27]:
feature_df['unperm_mean'] = list(
    unperm_matrix_df[feature_df.feature].mean()
)

In [28]:
feature_df['unperm_sd'] = list(
    unperm_matrix_df[feature_df.feature].std()
)
feature_df.head(2)

Unnamed: 0,feature,feature_type,unperm_mean,unperm_sd
0,prior_prob,prior,0.033503,0.06185
0,CbG,degree,11.816554,13.42179


In [29]:
feature_df.to_csv('data/matrix/feature-type.tsv', index=False, sep='\t', float_format='%.5g')

path = 'data/matrix/features.tsv.bz2'
with bz2.open(path, 'wt') as wf:
    matrix_df.to_csv(wf, index=False, sep='\t', float_format='%.5g')

In [30]:
# Save hetnet specific feature files
directory = os.path.join('data', 'matrix', unperm_name)
if not os.path.exists(directory):
    os.mkdir(directory)
path = os.path.join(directory, 'features.tsv.bz2')
with bz2.open(path, 'wt') as wf:
    unperm_matrix_df.to_csv(wf, index=False, sep='\t', float_format='%.5g')