In [1]:
from utility import *

from collections import Counter
from scipy.stats import ks_2samp

import matplotlib.pyplot as plt
import numpy as np

import random
import pandas as pd
import seaborn as sns

%matplotlib inline

In [2]:
combo2stitch, combo2se, se2name = load_combo_se(fname='../data/csv/bio-decagon-combo.csv')
net, node2idx = load_ppi('../data/csv/bio-decagon-ppi.csv')
stitch2se, se2name_mono = load_mono_se('../data/csv/bio-decagon-mono.csv')
stitch2proteins = load_targets(fname='../data/csv/bio-decagon-targets.csv')
se2class, se2name_class = load_categories('../data/csv/bio-decagon-effectcategories.csv') # standard naming file

print('***', len(set(se2name.keys())), len(set(se2name.values())))
print('***', len(set(se2name_mono.keys())), len(set(se2name_mono.values())))


se2name.update(se2name_mono) # add or update value in side effect name extracted from combo file with mono file

print('***', len(set(se2name.keys())), len(set(se2name.values())))


se2name.update(se2name_class) # add or update value in side effect name extracted from combo file with category file

print('***', len(set(se2name.keys())), len(set(se2name.values())))




print('unique side effect ID number', len(set(se2name.keys())))
print('unique side effect name number', len(set(se2name.values())))

****************************** Reading: ../data/csv/bio-decagon-combo.csv ******************************
Drug combinations number: 63473 Unique side effects number: 1317
Drug-drug interactions number: 4649441
Unique drug number: 645
****************************** Reading: ../data/csv/bio-decagon-ppi.csv ******************************
PPI Edges: 715612
PPI Nodes: 19081
****************************** Reading: ../data/csv/bio-decagon-mono.csv ******************************
Individual drug side effect total number: 174977
Unique drug number: 639
****************************** Reading: ../data/csv/bio-decagon-targets.csv ******************************
Drug-protein interaction: 18690
****************************** Reading: ../data/csv/bio-decagon-effectcategories.csv ******************************
unique side effect class number: 37
*** 1317 1317
*** 10184 9702
*** 11501 10989
*** 11501 10989
unique side effect ID number 11501
unique side effect name number 10989


In [3]:
# some side effect IDs share common name

In [6]:
# check: (i) side effects we are predicting over are true polypharmacy side effects 
# (i.e. a given polypharmacy side effect is only associated with a drug pair and not 
# with any individual drug in the pair)

# check (ii) no side effect types that we are predicting over are included in the side features. 
# For example, nausea is one polypharmacy side effect, 
# and we therefore remove all instances of nausea as a side effect for individual drugs.

# if there is no overlapped item between side effects in combo and mono datasets, then both (i) and (ii) hold true 

se_combo_id = np.concatenate([list(se) for se in combo2se.values()])
se_combo_id = np.array(list(set(se_combo_id)))
print('unique combo side effect ID number', se_combo_id.shape)

se_mono_id = np.concatenate([list(se) for se in stitch2se.values()])
se_mono_id = np.array(list(set(se_mono_id)))
print('unique mono side effect ID number', se_mono_id.shape)

print('overlapped side effect ID', np.intersect1d(se_mono_id, se_combo_id))


se_combo_name = np.array([se2name[se_id] for se_id in se_combo_id])
se_combo_name = np.array(list(set(se_combo_name)))
print('unique combo side effect name number', se_combo_name.shape)

se_mono_name = np.array([se2name[se_id] for se_id in se_mono_id])
se_mono_name = np.array(list(set(se_mono_name)))
print('unique mono side effect name number', se_mono_name.shape)

print('overlapped side effect name', np.intersect1d(se_mono_name, se_combo_name))

unique combo side effect ID number (1317,)
unique mono side effect ID number (10184,)
overlapped side effect ID []
unique combo side effect name number (1317,)
unique mono side effect name number (9702,)
overlapped side effect name ['Acidosis' 'Amnesia' 'Anxiety' 'Arrhythmia' 'Ataxia'
 'Blood calcium decreased' 'Breast cancer' 'Cardiomyopathy' 'Cough'
 'Deafness neurosensory' 'Diabetic neuropathy' 'Endometrial cancer'
 'Enuresis' 'Excoriation' 'Fatigue' 'Glossitis' 'Hepatic failure'
 'Incontinence' 'Leukaemia' 'Nail disorder' 'Nephrolithiasis'
 'Ovarian cancer' 'Pain' 'Paraesthesia' 'Pterygium' 'Rhinitis' 'Scar'
 'Scleroderma' 'Supraventricular tachycardia' 'Thrombophlebitis']


In [7]:
for overlap_name in np.intersect1d(se_mono_name, se_combo_name):
    print(overlap_name, np.where(np.array(list(se2name.values())) == overlap_name)[0].shape)

Acidosis (2,)
Amnesia (4,)
Anxiety (6,)
Arrhythmia (3,)
Ataxia (2,)
Blood calcium decreased (5,)
Breast cancer (3,)
Cardiomyopathy (3,)
Cough (5,)
Deafness neurosensory (2,)
Diabetic neuropathy (2,)
Endometrial cancer (2,)
Enuresis (2,)
Excoriation (2,)
Fatigue (4,)
Glossitis (2,)
Hepatic failure (2,)
Incontinence (2,)
Leukaemia (2,)
Nail disorder (4,)
Nephrolithiasis (2,)
Ovarian cancer (2,)
Pain (10,)
Paraesthesia (10,)
Pterygium (2,)
Rhinitis (3,)
Scar (2,)
Scleroderma (2,)
Supraventricular tachycardia (2,)
Thrombophlebitis (2,)


In [8]:
# in side effect id level, there is no overlap 
# in side effect name level, there are overlapped names, which contain multiple ids