In [1]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import sklearn.linear_model
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from __future__ import division
from __future__ import print_function
from operator import itemgetter
from itertools import combinations
import time
import os

import tensorflow as tf
import numpy as np
import networkx as nx
import scipy.sparse as sp
from sklearn import metrics

from decagon.deep.optimizer import DecagonOptimizer
from decagon.deep.model import DecagonModel
from decagon.deep.minibatch import EdgeMinibatchIterator
from decagon.utility import rank_metrics, preprocessing

  from pandas.core import datetools
  return f(*args, **kwds)


In [2]:
# Train on CPU (hide GPU) due to memory constraints
os.environ['CUDA_VISIBLE_DEVICES'] = ""

# Train on GPU
# os.environ["CUDA_DEVICE_ORDER"] = 'PCI_BUS_ID'
# os.environ["CUDA_VISIBLE_DEVICES"] = '0'
# config = tf.ConfigProto()
# config.gpu_options.allow_growth = True

np.random.seed(0)

###########################################################
#
# Functions
#
###########################################################


def get_accuracy_scores(edges_pos, edges_neg, edge_type):
    feed_dict.update({placeholders['dropout']: 0})
    feed_dict.update({placeholders['batch_edge_type_idx']: minibatch.edge_type2idx[edge_type]})
    feed_dict.update({placeholders['batch_row_edge_type']: edge_type[0]})
    feed_dict.update({placeholders['batch_col_edge_type']: edge_type[1]})
    rec = sess.run(opt.predictions, feed_dict=feed_dict)

    def sigmoid(x):
        return 1. / (1 + np.exp(-x))

    # Predict on test set of edges
    preds = []
    actual = []
    predicted = []
    edge_ind = 0
    for u, v in edges_pos[edge_type[:2]][edge_type[2]]:
        score = sigmoid(rec[u, v])
        preds.append(score)
        assert adj_mats_orig[edge_type[:2]][edge_type[2]][u,v] == 1, 'Problem 1'

        actual.append(edge_ind)
        predicted.append((score, edge_ind))
        edge_ind += 1

    preds_neg = []
    for u, v in edges_neg[edge_type[:2]][edge_type[2]]:
        score = sigmoid(rec[u, v])
        preds_neg.append(score)
        assert adj_mats_orig[edge_type[:2]][edge_type[2]][u,v] == 0, 'Problem 0'

        predicted.append((score, edge_ind))
        edge_ind += 1

    preds_all = np.hstack([preds, preds_neg])
    preds_all = np.nan_to_num(preds_all)
    labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds_neg))])
    predicted = list(zip(*sorted(predicted, reverse=True, key=itemgetter(0))))[1]

    roc_sc = metrics.roc_auc_score(labels_all, preds_all)
    aupr_sc = metrics.average_precision_score(labels_all, preds_all)
    apk_sc = rank_metrics.apk(actual, predicted, k=50)

    return roc_sc, aupr_sc, apk_sc


def construct_placeholders(edge_types):
    placeholders = {
        'batch': tf.placeholder(tf.int32, name='batch'),
        'batch_edge_type_idx': tf.placeholder(tf.int32, shape=(), name='batch_edge_type_idx'),
        'batch_row_edge_type': tf.placeholder(tf.int32, shape=(), name='batch_row_edge_type'),
        'batch_col_edge_type': tf.placeholder(tf.int32, shape=(), name='batch_col_edge_type'),
        'degrees': tf.placeholder(tf.int32),
        'dropout': tf.placeholder_with_default(0., shape=()),
    }
    placeholders.update({
        'adj_mats_%d,%d,%d' % (i, j, k): tf.sparse_placeholder(tf.float32)
        for i, j in edge_types for k in range(edge_types[i,j])})
    placeholders.update({
        'feat_%d' % i: tf.sparse_placeholder(tf.float32)
        for i, _ in edge_types})
    return placeholders

###########################################################
#
# Load and preprocess data (This is a dummy toy example!)
#
###########################################################

####
# The following code uses artificially generated and very small networks.
# Expect less than excellent performance as these random networks do not have any interesting structure.
# The purpose of main.py is to show how to use the code!
#
# All preprocessed datasets used in the drug combination study are at: http://snap.stanford.edu/decagon:
# (1) Download datasets from http://snap.stanford.edu/decagon to your local machine.
# (2) Replace dummy toy datasets used here with the actual datasets you just downloaded.
# (3) Train & test the model.
####


In [3]:
#bio-decagon-ppi:Protein-protein interaction network
#bio-decagon-targets: Drug-target protein associations
#bio-decagon-targets-all: Drug-target protein associations culled from several curated databases
#bio-decagon-combo:Polypharmacy side effects in the form of (drug A, side effect type, drug B) triples
#bio-decagon-effectcategories: Side effect categories
#bio-decagon-mono:Side effects of individual drugs in the form of (drug A, side effect type) tuples

<h1>Datasets from the paper:</h1>
<h2>The protein-prtoein network:</h2>
number of proteins = 19,085, number of physical interactions = 719,402
<h2>The drug-prtoein network:</h2>
number of proteins = 8,934, number of drugs = 519,022, number of interactions = 8,083,600
<h2>The drug-drug network(individuale):</h2>
number of  drugs = 1,556, number of side effects = 5,868, number of drug-side effect association = 286,399
<h2>The drug-drugn network(combination):</h2>
number of  drug combinations = 63,473, number of side effect types = 1,318, number of drug combination-side effect association = 4,651,131
<h2>Final Network:</h2>
<h4>Number of protiens = 19,085</h4>
<h4>Number of drugs = 645</h4>
<h4>Number of protien-protien edges= 715,612</h4>
<h4>Number of drug-drug edges= 4,651,131</h4>
<h4>Number of drug-protien edges= 18,596</h4>

In [4]:
protein_protein_data = pd.read_csv("/Users/ravanv/Desktop/Decagon/data/bio-decagon-ppi.csv", sep=',',header = 0)
drug_target_protein_data = pd.read_csv("/Users/ravanv/Desktop/Decagon/data/bio-decagon-targets.csv", sep=',',header = 0)
drug_target_protein_all_data = pd.read_csv("/Users/ravanv/Desktop/Decagon/data/bio-decagon-targets-all.csv", sep=',',header = 0)
polypharmacy_side_effect_data = pd.read_csv("/Users/ravanv/Desktop/Decagon/data/bio-decagon-combo.csv", sep=',',header = 0)
side_effect_categories_data = pd.read_csv("/Users/ravanv/Desktop/Decagon/data/bio-decagon-effectcategories.csv", sep=',',header = 0)
side_effect_individuale_data = pd.read_csv("/Users/ravanv/Desktop/Decagon/data/bio-decagon-mono.csv", sep=',',header = 0)


In [5]:
print("size of the protein-protien network:", protein_protein_data.shape)
print("size of the Drug-target protein associations:", drug_target_protein_data.shape)
print("size of the Drug-target protein associations culled from several curated databases :",drug_target_protein_all_data.shape)
print("size of the Polypharmacy side effects:", polypharmacy_side_effect_data.shape)
print("size of the Side effect categories:",side_effect_categories_data.shape)
print("size of the Side effects of individual drugs:",side_effect_individuale_data.shape)

size of the protein-protien network: (715612, 2)
size of the Drug-target protein associations: (18690, 2)
size of the Drug-target protein associations culled from several curated databases : (131034, 2)
size of the Polypharmacy side effects: (4649441, 4)
size of the Side effect categories: (561, 3)
size of the Side effects of individual drugs: (174977, 3)


In [6]:
column_list = list(protein_protein_data)
column_list

['Gene 1', 'Gene 2']

In [7]:
list_gene1 = []
prev_gene = []
for i in range(0,protein_protein_data.shape[0]):
    if protein_protein_data.loc[i][0] not in prev_gene:
        list_gene1.append(protein_protein_data.loc[i][0])
        prev_gene.append(protein_protein_data.loc[i][0])

In [8]:
len(list_gene1)

17544

In [9]:
list_gene2 = []
prev_gene = []
for i in range(0,protein_protein_data.shape[0]):
    if protein_protein_data.loc[i][1] not in prev_gene:
        list_gene2.append(protein_protein_data.loc[i][1])
        prev_gene.append(protein_protein_data.loc[i][1])

In [10]:
len(list_gene2)

17472

In [23]:
def intersection(list1, list2):
    lst3 = [value for value in list1 if value in list2]
    return lst3
def onlylist(lst,intersect):
    lst3 = [value for value in lst if value not in intersect]
    return lst3

In [24]:
intersect = intersection(list_gene2,list_gene1)
list1_only = onlylist(list_gene1,intersect)
list2_only = onlylist(list_gene2,intersect)

In [21]:
list_proteins = intersect + list1_only + list2_only

In [22]:
print("number of protiens in the first column = ", len(list1_only))
print("number of protiens in the second column = ", len(list2_only))
print("number of protiens that exist in both columns = ",len(intersect))
print("total number of protiens  = ",len(list_proteins))

number of protiens in the first column =  1609
number of protiens in the second column =  1537
number of protiens that exist in both columns =  15935
total number of protiens  =  19081


<h2>Number of protiens in the paper is 19,085, while I found 19,081 proteins! </h2>