In [1]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import sklearn.linear_model
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from __future__ import division
from __future__ import print_function
from operator import itemgetter
from itertools import combinations
import time
import os

import tensorflow as tf
import numpy as np
import networkx as nx
import scipy.sparse as sp
from sklearn import metrics

from decagon.deep.optimizer import DecagonOptimizer
from decagon.deep.model import DecagonModel
from decagon.deep.minibatch import EdgeMinibatchIterator
from decagon.utility import rank_metrics, preprocessing

  from pandas.core import datetools
  return f(*args, **kwds)


In [2]:
# Train on CPU (hide GPU) due to memory constraints
os.environ['CUDA_VISIBLE_DEVICES'] = ""

# Train on GPU
# os.environ["CUDA_DEVICE_ORDER"] = 'PCI_BUS_ID'
# os.environ["CUDA_VISIBLE_DEVICES"] = '0'
# config = tf.ConfigProto()
# config.gpu_options.allow_growth = True

np.random.seed(0)

###########################################################
#
# Functions
#
###########################################################


def get_accuracy_scores(edges_pos, edges_neg, edge_type):
    feed_dict.update({placeholders['dropout']: 0})
    feed_dict.update({placeholders['batch_edge_type_idx']: minibatch.edge_type2idx[edge_type]})
    feed_dict.update({placeholders['batch_row_edge_type']: edge_type[0]})
    feed_dict.update({placeholders['batch_col_edge_type']: edge_type[1]})
    rec = sess.run(opt.predictions, feed_dict=feed_dict)

    def sigmoid(x):
        return 1. / (1 + np.exp(-x))

    # Predict on test set of edges
    preds = []
    actual = []
    predicted = []
    edge_ind = 0
    for u, v in edges_pos[edge_type[:2]][edge_type[2]]:
        score = sigmoid(rec[u, v])
        preds.append(score)
        assert adj_mats_orig[edge_type[:2]][edge_type[2]][u,v] == 1, 'Problem 1'

        actual.append(edge_ind)
        predicted.append((score, edge_ind))
        edge_ind += 1

    preds_neg = []
    for u, v in edges_neg[edge_type[:2]][edge_type[2]]:
        score = sigmoid(rec[u, v])
        preds_neg.append(score)
        assert adj_mats_orig[edge_type[:2]][edge_type[2]][u,v] == 0, 'Problem 0'

        predicted.append((score, edge_ind))
        edge_ind += 1

    preds_all = np.hstack([preds, preds_neg])
    preds_all = np.nan_to_num(preds_all)
    labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds_neg))])
    predicted = list(zip(*sorted(predicted, reverse=True, key=itemgetter(0))))[1]

    roc_sc = metrics.roc_auc_score(labels_all, preds_all)
    aupr_sc = metrics.average_precision_score(labels_all, preds_all)
    apk_sc = rank_metrics.apk(actual, predicted, k=50)

    return roc_sc, aupr_sc, apk_sc


def construct_placeholders(edge_types):
    placeholders = {
        'batch': tf.placeholder(tf.int32, name='batch'),
        'batch_edge_type_idx': tf.placeholder(tf.int32, shape=(), name='batch_edge_type_idx'),
        'batch_row_edge_type': tf.placeholder(tf.int32, shape=(), name='batch_row_edge_type'),
        'batch_col_edge_type': tf.placeholder(tf.int32, shape=(), name='batch_col_edge_type'),
        'degrees': tf.placeholder(tf.int32),
        'dropout': tf.placeholder_with_default(0., shape=()),
    }
    placeholders.update({
        'adj_mats_%d,%d,%d' % (i, j, k): tf.sparse_placeholder(tf.float32)
        for i, j in edge_types for k in range(edge_types[i,j])})
    placeholders.update({
        'feat_%d' % i: tf.sparse_placeholder(tf.float32)
        for i, _ in edge_types})
    return placeholders

###########################################################
#
# Load and preprocess data (This is a dummy toy example!)
#
###########################################################

####
# The following code uses artificially generated and very small networks.
# Expect less than excellent performance as these random networks do not have any interesting structure.
# The purpose of main.py is to show how to use the code!
#
# All preprocessed datasets used in the drug combination study are at: http://snap.stanford.edu/decagon:
# (1) Download datasets from http://snap.stanford.edu/decagon to your local machine.
# (2) Replace dummy toy datasets used here with the actual datasets you just downloaded.
# (3) Train & test the model.
####


In [102]:
val_test_size = 0.05
n_genes = 6
n_drugs = 4
n_drugdrug_rel_types = 2
gene_net = nx.planted_partition_graph(2, 3, 0.3, 0.05, seed=42)

gene_adj = nx.adjacency_matrix(gene_net)
gene_degrees = np.array(gene_adj.sum(axis=0)).squeeze()

In [103]:
print(gene_adj)

  (1, 2)	1
  (2, 1)	1
  (2, 4)	1
  (3, 4)	1
  (3, 5)	1
  (4, 2)	1
  (4, 3)	1
  (5, 3)	1


In [104]:
gene_adj[1,2]

1

In [105]:
print(gene_degrees)

[0 1 2 2 2 1]


In [106]:
gene_degrees[4]

2

In [107]:
gene_adj[1,2]

1

In [108]:
gene_drug_adj = sp.csr_matrix((10 * np.random.randn(n_genes, n_drugs) > 2).astype(int))

In [109]:
print(gene_drug_adj)

  (0, 0)	1
  (0, 2)	1
  (2, 1)	1
  (2, 2)	1
  (3, 0)	1
  (3, 1)	1
  (3, 2)	1
  (4, 1)	1
  (4, 3)	1
  (5, 0)	1
  (5, 1)	1
  (5, 2)	1
  (5, 3)	1


In [110]:
drug_gene_adj = gene_drug_adj.transpose(copy=True)

In [111]:
drug_gene_adj.shape

(4, 6)

In [112]:
print(drug_gene_adj)

  (0, 0)	1
  (2, 0)	1
  (1, 2)	1
  (2, 2)	1
  (0, 3)	1
  (1, 3)	1
  (2, 3)	1
  (1, 4)	1
  (3, 4)	1
  (0, 5)	1
  (1, 5)	1
  (2, 5)	1
  (3, 5)	1


In [113]:
drug_drug_adj_list = []
tmp = np.dot(drug_gene_adj, gene_drug_adj)

In [114]:
print(tmp)

  (3, 0)	1
  (1, 0)	2
  (2, 0)	3
  (0, 0)	3
  (3, 1)	2
  (0, 1)	2
  (2, 1)	3
  (1, 1)	4
  (3, 2)	1
  (1, 2)	3
  (2, 2)	4
  (0, 2)	3
  (2, 3)	1
  (0, 3)	1
  (3, 3)	2
  (1, 3)	2


In [115]:
for i in range(n_drugdrug_rel_types):
    mat = np.zeros((n_drugs, n_drugs))
    for d1, d2 in combinations(list(range(n_drugs)), 2):
        if tmp[d1, d2] == i + 2:
            mat[d1, d2] = mat[d2, d1] = 1.
    drug_drug_adj_list.append(sp.csr_matrix(mat))
    print(mat)
drug_degrees_list = [np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in drug_drug_adj_list]

[[ 0.  1.  0.  0.]
 [ 1.  0.  0.  1.]
 [ 0.  0.  0.  0.]
 [ 0.  1.  0.  0.]]
[[ 0.  0.  1.  0.]
 [ 0.  0.  1.  0.]
 [ 1.  1.  0.  0.]
 [ 0.  0.  0.  0.]]


In [116]:
len(drug_drug_adj_list)

2

In [122]:
drug_drug_adj_list[1][0,2]

1.0

In [123]:
adj_mats_orig = {
    (0, 0): [gene_adj, gene_adj.transpose(copy=True)],
    (0, 1): [gene_drug_adj],
    (1, 0): [drug_gene_adj],
    (1, 1): drug_drug_adj_list + [x.transpose(copy=True) for x in drug_drug_adj_list],
}

In [124]:
adj_mats_orig

{(0, 0): [<6x6 sparse matrix of type '<class 'numpy.int64'>'
  	with 8 stored elements in Compressed Sparse Row format>,
  <6x6 sparse matrix of type '<class 'numpy.int64'>'
  	with 8 stored elements in Compressed Sparse Column format>],
 (0, 1): [<6x4 sparse matrix of type '<class 'numpy.int64'>'
  	with 13 stored elements in Compressed Sparse Row format>],
 (1, 0): [<4x6 sparse matrix of type '<class 'numpy.int64'>'
  	with 13 stored elements in Compressed Sparse Column format>],
 (1, 1): [<4x4 sparse matrix of type '<class 'numpy.float64'>'
  	with 4 stored elements in Compressed Sparse Row format>,
  <4x4 sparse matrix of type '<class 'numpy.float64'>'
  	with 4 stored elements in Compressed Sparse Row format>,
  <4x4 sparse matrix of type '<class 'numpy.float64'>'
  	with 4 stored elements in Compressed Sparse Column format>,
  <4x4 sparse matrix of type '<class 'numpy.float64'>'
  	with 4 stored elements in Compressed Sparse Column format>]}

In [125]:
adj_mats_orig[0,0]

[<6x6 sparse matrix of type '<class 'numpy.int64'>'
 	with 8 stored elements in Compressed Sparse Row format>,
 <6x6 sparse matrix of type '<class 'numpy.int64'>'
 	with 8 stored elements in Compressed Sparse Column format>]

In [53]:
adj_mats_orig[0,1]

[<6x4 sparse matrix of type '<class 'numpy.int64'>'
 	with 15 stored elements in Compressed Sparse Row format>]

In [54]:
adj_mats_orig[1,0]

[<4x6 sparse matrix of type '<class 'numpy.int64'>'
 	with 15 stored elements in Compressed Sparse Column format>]

In [16]:
drug_degrees_list

[array([ 1.,  2.,  2.,  3.]), array([ 0.,  0.,  0.,  0.])]

In [126]:
degrees = {
    0: [gene_degrees, gene_degrees],
    1: drug_degrees_list + drug_degrees_list,
}

# featureless (genes)
gene_feat = sp.identity(n_genes)
gene_nonzero_feat, gene_num_feat = gene_feat.shape
gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo())

# features (drugs)
drug_feat = sp.identity(n_drugs)
drug_nonzero_feat, drug_num_feat = drug_feat.shape
drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo())

# data representation
num_feat = {
    0: gene_num_feat,
    1: drug_num_feat,
}
nonzero_feat = {
    0: gene_nonzero_feat,
    1: drug_nonzero_feat,
}
feat = {
    0: gene_feat,
    1: drug_feat,
}

edge_type2dim = {k: [adj.shape for adj in adjs] for k, adjs in adj_mats_orig.items()}
edge_type2decoder = {
    (0, 0): 'bilinear',
    (0, 1): 'bilinear',
    (1, 0): 'bilinear',
    (1, 1): 'dedicom',
}

In [18]:
protein_protein_test = pd.read_csv("/Users/ravanv/Desktop/test_ppi.csv", sep=',',header = 0)

In [19]:
df = pd.crosstab(protein_protein_test['Gene 1'], protein_protein_test['Gene 2'])
df

Gene 2,2775,2781,4914,5677,8089,29785,51343
Gene 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1234,0,0,0,1,0,0,0
114787,1,1,1,0,1,1,1


In [20]:
idx = df.columns.union(df.index)
idx

Int64Index([1234, 2775, 2781, 4914, 5677, 8089, 29785, 51343, 114787], dtype='int64')

In [21]:
df = df.reindex(index = idx, columns=idx, fill_value=0)
print (df)

        1234    2775    2781    4914    5677    8089    29785   51343   114787
1234         0       0       0       0       1       0       0       0       0
2775         0       0       0       0       0       0       0       0       0
2781         0       0       0       0       0       0       0       0       0
4914         0       0       0       0       0       0       0       0       0
5677         0       0       0       0       0       0       0       0       0
8089         0       0       0       0       0       0       0       0       0
29785        0       0       0       0       0       0       0       0       0
51343        0       0       0       0       0       0       0       0       0
114787       0       1       1       1       0       1       1       1       0


In [23]:
df1 = pd.crosstab(protein_protein_test['Gene 2'], protein_protein_test['Gene 1'])
df1

Gene 1,1234,114787
Gene 2,Unnamed: 1_level_1,Unnamed: 2_level_1
2775,0,1
2781,0,1
4914,0,1
5677,1,0
8089,0,1
29785,0,1
51343,0,1


In [24]:
df1 = df1.reindex(index = idx, columns=idx, fill_value=0)
print (df1)

        1234    2775    2781    4914    5677    8089    29785   51343   114787
1234         0       0       0       0       0       0       0       0       0
2775         0       0       0       0       0       0       0       0       1
2781         0       0       0       0       0       0       0       0       1
4914         0       0       0       0       0       0       0       0       1
5677         1       0       0       0       0       0       0       0       0
8089         0       0       0       0       0       0       0       0       1
29785        0       0       0       0       0       0       0       0       1
51343        0       0       0       0       0       0       0       0       1
114787       0       0       0       0       0       0       0       0       0


In [27]:
df_add = df1.add(df, fill_value=0)

In [28]:
df_add

Unnamed: 0,1234,2775,2781,4914,5677,8089,29785,51343,114787
1234,0,0,0,0,1,0,0,0,0
2775,0,0,0,0,0,0,0,0,1
2781,0,0,0,0,0,0,0,0,1
4914,0,0,0,0,0,0,0,0,1
5677,1,0,0,0,0,0,0,0,0
8089,0,0,0,0,0,0,0,0,1
29785,0,0,0,0,0,0,0,0,1
51343,0,0,0,0,0,0,0,0,1
114787,0,1,1,1,0,1,1,1,0


In [30]:
df_val = df_add.values

In [34]:
df_val[0,4]

1