In [35]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [36]:
import ddi
import sys

In [37]:
import numpy as np
import pandas as pd
import datetime
import seaborn as sns
from ddi.dataset import *

In [38]:
from ddi.utilities import *
from ddi.run_workflow import *

In [39]:
rawdata_dir = '../data/raw/'
processed_dir = '../data/processed/'
up_dir = '..'

In [40]:
report_available_cuda_devices()

number of GPUs available: 5
cuda:0, name:GeForce GTX 1080 Ti
total memory available: 10.91650390625 GB
total memory allocated on device: 0.0 GB
max memory allocated on device: 0.0 GB
total memory cached on device: 0.0 GB
max memory cached  on device: 0.0 GB

cuda:1, name:GeForce GTX 1080 Ti
total memory available: 10.91650390625 GB
total memory allocated on device: 0.0 GB
max memory allocated on device: 0.0 GB
total memory cached on device: 0.0 GB
max memory cached  on device: 0.0 GB

cuda:2, name:GeForce GTX 1080 Ti
total memory available: 10.91650390625 GB
total memory allocated on device: 0.0 GB
max memory allocated on device: 0.0 GB
total memory cached on device: 0.0 GB
max memory cached  on device: 0.0 GB

cuda:3, name:GeForce GTX 1080 Ti
total memory available: 10.91650390625 GB
total memory allocated on device: 0.0 GB
max memory allocated on device: 0.0 GB
total memory cached on device: 0.0 GB
max memory cached  on device: 0.0 GB

cuda:4, name:GeForce GTX 1080 Ti
total memory av

In [41]:
n_gpu = torch.cuda.device_count()
n_gpu

5

### Preparing dataset 

In [42]:
DSdataset_name = 'DS1' # or DS2, DS3

# For DS3:
interact_matfname_DS3 = 'NCRDInteractionMat'
# interact_matfname_DS3 = 'CRDInteractionMat'

In [43]:
dataset_configs = {'DS1':{'DSdataset_name':'DS1', 
                          'fname_suffix':"_Jacarrd_sim.csv",
                          'similarity_types':['enzyme',
                                              'indication',
                                              'offsideeffect',
                                              'pathway',
                                              'sideeffect',
                                              'target',
                                              'transporter',
                                              'chem'],
                          'interact_matfname':'drug_drug_matrix',
                          'exp_iden':'simtypeall',
                          'kernel_option':'sqeuclidean',
                          'data_fname':'data_v1',
                          'ddi_interaction_labels_pth':os.path.join(up_dir, rawdata_dir, 'DS1', 'drug_drug_matrix.csv')}, 
                   'DS2':{'DSdataset_name':'DS2',
                          'fname_suffix':'.csv',
                          'similarity_types':['simMatrix'],
                          'interact_matfname':'ddiMatrix',
                          'exp_iden':'simtypeall',
                          'kernel_option':'correlation',
                          'ddi_interaction_labels_pth':os.path.join(up_dir, rawdata_dir, 'DS2', 'ddiMatrix.csv'),
                          'data_fname':'data_v1'}, 
                   'DS3':{'DSdataset_name':'DS3',
                          'fname_suffix':"Mat.csv",
                          'similarity_types':['ATCSimilarity',
                                              'chemicalSimilarity',
                                              'distSimilarity',
                                              'GOSimilarity',
                                              'ligandSimilarity',
                                              'seqSimilarity',
                                              'SideEffectSimilarity'],
                          'interact_matfname':['NCRDInteractionMat', 'CRDInteractionMat'],
                          'exp_iden':['simtypeall_NCRDInteractionMat', 'simtypeall_CRDInteractionMat'],
                          'kernel_option':'sqeuclidean',
                          'ddi_interaction_labels_pth':[os.path.join(up_dir, rawdata_dir, 'DS3', 'NCRDInteractionMat.csv'), os.path.join(up_dir, rawdata_dir, 'DS3', 'CRDInteractionMat.csv')],
                          'data_fname':'data_v1'}}

dict_interact_matfname = {'NCRDInteractionMat': 0, 'CRDInteractionMat':1}

In [44]:
ds_config = dataset_configs[DSdataset_name]

fname_suffix = ds_config["fname_suffix"]
similarity_types = ds_config["similarity_types"]
kernel_option = ds_config["kernel_option"]
data_fname = ds_config["data_fname"]
interact_matfname = ds_config["interact_matfname"]
exp_iden = ds_config["exp_iden"]
ddi_interaction_labels_pth = ds_config["ddi_interaction_labels_pth"]

if DSdataset_name == 'DS3':
    int_interact_matfname = dict_interact_matfname[interact_matfname_DS3]
    interact_matfname = interact_matfname[int_interact_matfname]
    exp_iden = exp_iden[int_interact_matfname]
    ddi_interaction_labels_pth = ddi_interaction_labels_pth[int_interact_matfname]

In [45]:
y = preprocess_labels(ddi_interaction_labels_pth, DSdataset_name)

In [46]:
report_label_distrib(y), y.shape

class: 0 norm count: 0.6758430189887775
class: 1 norm count: 0.3241569810112225


(None, (149878,))

In [47]:
num_drugs = get_num_drugs(ddi_interaction_labels_pth, DSdataset_name)
num_drugs

548

In [48]:
interaction_mat = get_interaction_mat(ddi_interaction_labels_pth, DSdataset_name)
interaction_mat

array([[0, 1, 1, ..., 1, 1, 1],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 1, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [49]:
sid_ddipairs_map = construct_sampleid_ddipairs(interaction_mat)
sid_ddipairs_map

{0: (0, 1),
 1: (0, 2),
 2: (0, 3),
 3: (0, 4),
 4: (0, 5),
 5: (0, 6),
 6: (0, 7),
 7: (0, 8),
 8: (0, 9),
 9: (0, 10),
 10: (0, 11),
 11: (0, 12),
 12: (0, 13),
 13: (0, 14),
 14: (0, 15),
 15: (0, 16),
 16: (0, 17),
 17: (0, 18),
 18: (0, 19),
 19: (0, 20),
 20: (0, 21),
 21: (0, 22),
 22: (0, 23),
 23: (0, 24),
 24: (0, 25),
 25: (0, 26),
 26: (0, 27),
 27: (0, 28),
 28: (0, 29),
 29: (0, 30),
 30: (0, 31),
 31: (0, 32),
 32: (0, 33),
 33: (0, 34),
 34: (0, 35),
 35: (0, 36),
 36: (0, 37),
 37: (0, 38),
 38: (0, 39),
 39: (0, 40),
 40: (0, 41),
 41: (0, 42),
 42: (0, 43),
 43: (0, 44),
 44: (0, 45),
 45: (0, 46),
 46: (0, 47),
 47: (0, 48),
 48: (0, 49),
 49: (0, 50),
 50: (0, 51),
 51: (0, 52),
 52: (0, 53),
 53: (0, 54),
 54: (0, 55),
 55: (0, 56),
 56: (0, 57),
 57: (0, 58),
 58: (0, 59),
 59: (0, 60),
 60: (0, 61),
 61: (0, 62),
 62: (0, 63),
 63: (0, 64),
 64: (0, 65),
 65: (0, 66),
 66: (0, 67),
 67: (0, 68),
 68: (0, 69),
 69: (0, 70),
 70: (0, 71),
 71: (0, 72),
 72: (0, 73

### Generate datapartitions (i.e. train/val, test indices)

In [53]:
dpartitions = get_stratified_partitions(y, num_folds=10, valid_set_portion=0.1, random_state=42)

fold_num: 0
train data
class: 0 norm count: 0.6758511050156094
class: 1 norm count: 0.3241488949843906
validation data
class: 0 norm count: 0.6758099191934168
class: 1 norm count: 0.3241900808065831
test data
class: 0 norm count: 0.67580731251668
class: 1 norm count: 0.32419268748332

-------------------------
fold_num: 1
train data
class: 0 norm count: 0.6758511050156094
class: 1 norm count: 0.3241488949843906
validation data
class: 0 norm count: 0.6758099191934168
class: 1 norm count: 0.3241900808065831
test data
class: 0 norm count: 0.67580731251668
class: 1 norm count: 0.32419268748332

-------------------------
fold_num: 2
train data
class: 0 norm count: 0.6758511050156094
class: 1 norm count: 0.3241488949843906
validation data
class: 0 norm count: 0.6758099191934168
class: 1 norm count: 0.3241900808065831
test data
class: 0 norm count: 0.67580731251668
class: 1 norm count: 0.32419268748332

-------------------------
fold_num: 3
train data
class: 0 norm count: 0.6758511050156094
c

In [54]:
# dump data on disk
targetdata_dir = create_directory(exp_iden, os.path.join(up_dir, processed_dir, DSdataset_name, data_fname))
ReaderWriter.dump_data(dpartitions, os.path.join(targetdata_dir, 'data_partitions.pkl'))

path_current_dir ../../data/processed/DS1/data_v1


### GIP computation for each fold

In [55]:
def get_nan_idx(imat_mask):
    r, c = np.where(np.isnan(imat_mask))
    d = {}
    for i in range(len(r)):
        ridx= r[i]
        cidx= c[i]
        if ridx in d:
            d[ridx].append(cidx)
        else:
            d[ridx] = [cidx]
    return d

def impute_nan(intmat, sim_mat, k=5, mask_value=np.nan):

    mat = intmat.copy()
    sim = sim_mat.copy()
    np.fill_diagonal(mat,0)
    np.fill_diagonal(sim,0)
    (row,col) = mat.shape
    
    d = get_nan_idx(intmat)
    nanw_m = np.ones(mat.shape)
    for i, num_nan in enumerate(np.isnan(mat).sum(axis=1)):
        if num_nan == 0:
            continue
        else:
            curr_sim_vec = sim[i,:]
            topk_indx = np.argsort(curr_sim_vec)[-k:]
            coeff = curr_sim_vec[topk_indx]
            norm = sum(coeff)

            A = mat[topk_indx, :].copy()
            A = A*coeff.reshape(-1,1)
            vec_imat = np.nansum(A, axis=0)
            
            # update nan positions
            mat[i,d[i]] = vec_imat[d[i]]
            if norm > 0:
                mat[i,d[i]] = mat[i,d[i]]/norm

            # compute percent of nan in computation
            nanw_vec = 1-np.isnan(A).sum(axis=0)/(A.shape[0])
            nanw_m[i,d[i]] = nanw_vec[d[i]]

    return mat, nanw_m

def weight_inferred_mat(nanw_mat_lst, infer_mat_lst):
    res_m = np.zeros(infer_mat_lst[0].shape)
    nanw_m_accum = np.zeros(nanw_mat_lst[0].shape)
    for i in range(len(nanw_mat_lst)):
        nanw_m = nanw_mat_lst[i]
        infer_m = infer_mat_lst[i]
        res_m += nanw_m*infer_m
        nanw_m_accum += nanw_m
        
    return res_m/nanw_m_accum

def compute_gip_kernel(intmat, k_bandwidth, option='correlation'):
    """computes gaussian kernel from 2D matrix
    
       Approach based on van Laarhoven et al. doi:10.1093/bioinformatics/btr500
    
    """
    assert option in {'correlation', 'sqeuclidean'}
    
    mat = intmat.copy()
    np.fill_diagonal(mat, 1) # to compensate for pair interactions
    
    r, c = mat.shape # 2D matrix
    # computes pairwise correlation
    dist_kernel = squareform(pdist(mat, metric=option))
    print('nan',np.isnan(dist_kernel).sum())
    if option == 'correlation':
        avg_len = np.max(dist_kernel, axis=1, keepdims=True)
        avg_len[np.where(avg_len <= 0)] = 1.
        out = 1-dist_kernel/avg_len
    else:
        avg_len = (scpnorm(mat, axis=1, keepdims=True)**2) * 1/c
        avg_len[np.where(avg_len <= 0)] = 1.
        gamma = k_bandwidth/avg_len
        out = np.exp(-gamma*dist_kernel)
    return out

### Using masking and inference with gip computation

In [56]:
gip_perfold = {}
for fold_id in dpartitions:
    masked_intermat = interaction_mat.copy()
    masked_intermat = masked_intermat.astype(np.float)
    for dsettype in ('validation', 'test'):
        # get validation/test ddi pair indices
        sids = dpartitions[fold_id][dsettype]
        a = [sid_ddipairs_map[sid][0] for sid in sids]
        b = [sid_ddipairs_map[sid][1] for sid in sids]
        # set to nan
        masked_intermat[tuple([a,b])] = np.nan
        masked_intermat[tuple([b,a])] = np.nan
        
    intermat_infer_lst = []
    nanw_mat_lst = []
    for similarity_type in similarity_types:
        print('similarity_type', similarity_type)
        siminput_feat_pth = os.path.join(up_dir, rawdata_dir, DSdataset_name, '{}{}'.format(similarity_type, fname_suffix))
        sim_mat = get_similarity_matrix(siminput_feat_pth, DSdataset_name)
        imat_infer, nanw_m = impute_nan(masked_intermat, sim_mat, k=15)
        intermat_infer_lst.append(imat_infer)
        nanw_mat_lst.append(nanw_m)
        
    infer_mat_fus = weight_inferred_mat(nanw_mat_lst, intermat_infer_lst)

    print('norm(infer_mat-interaction_mat)', np.linalg.norm(infer_mat_fus - interaction_mat))

    # compute GIP here
    gip_kernel = compute_gip_kernel(infer_mat_fus, 1., kernel_option)
    print('norm(gip_kernel-interaction_mat)',np.linalg.norm(gip_kernel - interaction_mat))
    t = gip_kernel-interaction_mat
    print(np.sum(np.abs(t) > 0.5)/(t.size - t.shape[0]))
    gip_perfold[fold_id] = gip_kernel

similarity_type enzyme
similarity_type indication
similarity_type offsideeffect
similarity_type pathway
similarity_type sideeffect
similarity_type target
similarity_type transporter
similarity_type chem
norm(infer_mat-interaction_mat) 95.85996802784743
nan 0
norm(gip_kernel-interaction_mat) 312.59558538149577
0.32598513457612194
similarity_type enzyme
similarity_type indication
similarity_type offsideeffect
similarity_type pathway
similarity_type sideeffect
similarity_type target
similarity_type transporter
similarity_type chem
norm(infer_mat-interaction_mat) 95.91048645102023
nan 0
norm(gip_kernel-interaction_mat) 312.59558538149577
0.32598513457612194
similarity_type enzyme
similarity_type indication
similarity_type offsideeffect
similarity_type pathway
similarity_type sideeffect
similarity_type target
similarity_type transporter
similarity_type chem
norm(infer_mat-interaction_mat) 95.75408444617713
nan 0
norm(gip_kernel-interaction_mat) 312.59558538149577
0.32598513457612194
similar

### Compute features from similarity matrices

#### check if similarity matrix is symmetric

In [57]:
num_sim_types = len(similarity_types)
for similarity_type in similarity_types:
    siminput_feat_pth = os.path.join(up_dir, rawdata_dir, DSdataset_name, '{}{}'.format(similarity_type, fname_suffix))
    sim_mat = get_similarity_matrix(siminput_feat_pth, DSdataset_name)   
    print(np.allclose(sim_mat, np.transpose(sim_mat)))

True
True
True
True
True
True
True
True


In [58]:
num_sim_types = len(similarity_types)
X_feats = []
for similarity_type in similarity_types:
    siminput_feat_pth = os.path.join(up_dir, rawdata_dir, DSdataset_name, '{}{}'.format(similarity_type, fname_suffix))
    X_feat = preprocess_features(siminput_feat_pth, DSdataset_name, fill_diag=None)   
    X_feats.append(X_feat)
X_feat_cat = np.concatenate(X_feats, axis=1)
print("X_feat_cat", X_feat_cat.shape)

X_feat_cat (149878, 8784)


In [59]:
X = create_setvector_features(X_feat_cat, 2*num_sim_types)
X.shape

(149878, 16, 549)

In [60]:
X_a = X[:,list(range(0,2*num_sim_types,2))].copy()
X_b = X[:,list(range(1,2*num_sim_types,2))].copy()

In [61]:
from ddi.utilities import format_bytes
print(format_bytes(X_feat_cat.size * X_feat_cat.itemsize))
print(format_bytes(y.size * y.itemsize))

(4.9, 'gigabytes')
(585.46, 'kilobytes')


In [62]:
# clear unused objects
del X_feats
del X_feat_cat
del X_feat

In [63]:
device_cpu = get_device(to_gpu=False)
device_gpu = get_device(True, index=0)

In [64]:
# dtype is float32 since we will use sigmoid (binary outcome)
y_tensor = torch.tensor(y, dtype = torch.int64, device = device_cpu) 
X_a = torch.tensor(X_a, dtype = torch.float32, device = device_cpu)
X_b = torch.tensor(X_b, dtype = torch.float32, device = device_cpu)
ddi_datatensor = DDIDataTensor(X_a, X_b, y_tensor)

In [65]:
targetdata_dir

'../../data/processed/DS1/data_v1/simtypeall'

In [66]:
# dump data on disk
ReaderWriter.dump_tensor(X_a, os.path.join(targetdata_dir, 'X_a.torch'))
ReaderWriter.dump_tensor(X_b, os.path.join(targetdata_dir, 'X_b.torch'))
ReaderWriter.dump_tensor(y_tensor, os.path.join(targetdata_dir, 'y_tensor.torch'))

### Construct GIP datatensor for each fold

In [67]:
gip_dtensor_perfold = {}
for fold_id in gip_perfold:
    print('fold_id:', fold_id)
    gip_mat = gip_perfold[fold_id]
    print('gip_mat:', gip_mat.shape)
    gip_feat = get_features_from_simmatrix(gip_mat)
    gip_all = create_setvector_features(gip_feat, 2)
    print('gip_all:', gip_all.shape)
    X_a_gip = gip_all[:,list(range(0,2*1,2))].copy()
    X_b_gip = gip_all[:,list(range(1,2*1,2))].copy()
    print('X_a_gip:', X_a_gip.shape)
    X_a_gip = torch.tensor(X_a_gip, dtype = torch.float32, device = device_cpu)
    X_b_gip = torch.tensor(X_b_gip, dtype = torch.float32, device = device_cpu)
    gip_datatensor = GIPDataTensor(X_a_gip, X_b_gip)
    gip_dtensor_perfold[fold_id] = gip_datatensor

fold_id: 0
gip_mat: (548, 548)
gip_all: (149878, 2, 549)
X_a_gip: (149878, 1, 549)
fold_id: 1
gip_mat: (548, 548)
gip_all: (149878, 2, 549)
X_a_gip: (149878, 1, 549)
fold_id: 2
gip_mat: (548, 548)
gip_all: (149878, 2, 549)
X_a_gip: (149878, 1, 549)
fold_id: 3
gip_mat: (548, 548)
gip_all: (149878, 2, 549)
X_a_gip: (149878, 1, 549)
fold_id: 4
gip_mat: (548, 548)
gip_all: (149878, 2, 549)
X_a_gip: (149878, 1, 549)
fold_id: 5
gip_mat: (548, 548)
gip_all: (149878, 2, 549)
X_a_gip: (149878, 1, 549)
fold_id: 6
gip_mat: (548, 548)
gip_all: (149878, 2, 549)
X_a_gip: (149878, 1, 549)
fold_id: 7
gip_mat: (548, 548)
gip_all: (149878, 2, 549)
X_a_gip: (149878, 1, 549)
fold_id: 8
gip_mat: (548, 548)
gip_all: (149878, 2, 549)
X_a_gip: (149878, 1, 549)
fold_id: 9
gip_mat: (548, 548)
gip_all: (149878, 2, 549)
X_a_gip: (149878, 1, 549)


In [68]:
# dump data on disk
ReaderWriter.dump_tensor(gip_dtensor_perfold, os.path.join(targetdata_dir, 'gip_dtensor_perfold.torch'))