In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import ddi
import sys

In [3]:
import numpy as np
import pandas as pd
import datetime
import seaborn as sns
from ddi.dataset import *

In [4]:
from ddi.utilities import *
from ddi.run_workflow import *

In [5]:
import glob

In [6]:
rawdata_dir = '../data/raw/'
processed_dir = '../data/processed/'
up_dir = '..'

In [7]:
report_available_cuda_devices()

number of GPUs available: 8
cuda:0, name:GeForce GTX 1080 Ti
total memory available: 10.91650390625 GB
total memory allocated on device: 0.0 GB
max memory allocated on device: 0.0 GB
total memory cached on device: 0.0 GB
max memory cached  on device: 0.0 GB

cuda:1, name:GeForce GTX 1080 Ti
total memory available: 10.91650390625 GB
total memory allocated on device: 0.0 GB
max memory allocated on device: 0.0 GB
total memory cached on device: 0.0 GB
max memory cached  on device: 0.0 GB

cuda:2, name:GeForce GTX 1080 Ti
total memory available: 10.91650390625 GB
total memory allocated on device: 0.0 GB
max memory allocated on device: 0.0 GB
total memory cached on device: 0.0 GB
max memory cached  on device: 0.0 GB

cuda:3, name:GeForce GTX 1080 Ti
total memory available: 10.91650390625 GB
total memory allocated on device: 0.0 GB
max memory allocated on device: 0.0 GB
total memory cached on device: 0.0 GB
max memory cached  on device: 0.0 GB

cuda:4, name:GeForce GTX 1080 Ti
total memory av

In [8]:
n_gpu = torch.cuda.device_count()
n_gpu

8

### Preparing dataset 

In [9]:
DSdataset_name = 'DS3' # or DS2, DS3

# For DS3:
interact_matfname_DS3 = 'NCRDInteractionMat'
# interact_matfname_DS3 = 'CRDInteractionMat'

In [10]:
dataset_configs = {'DS1':{'DSdataset_name':'DS1', 
                          'fname_suffix':"_Jacarrd_sim.csv",
                          'similarity_types':['enzyme',
                                              'indication',
                                              'offsideeffect',
                                              'pathway',
                                              'sideeffect',
                                              'target',
                                              'transporter',
                                              'chem'],
                          'interact_matfname':'drug_drug_matrix',
                          'exp_iden':'simtypeall',
                          'kernel_option':'sqeuclidean',
                          'data_fname':'data_v1',
                          'ddi_interaction_labels_pth':os.path.join(up_dir, rawdata_dir, 'DS1', 'drug_drug_matrix.csv')}, 
                   'DS2':{'DSdataset_name':'DS2',
                          'fname_suffix':'.csv',
                          'similarity_types':['simMatrix'],
                          'interact_matfname':'ddiMatrix',
                          'exp_iden':'simtypeall',
                          'kernel_option':'correlation',
                          'ddi_interaction_labels_pth':os.path.join(up_dir, rawdata_dir, 'DS2', 'ddiMatrix.csv'),
                          'data_fname':'data_v1'}, 
                   'DS3':{'DSdataset_name':'DS3',
                          'fname_suffix':"Mat.csv",
                          'similarity_types':['ATCSimilarity',
                                              'chemicalSimilarity',
                                              'distSimilarity',
                                              'GOSimilarity',
                                              'ligandSimilarity',
                                              'seqSimilarity',
                                              'SideEffectSimilarity'],
                          'interact_matfname':['NCRDInteractionMat', 'CRDInteractionMat'],
                          'exp_iden':['simtypeall_NCRDInteractionMat', 'simtypeall_CRDInteractionMat'],
                          'kernel_option':'sqeuclidean',
                          'ddi_interaction_labels_pth':[os.path.join(up_dir, rawdata_dir, 'DS3', 'NCRDInteractionMat.csv'), os.path.join(up_dir, rawdata_dir, 'DS3', 'CRDInteractionMat.csv')],
                          'data_fname':'data_v1'}}

dict_interact_matfname = {'NCRDInteractionMat': 0, 'CRDInteractionMat':1}

In [11]:
ds_config = dataset_configs[DSdataset_name]

fname_suffix = ds_config["fname_suffix"]
similarity_types = ds_config["similarity_types"]
kernel_option = ds_config["kernel_option"]
data_fname = ds_config["data_fname"]
interact_matfname = ds_config["interact_matfname"]
exp_iden = ds_config["exp_iden"]
ddi_interaction_labels_pth = ds_config["ddi_interaction_labels_pth"]

if DSdataset_name == 'DS3':
    int_interact_matfname = dict_interact_matfname[interact_matfname_DS3]
    interact_matfname = interact_matfname[int_interact_matfname]
    exp_iden = exp_iden[int_interact_matfname]
    ddi_interaction_labels_pth = ddi_interaction_labels_pth[int_interact_matfname]

In [12]:
num_drugs = get_num_drugs(ddi_interaction_labels_pth, DSdataset_name)

In [13]:
interaction_mat = get_interaction_mat(ddi_interaction_labels_pth, DSdataset_name)

In [14]:
sid_ddipairs_map = construct_sampleid_ddipairs(interaction_mat)

### Read relevant data stub

In [16]:
# read data from disk
device_cpu = get_device(to_gpu=False)
device_gpu = get_device(True, index=0)
targetdata_dir = create_directory(exp_iden, os.path.join(up_dir, processed_dir, DSdataset_name, data_fname))
dpartitions = ReaderWriter.read_data(os.path.join(targetdata_dir, 'data_partitions.pkl'))

X_a = ReaderWriter.read_tensor(os.path.join(targetdata_dir, 'X_a.torch'), device_cpu)
X_b = ReaderWriter.read_tensor(os.path.join(targetdata_dir, 'X_b.torch'), device_cpu)
y_tensor = ReaderWriter.read_tensor(os.path.join(targetdata_dir, 'y_tensor.torch'), device_cpu)

gip_dtensor_perfold =  ReaderWriter.read_tensor(os.path.join(targetdata_dir, 'gip_dtensor_perfold.torch'), device_cpu)

path_current_dir ../../data/processed/DS3/data_v1


### Run from here

In [17]:
ddi_datatensor = DDIDataTensor(X_a, X_b, y_tensor)

In [18]:
datatensor_partitions = generate_partition_datatensor(ddi_datatensor, gip_dtensor_perfold, dpartitions)

In [19]:
# confirm that we separate PartitionDataTensor object and same reference to DDIDataTensor object!
for fold_num in datatensor_partitions:
    for dsettype in ('train', 'validation', 'test'):
        print(f'fold_num:{fold_num}, dsettype:{dsettype}')
        print('ID(PartitionDataTensor)', id(datatensor_partitions[fold_num][dsettype]))
        print('ID(DDIDataTensor)', id(datatensor_partitions[fold_num][dsettype].ddi_datatensor))
        print('ID(GIPDataTensor)', id(datatensor_partitions[fold_num][dsettype].gip_datatensor))
        print()

fold_num:0, dsettype:train
ID(PartitionDataTensor) 47661235389272
ID(DDIDataTensor) 47661235390896
ID(GIPDataTensor) 47661235390784

fold_num:0, dsettype:validation
ID(PartitionDataTensor) 47661235391064
ID(DDIDataTensor) 47661235390896
ID(GIPDataTensor) 47661235390784

fold_num:0, dsettype:test
ID(PartitionDataTensor) 47661235389384
ID(DDIDataTensor) 47661235390896
ID(GIPDataTensor) 47661235390784

fold_num:1, dsettype:train
ID(PartitionDataTensor) 47661235390728
ID(DDIDataTensor) 47661235390896
ID(GIPDataTensor) 47661235391176

fold_num:1, dsettype:validation
ID(PartitionDataTensor) 47661235391400
ID(DDIDataTensor) 47661235390896
ID(GIPDataTensor) 47661235391176

fold_num:1, dsettype:test
ID(PartitionDataTensor) 47661235391456
ID(DDIDataTensor) 47661235390896
ID(GIPDataTensor) 47661235391176

fold_num:2, dsettype:train
ID(PartitionDataTensor) 47661235391512
ID(DDIDataTensor) 47661235390896
ID(GIPDataTensor) 47661235391232

fold_num:2, dsettype:validation
ID(PartitionDataTensor) 47661

### Train and Evaluate workflow

In [20]:
from ddi.run_workflow import *

In [21]:
def build_dditrf_config_map(input_dim, similarity_type, model_name, hyperparam_opt, loss_func='nllloss', margin=0.5, loss_w=0.5):
    hyperparam_config = DDITrfHyperparamConfig(*hyperparam_opt)
    fold_num = -1 
    fdtype = torch.float32
    mconfig, options = generate_models_config(hyperparam_config, similarity_type, model_name, input_dim, fold_num, fdtype, loss_func=loss_func, margin=margin, loss_w=loss_w)
    return mconfig, options

In [22]:
num_drugs

807

In [23]:
input_embed_dim = [128]
num_attn_heads = [1,2]
num_transformer_units = [1]
p_dropout = [0.3, 0.45]
nonlin_func = [nn.ReLU()]
mlp_embed_factor = [2]
pooling_mode = ['attn']
dist_opt = ['cosine']
l2_reg = [0,1e-8]
batch_size = [400]
num_epochs = [200]
loss_w = [0.05]

In [24]:
hyperparam_space = list(itertools.product(*[input_embed_dim, num_attn_heads,  num_transformer_units, p_dropout,
                                                nonlin_func, mlp_embed_factor,pooling_mode,dist_opt, l2_reg, batch_size,
                                                num_epochs, loss_w]))
print(len(hyperparam_space))

8


In [29]:
hyperparam_opt = (input_embed_dim,num_attn_heads, num_transformer_units, p_dropout, 
                  nonlin_func, mlp_embed_factor, pooling_mode, dist_opt,
                  l2_reg, batch_size, num_epochs)


In [32]:
exp_dir = create_directory(exp_iden, os.path.join(processed_dir, DSdataset_name, 'experiments'))
exp_dir

'../data/processed/DS3/experiments/simtypeall_NCRDInteractionMat'

In [33]:
num_folds=10

# Training

In [38]:
def spawn_q_process(q_process):
    print(">>> spawning hyperparam search process")
    q_process.start()
    
def join_q_process(q_process):
    q_process.join()
    print("<<< joined hyperparam search process")
    
def create_q_process(hyperparam_comb, gpu_num, datatensor_partition, exp_dir, num_drugs, queue, exp_iden):
    fold_gpu_map = {0:gpu_num}
    return mp.Process(target=ddi.run_workflow.train_test_hyperparam_conf, args=(hyperparam_comb, 
                                                                                gpu_num, 
                                                                                datatensor_partition, 
                                                                                fold_gpu_map, 
                                                                                exp_dir, 
                                                                                num_drugs, 
                                                                                queue,
                                                                                exp_iden))

In [40]:
import torch.multiprocessing as mp
mp.set_start_method("spawn", force=True)

queue = mp.Queue()
q_processes = []

for q_i in range(min(n_gpu, len(hyperparam_space))):
    q_process = create_q_process(hyperparam_comb=hyperparam_space[q_i], 
                                 gpu_num=q_i, 
                                 datatensor_partition={0:datatensor_partitions[0]}, 
                                 exp_dir=exp_dir, 
                                 num_drugs=num_drugs, 
                                 queue=queue,
                                 exp_iden=exp_iden)
    q_processes.append(q_process)
    spawn_q_process(q_process)

spawned_processes = n_gpu
    
for q_i in range(len(hyperparam_space)):
    join_q_process(q_processes[q_i])
    released_gpu_num = queue.get()
    print("released_gpu_num:", released_gpu_num)
    if(spawned_processes < len(hyperparam_space)):
        q_process = create_q_process(hyperparam_comb=hyperparam_space[spawned_processes], 
                             gpu_num=released_gpu_num, 
                             datatensor_partition={0:datatensor_partitions[0]}, 
                             exp_dir=exp_dir, 
                             num_drugs=num_drugs, 
                             queue=queue,
                             exp_iden=exp_iden)
        q_processes.append(q_process)
        spawn_q_process(q_process)
        spawned_processes = spawned_processes + 1

>>> spawning hyperparam search process
>>> spawning hyperparam search process
>>> spawning hyperparam search process
>>> spawning hyperparam search process
>>> spawning hyperparam search process
>>> spawning hyperparam search process
>>> spawning hyperparam search process
>>> spawning hyperparam search process
<<< joined hyperparam search process
released_gpu_num: 3
<<< joined hyperparam search process
released_gpu_num: 2
<<< joined hyperparam search process
released_gpu_num: 0
<<< joined hyperparam search process
released_gpu_num: 1
<<< joined hyperparam search process
released_gpu_num: 5
<<< joined hyperparam search process
released_gpu_num: 7
<<< joined hyperparam search process
released_gpu_num: 4
<<< joined hyperparam search process
released_gpu_num: 6
