In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import ddi
import sys

In [None]:
import numpy as np
import pandas as pd
import datetime
import seaborn as sns

In [None]:
from ddi.dataset import *
from ddi.utilities import *
from ddi.run_workflow import *

In [None]:
rawdata_dir = '../data/raw/'
processed_dir = '../data/processed/'
up_dir = '..'

In [None]:
report_available_cuda_devices()

In [None]:
n_gpu = torch.cuda.device_count()
n_gpu

### Preparing dataset 

In [None]:
DSdataset_name = 'DS1' # or DS2, DS3

# For DS3:
# interact_matfname_DS3 = 'NCRDInteractionMat'
interact_matfname_DS3 = 'CRDInteractionMat'

train_Siamese = True

In [None]:
dataset_configs = {'DS1':{'DSdataset_name':'DS1', 
                          'fname_suffix':"_Jacarrd_sim.csv",
                          'similarity_types':['enzyme',
                                              'indication',
                                              'offsideeffect',
                                              'pathway',
                                              'sideeffect',
                                              'target',
                                              'transporter',
                                              'chem'],
                          'interact_matfname':'drug_drug_matrix',
                          'exp_iden':'simtypeall',
                          'kernel_option':'sqeuclidean',
                          'data_fname':'data_v1',
                          'ddi_interaction_labels_pth':os.path.join(up_dir, rawdata_dir, 'DS1', 'drug_drug_matrix.csv')}, 
                   'DS2':{'DSdataset_name':'DS2',
                          'fname_suffix':'.csv',
                          'similarity_types':['simMatrix'],
                          'interact_matfname':'ddiMatrix',
                          'exp_iden':'simtypeall',
                          'kernel_option':'correlation',
                          'ddi_interaction_labels_pth':os.path.join(up_dir, rawdata_dir, 'DS2', 'ddiMatrix.csv'),
                          'data_fname':'data_v1'}, 
                   'DS3':{'DSdataset_name':'DS3',
                          'fname_suffix':"Mat.csv",
                          'similarity_types':['ATCSimilarity',
                                              'chemicalSimilarity',
                                              'distSimilarity',
                                              'GOSimilarity',
                                              'ligandSimilarity',
                                              'seqSimilarity',
                                              'SideEffectSimilarity'],
                          'interact_matfname':['NCRDInteractionMat', 'CRDInteractionMat'],
                          'exp_iden':['simtypeall_NCRDInteractionMat', 'simtypeall_CRDInteractionMat'],
                          'kernel_option':'sqeuclidean',
                          'ddi_interaction_labels_pth':[os.path.join(up_dir, rawdata_dir, 'DS3', 'NCRDInteractionMat.csv'), os.path.join(up_dir, rawdata_dir, 'DS3', 'CRDInteractionMat.csv')],
                          'data_fname':'data_v1'}}

dict_interact_matfname = {'NCRDInteractionMat': 0, 'CRDInteractionMat':1}

In [None]:
ds_config = dataset_configs[DSdataset_name]

fname_suffix = ds_config["fname_suffix"]
similarity_types = ds_config["similarity_types"]
kernel_option = ds_config["kernel_option"]
data_fname = ds_config["data_fname"]
interact_matfname = ds_config["interact_matfname"]
exp_iden = ds_config["exp_iden"]
ddi_interaction_labels_pth = ds_config["ddi_interaction_labels_pth"]

if DSdataset_name == 'DS3':
    int_interact_matfname = dict_interact_matfname[interact_matfname_DS3]
    interact_matfname = interact_matfname[int_interact_matfname]
    exp_iden = exp_iden[int_interact_matfname]
    ddi_interaction_labels_pth = ddi_interaction_labels_pth[int_interact_matfname]

In [None]:
num_drugs = get_num_drugs(ddi_interaction_labels_pth, DSdataset_name)
num_drugs

In [None]:
interaction_mat = get_interaction_mat(ddi_interaction_labels_pth, DSdataset_name)
interaction_mat

In [None]:
sid_ddipairs_map = construct_sampleid_ddipairs(interaction_mat)
sid_ddipairs_map

### Read relevant data stub

In [None]:
# read data from disk
device_cpu = get_device(to_gpu=False)
device_gpu = get_device(True, index=0)
targetdata_dir = create_directory(exp_iden, os.path.join(up_dir, processed_dir, DSdataset_name, data_fname))
dpartitions = ReaderWriter.read_data(os.path.join(targetdata_dir, 'data_partitions.pkl'))

X_a = ReaderWriter.read_tensor(os.path.join(targetdata_dir, 'X_a.torch'), device_cpu)
X_b = ReaderWriter.read_tensor(os.path.join(targetdata_dir, 'X_b.torch'), device_cpu)
y_tensor = ReaderWriter.read_tensor(os.path.join(targetdata_dir, 'y_tensor.torch'), device_cpu)

gip_dtensor_perfold =  ReaderWriter.read_tensor(os.path.join(targetdata_dir, 'gip_dtensor_perfold.torch'), device_cpu)

### Run from here

In [None]:
ddi_datatensor = DDIDataTensor(y=y_tensor, X_a=X_a, X_b=X_b)

In [None]:
datatensor_partitions = generate_partition_datatensor(ddi_datatensor, gip_dtensor_perfold, dpartitions, train_Siamese)

In [None]:
# confirm that we separate PartitionDataTensor object and same reference to DDIDataTensor object!
for fold_num in datatensor_partitions:
    for dsettype in ('train', 'validation', 'test'):
        print(f'fold_num:{fold_num}, dsettype:{dsettype}')
        print('ID(PartitionDataTensor)', id(datatensor_partitions[fold_num][dsettype]))
        print('ID(DDIDataTensor)', id(datatensor_partitions[fold_num][dsettype].ddi_datatensor))
        print('ID(GIPDataTensor)', id(datatensor_partitions[fold_num][dsettype].gip_datatensor))
        print()

### Train and Evaluate workflow

In [None]:
from ddi.run_workflow import *

In [None]:
def build_dditrf_config_map(input_dim, similarity_type, model_name, hyperparam_opt, loss_func='nllloss', margin=0.5, loss_w=0.5):
    if(model_name == 'NDD'):
        hyperparam_config = NDDHyperparamConfig(*hyperparam_opt)
    elif(model_name == 'Transformer'):
        hyperparam_config = DDITrfHyperparamConfig(*hyperparam_opt)
    fold_num = -1 
    fdtype = torch.float32
    mconfig, options = generate_models_config(hyperparam_config, similarity_type, model_name, input_dim, fold_num, fdtype, loss_func=loss_func, margin=margin, loss_w=loss_w)
    return mconfig, options

In [None]:
num_drugs

In [None]:
input_embed_dim = None
num_attn_heads = 2
num_transformer_units = 1
p_dropout = 0.3
nonlin_func = nn.ReLU()
mlp_embed_factor = 2
pooling_mode = 'attn'
dist_opt = 'cosine'
l2_reg = 1e-6
batch_size = 1000
num_epochs = 100
loss_w = 0.05
margin_v = 1.

In [None]:
if (DSdataset_name == 'DS3' and interact_matfname_DS3 == 'CRDInteractionMat'):
    input_embed_dim = None
    num_attn_heads = 4
    num_transformer_units = 1
    p_dropout = 0.45
    nonlin_func = nn.ReLU()
    mlp_embed_factor = 2
    pooling_mode = 'attn'
    dist_opt = 'cosine'
    l2_reg = 1e-8
    batch_size = 400
    num_epochs = 200
    loss_w = 0.05
    margin_v = 1.

In [None]:
#Transformer (Siamese)

hyperparam_opt = (input_embed_dim,num_attn_heads, num_transformer_units, p_dropout, 
                  nonlin_func, mlp_embed_factor, pooling_mode, dist_opt,
                  l2_reg, batch_size, num_epochs)

In [None]:
# NDD

# fc1_dim, fc2_dim, p_dropout, l2_reg, batch_size, num_epochs = 400,300,0.5,0,200,20
# hyperparam_opt = (fc1_dim, fc2_dim, p_dropout, l2_reg, batch_size, num_epochs)

In [None]:
mconfig, options = build_dditrf_config_map(input_dim=num_drugs+1, 
                                           similarity_type=exp_iden, 
                                           model_name='Transformer', 
                                           hyperparam_opt=hyperparam_opt,
                                           loss_func='nllloss'
                                           margin=margin_v, 
                                           loss_w=loss_w)
# mconfig, options = build_dditrf_config_map(input_dim=(num_drugs+1)*(len(similarity_types)+1)*2, 
#                                            similarity_type=exp_iden, 
#                                            model_name='NDD', #'Transformer' 
#                                            hyperparam_opt=hyperparam_opt, 
#                                            loss_func='bceloss', #'nllloss'
#                                            margin=0.5, 
#                                            loss_w=0.5)

In [None]:
mconfig, options

In [None]:
exp_dir = create_directory(exp_iden, os.path.join(processed_dir, DSdataset_name, 'experiments'))

In [None]:
num_folds=len(datatensor_partitions)

# Training

In [None]:
config_map = (mconfig, options)

In [None]:
exp_dir = create_directory(exp_iden, os.path.join(processed_dir, DSdataset_name, 'experiments'))

In [None]:
time_stamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
tr_val_dir = create_directory(f'exp_{time_stamp}', exp_dir)
tr_val_dir

In [None]:
def spawn_q_process(q_process):
    print(">>> spawning hyperparam search process")
    q_process.start()
    
def join_q_process(q_process):
    q_process.join()
    print("<<< joined hyperparam search process")
    
def create_q_process(datatensor_partition, config_map, tr_val_dir, fold_gpu_map):
    return mp.Process(target=ddi.run_workflow.train_test_partition, args=(datatensor_partition, config_map, tr_val_dir, fold_gpu_map))

In [None]:
datatensor_partitions

In [None]:
start_part = 5

fold_gpu_map = {(i+start_part):i for i in range (n_gpu)}
fold_gpu_map

In [None]:
import torch.multiprocessing as mp
mp.set_start_method("spawn", force=True)

queue = mp.Queue()
q_processes = []

for q_i in fold_gpu_map.keys():
    q_process = create_q_process({q_i:datatensor_partitions[q_i]}, config_map, tr_val_dir, fold_gpu_map)
    q_processes.append(q_process)
    spawn_q_process(q_process)
    
for q_i in range(n_gpu):
    join_q_process(q_processes[q_i])

In [None]:
auc_df, aupr_df, f1_df= build_performance_dfs(similarity_types, os.path.relpath(tr_val_dir, '..'), num_folds, 'train')

for perf_name, perf_df in (('auc', auc_df), ('aupr', aupr_df), ('f1', f1_df)):
    print(perf_name)
    display(perf_df)
    print(perf_df['mean'].values[0])
    print("|"*25)

In [None]:
auc_df, aupr_df, f1_df= build_performance_dfs(similarity_types, os.path.relpath(tr_val_dir, '..'), num_folds, 'test')

for perf_name, perf_df in (('auc', auc_df), ('aupr', aupr_df), ('f1', f1_df)):
    print(perf_name)
    display(perf_df)
    print(perf_df['mean'].values[0])
    print("|"*25)