In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import datetime
import seaborn as sns
import ogb
from tqdm import tqdm
import hiplot as hip
from copy import deepcopy
import datetime

In [None]:
import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch.utils.data import Subset, TensorDataset

In [None]:
cwd = os.getcwd()
print(cwd)
cwd_parent = os.path.abspath(os.path.join(cwd, os.pardir))
print(cwd_parent)

sys.path.append(cwd_parent)

In [None]:
import deepadr
from deepadr.dataset import *
from deepadr.utilities import *
from deepadr.chemfeatures import *
from deepadr.train_functions import *
from deepadr.model_gnn_ogb import GNN, DeepAdr_SiameseTrf, ExpressionNN

In [None]:
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw

In [None]:
rawdata_dir = '../data/raw/'
processed_dir = '../data/processed/'
up_dir = '..'

In [None]:
report_available_cuda_devices()

In [None]:
n_gpu = torch.cuda.device_count()
n_gpu

In [None]:
device_cpu = get_device(to_gpu=False)
# device_gpu = get_device(True, index=0)

In [None]:
print("torch:", torch.__version__)
print("CUDA:", torch.version.cuda)
print(sys.version)

### Preparing dataset 

In [None]:
# options: 
# 'total_thresh' + 4,3,2
# 'loewe_thresh', 'hsa_thresh', 'bliss_thresh', 'zip_thresh' + 1

score = 'total_thresh'
score_val = 4

In [None]:
DSdataset_name = f'DrugComb_{score}_{score_val}'

data_fname = 'data_v1' # v2 for baseline models

In [None]:
targetdata_dir = create_directory(os.path.join(processed_dir, DSdataset_name, data_fname))
targetdata_dir_raw = create_directory(os.path.join(targetdata_dir, "raw"))
targetdata_dir_processed = create_directory(os.path.join(targetdata_dir, "processed"))
targetdata_dir_exp = create_directory(os.path.join(targetdata_dir, "experiments"))
# # ReaderWriter.dump_data(dpartitions, os.path.join(targetdata_dir, 'data_partitions.pkl'))
print(targetdata_dir)

In [None]:
%%time

# Make sure to first run the "DDoS_Dataset_Generation" notebook first

dataset = MoleculeDataset(root=targetdata_dir)

In [None]:
print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

In [None]:
# used_dataset = dataset

# If you want to use a smaller subset of the dataset for testing
smaller_dataset_len = int(len(dataset)/1)
used_dataset = dataset[:smaller_dataset_len]

In [None]:
fold_partitions = get_stratified_partitions(used_dataset.data.y[:smaller_dataset_len],
                                            num_folds=5, valid_set_portion=0.1, random_state=42)

In [None]:
print("Number of training graphs: "+ str(len(fold_partitions[0]['train'])))
print("Number of validation graphs: "+ str(len(fold_partitions[0]['validation'])))
print("Number of testing graphs: "+ str(len(fold_partitions[0]['test'])))


In [None]:
tp = {
    "batch_size" : 300,
    "num_epochs" : 50,
    
    "emb_dim" : 300,
    "gnn_type" : "gatv2",
    "num_layer" : 5,
    "graph_pooling" : "mean", #attention
    
    "input_embed_dim" : None,
    "gene_embed_dim": 1,
    "num_attn_heads" : 2,
    "num_transformer_units" : 1,
    "p_dropout" : 0.3,
#     "nonlin_func" : nn.ReLU(),
    "mlp_embed_factor" : 2,
    "pooling_mode" : 'attn',
    "dist_opt" : 'cosine',

    "base_lr" : 3e-4, #3e-4
    "max_lr_mul": 5,
    "l2_reg" : 1e-5,
    "loss_w" : 1.,
    "margin_v" : 1.,

    "expression_dim" : 64,
    "expression_input_size" : 908,
    "exp_H1" : 4096,
    "exp_H2" : 2048
}

In [None]:
# input_embed_dim = [128]
# num_attn_heads = [2] # 2,4
# num_transformer_units = [1]
emb_dim = [100]
num_layer = [5]
p_dropout = [0.3]
# nonlin_func = [nn.ReLU()]
# mlp_embed_factor = [2]
# pooling_mode = ['attn']
# dist_opt = ['cosine']
l2_reg = [1e-7] #0
# batch_size = [300]
# num_epochs = [200]
# loss_w = [0.95] # 0.05, 
base_lr = [3e-4]
max_lr_mul = [10]
exp_H1 = [512, 1024, 2048, 4096]
exp_H2 = [512, 1024]


In [None]:
hp_names = ["num_layer",
            "p_dropout",
            "l2_reg",
            "emb_dim",
#             "loss_w",
#            "num_transformer_units",
#            "batch_size",
           "base_lr",
           "max_lr_mul",
           "exp_H1",
           "exp_H2"]

In [None]:
[globals()[i] for i in hp_names]

In [None]:
hyperparam_space = list(itertools.product(*[globals()[i] for i in hp_names]))
print(len(hyperparam_space))

In [None]:
hyperparam_space[0]

In [None]:
def spawn_q_process(q_process):
    print(">>> spawning hyperparam search process")
    q_process.start()
    
def join_q_process(q_process):
    q_process.join()
    print("<<< joined hyperparam search process")
    
def create_q_process(queue, used_dataset, gpu_num, tphp, exp_dir, partition): #
    return mp.Process(target=deepadr.train_functions.run_exp, args=(queue, used_dataset, gpu_num, tphp, exp_dir, partition)) #

In [None]:
import torch.multiprocessing as mp
mp.set_start_method("spawn", force=True)

queue = mp.Queue()
q_processes = []

partition = fold_partitions[0]

print("Start: " + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))

for q_i in range(min(n_gpu, len(hyperparam_space))):
    time_stamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    exp_dir = create_directory(os.path.join(targetdata_dir_exp, "exp_"+str(q_i)+"_"+time_stamp))
    create_directory(os.path.join(exp_dir, "predictions"))
    tphp = generate_tp_hp(tp, hyperparam_space[q_i], hp_names)
    
    q_process = create_q_process(queue, used_dataset, q_i, tphp, exp_dir, partition)
    q_processes.append(q_process)
    spawn_q_process(q_process)

spawned_processes = n_gpu
    
for q_i in range(len(hyperparam_space)):
    join_q_process(q_processes[q_i])
    released_gpu_num = queue.get()
    print("released_gpu_num:", released_gpu_num)
    if(spawned_processes < len(hyperparam_space)):
#         device_gpu = get_device(True, index=q_i)
        time_stamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
        exp_dir = create_directory(os.path.join(targetdata_dir_exp, "exp_"+str(q_i)+"_"+time_stamp))
        create_directory(os.path.join(exp_dir, "predictions"))
        tphp = generate_tp_hp(tp, hyperparam_space[q_i], hp_names)

        q_process = create_q_process(queue, used_dataset, released_gpu_num, tphp, exp_dir, partition)
        q_processes.append(q_process)
        spawn_q_process(q_process)
        spawned_processes = spawned_processes + 1

In [None]:
print("End: " + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))


### Hyper Parameters

In [None]:
import glob
exp_dirs = glob.glob(targetdata_dir_exp+"/hyp/exp_*")
len(exp_dirs)

In [None]:
exp_results = []

for edir in exp_dirs:
    print(edir)
    
    hp = pd.read_json(edir + "/hyperparameters.json", typ="Series").to_dict()
    hp['maxTestAUPR'] = pd.read_csv(edir + "/curves.csv")['test_aupr'].max()
    exp_results.append(hp)

In [None]:
hip.Experiment.from_iterable(exp_results).display(force_full_width=False)