In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import datetime
import seaborn as sns
import ogb
from tqdm import tqdm
import hiplot as hip
from copy import deepcopy
import datetime

In [None]:
from joblib import dump as jdump
from joblib import load as jload

In [None]:
import torch

print("torch:", torch.__version__)
print("CUDA:", torch.version.cuda)
print(sys.version)

In [None]:
import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch.utils.data import Subset, TensorDataset

In [None]:
cwd = os.getcwd()
print(cwd)
# cwd_parent = os.path.abspath(os.path.join(cwd, os.pardir))
cwd_parent = os.path.abspath(os.path.join(cwd, '../../'))
print(cwd_parent)

sys.path.append(cwd_parent)

In [None]:
import deepadr
from deepadr.dataset import *
from deepadr.utilities import *
from deepadr.run_workflow import *
from deepadr.chemfeatures import *
from deepadr.hyphelper import *
from deepadr.model_gnn_ogb import GNN, DeepAdr_SiameseTrf, ExpressionNN
from ogb.graphproppred import Evaluator

In [None]:
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw

In [None]:
rawdata_dir = '../data/raw/'
processed_dir = '../data/processed/'
up_dir = '..'

In [None]:
report_available_cuda_devices()

In [None]:
n_gpu = torch.cuda.device_count()
n_gpu

### Preparing dataset 

In [None]:
score = 'total_thresh'
score_val = 4

In [None]:
DSdataset_name = f'DrugComb_{score}_{score_val}'

# v_1: GNN
# v_2: Alt Models (Baseline)
data_fname = 'data_v2'

In [None]:
targetdata_dir = create_directory(os.path.join(processed_dir, DSdataset_name, data_fname))
targetdata_dir_raw = create_directory(os.path.join(targetdata_dir, "raw"))
targetdata_dir_processed = create_directory(os.path.join(targetdata_dir, "processed"))
targetdata_dir_exp = create_directory(os.path.join(targetdata_dir, "experiments"))

In [None]:
xFlat = ReaderWriter.read_data(os.path.join(targetdata_dir_raw, 'X_flat.pkl'))
y = ReaderWriter.read_data(os.path.join(targetdata_dir_raw, 'y.pkl'))
expression = ReaderWriter.read_data(os.path.join(targetdata_dir_raw, 'expression.pkl'))

In [None]:
xFlatMat = torch.stack([torch.cat(i) for i in list(xFlat.values())])
xFlatMat.shape

In [None]:
x = torch.cat([xFlatMat, torch.tensor(expression)], dim=1)
x.shape

In [None]:
y

In [None]:
fold_partitions = get_stratified_partitions(y,
                                            num_folds=5,
                                            valid_set_portion=0.1,
                                            random_state=42)

In [None]:
print("Number of training graphs: "+ str(len(fold_partitions[0]['train'])))
print("Number of validation graphs: "+ str(len(fold_partitions[0]['validation'])))
print("Number of testing graphs: "+ str(len(fold_partitions[0]['test'])))


### Alt Models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
x_np = x.numpy()
y_np = y

In [None]:
targetdata_dir_exp

In [None]:
model_name = "Default"

In [None]:
model = DecisionTreeClassifier()
model_name = "DecisionTree"

In [None]:
time_stamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
print("Start: " + time_stamp)

for q_i in range(len(fold_partitions)):
    partition = fold_partitions[q_i]
    
    ids_train = partition['train']
    ids_test = partition['test']
    
    x_np_train = np.take(x_np, ids_train, axis=0)
    
    scaler = StandardScaler()
    scaler.fit(x_np_train)
    
    part_train_x = scaler.transform(x_np_train)
    part_train_y = np.take(y_np, ids_train, axis=0)
    
    x_np_test = np.take(x_np, ids_test, axis=0)

    part_test_x = scaler.transform(x_np_test)
    part_test_y = np.take(y_np, ids_test, axis=0)
    
    exp_dir = create_directory(os.path.join(targetdata_dir_exp, "fold_"+str(q_i)+"_"+time_stamp))
    create_directory(os.path.join(exp_dir, "predictions"))
    create_directory(os.path.join(exp_dir, "modelStates"))
    
    model_fit = model.fit(part_train_x, part_train_y)
    
    prob_scores_arr = model_fit.predict_proba(part_test_x)
    pred_class = model_fit.predict(part_test_x)
    ref_class = part_test_y
    epoch = 0
    dsettype = 'test'

    dset_perf = perfmetric_report(pred_class, ref_class, prob_scores_arr[:,1], epoch,
                              outlog = os.path.join(exp_dir, dsettype + ".log"))
    
    predictions_df = build_predictions_df(ids_test, ref_class, pred_class, prob_scores_arr)
    predictions_df.to_csv(os.path.join(exp_dir, 'predictions', f'epoch_{epoch}_predictions_{dsettype}.csv'))
    
    jdump(model_fit, os.path.join(exp_dir, 'modelStates', f'{model_name}_modelFit.joblib')) 
    
print("End: " + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))

### Scores

In [None]:
time_stamp

In [None]:
import glob
exp_dirs = glob.glob(targetdata_dir_exp+"/fold_*_"+time_stamp)
len(exp_dirs)

In [None]:
folds = []

for edir in exp_dirs:
    fold = edir.split("/")[-1].split('_')[1]
    with open(os.path.join(edir, "test.log")) as f:
        lines = f.read().splitlines()
        folds.append([fold, float(lines[18]), float(lines[22])])

In [None]:
df_folds = pd.DataFrame(data=folds, columns=["Fold","AUPR", "AUC"]).set_index("Fold")
df_folds

In [None]:
df_folds.mean(axis=0).values

In [None]:
F_score(*df_folds.mean(axis=0).values)