![](https://scikit-learn.org/stable/_images/grid_search_workflow.png)

In [1292]:
import numpy as np

import pickle

from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, matthews_corrcoef

from keras.utils import np_utils

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch.optim import SGD

from skorch import NeuralNetClassifier
from skorch.callbacks import EpochScoring
from skorch.callbacks import TensorBoard

In [1293]:
# import configurations (file paths, etc.)
import yaml
try:
    from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
    from yaml import Loader, Dumper
    
configFile = '../cluster/data/medinfmk/ddi/config/config.yml'

with open(configFile, 'r') as ymlfile:
    cfg = yaml.load(ymlfile, Loader=Loader)

In [1294]:
pathInput = cfg['filePaths']['dirRaw']
pathOutput = cfg['filePaths']['dirProcessed']
# path to store python binary files (pickles)
# in order not to recalculate them every time
pathPickles = cfg['filePaths']['dirProcessedFiles']['dirPickles']
pathRuns = cfg['filePaths']['dirProcessedFiles']['dirRuns']
datasetDirs = cfg['filePaths']['dirRawDatasets']
DS1_path = str(datasetDirs[0])

In [1295]:
# !tensorboard --logdir ../cluster/data/medinfmk/ddi/processed/runs/

In [1296]:
# def prepare_data(input_fea, input_lab, seperate=False):
#     offside_sim_path = input_fea
#     drug_interaction_matrix_path = input_lab
#     drug_fea = np.loadtxt(offside_sim_path,dtype=float,delimiter=",")
#     interaction = np.loadtxt(drug_interaction_matrix_path,dtype=int,delimiter=",")
#     #print(drug_fea.shape)
#     #print(interaction.shape)
#     #return
#     train = []
#     label = []
#     tmp_fea=[]
#     drug_fea_tmp = []
#     for i in range(0, interaction.shape[0]):
#         for j in range(0, interaction.shape[1]):
#             label.append(interaction[i,j])
#             drug_fea_tmp = list(drug_fea[i])
#             if seperate:
        
#                  tmp_fea = (drug_fea_tmp,drug_fea_tmp)

#             else:
#                  tmp_fea = drug_fea_tmp + drug_fea_tmp
#             train.append(tmp_fea)

#     return np.array(train), np.array(label)

In [1297]:
def prepare_data(input_fea, input_lab, seperate=False):
    offside_sim_path = input_fea
    drug_interaction_matrix_path = input_lab
    drug_fea = np.loadtxt(offside_sim_path,dtype=float,delimiter=",")
    interaction = np.loadtxt(drug_interaction_matrix_path,dtype=int,delimiter=",")
    #print(drug_fea.shape)
    #print(interaction.shape)
    #return
    train = []
    label = []
    tmp_fea=[]
    drug_fea_tmp = []
            
    for i in range(0, (interaction.shape[0]-1)):
        for j in range((i+1), interaction.shape[1]):
            #print(i,j)
    #return
            label.append(interaction[i,j])
            drug_fea_tmp_1 = list(drug_fea[i])
            drug_fea_tmp_2 = list(drug_fea[j])
            if seperate:
                 tmp_fea = (drug_fea_tmp_1,drug_fea_tmp_2)
            else:
                 tmp_fea = drug_fea_tmp_1 + drug_fea_tmp_2
            train.append(tmp_fea)

    return np.array(train), np.array(label)

In [1298]:
def transfer_array_format(data):
    formated_matrix1 = []
    formated_matrix2 = []
    for val in data:
        formated_matrix1.append(val[0])
        formated_matrix2.append(val[1])
    return np.array(formated_matrix1), np.array(formated_matrix2)

In [1299]:
def preprocess_labels(labels, encoder=None, categorical=True):
    if not encoder:
        encoder = LabelEncoder()
        encoder.fit(labels)
        y = encoder.transform(labels).astype(np.int32)
    if categorical:
        y = np_utils.to_categorical(y)
        print(y)
    return y, encoder

In [1300]:
def preprocess_names(labels, encoder=None, categorical=True):
    if not encoder:
        encoder = LabelEncoder()
        encoder.fit(labels)
    if categorical:
        labels = np_utils.to_categorical(labels)
    return labels, encoder

In [1301]:
#X_prep = np.repeat(np.arange(1,6),5).reshape((-1,5))

In [1302]:
#y_prep = np.random.binomial(1, 0.5, size = 25).reshape((5,5))
#y_prep = np.arange(0,25).reshape((5,5))

In [1303]:
input_fea = pathInput+DS1_path+"/offsideeffect_Jacarrd_sim.csv"
###input_fea = pathInput+DS1_path+"/dummy/X_dummy.csv"
###input_fea = pathInput+DS1_path+"/chem_Jacarrd_sim.csv"
###input_fea = pathOutput+"/finalsimddd.txt"
input_lab = pathInput+DS1_path+"/drug_drug_matrix.csv"
###input_lab = pathInput+DS1_path+"/dummy/y_dummy.csv"

In [1304]:
# def check_symmetric(a, rtol=1e-05, atol=1e-08):
#     return np.allclose(a, a.T, rtol=rtol, atol=atol)

In [1305]:
# np.savetxt(input_fea, X_prep.astype(int), fmt='%i', delimiter=",")
# np.savetxt(input_lab, y_prep.astype(int), fmt='%i', delimiter=",")

In [1306]:
# X,y = prepare_data(input_fea, input_lab, seperate = False)

In [1307]:
# X.shape

(149878, 1096)

In [1308]:
# y.shape

(149878,)

In [1309]:
#X_data1, X_data2 = transfer_array_format(X)
#X = np.concatenate((X_data1, X_data2), axis = 1)
###Y, encoder = preprocess_labels(y)

In [1310]:
#dataPicklePath = pathPickles+"/data_X_y_chem_Jaccard.p"
dataPicklePath = pathPickles+"/data_X_y_offside_Jaccard.p"
#dataPicklePath = pathPickles+"/data_X_y_SNFmat.p"

with open(dataPicklePath, 'wb') as f:
    pickle.dump([X, y], f)

In [1311]:
# with open(dataPicklePath, 'rb') as f:
#     X, y = pickle.load(f)

In [1312]:
# # X, y = make_classification(1500, 1000, n_informative=10, random_state=0)
X = X.astype(np.float32)
y = y.astype(np.int64)

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [1313]:
skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X, y)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [1314]:
# tX = torch.from_numpy(X).type(torch.float32)
# ty = torch.from_numpy(y).type(torch.int64)

# dataSet = TensorDataset(tX, ty)
# dataLoader = DataLoader(dataSet)

In [1315]:
# def report_available_cuda_devices():
#     n_gpu = torch.cuda.device_count()
#     print('number of GPUs available:', n_gpu)
#     for i in range(n_gpu):
#         print("cuda:{}, name:{}".format(i, torch.cuda.get_device_name(i)))
#         device = torch.device('cuda', i)
#         get_cuda_device_stats(device)
#         print()
        
# def get_cuda_device_stats(device):
#     print('total memory available:', torch.cuda.get_device_properties(device).total_memory/(1024**3), 'GB')
#     print('total memory allocated on device:', torch.cuda.memory_allocated(device)/(1024**3), 'GB')
#     print('max memory allocated on device:', torch.cuda.max_memory_allocated(device)/(1024**3), 'GB')
#     print('total memory cached on device:', torch.cuda.memory_cached(device)/(1024**3), 'GB')
#     print('max memory cached  on device:', torch.cuda.max_memory_cached(device)/(1024**3), 'GB')

In [1316]:
class NDD(nn.Module):
    def __init__(self, D_in=model_input_dim, H1=400, H2=300, D_out=2, drop=0.5):
        super(NDD, self).__init__()
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(D_in, H1) # Fully Connected
        self.fc2 = nn.Linear(H1, H2)
        self.fc3 = nn.Linear(H2, D_out)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.drop(x)
        x = F.relu(self.fc2(x))
        x = self.drop(x)
        x = self.fc3(x)
        return x

In [1317]:
# Params

# Model
model_input_dim = X.shape[1]
D_in, H1, H2, D_out, drop = model_input_dim, 400, 300, 2, 0.5
# Training
#batch_size, epochs = 100, 20
#print_iter = int(epochs / 10)
# SGD
#learning_rate, momentum, weight_decay, nesterov = 0.01, 0.9, 1e-6, True

# Construct our model by instantiating the class defined above
model = NDD(D_in, H1, H2, D_out, drop)

# if torch.cuda.device_count() > 1:
#   print("Let's use", torch.cuda.device_count(), "GPUs!")
#   # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
#   model = nn.DataParallel(model)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# #device = "cpu"
# model.to(device)

writer = SummaryWriter(pathRuns+"test_40epochs_100batch_optim")

In [1318]:
#device = torch.device("cpu")

In [1319]:
callbacks = []

In [1320]:
#auc = EpochScoring(scoring='roc_auc', lower_is_better=False)
#callbacks.append(auc)

In [1321]:
callbacks.append(TensorBoard(writer))

In [1322]:
#optimizer=SGD(momentum=0.9, weight_decay=1e-6, nesterov=True)

In [1323]:
net = NeuralNetClassifier(
    model,
    criterion=nn.CrossEntropyLoss,
    max_epochs=20,
    optimizer=SGD,
    optimizer__lr=0.01,
    optimizer__momentum=0.9,    
    optimizer__weight_decay=1e-6,    
    optimizer__nesterov=True,    
    batch_size=100,
    callbacks=callbacks,
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
    device=device,
)

In [1324]:
# pipe = Pipeline([
#     ('net', net),
# ])

# pipe.fit(X, y)
# y_proba = pipe.predict_proba(X)

In [1325]:
# for data in dataLoader:
#     X,y = data
#     X = X.to(device)
#     y = y.to(device)
#     print("Outside: input size", X.size(), y.size(), X.device, y.device)

In [1326]:
# params = {
#     'lr': [0.1],
#     'max_epochs': [5],
#     'module__H1': [300],
#     'module__H2': [200, 100],
# }
# gs = GridSearchCV(net, params, refit=True, cv=3, scoring='accuracy')

# gs.fit(X_train, y_train)
# print(gs.best_score_, gs.best_params_)

In [1327]:
# y_pred = gs.predict(X_test)

In [1328]:
net.fit(X_train, y_train)

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m0.5503[0m       [32m0.6003[0m        [35m0.6463[0m  3.0106
      2        [36m0.4986[0m       [32m0.7117[0m        [35m0.5407[0m  3.0374
      3        [36m0.4904[0m       0.6668        0.5783  2.9714
      4        [36m0.4861[0m       0.7091        0.5552  2.5985
      5        [36m0.4861[0m       [32m0.7429[0m        [35m0.5060[0m  3.0570
      6        [36m0.4804[0m       0.7058        0.5437  3.0792
      7        0.4806       [32m0.7637[0m        [35m0.4919[0m  2.9267
      8        0.4823       [32m0.7676[0m        [35m0.4918[0m  2.9078
      9        [36m0.4783[0m       0.7648        0.4933  3.0386
     10        [36m0.4775[0m       0.7605        0.4976  2.9310
     11        [36m0.4766[0m       0.7578        0.4921  3.2826
     12        [36m0.4664[0m       0.7622        [35m0.4890[0m  3.1057
     13      

<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=NDD(
    (fc1): Linear(in_features=1096, out_features=400, bias=True)
    (fc2): Linear(in_features=400, out_features=300, bias=True)
    (fc3): Linear(in_features=300, out_features=2, bias=True)
    (drop): Dropout(p=0.5, inplace=False)
  ),
)

In [1329]:
y_pred = net.predict(X_test)

In [1330]:
roc_auc_score(y_test, y_pred), f1_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred)

(0.6829205482702533,
 0.6000645577792124,
 0.4371237772761475,
 0.9566694112803623)