![](https://scikit-learn.org/stable/_images/grid_search_workflow.png)

In [172]:
import numpy as np

import pickle

from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

from keras.utils import np_utils

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from skorch import NeuralNetClassifier
from skorch.callbacks import EpochScoring

In [152]:
# import configurations (file paths, etc.)
import yaml
try:
    from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
    from yaml import Loader, Dumper
    
configFile = '../cluster/data/medinfmk/ddi/config/config.yml'

with open(configFile, 'r') as ymlfile:
    cfg = yaml.load(ymlfile, Loader=Loader)

In [153]:
pathInput = cfg['filePaths']['dirRaw']
pathOutput = cfg['filePaths']['dirProcessed']
# path to store python binary files (pickles)
# in order not to recalculate them every time
pathPickles = cfg['filePaths']['dirProcessedFiles']['dirPickles']
datasetDirs = cfg['filePaths']['dirRawDatasets']
DS1_path = str(datasetDirs[0])

In [154]:
def prepare_data(input_fea, input_lab, seperate=False):
    offside_sim_path = input_fea
    drug_interaction_matrix_path = input_lab
    drug_fea = np.loadtxt(offside_sim_path,dtype=float,delimiter=",")
    interaction = np.loadtxt(drug_interaction_matrix_path,dtype=int,delimiter=",")
    train = []
    label = []
    tmp_fea=[]
    drug_fea_tmp = []
    for i in range(0, interaction.shape[0]):
        for j in range(0, interaction.shape[1]):
            label.append(interaction[i,j])
            drug_fea_tmp = list(drug_fea[i])
            if seperate:
        
                 tmp_fea = (drug_fea_tmp,drug_fea_tmp)

            else:
                 tmp_fea = drug_fea_tmp + drug_fea_tmp
            train.append(tmp_fea)

    return np.array(train), np.array(label)

In [155]:
def transfer_array_format(data):
    formated_matrix1 = []
    formated_matrix2 = []
    for val in data:
        formated_matrix1.append(val[0])
        formated_matrix2.append(val[1])
    return np.array(formated_matrix1), np.array(formated_matrix2)

In [156]:
def preprocess_labels(labels, encoder=None, categorical=True):
    if not encoder:
        encoder = LabelEncoder()
        encoder.fit(labels)
        y = encoder.transform(labels).astype(np.int32)
    if categorical:
        y = np_utils.to_categorical(y)
        print(y)
    return y, encoder

In [157]:
def preprocess_names(labels, encoder=None, categorical=True):
    if not encoder:
        encoder = LabelEncoder()
        encoder.fit(labels)
    if categorical:
        labels = np_utils.to_categorical(labels)
    return labels, encoder

In [158]:
# input_fea = pathInput+DS1_path+"/offsideeffect_Jacarrd_sim.csv"
# ###input_fea = pathOutput+"/finalsimddd.txt"
# input_lab = pathInput+DS1_path+"/drug_drug_matrix.csv"
# X, y = prepare_data(input_fea, input_lab, seperate = True)

In [159]:
# X_data1, X_data2 = transfer_array_format(X)
# X = np.concatenate((X_data1, X_data2), axis = 1)
# model_input_dim = X.shape[1]
# ###Y, encoder = preprocess_labels(y)

In [160]:
# dataPicklePath = pathPickles+"/data_X_y.p"

# with open(dataPicklePath, 'wb') as f:
#     pickle.dump([X, y], f)

In [161]:
with open(dataPicklePath, 'rb') as f:
    X, y = pickle.load(f)

In [162]:
# # X, y = make_classification(1500, 1000, n_informative=10, random_state=0)
X = X.astype(np.float32)
y = y.astype(np.int64)

# tX = torch.from_numpy(X).type(torch.float32)
# ty = torch.from_numpy(y).type(torch.int64)

# dataSet = TensorDataset(tX, ty)
# dataLoader = DataLoader(dataSet)

In [163]:
# def report_available_cuda_devices():
#     n_gpu = torch.cuda.device_count()
#     print('number of GPUs available:', n_gpu)
#     for i in range(n_gpu):
#         print("cuda:{}, name:{}".format(i, torch.cuda.get_device_name(i)))
#         device = torch.device('cuda', i)
#         get_cuda_device_stats(device)
#         print()
        
# def get_cuda_device_stats(device):
#     print('total memory available:', torch.cuda.get_device_properties(device).total_memory/(1024**3), 'GB')
#     print('total memory allocated on device:', torch.cuda.memory_allocated(device)/(1024**3), 'GB')
#     print('max memory allocated on device:', torch.cuda.max_memory_allocated(device)/(1024**3), 'GB')
#     print('total memory cached on device:', torch.cuda.memory_cached(device)/(1024**3), 'GB')
#     print('max memory cached  on device:', torch.cuda.max_memory_cached(device)/(1024**3), 'GB')

In [164]:
class NDD(nn.Module):
    def __init__(self, D_in=model_input_dim, H1=500, H2=300, D_out=2, drop=0.5):
        super(NDD, self).__init__()
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(D_in, H1) # Fully Connected
        self.fc2 = nn.Linear(H1, H2)
        self.fc3 = nn.Linear(H2, D_out)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.drop(x)
        x = F.relu(self.fc2(x))
        x = self.drop(x)
        x = self.fc3(x)
        return x

In [165]:
# Params

# Model
D_in, H1, H2, D_out, drop = model_input_dim, 400, 300, 2, 0.5
# Training
#batch_size, epochs = 100, 20
#print_iter = int(epochs / 10)
# SGD
#learning_rate, momentum, weight_decay, nesterov = 0.01, 0.9, 1e-6, True

# Construct our model by instantiating the class defined above
model = NDD(D_in, H1, H2, D_out, drop)

# if torch.cuda.device_count() > 1:
#   print("Let's use", torch.cuda.device_count(), "GPUs!")
#   # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
#   model = nn.DataParallel(model)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# #device = "cpu"
# model.to(device)

In [166]:
#device = torch.device("cpu")

In [173]:
auc = EpochScoring(scoring='roc_auc', lower_is_better=False)

In [174]:
net = NeuralNetClassifier(
    model,
    criterion=nn.CrossEntropyLoss,
    #max_epochs=10,
    #lr=0.1,
    callbacks=[auc],
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
    device=device,
)

In [175]:
# pipe = Pipeline([
#     ('net', net),
# ])

# pipe.fit(X, y)
# y_proba = pipe.predict_proba(X)

In [176]:
# for data in dataLoader:
#     X,y = data
#     X = X.to(device)
#     y = y.to(device)
#     print("Outside: input size", X.size(), y.size(), X.device, y.device)

In [177]:
params = {
    'lr': [0.1],
    'max_epochs': [5],
    'module__H1': [300],
    'module__H2': [200, 100],
}
gs = GridSearchCV(net, params, refit=False, cv=3, scoring='accuracy')

gs.fit(X, y)
print(gs.best_score_, gs.best_params_)

Re-initializing module because the following parameters were re-set: H1, H2.
Re-initializing module because the following parameters were re-set: H1, H2.
  epoch    roc_auc    train_loss    valid_acc    valid_loss     dur
-------  ---------  ------------  -----------  ------------  ------
      1     [36m0.6455[0m        [32m0.5812[0m       [35m0.6764[0m        [31m0.6050[0m  3.5251
      2     [36m0.6872[0m        [32m0.5529[0m       0.6764        0.7468  3.5309
      3     0.4552        [32m0.5422[0m       0.6764        0.9118  3.1823
      4     0.5310        [32m0.5350[0m       0.6764        0.7842  3.8062
      5     0.4786        [32m0.5296[0m       0.6764        0.9074  3.6634
Re-initializing module because the following parameters were re-set: H1, H2.
Re-initializing module because the following parameters were re-set: H1, H2.
  epoch    roc_auc    train_loss    valid_acc    valid_loss     dur
-------  ---------  ------------  -----------  ------------  ------