![](https://scikit-learn.org/stable/_images/grid_search_workflow.png)

In [377]:
import warnings
warnings.filterwarnings('ignore')

In [378]:
import numpy as np
import pandas as pd

import pickle

from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, matthews_corrcoef, precision_recall_curve, auc

from keras.utils import np_utils

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch.optim import SGD

import skorch
from skorch import NeuralNetClassifier
from skorch.callbacks import EpochScoring
from skorch.callbacks import TensorBoard
from skorch.helper import predefined_split

In [379]:
# import configurations (file paths, etc.)
import yaml
try:
    from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
    from yaml import Loader, Dumper
    
configFile = '../cluster/data/medinfmk/ddi/config/config.yml'

with open(configFile, 'r') as ymlfile:
    cfg = yaml.load(ymlfile, Loader=Loader)

In [380]:
pathInput = cfg['filePaths']['dirRaw']
pathOutput = cfg['filePaths']['dirProcessed']
# path to store python binary files (pickles)
# in order not to recalculate them every time
pathPickles = cfg['filePaths']['dirProcessedFiles']['dirPickles']
pathRuns = cfg['filePaths']['dirProcessedFiles']['dirRuns']
pathPaperScores = cfg['filePaths']['dirRawFiles']['paper-individual-metrics-scores']
datasetDirs = cfg['filePaths']['dirRawDatasets']
DS1_path = str(datasetDirs[0])

# Helper Functions

In [381]:
def prepare_data(input_fea, input_lab, seperate=False):
    offside_sim_path = input_fea
    drug_interaction_matrix_path = input_lab
    drug_fea = np.loadtxt(offside_sim_path,dtype=float,delimiter=",")
    interaction = np.loadtxt(drug_interaction_matrix_path,dtype=int,delimiter=",")
    
    train = []
    label = []
    tmp_fea=[]
    drug_fea_tmp = []
            
    for i in range(0, (interaction.shape[0]-1)):
        for j in range((i+1), interaction.shape[1]):
            label.append(interaction[i,j])
            drug_fea_tmp_1 = list(drug_fea[i])
            drug_fea_tmp_2 = list(drug_fea[j])
            if seperate:
                tmp_fea = (drug_fea_tmp_1,drug_fea_tmp_2)
            else:
                # list of (1x548x2) matrices for the CNN input
                tmp_fea = np.column_stack((np.array(drug_fea_tmp_1), np.array(drug_fea_tmp_2)))
                # add 3rd dimension of Conv2D input
                tmp_fea = np.expand_dims(tmp_fea, axis=0)
            train.append(tmp_fea)
            
    return np.asarray(train), np.array(label)

In [382]:
def transfer_array_format(data):
    formated_matrix1 = []
    formated_matrix2 = []
    for val in data:
        formated_matrix1.append(val[0])
        formated_matrix2.append(val[1])
    return np.array(formated_matrix1), np.array(formated_matrix2)

In [383]:
def preprocess_labels(labels, encoder=None, categorical=True):
    if not encoder:
        encoder = LabelEncoder()
        encoder.fit(labels)
        y = encoder.transform(labels).astype(np.int32)
    if categorical:
        y = np_utils.to_categorical(y)
    return y, encoder

In [384]:
def preprocess_names(labels, encoder=None, categorical=True):
    if not encoder:
        encoder = LabelEncoder()
        encoder.fit(labels)
    if categorical:
        labels = np_utils.to_categorical(labels)
    return labels, encoder

In [385]:
def getStratifiedKFoldSplit(X,y,n_splits):
    skf = StratifiedKFold(n_splits=n_splits, random_state=42)
    return skf.split(X,y)

In [386]:
# class NDD(nn.Module):
#     def __init__(self, D_in=1096, H1=300, H2=400, D_out=2, drop=0.5):
#         super(NDD, self).__init__()
#         # an affine operation: y = Wx + b
#         self.fc1 = nn.Linear(D_in, H1) # Fully Connected
#         self.fc2 = nn.Linear(H1, H2)
#         self.fc3 = nn.Linear(H2, D_out)
#         self.drop = nn.Dropout(drop)
#         self._init_weights()

#     def forward(self, x):
#         x = F.relu(self.fc1(x))
#         x = self.drop(x)
#         x = F.relu(self.fc2(x))
#         x = self.drop(x)
#         x = self.fc3(x)
#         return x
    
#     def _init_weights(self):
#         for m in self.modules():
#             if(isinstance(m, nn.Linear)):
#                 m.weight.data.normal_(0, 0.05)
#                 m.bias.data.uniform_(-1,0)

In [387]:
class NDD(nn.Module):
    def __init__(self, D_in=8736, H1=256, H2=64, D_out=2, drop=0.5):
        super(NDD, self).__init__()
#         torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, 
#                 dilation=1, groups=1, bias=True, padding_mode='zeros')
        self.conv1 = torch.nn.Conv2d(in_channels=1, out_channels=32, kernel_size=2, stride=1, padding=0)
        self.pool = torch.nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(D_in, H1) # Fully Connected
        self.fc2 = nn.Linear(H1, H2)
        self.fc3 = nn.Linear(H2, D_out)
        self.drop = nn.Dropout(drop)
#         self._init_weights()

    def forward(self, x):
        #Computes the activation of the first convolution
        #Size changes from (1, 548, 2) to (32, 546, 1)
        x = F.relu(self.conv1(x))
        
        #Size changes from (32, 546, 1) to (32, 273, 1)
        x = self.pool(x)
        
        #Reshape data to input to the input layer of the neural net
        #Size changes from (32, 273, 1) to (1, 8736)
        #Recall that the -1 infers this dimension from the other given dimension
        x = x.view(-1, 32 * 273)
        
        #Computes the activation of the first fully connected layer
        #Size changes from (1, 8736) to (1, 256)
        x = F.relu(self.fc1(x))
        x = self.drop(x)
        
        #Computes the activation of the second fully connected layer
        #Size changes from (1, 256) to (1, 64)
        x = F.relu(self.fc2(x))
        x = self.drop(x)
        
        
        x = self.fc3(x)
        return x
    
    def _init_weights(self):
        for m in self.modules():
            if(isinstance(m, nn.Linear)):
                m.weight.data.normal_(0, 0.05)
                m.bias.data.uniform_(-1,0)

In [388]:
def updateSimilarityDFSingleMetric(df, sim_type, metric, value):
    df.loc[df['Similarity'] == sim_type, metric ] = round(value,3)
    return df

In [389]:
def updateSimilarityDF(df, sim_type, AUROC, AUPR, F1, Rec, Prec):
    df = updateSimilarityDFSingleMetric(df, sim_type, 'AUC', AUROC)
    df = updateSimilarityDFSingleMetric(df, sim_type, 'AUPR', AUPR)
    df = updateSimilarityDFSingleMetric(df, sim_type, 'F-measure', F1)
    df = updateSimilarityDFSingleMetric(df, sim_type, 'Recall', Rec)
    df = updateSimilarityDFSingleMetric(df, sim_type, 'Precision', Prec)
    return df

In [390]:
def getNetParamsStr(net, str_hidden_layers_params, net_params_to_print=["max_epochs", "batch_size"]):
    net_params = [val for sublist in [[x,net.get_params()[x]] for x in net_params_to_print] for val in sublist]
    net_params_str = '-'.join(map(str, net_params))
    return(net_params_str+str_hidden_layers_params)

In [391]:
def writeReplicatedIndividualScoresCSV(net, df, destination, str_hidden_layers_params):
    filePath = destination + "replicatedIndividualScores_" + getNetParamsStr(net, str_hidden_layers_params) + ".csv"
    df.to_csv(path_or_buf = filePath, index=False)

In [392]:
def getNDDClassifier(Xy_test, D_in=8736, H1=256, H2=64, D_out=2, drop=0.5):
    model = NDD(D_in, H1, H2, D_out, drop)
    
    net = NeuralNetClassifier(
        model,
        criterion=nn.CrossEntropyLoss,
        max_epochs=20,
        optimizer=SGD,
        optimizer__lr=0.01,
        optimizer__momentum=0.9,    
        optimizer__weight_decay=1e-6,    
        optimizer__nesterov=True,    
        batch_size=200,
        callbacks=callbacks,
        # Shuffle training data on each epoch
        iterator_train__shuffle=True,
        device=device,
        train_split=predefined_split(Xy_test),
    )
    return net

In [393]:
def avgMetrics(AUROC, AUPR, F1, Rec, Prec, kfold_nsplits):
    AUROC /= kfold_nsplits
    AUPR /= kfold_nsplits
    F1 /= kfold_nsplits
    Rec /= kfold_nsplits
    Prec /= kfold_nsplits
    return AUROC, AUPR, F1, Rec, Prec

# Run

In [394]:
df_paperIndividualScores = pd.read_csv(pathPaperScores)

df_replicatedIndividualScores = df_paperIndividualScores.copy()
# Copy scores table and set them to 0
for col in df_replicatedIndividualScores.columns:
    if col != 'Similarity':
        df_replicatedIndividualScores[col].values[:] = 0

In [395]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
soft = nn.Softmax(dim=1)

In [396]:
do_prepare_data = True
do_train_model = True
kfold_nsplits = 5
# similaritiesToRun = df_paperIndividualScores['Similarity']
similaritiesToRun = ["sideeffect"]

for similarity in similaritiesToRun:
    input_fea = pathInput+DS1_path+"/" + similarity + "_Jacarrd_sim.csv"
    input_lab = pathInput+DS1_path+"/drug_drug_matrix.csv"
    dataPicklePath = pathPickles+"data_X_y_" + similarity + "_Jaccard_CNN.p"
    
    # Prepare data if not available
    if do_prepare_data:
        X,y = prepare_data(input_fea, input_lab, seperate = False)

        with open(dataPicklePath, 'wb') as f:
            pickle.dump([X, y], f)

    # Load X,y and split in to train, test
    with open(dataPicklePath, 'rb') as f:
        X, y = pickle.load(f)
    
    X = X.astype(np.float32)
    y = y.astype(np.int64)
    
    print(X.shape, y.shape)
    
    # Define model
#     D_in, H1, H2, D_out, drop = X.shape[1], 300, 400, 2, 0.5
#     str_hidden_layers_params = "-H1-" + str(H1) + "-H2-" + str(H2)
    str_hidden_layers_params = "-CNN-8736-256-64"

    callbacks = []
    
    AUROC, AUPR, F1, Rec, Prec = 0,0,0,0,0
    kFoldSplit = getStratifiedKFoldSplit(X,y,n_splits=kfold_nsplits)
    for i, indices in enumerate(kFoldSplit):
        train_index = indices[0]
        test_index = indices[1]
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    
        # Create Network Classifier
        Xy_test = skorch.dataset.Dataset(X_test, y_test)
#         print(Xy_test.shape)
#         net = getNDDClassifier(Xy_test, D_in, H1, H2, D_out, drop)
        net = getNDDClassifier(Xy_test=Xy_test)        

        
        # Fit and save OR load model
        modelPicklePath = pathPickles+"model_params/model_params_fold" + str(i) + "_" + str_hidden_layers_params+ "_" + similarity + ".p"
        if do_train_model:
            net.fit(X_train, y_train)
            net.save_params(f_params=modelPicklePath)
        else:
            net.initialize()  # This is important!
            net.load_params(f_params=modelPicklePath)

        # Make predictions
        y_pred = net.predict(X_test)
        lr_probs = soft(net.forward(X_test))[:,1]
        lr_precision, lr_recall, _ = precision_recall_curve(y_test, lr_probs)

        AUROC += roc_auc_score(y_test, y_pred)
        AUPR += auc(lr_recall, lr_precision)
        F1 += f1_score(y_test, y_pred)
        Rec += recall_score(y_test, y_pred)
        Prec += precision_score(y_test, y_pred)
        
        print(i, similarity, AUROC, AUPR, F1, Rec, Prec)
        
    
    AUROC, AUPR, F1, Rec, Prec = avgMetrics(AUROC, AUPR, F1, Rec, Prec, kfold_nsplits)
    print(similarity, AUROC, AUPR, F1, Rec, Prec)
    
    # Fill replicated metrics
    updateSimilarityDF(df_replicatedIndividualScores, similarity, AUROC, AUPR, F1, Rec, Prec)
    
# Write CSV
writeReplicatedIndividualScoresCSV(net, df_replicatedIndividualScores, pathRuns, str_hidden_layers_params)

(149878, 1, 548, 2) (149878,)


RuntimeError: Given input size: (32x547x1). Calculated output size: (32x273x0). Output size is too small

# Compare to Paper

In [None]:
print(df_paperIndividualScores)

In [None]:
print(df_replicatedIndividualScores)

In [None]:
diff_metrics = ['AUC', 'AUPR', 'F-measure', 'Recall', 'Precision']
df_diff = df_paperIndividualScores[diff_metrics] - df_replicatedIndividualScores[diff_metrics]
df_diff_abs = df_diff.abs()
df_diff_percent = (df_diff_abs / df_paperIndividualScores[diff_metrics]) * 100

In [None]:
df_diff

In [None]:
from seaborn import heatmap
heatmap(df_diff, yticklabels=df_paperIndividualScores["Similarity"])

In [None]:
heatmap(df_diff_abs, yticklabels=df_paperIndividualScores["Similarity"])

In [None]:
heatmap(df_diff_percent, yticklabels=df_paperIndividualScores["Similarity"])

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(df_paperIndividualScores[diff_metrics],
                   df_replicatedIndividualScores[diff_metrics])