![](https://scikit-learn.org/stable/_images/grid_search_workflow.png)

In [1]:
# import warnings
# warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd

import pickle

from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, matthews_corrcoef, precision_recall_curve, auc

from keras.utils import np_utils

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch.optim import SGD

import skorch
from skorch import NeuralNetClassifier
from skorch.callbacks import EpochScoring
from skorch.callbacks import TensorBoard
from skorch.helper import predefined_split

Using TensorFlow backend.


In [3]:
# import configurations (file paths, etc.)
import yaml
try:
    from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
    from yaml import Loader, Dumper
    
configFile = '../cluster/data/medinfmk/ddi/config/config.yml'

with open(configFile, 'r') as ymlfile:
    cfg = yaml.load(ymlfile, Loader=Loader)

In [4]:
pathInput = cfg['filePaths']['dirRaw']
pathOutput = cfg['filePaths']['dirProcessed']
# path to store python binary files (pickles)
# in order not to recalculate them every time
pathPickles = cfg['filePaths']['dirProcessedFiles']['dirPickles']
pathRuns = cfg['filePaths']['dirProcessedFiles']['dirRuns']
pathPaperScores = cfg['filePaths']['dirRawFiles']['paper-individual-metrics-scores']
datasetDirs = cfg['filePaths']['dirRawDatasets']
DS1_path = str(datasetDirs[0])

# Helper Functions

In [5]:
def prepare_data(input_fea, input_lab, seperate=False):
    offside_sim_path = input_fea
    drug_interaction_matrix_path = input_lab
    drug_fea = np.loadtxt(offside_sim_path,dtype=float,delimiter=",")
    interaction = np.loadtxt(drug_interaction_matrix_path,dtype=int,delimiter=",")
    
    train = []
    label = []
    tmp_fea=[]
    drug_fea_tmp = []
            
    for i in range(0, (interaction.shape[0]-1)):
        for j in range((i+1), interaction.shape[1]):
            label.append(interaction[i,j])
            drug_fea_tmp_1 = list(drug_fea[i])
            drug_fea_tmp_2 = list(drug_fea[j])
            if seperate:
                 tmp_fea = (drug_fea_tmp_1,drug_fea_tmp_2)
            else:
                 tmp_fea = drug_fea_tmp_1 + drug_fea_tmp_2
            train.append(tmp_fea)

    return np.array(train), np.array(label)

In [6]:
def transfer_array_format(data):
    formated_matrix1 = []
    formated_matrix2 = []
    for val in data:
        formated_matrix1.append(val[0])
        formated_matrix2.append(val[1])
    return np.array(formated_matrix1), np.array(formated_matrix2)

In [7]:
def preprocess_labels(labels, encoder=None, categorical=True):
    if not encoder:
        encoder = LabelEncoder()
        encoder.fit(labels)
        y = encoder.transform(labels).astype(np.int32)
    if categorical:
        y = np_utils.to_categorical(y)
#         print(y)
    return y, encoder

In [8]:
def preprocess_names(labels, encoder=None, categorical=True):
    if not encoder:
        encoder = LabelEncoder()
        encoder.fit(labels)
    if categorical:
        labels = np_utils.to_categorical(labels)
    return labels, encoder

In [9]:
def getStratifiedKFoldSplit(X,y,n_splits):
    skf = StratifiedKFold(n_splits=n_splits, random_state=42)
    return skf.split(X,y)

In [52]:
class NDD(nn.Module):
    def __init__(self, D_in=1096, H1=300, H2=400, D_out=1, drop=0.5):
        super(NDD, self).__init__()
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(D_in, H1) # Fully Connected
        self.fc2 = nn.Linear(H1, H2)
        self.fc3 = nn.Linear(H2, D_out)
        self.drop = nn.Dropout(drop)
        self._init_weights()

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.drop(x)
        x = F.relu(self.fc2(x))
        x = self.drop(x)
        x = self.fc3(x)
        return x
    
    def _init_weights(self):
        for m in self.modules():
            if(isinstance(m, nn.Linear)):
                m.weight.data.normal_(0, 0.05)
                m.bias.data.uniform_(-1,0)

In [48]:
ndd = NDD()

In [49]:
ndd(x)

tensor([-0.0055], grad_fn=<AsStridedBackward>)

In [50]:
x = torch.randn(1, 1096)
print(x.size())
y = ndd(x)
print(y.size())
print(y)

torch.Size([1, 1096])
torch.Size([1])
tensor([-0.5538], grad_fn=<AsStridedBackward>)


In [54]:
def updateSimilarityDFSingleMetric(df, sim_type, metric, value):
    df.loc[df['Similarity'] == sim_type, metric ] = round(value,3)
    return df

In [55]:
def updateSimilarityDF(df, sim_type, AUROC, AUPR, F1, Rec, Prec):
    df = updateSimilarityDFSingleMetric(df, sim_type, 'AUC', AUROC)
    df = updateSimilarityDFSingleMetric(df, sim_type, 'AUPR', AUPR)
    df = updateSimilarityDFSingleMetric(df, sim_type, 'F-measure', F1)
    df = updateSimilarityDFSingleMetric(df, sim_type, 'Recall', Rec)
    df = updateSimilarityDFSingleMetric(df, sim_type, 'Precision', Prec)
    return df

In [29]:
def getNetParamsStr(net, str_hidden_layers_params, net_params_to_print=["max_epochs", "batch_size"]):
    net_params = [val for sublist in [[x,net.get_params()[x]] for x in net_params_to_print] for val in sublist]
    net_params_str = '-'.join(map(str, flattened))
    return(net_params_str+str_hidden_layers_params)

In [30]:
def writeReplicatedIndividualScoresCSV(net, df, destination, str_hidden_layers_params):
    filePath = destination + "replicatedIndividualScores_" + getNetParamsStr(net, str_hidden_layers_params) + ".csv"
    df.to_csv(path_or_buf = filePath, index=False)

In [31]:
def getNDDClassifier(D_in, H1, H2, D_out, drop, Xy_test):
    model = NDD(D_in, H1, H2, D_out, drop)
    
    net = NeuralNetClassifier(
        model,
#         criterion=nn.CrossEntropyLoss,
        criterion=nn.BCEWithLogitsLoss,
        max_epochs=20,
        optimizer=SGD,
        optimizer__lr=0.01,
        optimizer__momentum=0.9,    
        optimizer__weight_decay=1e-6,    
        optimizer__nesterov=True,    
        batch_size=200,
        callbacks=callbacks,
        # Shuffle training data on each epoch
        iterator_train__shuffle=True,
        device=device,
        train_split=predefined_split(Xy_test),
    )
    return net

In [32]:
def avgMetrics(AUROC, AUPR, F1, Rec, Prec, kfold_nsplits):
    AUROC /= kfold_nsplits
    AUPR /= kfold_nsplits
    F1 /= kfold_nsplits
    Rec /= kfold_nsplits
    Prec /= kfold_nsplits
    return AUROC, AUPR, F1, Rec, Prec

# Run

In [33]:
df_paperIndividualScores = pd.read_csv(pathPaperScores)

df_replicatedIndividualScores = df_paperIndividualScores.copy()

for col in df_replicatedIndividualScores.columns:
    if col != 'Similarity':
        df_replicatedIndividualScores[col].values[:] = 0

In [34]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
soft = nn.Softmax(dim=1)

In [36]:
pathPickles

'../cluster/data/medinfmk/ddi/processed/pickles/'

In [38]:
similarity = "sideeffect"

input_fea = pathInput+DS1_path+"/" + similarity + "_Jacarrd_sim.csv"
input_lab = pathInput+DS1_path+"/drug_drug_matrix.csv"
dataPicklePath = pathPickles+"data_X_y_" + similarity + "_Jaccard.p"

print("Preparing " + similarity + " data...")
X,y = prepare_data(input_fea, input_lab, seperate = False)

Preparing sideeffect data...


In [41]:
X.shape, X.dtype, y.shape, y.dtype

((149878, 1096), dtype('float64'), (149878,), dtype('int64'))

In [53]:
do_prepare_data = False
do_train_model = True
kfold_nsplits = 5
# similaritiesToRun = df_paperIndividualScores['Similarity']
similaritiesToRun = ["sideeffect"]
sigmoid = torch.nn.Sigmoid()

for similarity in similaritiesToRun:
    input_fea = pathInput+DS1_path+"/" + similarity + "_Jacarrd_sim.csv"
    input_lab = pathInput+DS1_path+"/drug_drug_matrix.csv"
    dataPicklePath = pathPickles+"data_X_y_" + similarity + "_Jaccard.p"

    # Define model
    D_in, H1, H2, D_out, drop = X.shape[1], 300, 400, 1, 0.5
    str_hidden_layers_params = "-H1-" + str(H1) + "-H2-" + str(H2)
    callbacks = []
    
    # Prepare data if not available
    if do_prepare_data:
        print("Preparing " + similarity + " data...")
        X,y = prepare_data(input_fea, input_lab, seperate = False)

        with open(dataPicklePath, 'wb') as f:
            pickle.dump([X, y], f)

    # Load X,y and split in to train, test
    with open(dataPicklePath, 'rb') as f:
        X, y = pickle.load(f)
    

    y = np.reshape(y, (y.shape[0], 1))

    
    X = X.astype(np.float32)
    y = y.astype(np.float32)   

    
#     y_cat = np_utils.to_categorical(y)
    
    AUROC, AUPR, F1, Rec, Prec = 0,0,0,0,0
    kFoldSplit = getStratifiedKFoldSplit(X,y,n_splits=kfold_nsplits)
    for i, indices in enumerate(kFoldSplit):
        print("Running fold" + str(i) + " for " + similarity +"...")
        
        train_index = indices[0]
        test_index = indices[1]
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
#         y_train, y_test = y_cat[train_index], y_cat[test_index]
    
        # Create Network Classifier
        Xy_test = skorch.dataset.Dataset(X_test, y_test)
        net = getNDDClassifier(D_in, H1, H2, D_out, drop, Xy_test)
        
        # Fit and save OR load model
        modelPicklePath = pathPickles+"model_params/model_params_fold" + str(i) + "_" + str_hidden_layers_params+ "_" + similarity + ".p"
        if do_train_model:
            net.fit(X_train, y_train)
            net.save_params(f_params=modelPicklePath)
        else:
            net.initialize()  # This is important!
            net.load_params(f_params=modelPicklePath)

        # Make predictions
        y_pred = net.predict(X_test)
        lr_probs = sigmoid(net.forward(X_test))
        lr_precision, lr_recall, _ = precision_recall_curve(y_test, lr_probs)

        AUROC += roc_auc_score(y_test, y_pred)
        AUPR += auc(lr_recall, lr_precision)
        F1 += f1_score(y_test, y_pred)
        Rec += recall_score(y_test, y_pred)
        Prec += precision_score(y_test, y_pred)
        
        print(i, similarity, AUROC, AUPR, F1, Rec, Prec)
        
    
    AUROC, AUPR, F1, Rec, Prec = avgMetrics(AUROC, AUPR, F1, Rec, Prec, kfold_nsplits)
    print(similarity, AUROC, AUPR, F1, Rec, Prec)
    
    # Fill replicated metrics
    updateSimilarityDF(df_replicatedIndividualScores, similarity, AUROC, AUPR, F1, Rec, Prec)
    
# Write CSV
writeReplicatedIndividualScoresCSV(net, df_replicatedIndividualScores, pathRuns, str_hidden_layers_params)

Running fold0 for sideeffect...
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m0.6005[0m       [32m0.6758[0m        [35m0.6059[0m  2.9418
      2        [36m0.5774[0m       0.6758        [35m0.5969[0m  2.9599
      3        [36m0.5613[0m       0.6758        0.6023  2.9342
      4        [36m0.5426[0m       0.6758        0.6269  2.9451
      5        [36m0.5280[0m       0.6758        [35m0.5921[0m  2.8710
      6        [36m0.5104[0m       0.6758        0.6048  2.9335
      7        [36m0.5033[0m       0.6758        0.6138  2.9124
      8        [36m0.4941[0m       0.6758        [35m0.5819[0m  2.9125
      9        0.4945       0.6758        0.6152  2.9284
     10        [36m0.4941[0m       0.6758        0.5917  2.9108
     11        0.4978       0.6758        [35m0.5592[0m  2.9134
     12        0.5006       0.6758        0.5623  2.9682
     13        0.5026       0.6758 

NameError: name 'updateSimilarityDFSingleMetric' is not defined

In [66]:
np.unique(y_pred, return_counts=True)

(array([0]), array([29975]))

(array([0., 1.], dtype=float32), array([20259,  9716]))

In [56]:
updateSimilarityDF(df_replicatedIndividualScores, similarity, AUROC, AUPR, F1, Rec, Prec)


Unnamed: 0,Similarity,AUC,AUPR,F-measure,Recall,Precision
0,chem,0.0,0.0,0.0,0.0,0.0
1,target,0.0,0.0,0.0,0.0,0.0
2,transporter,0.0,0.0,0.0,0.0,0.0
3,enzyme,0.0,0.0,0.0,0.0,0.0
4,pathway,0.0,0.0,0.0,0.0,0.0
5,indication,0.0,0.0,0.0,0.0,0.0
6,sideeffect,0.5,0.549,0.0,0.0,0.0
7,offsideeffect,0.0,0.0,0.0,0.0,0.0


# Compare to Paper

In [59]:
print(df_paperIndividualScores)

      Similarity    AUC   AUPR  F-measure  Recall  Precision
0           chem  0.631  0.455      0.527   0.899      0.373
1         target  0.787  0.642      0.617   0.721      0.540
2    transporter  0.682  0.568      0.519   0.945      0.358
3         enzyme  0.734  0.599      0.552   0.579      0.529
4        pathway  0.767  0.623      0.587   0.650      0.536
5     indication  0.802  0.654      0.632   0.740      0.551
6     sideeffect  0.778  0.601      0.619   0.748      0.528
7  offsideeffect  0.782  0.606      0.617   0.764      0.517


In [None]:
print(df_replicatedIndividualScores)

In [57]:
diff_metrics = ['AUC', 'AUPR', 'F-measure', 'Recall', 'Precision']
df_diff = df_paperIndividualScores[diff_metrics] - df_replicatedIndividualScores[diff_metrics]
df_diff_abs = df_diff.abs()
df_diff_percent = (df_diff_abs / df_paperIndividualScores[diff_metrics]) * 100

In [58]:
df_diff

Unnamed: 0,AUC,AUPR,F-measure,Recall,Precision
0,0.631,0.455,0.527,0.899,0.373
1,0.787,0.642,0.617,0.721,0.54
2,0.682,0.568,0.519,0.945,0.358
3,0.734,0.599,0.552,0.579,0.529
4,0.767,0.623,0.587,0.65,0.536
5,0.802,0.654,0.632,0.74,0.551
6,0.278,0.052,0.619,0.748,0.528
7,0.782,0.606,0.617,0.764,0.517


In [None]:
from seaborn import heatmap
heatmap(df_diff, yticklabels=df_paperIndividualScores["Similarity"])

In [None]:
heatmap(df_diff_abs, yticklabels=df_paperIndividualScores["Similarity"])

In [None]:
heatmap(df_diff_percent, yticklabels=df_paperIndividualScores["Similarity"])

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(df_paperIndividualScores[diff_metrics],
                   df_replicatedIndividualScores[diff_metrics])