In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from tensorflow.keras.optimizers import Adam, SGD, RMSprop, Adadelta, Adagrad, Adamax, Nadam, Ftrl
from bayes_opt import BayesianOptimization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score,mean_absolute_error,explained_variance_score,mean_squared_error,median_absolute_error
from scipy.stats import pearsonr
import openpyxl
from keras.layers import Input, Dense, Conv1D, Lambda
from keras.models import Model, Sequential
from keras.utils.vis_utils import plot_model
from keras import backend as K

## Pre processing data

In [2]:
def prepTrainSet(filename):
    all_Data = pd.read_excel(filename, engine='openpyxl', index_col=0)
    #print(all_Data.head(2))
    seq_data = []
    str_data=[]
    exp_out = []
    for index,item in all_Data.iterrows():
        seq_values = []
        str_values = []
        for key in['perjxW', 'sum_hydW','sumxxcW', 'walx','wbzd','H','numB','numA']:
            str_values.append(item[key])

        for key in['GA','GC','AA', 'CA','X','perX']:
            seq_values.append(item[key])
        exp_out.append(item['Experimental'])

        seq_data.append(seq_values)
        str_data.append(str_values)
    
    return str_data,seq_data,exp_out

## Hyper parameter tuning Neural Network

In [10]:
# Hyper-parameter tuning attempt
def tuning(filename):
    
    def created_model():
        model = tf.keras.Sequential(layers=[
        tf.keras.layers.Dense(12, input_dim=6, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')])
            
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        #history=model.fit(X, np.array(exp_out), verbose=1)
        
        #pred_fin = model.predict(X_train)
        
        #return pred_fin, exp_out
        return model
    
    seed = 7
    np.random.seed(7)
    
    str_train,seq_train,exp_out = prepTrainSet(filename)

    for X_train in [str_train,seq_train]:
        X = np.array(X_train)
    
    model = KerasRegressor(build_fn=created_model, verbose=0)
    
    
   # define the grid search parameters
    batch_size = [10, 20, 40, 60, 80, 100]
    epochs = [10, 50, 100, 200]
    param_grid = dict(batch_size=batch_size, epochs=epochs)
    grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
    grid_result = grid.fit(X, np.array(exp_out))
    
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
        
    

In [11]:
tuning('../data2/singleDNA.xlsx')

  model = KerasRegressor(build_fn=created_model, verbose=0)
2022-04-23 14:08:09.264063: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-23 14:08:09.264063: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-23 14:08:09.264064: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To ena

Best: 34.958778 using {'batch_size': 40, 'epochs': 200}
-1.397353 (3.218117) with: {'batch_size': 10, 'epochs': 10}
6.088290 (6.931802) with: {'batch_size': 10, 'epochs': 50}
14.875121 (21.124999) with: {'batch_size': 10, 'epochs': 100}
18.059123 (9.915744) with: {'batch_size': 10, 'epochs': 200}
-5.746497 (5.051279) with: {'batch_size': 20, 'epochs': 10}
10.203030 (5.030236) with: {'batch_size': 20, 'epochs': 50}
12.329368 (10.377695) with: {'batch_size': 20, 'epochs': 100}
28.001565 (16.157373) with: {'batch_size': 20, 'epochs': 200}
-7.084646 (18.813093) with: {'batch_size': 40, 'epochs': 10}
9.145712 (6.702134) with: {'batch_size': 40, 'epochs': 50}
2.633755 (4.153235) with: {'batch_size': 40, 'epochs': 100}
34.958778 (37.650106) with: {'batch_size': 40, 'epochs': 200}
8.022015 (5.142830) with: {'batch_size': 60, 'epochs': 10}
5.318695 (2.573435) with: {'batch_size': 60, 'epochs': 50}
1.790025 (4.635187) with: {'batch_size': 60, 'epochs': 100}
23.235147 (5.897667) with: {'batch_siz

## Neural Network

In [21]:
def neuralN(filename):

    str_train,seq_train,exp_out = prepTrainSet(filename)
    
    for X_train in [str_train,seq_train]:
        X = np.array(X_train)
        print(X.shape)
        print(X)
        
        
        model = tf.keras.Sequential(layers=[
            tf.keras.layers.Dense(8, input_dim=8, kernel_initializer='normal', activation='relu'),
            tf.keras.layers.Dense(10, kernel_initializer='normal', activation='relu'),
            tf.keras.layers.Dense(1, kernel_initializer='normal')])
        
        model.compile(loss='mse', optimizer='adam', metrics=['mse','mae'])
        
        history=model.fit(X, np.array(exp_out), batch_size=5, epochs=20, verbose=1, validation_split=0.2)
        
        pred_fin = model.predict(np.array(X_train))
        
        model.summary()
        
        #print(pred_fin)
        
        #printAllAccMetrics(pred_fin,exp_out)
        
        return pred_fin, exp_out

In [23]:
def printAllAccMetrics(pred_fin,exp_out):
    exp_out = np.array(exp_out)
    exp_out = exp_out.reshape(24,1)
    
    pred_fin = np.squeeze(pred_fin)
    exp_out = np.squeeze(exp_out)
    
    #print(pred_fin.shape)
    #print(exp_out.shape)
    
    print('Mean Absolute Error(MAE):',mean_absolute_error(pred_fin,exp_out))
    corr, corr2 = pearsonr(y_exp,exp_out)
    print("Pearson's correlation coefficient: ",corr, corr2)
    print('R2 value:', r2_score(pred_fin,exp_out))

In [24]:
def writePreds(complex_type,pred_out,exp_out):
    outlist = []
    filename = ''
    for pred,exp in zip(pred_out,exp_out):
        outlist.append([pred,exp])
    if complex_type == 'SS':
        filename = '../data3/NNSS.xlsx'
    elif complex_type == 'MISC':
        filename = '../data3/NNMisc.xlsx'
    elif complex_type == 'DDI':
        filename = '../data3/NNDDI.xlsx'
    elif complex_type == 'DDII':
        filename = '../data3/NNDDII.xlsx'
    else:
        filename = '../data3/NNDDIII.xlsx'
    
    df = pd.DataFrame(outlist,columns=['NN','Expected'])
    df.to_excel(filename,engine='openpyxl')


In [15]:
y_pred, y_exp = neuralN('../data3/singleDNA.xlsx')
printAllAccMetrics(y_pred, y_exp)
writePreds('SS',y_pred, y_exp)

(8, 8)
[[2.17970050e+01 1.57000000e+02 1.40000000e+02 3.93257000e+04
  9.20510000e+03 1.71800000e+03 1.40000000e+01 2.60000000e+01]
 [2.42320819e+01 9.30000000e+01 7.50000000e+01 1.62147900e+04
  5.81933000e+03 8.58000000e+02 1.10000000e+01 1.30000000e+01]
 [2.60000000e+01 6.60000000e+01 4.90000000e+01 3.54277000e+03
  7.94007000e+03 5.56000000e+02 1.20000000e+01 4.00000000e+00]
 [2.48704663e+01 6.20000000e+01 4.00000000e+01 7.55601000e+03
  6.11193000e+03 5.41000000e+02 8.00000000e+00 6.00000000e+00]
 [2.51764706e+01 1.28000000e+02 8.50000000e+01 3.02051100e+04
  6.43357000e+03 1.24100000e+03 9.00000000e+00 2.00000000e+01]
 [2.17391304e+01 5.70000000e+01 5.50000000e+01 1.04765800e+04
  6.29316000e+03 5.74000000e+02 6.00000000e+00 7.00000000e+00]
 [2.59259259e+01 1.33000000e+02 1.29000000e+02 1.70962000e+04
  1.59119900e+04 1.21700000e+03 2.10000000e+01 1.10000000e+01]
 [2.46644295e+01 1.76000000e+02 1.38000000e+02 4.13232500e+04
  1.15666300e+04 1.71100000e+03 2.00000000e+01 2.8000000

In [25]:
y_pred, y_exp = neuralN('../data3/DNAI.xlsx')
printAllAccMetrics(y_pred, y_exp)
writePreds('DDI',y_pred, y_exp)

(24, 8)
[[2.92279412e+01 1.69000000e+02 1.46000000e+02 3.37566300e+04
  9.85635000e+03 1.57200000e+03 1.60000000e+01 1.70000000e+01]
 [2.66888151e+01 2.80000000e+02 2.28000000e+02 5.10655200e+04
  2.20345300e+04 2.58500000e+03 3.30000000e+01 3.10000000e+01]
 [3.05732484e+01 1.06000000e+02 7.50000000e+01 1.93834700e+04
  4.76428000e+03 9.13000000e+02 7.00000000e+00 1.50000000e+01]
 [2.86585366e+01 1.13000000e+02 5.80000000e+01 1.81680100e+04
  6.52749000e+03 9.34000000e+02 1.20000000e+01 1.20000000e+01]
 [2.66666667e+01 5.60000000e+01 3.70000000e+01 7.24589000e+03
  3.46307000e+03 4.62000000e+02 5.00000000e+00 5.00000000e+00]
 [1.48880105e+01 1.69000000e+02 1.78000000e+02 4.26050600e+04
  1.79303200e+04 2.12000000e+03 2.80000000e+01 2.40000000e+01]
 [2.32616941e+01 2.29000000e+02 1.80000000e+02 4.39747200e+04
  1.94032200e+04 2.27300000e+03 2.70000000e+01 3.00000000e+01]
 [2.20338983e+01 4.50000000e+01 4.80000000e+01 1.16649600e+04
  1.86299000e+03 5.17000000e+02 3.00000000e+00 9.000000

In [None]:
y_pred, y_exp = neuralN('../data3/DNAII.xlsx')
printAllAccMetrics(y_pred, y_exp)
writePreds('DDII',y_pred, y_exp)

In [None]:
y_pred, y_exp = neuralN('../data3/DNAIII.xlsx')
printAllAccMetrics(y_pred, y_exp)
writePreds('DDIII',y_pred, y_exp)

In [None]:
y_pred, y_exp = neuralN('../data3/MISC.xlsx')
printAllAccMetrics(y_pred, y_exp)
writePreds('MISC',y_pred, y_exp)

## Siamese Neural Network

In [37]:
#def siamese_single(filename):

str_train,seq_train,exp_out = prepTrainSet('../data2/singleDNA.xlsx')

for X_train in [str_train,seq_train]:
    X = np.array(X_train)

X = tf.reshape(X, [8,6])

#input_shape=(8,6)
               
left_input = Input(X.shape)
right_input = Input(X.shape)

model = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(64, kernel_size=3, activation='relu', input_shape=(X.shape)),
    tf.keras.layers.MaxPooling1D(2, 2),
    tf.keras.layers.Conv1D(128, kernel_size=2, activation='relu'),
    tf.keras.layers.MaxPooling1D(2, 2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(2048, activation='sigmoid')
])

encoded_l = model(left_input)
encoded_r = model(right_input)

L1_layer = Lambda(lambda tensors:K.abs(tensors[0] - tensors[1]))
L1_distance = L1_layer([encoded_l, encoded_r])

#subtracted = tf.keras.layers.Subtract()([encoded_l, encoded_r])
output = Dense(1, activation='sigmoid')(L1_distance)
model = Model(inputs=[left_input, right_input], outputs=output)

optimizer= Adam(learning_rate=0.0006)
model.compile(loss='binary_crossentropy', optimizer=optimizer)

model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 8, 6)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 8, 6)]       0           []                               
                                                                                                  
 sequential_7 (Sequential)      (None, 2048)         281920      ['input_1[0][0]',                
                                                                  'input_2[0][0]']                
                                                                                                  
 lambda (Lambda)                (None, 2048)         0           ['sequential_7[0][0]',       

In [38]:
model.fit([X,np.array(exp_out)], epochs=20)

Epoch 1/20


ValueError: in user code:

    File "/Users/vidhi/opt/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 878, in train_function  *
        return step_function(self, iterator)
    File "/Users/vidhi/opt/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 867, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/vidhi/opt/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 860, in run_step  **
        outputs = model.train_step(data)
    File "/Users/vidhi/opt/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 808, in train_step
        y_pred = self(x, training=True)
    File "/Users/vidhi/opt/anaconda3/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/vidhi/opt/anaconda3/lib/python3.8/site-packages/keras/engine/input_spec.py", line 263, in assert_input_compatibility
        raise ValueError(f'Input {input_index} of layer "{layer_name}" is '

    ValueError: Input 0 of layer "model" is incompatible with the layer: expected shape=(None, 8, 6), found shape=(None, 6)


In [None]:
#siamese_single('../data2/singleDNA.xlsx')

In [None]:
def siamese_dnaI(filename):
    
    str_train,seq_train,exp_out = prepTrainSet(filename)

    for X_train in [str_train,seq_train]:
        X = np.array(X_train)
    X = X.reshape(24,6,1)

    input_shape=(24,6)
    left_input = Input(input_shape)
    right_input = Input(input_shape)


    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv1D(64, kernel_size=3, activation='relu',input_shape=input_shape),
        tf.keras.layers.MaxPooling1D(2, 2),
        tf.keras.layers.Conv1D(128, kernel_size=2, activation='relu'),
        tf.keras.layers.MaxPooling1D(2, 2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(4096, activation='sigmoid')
    ])

    encoded_l = model(left_input)
    encoded_r = model(right_input)

    subtracted = tf.keras.layers.Subtract()([encoded_l, encoded_r])
    prediction = Dense(1, activation='sigmoid')(subtracted)
    siamese_net = Model(inputs=[left_input, right_input], outputs=prediction)

    optimizer= Adam(learning_rate=0.0006)
    siamese_net.compile(loss='binary_crossentropy', optimizer=optimizer)

    siamese_net.summary()

In [None]:
siamese_dnaI('../data2/DNAI.xlsx')

In [None]:
def siamese_dnaII(filename):
    
    str_train,seq_train,exp_out = prepTrainSet(filename)

    for X_train in [str_train,seq_train]:
        X = np.array(X_train)
    X = X.reshape(50,6,1)

    input_shape=(50,6)
    left_input = Input(input_shape)
    right_input = Input(input_shape)


    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv1D(64, kernel_size=3, activation='relu',input_shape=input_shape),
        tf.keras.layers.MaxPooling1D(2, 2),
        tf.keras.layers.Conv1D(128, kernel_size=2, activation='relu'),
        tf.keras.layers.MaxPooling1D(2, 2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(4096, activation='sigmoid')
    ])

    encoded_l = model(left_input)
    encoded_r = model(right_input)

    L1_layer = Lambda(lambda tensors:K.abs(tensors[0] - tensors[1]))
    L1_distance = L1_layer([encoded_l, encoded_r])

    #subtracted = tf.keras.layers.Subtract()([encoded_l, encoded_r])
    prediction = Dense(1, activation='sigmoid')(L1_distance)
    siamese_net = Model(inputs=[left_input, right_input], outputs=prediction)

    optimizer= Adam(learning_rate=0.0006)
    siamese_net.compile(loss='binary_crossentropy', optimizer=optimizer)

    siamese_net.summary()

In [None]:
siamese_dnaII('../data2/DNAII.xlsx')

In [None]:
def siamese_dnaIII(filename):
    
    str_train,seq_train,exp_out = prepTrainSet(filename)

    for X_train in [str_train,seq_train]:
        X = np.array(X_train)
    X = X.reshape(27,6,1)

    input_shape=(27,6)
    left_input = Input(input_shape)
    right_input = Input(input_shape)


    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv1D(64, kernel_size=3, activation='relu',input_shape=input_shape),
        tf.keras.layers.MaxPooling1D(2, 2),
        tf.keras.layers.Conv1D(128, kernel_size=2, activation='relu'),
        tf.keras.layers.MaxPooling1D(2, 2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(4096, activation='sigmoid')
    ])

    encoded_l = model(left_input)
    encoded_r = model(right_input)

    L1_layer = Lambda(lambda tensors:K.abs(tensors[0] - tensors[1]))
    L1_distance = L1_layer([encoded_l, encoded_r])

    #subtracted = tf.keras.layers.Subtract()([encoded_l, encoded_r])
    prediction = Dense(1, activation='sigmoid')(L1_distance)
    siamese_net = Model(inputs=[left_input, right_input], outputs=prediction)

    optimizer= Adam(learning_rate=0.0006)
    siamese_net.compile(loss='binary_crossentropy', optimizer=optimizer)

    siamese_net.summary()

In [None]:
siamese_dnaIII('../data2/DNAIII.xlsx')

In [None]:
def siamese_misc(filename):
    
    str_train,seq_train,exp_out = prepTrainSet(filename)

    for X_train in [str_train,seq_train]:
        X = np.array(X_train)
    X = X.reshape(8,6,1)

    input_shape=(8,6)
    left_input = Input(input_shape)
    right_input = Input(input_shape)


    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv1D(64, kernel_size=3, activation='relu',input_shape=input_shape),
        tf.keras.layers.MaxPooling1D(2, 2),
        tf.keras.layers.Conv1D(128, kernel_size=2, activation='relu'),
        tf.keras.layers.MaxPooling1D(2, 2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(4096, activation='sigmoid')
    ])

    encoded_l = model(left_input)
    encoded_r = model(right_input)

    L1_layer = Lambda(lambda tensors:K.abs(tensors[0] - tensors[1]))
    L1_distance = L1_layer([encoded_l, encoded_r])

    #subtracted = tf.keras.layers.Subtract()([encoded_l, encoded_r])
    prediction = Dense(1, activation='sigmoid')(L1_distance)
    siamese_net = Model(inputs=[left_input, right_input], outputs=prediction)

    optimizer= Adam(learning_rate=0.0006)
    siamese_net.compile(loss='binary_crossentropy', optimizer=optimizer)

    siamese_net.summary()

In [None]:
siamese_misc('../data2/MISC.xlsx')