In [33]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

from math import floor

tf.random.set_seed(1)

In [46]:
def r_squared(y_true,y_pred):
    rss = tf.reduce_sum(tf.square(y_true - y_pred))
    tss = tf.reduce_sum(tf.square(y_true - tf.reduce_mean(y_true))) 
    r_2 = 1 - (rss/tss)
    return r_2

def get_valid_sample(X_train, y_train):
    split = floor(X_train.shape[0] * 0.80)
    train_feat = X_train[:split,:]
    valid_feat = X_train[split:,:]
    train_target = y_train[:split]
    valid_target = y_train[split:]
    
    return train_feat, train_target, valid_feat, valid_target

def build_and_compile_model(norm): 
    model = keras.Sequential([norm, layers.Dense(25, activation = 'relu'), layers.Dense(25, activation = 'relu'),
                              layers.Dropout(0.5), layers.Dense(1)])
    model.compile(loss = 'mean_absolute_error', metrics = [r_squared], optimizer = keras.optimizers.Adam(0.001))

    return model

def plot_loss(history, file_name, i):
    plt.plot(history.history['loss'], label='loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    plt.title(f'loss_set{i}')
    plt.xlabel('Epoch')
    plt.ylabel('Error pIC50')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'loss_{file_name}.png')
    plt.clf()

def plot_metrics(history, file_name, i):
    plt.plot(history.history['r_squared'], label='r2')
    plt.plot(history.history['val_r_squared'], label='val_r2')
    plt.title(f'r2_set{i}')
    plt.xlabel('Epoch')
    plt.ylabel('r2_pIC50')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'r2_{file_name}.png')
    plt.clf()
  

In [47]:
test_set = 3
val_dict = {}
trn_dict = {}
test_dict = {}
for i in range(test_set):
    X_train = np.load(f'EGFR_Chembl_X_train_{i}.npy')
    y_train = np.load(f'EGFR_Chembl_y_train_{i}.npy')
    X_test = np.load(f'EGFR_Chembl_X_test_{i}.npy')
    y_test = np.load(f'EGFR_Chembl_y_test_{i}.npy')

    X_train = X_train[:,:-1]
    X_test = X_test[:,:-1]
    
    train_feat, train_target, valid_feat, valid_target = get_valid_sample(X_train,y_train)
    normalizer = keras.layers.Normalization(axis = -1)
    normalizer.adapt(train_feat)
    qsar_model = build_and_compile_model(normalizer)
    train_feat = tf.data.Dataset.from_tensor_slices(train_feat)
    train_target = tf.data.Dataset.from_tensor_slices(train_target)
    valid_feat = tf.data.Dataset.from_tensor_slices(valid_feat)
    valid_target = tf.data.Dataset.from_tensor_slices(valid_target)
    test_feat = tf.data.Dataset.from_tensor_slices(X_test)
    test_target = tf.data.Dataset.from_tensor_slices(y_test)
    train_ds = tf.data.Dataset.zip((train_feat,train_target))
    valid_ds = tf.data.Dataset.zip((valid_feat,valid_target))    
    test_ds = tf.data.Dataset.zip((test_feat,test_target))
    
    history = qsar_model.fit(train_ds, epochs = 100, verbose = 0, batch_size = 32, validation_data = valid_ds)
    name = f'set_{i}'
    plot_loss(history, name, i)
    plot_metrics(history, name, i)
    val_dict[f'{name}_loss'] = history.history['val_loss']
    trn_dict[f'{name}_loss'] = history.history['loss']
    val_dict[f'{name}_r2'] = history.history['val_r_squared']
    trn_dict[f'{name}_r2'] = history.history['r_squared']
    scores = qsar_model.evaluate(test_ds.batch(1), verbose = 0)
    test_dict[name] = dict(zip(qsar_model.metrics_names, scores))

    qsar_model.save(f'qsar_regression_{name}.h5', overwrite = True, include_optimizer = True, save_format = 'h5')
    tf.keras.backend.clear_session()


<Figure size 640x480 with 0 Axes>

In [36]:
tf.keras.backend.clear_session()

In [48]:
val_dict

{'set_0_loss': [1.211976408958435,
  1.1154204607009888,
  1.0746577978134155,
  1.0623873472213745,
  1.0570858716964722,
  1.0523754358291626,
  1.0276284217834473,
  0.9881190657615662,
  0.9836685061454773,
  0.9935552477836609,
  0.9779582023620605,
  0.984666109085083,
  0.9705508947372437,
  0.9763493537902832,
  0.9562316536903381,
  0.9609781503677368,
  0.9558896422386169,
  0.9768694043159485,
  0.9588438272476196,
  0.9503061175346375,
  0.9454380869865417,
  0.948413610458374,
  0.9410049915313721,
  0.9609151482582092,
  0.9443671703338623,
  0.9527363777160645,
  0.9440712928771973,
  0.9369041323661804,
  0.934604287147522,
  0.9476453065872192,
  0.9472845196723938,
  0.941111147403717,
  0.9226370453834534,
  0.938842236995697,
  0.9361026883125305,
  0.9321692585945129,
  0.9349587559700012,
  0.9307971596717834,
  0.9438202381134033,
  0.9468563199043274,
  0.9470325112342834,
  0.9348472952842712,
  0.9392264485359192,
  0.960748016834259,
  0.9524771571159363,
  0

In [44]:
trn_dict

{'set_0_loss': [1.7410516738891602,
  1.3982230424880981,
  1.3224350214004517,
  1.2627803087234497,
  1.213301658630371,
  1.176038146018982,
  1.159254550933838,
  1.1268723011016846,
  1.070223093032837,
  1.045920729637146,
  1.0175594091415405,
  1.0254498720169067,
  1.0065726041793823,
  0.9786569476127625,
  0.9660206437110901,
  0.9453806281089783,
  0.9342862963676453,
  0.9300321936607361,
  0.9162766933441162,
  0.9119118452072144,
  0.8912145495414734,
  0.8982961773872375,
  0.8771365880966187,
  0.8751102685928345,
  0.8755108118057251,
  0.8661985993385315,
  0.8616637587547302,
  0.8527924418449402,
  0.8513721823692322,
  0.8475378751754761,
  0.8455312848091125,
  0.8436403274536133,
  0.8402904868125916,
  0.8315609097480774,
  0.8343095779418945,
  0.8300461173057556,
  0.8293368220329285,
  0.8208667635917664,
  0.8185117840766907,
  0.8165185451507568,
  0.8094125986099243,
  0.8199524879455566,
  0.8123093247413635,
  0.8127266764640808,
  0.8091090321540833,
 

In [39]:
test_dict

{'set_0': {'loss': 0.9556497931480408, 'r_squared': -inf},
 'set_1': {'loss': 0.8666932582855225, 'r_squared': -inf},
 'set_2': {'loss': 0.8699333667755127, 'r_squared': -inf}}

In [49]:
import pandas as pd

In [56]:
val_df =pd.DataFrame(val_dict)
val_df.head(5)

Unnamed: 0,set_0_loss,set_0_r2,set_1_loss,set_1_r2,set_2_loss,set_2_r2
0,1.211976,-inf,1.233141,-inf,1.20841,-inf
1,1.11542,-inf,1.118449,-inf,1.166657,-inf
2,1.074658,-inf,1.03531,-inf,1.071673,-inf
3,1.062387,-inf,1.031782,-inf,1.05087,-inf
4,1.057086,-inf,1.043604,-inf,1.035593,-inf


In [52]:
trn_df = pd.DataFrame(trn_dict)
trn_df.head(5)

Unnamed: 0,set_0_loss,set_0_r2,set_1_loss,set_1_r2,set_2_loss,set_2_r2
0,2.12587,-inf,2.048622,-inf,2.146338,-inf
1,1.58571,-inf,1.57661,-inf,1.614256,-inf
2,1.449605,-inf,1.463446,-inf,1.47626,-inf
3,1.360585,-inf,1.377032,-inf,1.378465,-inf
4,1.278966,-inf,1.28368,-inf,1.283819,-inf


In [55]:
test_df = pd.DataFrame(test_dict)
test_df.head(5)

Unnamed: 0,set_0,set_1,set_2
loss,0.951887,0.914784,0.880484
r_squared,-inf,-inf,-inf


In [57]:
val_df.to_csv('validation_df.csv', index = False)
trn_df.to_csv('training_df.csv', index = False)
test_df.to_csv('test_df.csv')