In [0]:
# -*- ChAIkeras.py -*-
"""
Created Oct 2019

author: Timothy E H Allen
"""
#%%

# Import the usual suspects

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
from tensorflow import keras
from tensorflow.keras import layers
from keras import regularizers
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.utils import class_weight
import random

# DEFINE INPUTS FOR MODEL TRAINING

'''
receptor = biological target
input_data = clustered datasets
rng_1 and rng_2 = random numbers for dataset shuffle and train/validation split
validation_proportion = fraction of data to be used as validation set
beta = l2 regularisation rate
neurons = neurons per hidden layer
hidden_layers = number of hidden layers, must be 1, 2 or 3
LR = learning rate
epochs = number of training iterations
model_path = location to save model file post training
'''

receptor = "AR"
input_data_a = "/content/drive/My Drive/Andy_data/clustered_data/" + receptor + " fold_a fingerprint ECFP4 10000.csv"
input_data_b = "/content/drive/My Drive/Andy_data/clustered_data/" + receptor + " fold_b fingerprint ECFP4 10000.csv"
input_data_c = "/content/drive/My Drive/Andy_data/clustered_data/" + receptor + " fold_c fingerprint ECFP4 10000.csv"
input_data_d = "/content/drive/My Drive/Andy_data/clustered_data/" + receptor + " fold_d fingerprint ECFP4 10000.csv"
input_data_e = "/content/drive/My Drive/Andy_data/clustered_data/" + receptor + " fold_e fingerprint ECFP4 10000.csv"
rng_1 = random.randrange(1,1000)
rng_2 = random.randrange(1,1000)
validation_proportion = 0.25
beta = 0.1
neurons = 100
hidden_layers = 2
LR = 0.001
epochs = 100
model_path = "/content/drive/My Drive/Andy_data/" + receptor + " model.h5"

print("Welcome to ChAI")
print("Dataset loading...")

# Reading The Dataset

def read_dataset(input_data):
    df = pd.read_csv(input_data)
    X = df[df.columns[0:10000]].values
    y = df[df.columns[10000]]

    # Encode the dependent variable
    encoder = LabelEncoder()
    encoder.fit(y)
    Y = encoder.transform(y)
    print("X.shape =", X.shape)
    print("Y.shape =", Y.shape)
    print("y.shape =", y.shape)
    return (X, Y)

Xa, Ya = read_dataset(input_data_a)
Xb, Yb = read_dataset(input_data_b)
Xc, Yc = read_dataset(input_data_c)
Xd, Yd = read_dataset(input_data_d)
Xe, Ye = read_dataset(input_data_e)

X = np.concatenate((Xb,Xc,Xd,Xe))
Y = np.concatenate((Yb,Yc,Yd,Ye))


# Shuffle the dataset
 
X, Y = shuffle(X, Y, random_state=rng_1)

# Convert the dataset into train and validation sets

train_x, valid_x, train_y, valid_y = train_test_split(X, Y, test_size =validation_proportion, random_state=rng_2)
test_x = Xa
test_y = Ya

# Inspect the shape of the training and validation data

print("Dimensionality of data:")
print("Train x shape =", train_x.shape)
print("Train y shape =", train_y.shape)
print("Validation x shape =", valid_x.shape)
print("Validation y shape =", valid_y.shape)

class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(train_y),
                                                 train_y)

#Define the model in keras

print("Constructing model architecture")

if hidden_layers == 1:
    inputs = keras.Input(shape=(10000,), name='digits')
    x = layers.Dense(neurons, activation='relu', kernel_initializer='random_uniform', bias_initializer='zeros', kernel_regularizer=regularizers.l2(beta), name='dense_1')(inputs)
    outputs = layers.Dense(2, activation='softmax', name='predictions')(x)
elif hidden_layers == 2:
    inputs = keras.Input(shape=(10000,), name='digits')
    x = layers.Dense(neurons, activation='relu', kernel_initializer='random_uniform', bias_initializer='zeros', kernel_regularizer=regularizers.l2(beta), name='dense_1')(inputs)
    x = layers.Dense(neurons, activation='relu', kernel_initializer='random_uniform', bias_initializer='zeros', kernel_regularizer=regularizers.l2(beta), name='dense_2')(x)
    outputs = layers.Dense(2, activation='softmax', name='predictions')(x)
elif hidden_layers == 3:
    inputs = keras.Input(shape=(10000,), name='digits')
    x = layers.Dense(neurons, activation='relu', kernel_initializer='random_uniform', bias_initializer='zeros', kernel_regularizer=regularizers.l2(beta), name='dense_1')(inputs)
    x = layers.Dense(neurons, activation='relu', kernel_initializer='random_uniform', bias_initializer='zeros', kernel_regularizer=regularizers.l2(beta), name='dense_2')(x)
    x = layers.Dense(neurons, activation='relu', kernel_initializer='random_uniform', bias_initializer='zeros', kernel_regularizer=regularizers.l2(beta), name='dense_3')(x)
    outputs = layers.Dense(2, activation='softmax', name='predictions')(x)
else:
    print("Number of hidden layers outside this model scope, please choose 1, 2 or 3")

model = keras.Model(inputs = inputs, outputs = outputs)

model.compile(optimizer=keras.optimizers.Adam(lr=LR),
              loss='sparse_categorical_crossentropy',
              metrics=['sparse_categorical_accuracy'])

print('Commencing model training...')
history = model.fit(train_x, train_y,
                    batch_size=128,
                    epochs=epochs,
                    class_weight=class_weights,
                    # We pass some validation for
                    # monitoring validation loss and metrics
                    # at the end of each epoch
                    validation_data=(valid_x, valid_y))

# The returned "history" object holds a record
# of the loss values and metric values during training

# Evaluate the model on the training and validation data
print('\n# Evaluate on training data')
train_results = model.evaluate(train_x, train_y, batch_size=128)
print('train loss, train acc:', train_results)

print('\n# Evaluate on validaiton data')
validation_results = model.evaluate(valid_x, valid_y, batch_size=128)
print('validation loss, validation acc:', validation_results)

# Save the model

model.save(model_path)
print('Model saved to ' + model_path)

pred_valid_y = model.predict(valid_x, verbose=1)
pred_train_y = model.predict(train_x, verbose=1)
pred_test_y = model.predict(test_x, verbose=1)

# Plot history of loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

# Plot history of accuracy values
plt.plot(history.history['sparse_categorical_accuracy'])
plt.plot(history.history['val_sparse_categorical_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

# Define experimental and predicted values using argmax

pred_train_y_binary = np.argmax(pred_train_y, axis=1)
pred_valid_y_binary = np.argmax(pred_valid_y, axis=1)
pred_test_y_binary = np.argmax(pred_test_y, axis=1)

# Calculate and display confusion matricies

cm = confusion_matrix(train_y, pred_train_y_binary)
np.set_printoptions(precision=2)
print("Confusion matrix (Training), without normalisation")
print(cm)

cm = confusion_matrix(valid_y, pred_valid_y_binary)
np.set_printoptions(precision=2)
print("Confusion matrix (Validation), without normalisation")
print(cm)

cm = confusion_matrix(test_y, pred_test_y_binary)
np.set_printoptions(precision=2)
print("Confusion matrix (Test), without normalisation")
print(cm)

# Attempt a ROC curve
def plot_roc(pred,y):
    fpr, tpr, _ = roc_curve(y,pred)
    roc_auc = auc(fpr,tpr)

    plt.figure()
    plt.plot(fpr, tpr, label="ROC curve (area = %0.2f)" % roc_auc)
    plt.plot([0,1], [0,1], "k--")
    plt.xlim([0.0,1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Receiver Operating Characteristic (ROC)")
    plt.legend(loc="lower right")
    plt.show()

# First plot Training data ROC
print("Training data ROC Curve")
    
y_score = np.array(pred_train_y)[:,1]
y_true = np.array(train_y)
plot_roc(y_score, y_true)

# Then plot Validation data ROC
print("Validation data ROC Curve")

y_score_2 = np.array(pred_valid_y)[:,1]
y_true_2 = np.array(valid_y)
plot_roc(y_score_2, y_true_2)

# Finally plot Test data ROC
print("Test data ROC Curve")

y_score_3 = np.array(pred_test_y)[:,1]
y_true_3 = np.array(test_y)
plot_roc(y_score_3, y_true_3)

#End the cycle
tf.reset_default_graph()


#End the cycle
tf.reset_default_graph()


print("END")

