In [1]:
import csv
import numpy as np
import Loader
import tensorflow as tf
import Transformer
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import model_selection
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.losses import MeanSquaredError

compoundsTrain, smilesTrain, labelsTrain, compoundDataTrain, activitiesTrain = Loader.getTrain(defaultValue=0)
compoundsTest, smilesTest, labelsTest, compoundDataTest, activitiesTest = Loader.getTest(defaultValue=0)
compoundsValidate, smilesValidate, labelsValidate, compoundDataValidate, activitiesValidate = Loader.getValidate(defaultValue=0)

In [5]:

l1Reg = keras.regularizers.L1(.001)

optimizer = tf.keras.optimizers.Adam(
    learning_rate=0.001,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-07,
    amsgrad=False
)


def runMinPCA(Xtrain,Ytrain, labelsTrain, compoundDataTest, compoundDataValidate, classVal):
    #transform the data, use pca and mean of fusion/docking
    labelsPCA, trainPCA, testPCA, valPCA = Transformer.applyPCA(labels,  Xtrain, 
                                                            compoundDataTest, compoundDataValidate,
                                                            endDims=[1,1,2,1,2,3,2,4])
    labelsMeanPCA, trainData = Transformer.useAverageFD(labelsPCA, trainPCA)
    _, testData = Transformer.useAverageFD(labelsPCA, testPCA)
    _, valData = Transformer.useAverageFD(labelsPCA, valPCA)
    
    #normalize data
    trainData, testData, valData = Transformer.normalizeData(
        trainData, testData, valData, newMean=0, newStd=1)
    #outputs, classification here
    classVal = Transformer.toBinaryClassification(activitiesValidate)
    classTrain = Transformer.toBinaryClassification(activitiesTrain)

    inputDim = np.shape(trainData)[1] #how many inputs
    #build and run model
    model = tf.keras.models.Sequential([
            tf.keras.layers.Dense(inputDim, activation='relu', kernel_regularizer = l1Reg),
            tf.keras.layers.Dense(80, activation='relu', kernel_regularizer = l1Reg),
            tf.keras.layers.Dense(120, activation='relu', kernel_regularizer = l1Reg),
            tf.keras.layers.Dense(100, activation='relu', kernel_regularizer = l1Reg),
            tf.keras.layers.Dense(80, activation='relu', kernel_regularizer = l1Reg),
            tf.keras.layers.Dense(40, activation='relu', kernel_regularizer = l1Reg),
            tf.keras.layers.Dense(2, activation='relu', kernel_regularizer = l1Reg)
        ])

    model.compile(optimizer=optimizer,loss=tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True), metrics=['accuracy'])

    history = model.fit(trainData, Ytrain, validation_data = (valData, classVal), 
        epochs=epochsCount, batch_size=4, verbose = 0) #verbose = 0 for no printing

    predictionsTrain = np.argmax(model.predict(Xtrain), axis=1)
    predictionsTest = np.argmax(model.predict(valData), axis=1)
    return [ predictionsTrain, predictionsTest ]


def runPCANN(Xtrain,Ytrain, labelsTrain, compoundDataTest, compoundDataValidate, classVal):
    labelsPCA, trainPCA, testPCA, valPCA = Transformer.applyPCA(labelsTrain,  Xtrain, 
                                                        compoundDataTest, compoundDataValidate,
                                                        endDims=[1,1,9,4,6,7,3,16])

    labelsMeanPCA, trainMeanPCA = Transformer.useAverageFD(labelsPCA, trainPCA)
    _, testMeanPCA = Transformer.useAverageFD(labelsPCA, testPCA)
    _, valMeanPCA = Transformer.useAverageFD(labelsPCA, valPCA)

    labelsMaxPCA, trainMaxPCA = Transformer.useMaxFD(labelsPCA, trainPCA)
    _, testMaxPCA = Transformer.useMaxFD(labelsPCA, testPCA)
    _, valMaxPCA = Transformer.useMaxFD(labelsPCA, valPCA)

    #after transformations are done assign data
    dataLabels = labelsMaxPCA
    trainData = trainMaxPCA
    testData = testMaxPCA
    valData = valMaxPCA

    Xtrain,_,valData = Transformer.normalizeData(trainData, testData, valData, newMean=0, newStd=1)

    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(len(dataLabels), activation='relu', kernel_regularizer = l1Reg),
        tf.keras.layers.Dense(200, activation='relu', kernel_regularizer = l1Reg),
        tf.keras.layers.Dense(300, activation='relu', kernel_regularizer = l1Reg),
        tf.keras.layers.Dense(200, activation='relu', kernel_regularizer = l1Reg),
        tf.keras.layers.Dense(100, activation='relu', kernel_regularizer = l1Reg),
        tf.keras.layers.Dense(50, activation='relu', kernel_regularizer = l1Reg),
        tf.keras.layers.Dense(10, activation='relu', kernel_regularizer = l1Reg),
        tf.keras.layers.Dense(2)
    ])

    model.compile(optimizer=optimizer,loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                   metrics=['accuracy'])

    history = model.fit(Xtrain, Ytrain, validation_data = (valData, classVal), epochs=3, batch_size=4, verbose = 0)
    predictionsTrain = np.argmax(model.predict(Xtrain), axis=1)
    predictionsTest = np.argmax(model.predict(valData), axis=1)
    return [ predictionsTrain, predictionsTest ]



def runBasicNN(Xtrain,Ytrain, labelsTrain, compoundDataTest, compoundDataValidate, classVal):
    
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(len(labelsTrain), activation='relu', kernel_regularizer = l1Reg),
        tf.keras.layers.Dense(200, activation='relu', kernel_regularizer = l1Reg),
        tf.keras.layers.Dense(300, activation='relu', kernel_regularizer = l1Reg),
        tf.keras.layers.Dense(200, activation='relu', kernel_regularizer = l1Reg),
        tf.keras.layers.Dense(100, activation='relu', kernel_regularizer = l1Reg),
        tf.keras.layers.Dense(50, activation='relu', kernel_regularizer = l1Reg),
        tf.keras.layers.Dense(10, activation='relu', kernel_regularizer = l1Reg),
        tf.keras.layers.Dense(2)
    ])

    model.compile(optimizer=optimizer,loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),metrics=['accuracy'])
    
    Xtrain,_,valData = Transformer.normalizeData(Xtrain, compoundDataTest, compoundDataValidate, newMean=0, newStd=1)
    
    history = model.fit(Xtrain, Ytrain, validation_data = (valData, classVal), epochs=3, batch_size=4, verbose = 0)
    predictionsTrain = np.argmax(model.predict(Xtrain), axis=1)
    predictionsVal = np.argmax(model.predict(valData), axis=1)
    return [ predictionsTrain, predictionsVal ]


def averageValues(acc, valAcc):
    accPlot = []
    valAccPlot = []
    for i in range(len(acc)):
        accuracy = 0
        valAccuracy = 0
        added = 0

        for j in range(len(acc[i])):
            if(acc[i][j] > 0.7):
                added += 1
                accuracy += acc[i][j]
                valAccuracy += valAcc[i][j]
        if(added == 0):
            added = 1
        accuracy /= added
        valAccuracy /= added
        accPlot.append(accuracy)
        valAccPlot.append(valAccuracy)
    return accPlot, valAccPlot

def aggregate(predictions):
    yAggregate = np.zeros(len(predictions[0]))
    for prediction in predictions:
        yAggregate += prediction
    return yAggregate > len(predictions) / 2

In [6]:
classTrain = Transformer.toBinaryClassification(activitiesTrain)
classVal = Transformer.toBinaryClassification(activitiesValidate)
classTest = Transformer.toBinaryClassification(activitiesTest)


dataDropped = [0,.1,.25,.5,.75,.9] 
accBNN = []
valAccBNN = []
accPNN = []
valAccPNN = []
accMin = []
valAccMin = []

accAGG = []
valAccAGG = []
for percent in dataDropped:
    currentBNNAccs = []
    currentBNNValAccs = []
    currentPNNAccs = []
    currentPNNValAccs = []
    currentMinAccs = []
    currentMinValAccs = []
    
    currentAGGAccs = []
    currentAGGValAccs = []
    
    for run in range(3):
     
        if(percent == 0):
            Xtrain = compoundDataTrain
            Ytrain = classTrain
        else:
            (Xtrain,_,Ytrain,_) = model_selection.train_test_split(compoundDataTrain, classTrain, test_size=percent)
        
        histBNN = runBasicNN(Xtrain, Ytrain, labelsTrain, compoundDataTest, compoundDataValidate,classVal)
        currentBNNAccs.append(np.mean(histBNN[0] == Ytrain))
        currentBNNValAccs.append(np.mean(histBNN[1] == classVal))
        
        histPNN = runPCANN(Xtrain, Ytrain, labelsTrain, compoundDataTest, compoundDataValidate,classVal)
        currentPNNAccs.append(np.mean(histPNN[0] == Ytrain))
        currentPNNValAccs.append(np.mean(histPNN[1] == classVal))
        
        histMIN = runMinPCA(Xtrain, Ytrain, labelsTrain, compoundDataTest, compoundDataValidate,classVal)
        currentMinAccs.append(np.mean(histMIN[0] == Ytrain))
        currentMinValAccs.append(np.mean(histMIN[1] == classVal))
        
        currentAGGAccs.append(np.mean(aggregate([histBNN[0], histPNN[0], histMIN[0]]) == Ytrain ))
        currentAGGValAccs.append(np.mean(aggregate([histBNN[1], histPNN[1], histMIN[1]]) == Ytrain ))
        
    accBNN.append(currentBNNAccs)
    valAccBNN.append(currentBNNValAccs)
    accPNN.append(currentPNNAccs)
    valAccPNN.append(currentPNNValAccs)
    accMin.append(currentMinAccs)
    valAccMin.append(currentMinValAccs)
    
    accAGG.append(currentAGGAccs)
    valAccAGG.append(currentAGGValAccs)
        #if (tempHist.history["accuracy"][-1] > .7 ): #Don't include the run if it was a constant guess
         #   currentHist.append([tempHist.history["accuracy"][-1], tempHist.history["val_accuracy"][-1]])
   

Epoch 1/3
Epoch 2/3
Epoch 3/3
bcut2d retention: [0.99364773]
	total: 99.36477273412314%
chi retention: [0.9541968]
	total: 95.41967968597893%
paoe retention: [0.31495127 0.19509321 0.1390042  0.08838179 0.07155396 0.04590061
 0.03243    0.02673268 0.02402293]
	total: 93.80706737004675%
smr retention: [0.50634726 0.24568468 0.08792319 0.07435827]
	total: 91.43133907840051%
slogp retention: [0.44620276 0.22134465 0.15753302 0.04213752 0.03159298 0.02850562]
	total: 92.73165471968959%
estate_vsa retention: [0.29224011 0.18964809 0.14361318 0.10642656 0.07394495 0.06391846
 0.05513575]
	total: 92.49271047439831%
vsa_estate retention: [0.49255304 0.32866098 0.09719245]
	total: 91.84064669216798%
fr retention: [0.32153498 0.12578177 0.09944384 0.0570485  0.05494644 0.04503917
 0.03646285 0.02896244 0.02597271 0.02289905 0.01957315 0.01726898
 0.01514305 0.01174622 0.01137843 0.00991619]
	total: 90.31177786512335%
Epoch 1/3
Epoch 2/3
Epoch 3/3
1.0
Epoch 1/3
Epoch 2/3

KeyboardInterrupt: 

In [None]:


fig, ax = plt.subplots(figsize=(18,8))
plt.yticks(fontsize=20)
plt.xticks(fontsize=20)

accPlotBNN, valAccPlotBNN = averageValues(accBNN, valAccBNN)
print("BNN Acc:",accPlotBNN, "\nBNN valAcc:", valAccPlotBNN)

accPlotPNN, valAccPlotPNN = averageValues(accPNN, valAccPNN)
print("PNN Acc:",accPlotPNN, "\nPNN valAcc:", valAccPlotPNN)


ax.plot(dataDropped,accPlotBNN, label = "Basic Neural Network")
ax.plot(dataDropped,accPlotPNN, label = "PCA Neural Network")

ax.legend(loc = 'right', fontsize = 15)
ax.set_title("Ein Accuracies at Different Data Percentages (Non PCA)", fontsize = 20)
ax.set_xlabel("Dropped Percentage", fontsize = 20)
ax.set_ylabel("Accuracy", fontsize = 20)
ax.set_facecolor("black")