In [1]:
import os
import time
import pandas as pd
import itertools
import numpy as np
import tensorflow as tf
import gc
from random import *

from scipy import stats
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Imputer
from sklearn import preprocessing
from sklearn.utils import shuffle

import keras
from keras import backend as K
from keras import models, layers, regularizers
from keras.layers import Dense, Conv2D, Flatten, Dropout
from keras.layers.advanced_activations import LeakyReLU, PReLU
from keras.utils import multi_gpu_model
from keras.backend.tensorflow_backend import set_session
from keras.backend.tensorflow_backend import clear_session
from keras.backend.tensorflow_backend import get_session
from keras import initializers
import tensorflow

os.environ["CUDA_VISIBLE_DEVICES"]="0"

#import tensorflow as tf
#from keras import backend as K
#config = tf.ConfigProto()
#config.gpu_options.allow_growth=True
#sess = tf.Session(config=config)
#K.set_session(sess)

Using TensorFlow backend.


In [2]:
def reset_weights(model):
    session = K.get_session()
    for layer in model.layers: 
        if isinstance(layer, keras.engine.network.Network):
            reset_weights(layer)
            continue
        for v in layer.__dict__.values():
            if hasattr(v, 'initializer'):
                v.initializer.run(session=session)

# Reset Keras Session
def reset_keras():
    sess = get_session()
    clear_session()
    sess.close()
    sess = get_session()

    try:
        del classifier # this is from global space - change this as you need
    except:
        pass

    print(gc.collect()) # if it's done something you should see a number being outputted

    # use the same config as you used to create the session
    config = tensorflow.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 1
    config.gpu_options.visible_device_list = "0"
    set_session(tensorflow.Session(config=config))

reset_keras()

0


In [3]:
class MDframes_DL:
    def __init__(self):
        self.MDQSAR_directory = "G:/My Drive/NCSU/DF/P5/MDQSAR_BS_LG/"
        self.retrieve_data_labels()
        self.retrieve_cpd_filenames()
        
    def retrieve_data_labels(self):
        '''Retrieve the names of compounds along with binding affinities and save them in dictionary.'''
        cpd_affmanager = open("G:/My Drive/NCSU/DF/P5/Previous Attempts/DL_1/DeepLearning/aff.csv",'r')
        cpd_affmanager.readline()
        self.cpd_data_labels = {i[0]:float(i[1]) for i in (x.strip().split(',') for x in cpd_affmanager)} # make dictionary of compound IDs (string) matched with binding affinities (float)
        
    def retrieve_cpd_filenames(self):
        '''Retrieve the names of all cpds files in the folder and shuffle the list.'''
        
        self.cpd_filenames = os.listdir(self.MDQSAR_directory) # contains the names of all compounds in the folder
        shuffle(self.cpd_filenames)
        
    def BatchGenerator(self,file_names):
        "Based on files in path and generate data and labels batch by batch."
        initial = 0
        batch_size = 1
        while initial < len(file_names):
            pre_data = []
            labels = []
            for i in range(batch_size):
                if initial < len(file_names):
                    if os.path.isfile(self.MDQSAR_directory+file_names[initial]): 
                        each_cpd = pd.read_csv(self.MDQSAR_directory+file_names[initial]).astype(float)
                        df = each_cpd.fillna(0)
                        scaler = MinMaxScaler()
                        each_cpd_scaled = scaler.fit_transform(df)
                        pre_data.append(each_cpd_scaled)
                        labels.append(self.cpd_data_labels[file_names[initial][:-4]])
                        initial += 1
            pre_data = np.nan_to_num(np.array(pre_data))
            shaped_data = pre_data.reshape((pre_data.shape[0], pre_data.shape[1] * pre_data.shape[2]))
            scaler = MinMaxScaler(feature_range=(0, 1))
            final_data = scaler.fit_transform(shaped_data)
            yield final_data, labels
      
    def read_data(self, files):
        pre_data = []
        labels = []
        for i in files:
            each_cpd = pd.read_csv(self.MDQSAR_directory+i,engine='python').astype(float)
            df = each_cpd.fillna(0)
            scaler = MinMaxScaler()
            each_cpd_scaled = scaler.fit_transform(df)
            pre_data.append(each_cpd_scaled)
            labels.append(self.cpd_data_labels[i[:-4]])
        pre_data = np.nan_to_num(np.array(pre_data))
        shaped_data = pre_data.reshape((pre_data.shape[0], pre_data.shape[1] * pre_data.shape[2]))
        scaler = MinMaxScaler(feature_range=(0, 1))
        data = scaler.fit_transform(shaped_data)
        return data, labels

    def split_train_test(self):
        '''Split the list of compound names. train_data has the first 800 rows, and the test the rest. Returns the list of train cpd names and test cpd names.'''
        train_data_file_names = self.cpd_filenames[:800]
        test_data_file_names = self.cpd_filenames[800:]
        return train_data_file_names, test_data_file_names
    
    def split_partitions(self, filenames, folds):
        num_val_samples = len(filenames) // folds+1
        one_fold = []
        nine_folds = []
        for i in range(folds):
            one_fold_data = filenames[i * num_val_samples: (i + 1) * num_val_samples] # prepares the validation data: data from partition # k
            one_fold.append(one_fold_data)
            
            nine_fold_data = np.concatenate([filenames[:i * num_val_samples],filenames[(i + 1) * num_val_samples:]],axis=0).tolist() # prepares the training data: data from all other partitions
            nine_folds.append(nine_fold_data)
        return one_fold, nine_folds 
      

In [4]:
directory = "G:/My Drive/NCSU/DF/P5/DLNN_nestedCV/MD_Ki/"

sample = MDframes_DL()

train_data_files, test_data_files = ['CHEMBL3689632.csv', 'CHEMBL3685032.csv', 'CHEMBL473102.csv', 'CHEMBL3689561.csv', 'CHEMBL3689680.csv', 'CHEMBL3689568.csv', 'CHEMBL3689635.csv', 'CHEMBL3689612.csv', 'CHEMBL3689697.csv', 'CHEMBL3685194.csv', 'CHEMBL3689578.csv', 'CHEMBL403731.csv', 'CHEMBL3689598.csv', 'CHEMBL3689682.csv', 'CHEMBL241749.csv', 'CHEMBL3689580.csv', 'CHEMBL3689617.csv', 'CHEMBL3689634.csv', 'CHEMBL238561.csv', 'CHEMBL3689739.csv', 'CHEMBL2397805.csv', 'CHEMBL443929.csv', 'CHEMBL3689648.csv', 'CHEMBL401447.csv', 'CHEMBL3685013.csv', 'CHEMBL1785020.csv', 'CHEMBL238774.csv', 'CHEMBL3689677.csv', 'CHEMBL408966.csv', 'CHEMBL3685178.csv', 'CHEMBL1775035.csv', 'CHEMBL3685051.csv', 'CHEMBL2030556.csv', 'CHEMBL3685107.csv', 'CHEMBL3685138.csv', 'CHEMBL3689636.csv', 'CHEMBL3689551.csv', 'CHEMBL3685163.csv', 'CHEMBL3394092.csv', 'CHEMBL3689619.csv', 'CHEMBL438610.csv', 'CHEMBL3685078.csv', 'CHEMBL3685155.csv', 'CHEMBL3689695.csv', 'CHEMBL3685004.csv', 'CHEMBL3685098.csv', 'CHEMBL3689611.csv', 'CHEMBL3685135.csv', 'CHEMBL3689583.csv', 'CHEMBL3617738.csv', 'CHEMBL3685006.csv', 'CHEMBL3689637.csv', 'CHEMBL3685036.csv', 'CHEMBL3689729.csv', 'CHEMBL454120.csv', 'CHEMBL3689593.csv', 'CHEMBL235756.csv', 'CHEMBL3689642.csv', 'CHEMBL3685024.csv', 'CHEMBL3689620.csv', 'CHEMBL3685146.csv', 'CHEMBL376993.csv', 'CHEMBL3685173.csv', 'CHEMBL3689665.csv', 'CHEMBL3689685.csv', 'CHEMBL3689653.csv', 'CHEMBL255522.csv', 'CHEMBL3639885.csv', 'CHEMBL685.csv', 'CHEMBL3685027.csv', 'CHEMBL402568.csv', 'CHEMBL3685167.csv', 'CHEMBL83847.csv', 'CHEMBL507594.csv', 'CHEMBL3689651.csv', 'CHEMBL270452.csv', 'CHEMBL210766.csv', 'CHEMBL3685075.csv', 'CHEMBL255324.csv', 'CHEMBL3685070.csv', 'CHEMBL3689623.csv', 'CHEMBL235758.csv', 'CHEMBL3689725.csv', 'CHEMBL3689691.csv', 'CHEMBL3685079.csv', 'CHEMBL255368.csv', 'CHEMBL551542.csv', 'CHEMBL2397818.csv', 'CHEMBL3689640.csv', 'CHEMBL3639840.csv', 'CHEMBL3685088.csv', 'CHEMBL3689566.csv', 'CHEMBL3689599.csv', 'CHEMBL3689658.csv', 'CHEMBL3689698.csv', 'CHEMBL3639884.csv', 'CHEMBL3394091.csv', 'CHEMBL1241489.csv', 'CHEMBL473727.csv', 'CHEMBL3689543.csv', 'CHEMBL1836677.csv', 'CHEMBL1836680.csv', 'CHEMBL3689712.csv', 'CHEMBL3689602.csv', 'CHEMBL3689606.csv', 'CHEMBL3689689.csv', 'CHEMBL3685044.csv', 'CHEMBL3689654.csv', 'CHEMBL3685019.csv', 'CHEMBL3685137.csv', 'CHEMBL3689672.csv', 'CHEMBL3689720.csv', 'CHEMBL3689746.csv', 'CHEMBL3689718.csv', 'CHEMBL3685066.csv', 'CHEMBL3689667.csv', 'CHEMBL3689556.csv', 'CHEMBL3685124.csv', 'CHEMBL3689747.csv', 'CHEMBL3685104.csv', 'CHEMBL3685157.csv', 'CHEMBL271008.csv', 'CHEMBL3685174.csv', 'CHEMBL272288.csv', 'CHEMBL2397799.csv', 'CHEMBL3689701.csv', 'CHEMBL3689655.csv', 'CHEMBL3394083.csv', 'CHEMBL3685095.csv', 'CHEMBL3689708.csv', 'CHEMBL3685058.csv', 'CHEMBL3685039.csv', 'CHEMBL3685110.csv', 'CHEMBL3685170.csv', 'CHEMBL3685166.csv', 'CHEMBL3685116.csv', 'CHEMBL3685153.csv', 'CHEMBL3689699.csv', 'CHEMBL3689722.csv', 'CHEMBL3685150.csv', 'CHEMBL3689711.csv', 'CHEMBL3689656.csv', 'CHEMBL3685062.csv', 'CHEMBL2397800.csv', 'CHEMBL3689554.csv', 'CHEMBL3685181.csv', 'CHEMBL3685057.csv', 'CHEMBL3685042.csv', 'CHEMBL3685195.csv', 'CHEMBL270958.csv', 'CHEMBL3689690.csv', 'CHEMBL3689594.csv', 'CHEMBL3689646.csv', 'CHEMBL408291.csv', 'CHEMBL400211.csv', 'CHEMBL3689713.csv', 'CHEMBL3685055.csv', 'CHEMBL3689693.csv', 'CHEMBL3689676.csv', 'CHEMBL3689696.csv', 'CHEMBL3685034.csv', 'CHEMBL3689721.csv', 'CHEMBL3689743.csv', 'CHEMBL3689744.csv', 'CHEMBL455164.csv', 'CHEMBL3685180.csv', 'CHEMBL3689647.csv', 'CHEMBL1775041.csv', 'CHEMBL3685120.csv', 'CHEMBL3689616.csv', 'CHEMBL3689604.csv', 'CHEMBL514140.csv', 'CHEMBL3685028.csv', 'CHEMBL401238.csv', 'CHEMBL475739.csv', 'CHEMBL3685026.csv', 'CHEMBL1242754.csv', 'CHEMBL3685142.csv', 'CHEMBL288280.csv', 'CHEMBL2397796.csv', 'CHEMBL3685067.csv', 'CHEMBL3689659.csv', 'CHEMBL3689584.csv', 'CHEMBL3689709.csv', 'CHEMBL255165.csv', 'CHEMBL3685086.csv', 'CHEMBL3685190.csv', 'CHEMBL409082.csv', 'CHEMBL3689686.csv', 'CHEMBL241750.csv', 'CHEMBL3685074.csv', 'CHEMBL3685094.csv', 'CHEMBL3689644.csv', 'CHEMBL3685172.csv', 'CHEMBL3689740.csv', 'CHEMBL3685056.csv', 'CHEMBL3685043.csv', 'CHEMBL3689645.csv', 'CHEMBL3685091.csv', 'CHEMBL3685198.csv', 'CHEMBL3689565.csv', 'CHEMBL3689633.csv', 'CHEMBL3685080.csv', 'CHEMBL3685030.csv', 'CHEMBL1836675.csv', 'CHEMBL3684996.csv', 'CHEMBL3685035.csv', 'CHEMBL3685081.csv', 'CHEMBL2397812.csv', 'CHEMBL3684995.csv', 'CHEMBL3689719.csv', 'CHEMBL3689607.csv', 'CHEMBL2397806.csv', 'CHEMBL3685121.csv', 'CHEMBL453358.csv', 'CHEMBL3685196.csv', 'CHEMBL3689724.csv', 'CHEMBL3685014.csv', 'CHEMBL404666.csv', 'CHEMBL3684999.csv', 'CHEMBL454923.csv', 'CHEMBL454119.csv', 'CHEMBL3685112.csv', 'CHEMBL3617732.csv', 'CHEMBL3685127.csv', 'CHEMBL3685131.csv', 'CHEMBL3685012.csv', 'CHEMBL410931.csv', 'CHEMBL3689749.csv', 'CHEMBL2397798.csv', 'CHEMBL3685031.csv', 'CHEMBL3685017.csv', 'CHEMBL2397816.csv', 'CHEMBL3689679.csv', 'CHEMBL3689552.csv', 'CHEMBL3685046.csv', 'CHEMBL3689652.csv', 'CHEMBL455163.csv', 'CHEMBL399627.csv', 'CHEMBL3685171.csv', 'CHEMBL392301.csv', 'CHEMBL3685077.csv', 'CHEMBL3685063.csv', 'CHEMBL391724.csv', 'CHEMBL3689666.csv', 'CHEMBL1775046.csv', 'CHEMBL3689540.csv', 'CHEMBL3689754.csv', 'CHEMBL3685177.csv', 'CHEMBL3689681.csv', 'CHEMBL3689683.csv', 'CHEMBL2397809.csv', 'CHEMBL3685022.csv', 'CHEMBL3685065.csv', 'CHEMBL3685097.csv', 'CHEMBL1785021.csv', 'CHEMBL3685087.csv', 'CHEMBL2397802.csv', 'CHEMBL270492.csv', 'CHEMBL3689579.csv', 'CHEMBL236375.csv', 'CHEMBL3685023.csv', 'CHEMBL3685114.csv', 'CHEMBL2030558.csv', 'CHEMBL255738.csv', 'CHEMBL3685130.csv', 'CHEMBL2397801.csv', 'CHEMBL3689601.csv', 'CHEMBL206575.csv', 'CHEMBL3685008.csv', 'CHEMBL3685093.csv', 'CHEMBL3685037.csv', 'CHEMBL2397804.csv', 'CHEMBL3288854.csv', 'CHEMBL3685029.csv', 'CHEMBL3685085.csv', 'CHEMBL508027.csv', 'CHEMBL3685140.csv', 'CHEMBL3689738.csv', 'CHEMBL291273.csv', 'CHEMBL3689702.csv', 'CHEMBL406658.csv', 'CHEMBL3685045.csv', 'CHEMBL3685143.csv', 'CHEMBL3689742.csv', 'CHEMBL3689751.csv', 'CHEMBL3689562.csv', 'CHEMBL3689673.csv', 'CHEMBL3689581.csv', 'CHEMBL3689582.csv', 'CHEMBL3685007.csv', 'CHEMBL3685059.csv', 'CHEMBL3685134.csv', 'CHEMBL508986.csv', 'CHEMBL3685071.csv', 'CHEMBL3685183.csv', 'CHEMBL3685016.csv', 'CHEMBL272999.csv', 'CHEMBL3685139.csv', 'CHEMBL3689555.csv', 'CHEMBL3685128.csv', 'CHEMBL3689558.csv', 'CHEMBL3689544.csv', 'CHEMBL3685147.csv', 'CHEMBL270454.csv', 'CHEMBL3685187.csv', 'CHEMBL3685105.csv', 'CHEMBL3685072.csv', 'CHEMBL273090.csv', 'CHEMBL40798.csv', 'CHEMBL3685021.csv', 'CHEMBL3685162.csv', 'CHEMBL3685141.csv', 'CHEMBL3689692.csv', 'CHEMBL3689630.csv', 'CHEMBL3689572.csv', 'CHEMBL2397814.csv', 'CHEMBL2408778.csv', 'CHEMBL3685165.csv', 'CHEMBL3685053.csv', 'CHEMBL3685185.csv', 'CHEMBL3689564.csv', 'CHEMBL2397797.csv', 'CHEMBL3689660.csv', 'CHEMBL3689707.csv', 'CHEMBL2397810.csv', 'CHEMBL3685191.csv', 'CHEMBL3685159.csv', 'CHEMBL3685100.csv', 'CHEMBL3689650.csv', 'CHEMBL3685041.csv', 'CHEMBL475336.csv', 'CHEMBL428029.csv', 'CHEMBL3685054.csv', 'CHEMBL2397803.csv', 'CHEMBL3684997.csv', 'CHEMBL3689587.csv', 'CHEMBL3689687.csv', 'CHEMBL1775036.csv', 'CHEMBL3689664.csv', 'CHEMBL378175.csv', 'CHEMBL3685073.csv', 'CHEMBL3689735.csv', 'CHEMBL3689538.csv', 'CHEMBL3689639.csv', 'CHEMBL3685122.csv', 'CHEMBL3689678.csv', 'CHEMBL1836681.csv', 'CHEMBL3685038.csv', 'CHEMBL3689688.csv', 'CHEMBL3689592.csv', 'CHEMBL2397808.csv', 'CHEMBL3685148.csv', 'CHEMBL3689591.csv', 'CHEMBL516070.csv', 'CHEMBL3685076.csv', 'CHEMBL207471.csv', 'CHEMBL1629808.csv', 'CHEMBL3685064.csv', 'CHEMBL3685123.csv', 'CHEMBL3685132.csv', 'CHEMBL3685092.csv', 'CHEMBL3689700.csv', 'CHEMBL3689649.csv', 'CHEMBL241539.csv', 'CHEMBL564746.csv', 'CHEMBL3689753.csv', 'CHEMBL3689657.csv', 'CHEMBL412298.csv', 'CHEMBL41910.csv', 'CHEMBL3689547.csv', 'CHEMBL3685018.csv', 'CHEMBL3689661.csv', 'CHEMBL3689703.csv', 'CHEMBL3685069.csv', 'CHEMBL3689546.csv', 'CHEMBL3689590.csv', 'CHEMBL3685047.csv', 'CHEMBL3685108.csv', 'CHEMBL3685125.csv', 'CHEMBL3689755.csv', 'CHEMBL3685168.csv', 'CHEMBL3689613.csv', 'CHEMBL454922.csv', 'CHEMBL3689596.csv', 'CHEMBL3685050.csv', 'CHEMBL3689615.csv', 'CHEMBL261827.csv', 'CHEMBL3689704.csv', 'CHEMBL3689539.csv', 'CHEMBL3685061.csv', 'CHEMBL207360.csv', 'CHEMBL3685003.csv', 'CHEMBL3689641.csv', 'CHEMBL3685186.csv', 'CHEMBL3689576.csv', 'CHEMBL3685197.csv', 'CHEMBL3685089.csv', 'CHEMBL238559.csv', 'CHEMBL3689750.csv', 'CHEMBL3689745.csv', 'CHEMBL241540.csv', 'CHEMBL3685145.csv', 'CHEMBL3685084.csv', 'CHEMBL3689553.csv', 'CHEMBL255225.csv', 'CHEMBL3685033.csv', 'CHEMBL3689674.csv', 'CHEMBL3685106.csv', 'CHEMBL3689621.csv', 'CHEMBL429600.csv', 'CHEMBL473103.csv', 'CHEMBL206046.csv', 'CHEMBL3685068.csv', 'CHEMBL271009.csv', 'CHEMBL3689574.csv', 'CHEMBL3689716.csv', 'CHEMBL3689737.csv', 'CHEMBL1775045.csv', 'CHEMBL415264.csv', 'CHEMBL3689614.csv', 'CHEMBL3689671.csv', 'CHEMBL3685111.csv', 'CHEMBL3689706.csv', 'CHEMBL3685156.csv', 'CHEMBL3685133.csv', 'CHEMBL473726.csv', 'CHEMBL473923.csv', 'CHEMBL3689710.csv', 'CHEMBL475337.csv', 'CHEMBL3689643.csv', 'CHEMBL3685025.csv', 'CHEMBL3685151.csv', 'CHEMBL3689595.csv', 'CHEMBL3393986.csv', 'CHEMBL3685060.csv', 'CHEMBL3685002.csv', 'CHEMBL3689669.csv', 'CHEMBL3689684.csv', 'CHEMBL3685096.csv', 'CHEMBL3689730.csv', 'CHEMBL379308.csv', 'CHEMBL3685113.csv', 'CHEMBL3685154.csv', 'CHEMBL1836676.csv', 'CHEMBL3684998.csv', 'CHEMBL3689705.csv', 'CHEMBL3685001.csv', 'CHEMBL3685176.csv', 'CHEMBL515885.csv', 'CHEMBL3689629.csv', 'CHEMBL3685119.csv', 'CHEMBL3689662.csv', 'CHEMBL3685126.csv', 'CHEMBL3685152.csv', 'CHEMBL3685184.csv', 'CHEMBL3689550.csv', 'CHEMBL255164.csv', 'CHEMBL1775043.csv', 'CHEMBL3689610.csv', 'CHEMBL238304.csv', 'CHEMBL1775042.csv', 'CHEMBL3689545.csv', 'CHEMBL3689723.csv', 'CHEMBL235757.csv', 'CHEMBL3685179.csv', 'CHEMBL3685005.csv', 'CHEMBL3689694.csv', 'CHEMBL402288.csv', 'CHEMBL3689727.csv', 'CHEMBL3689585.csv', 'CHEMBL3393071.csv'], ['CHEMBL3685175.csv', 'CHEMBL3689626.csv', 'CHEMBL3685160.csv', 'CHEMBL3689624.csv', 'CHEMBL3689575.csv', 'CHEMBL475740.csv', 'CHEMBL3689549.csv', 'CHEMBL406845.csv', 'CHEMBL3685136.csv', 'CHEMBL3685102.csv', 'CHEMBL3685115.csv', 'CHEMBL3685164.csv', 'CHEMBL1775039.csv', 'CHEMBL209617.csv', 'CHEMBL3639839.csv', 'CHEMBL3685049.csv', 'CHEMBL1775037.csv', 'CHEMBL411043.csv', 'CHEMBL3685161.csv', 'CHEMBL3689668.csv', 'CHEMBL3685052.csv', 'CHEMBL3685000.csv', 'CHEMBL1775044.csv', 'CHEMBL3689628.csv', 'CHEMBL289125.csv', 'CHEMBL3394082.csv', 'CHEMBL3685015.csv', 'CHEMBL3685011.csv', 'CHEMBL3689559.csv', 'CHEMBL3689752.csv', 'CHEMBL272947.csv', 'CHEMBL3685010.csv', 'CHEMBL271010.csv', 'CHEMBL3689618.csv', 'CHEMBL3685182.csv', 'CHEMBL3685099.csv', 'CHEMBL3689627.csv', 'CHEMBL3685082.csv', 'CHEMBL3689675.csv', 'CHEMBL3689670.csv', 'CHEMBL3689663.csv', 'CHEMBL270286.csv', 'CHEMBL312214.csv', 'CHEMBL1165511.csv', 'CHEMBL1775040.csv', 'CHEMBL3689600.csv', 'CHEMBL230686.csv', 'CHEMBL3617740.csv', 'CHEMBL1773092.csv', 'CHEMBL3689589.csv', 'CHEMBL3685129.csv', 'CHEMBL3689717.csv', 'CHEMBL203493.csv', 'CHEMBL3685149.csv', 'CHEMBL473517.csv', 'CHEMBL510861.csv', 'CHEMBL3685083.csv', 'CHEMBL1775038.csv', 'CHEMBL1775047.csv', 'CHEMBL2397817.csv', 'CHEMBL2397807.csv', 'CHEMBL3685117.csv', 'CHEMBL3689573.csv', 'CHEMBL3689560.csv', 'CHEMBL3689622.csv', 'CHEMBL3689638.csv', 'CHEMBL2397815.csv', 'CHEMBL402891.csv', 'CHEMBL3685040.csv', 'CHEMBL3689748.csv', 'CHEMBL3685103.csv', 'CHEMBL454144.csv', 'CHEMBL3689625.csv', 'CHEMBL473516.csv', 'CHEMBL272169.csv', 'CHEMBL3685144.csv', 'CHEMBL3685020.csv', 'CHEMBL3689548.csv', 'CHEMBL3685109.csv', 'CHEMBL207986.csv', 'CHEMBL3685118.csv', 'CHEMBL515135.csv', 'CHEMBL3685090.csv']
all_data_files = train_data_files + test_data_files
shuffle(all_data_files)

# print(len(train_data_files), len(test_data_files), len(all_data_files))
# outer_k = 10
# test_fold,train_fold = sample.split_partitions(all_data_files,outer_k)

['CHEMBL2397810.csv',
 'CHEMBL3689582.csv',
 'CHEMBL270454.csv',
 'CHEMBL3689627.csv',
 'CHEMBL2397800.csv',
 'CHEMBL473517.csv',
 'CHEMBL3685178.csv',
 'CHEMBL3685159.csv',
 'CHEMBL3639885.csv',
 'CHEMBL1785021.csv',
 'CHEMBL404666.csv',
 'CHEMBL3685047.csv',
 'CHEMBL403731.csv',
 'CHEMBL3689735.csv',
 'CHEMBL3689680.csv',
 'CHEMBL475739.csv',
 'CHEMBL3685011.csv',
 'CHEMBL3689595.csv',
 'CHEMBL428029.csv',
 'CHEMBL3685029.csv',
 'CHEMBL230686.csv',
 'CHEMBL3685147.csv',
 'CHEMBL3685034.csv',
 'CHEMBL3689676.csv',
 'CHEMBL3685015.csv',
 'CHEMBL291273.csv',
 'CHEMBL3685166.csv',
 'CHEMBL1773092.csv',
 'CHEMBL3689664.csv',
 'CHEMBL3685191.csv',
 'CHEMBL3689616.csv',
 'CHEMBL3685174.csv',
 'CHEMBL1775043.csv',
 'CHEMBL564746.csv',
 'CHEMBL3685148.csv',
 'CHEMBL3685142.csv',
 'CHEMBL3689632.csv',
 'CHEMBL453358.csv',
 'CHEMBL3689746.csv',
 'CHEMBL3689696.csv',
 'CHEMBL3689712.csv',
 'CHEMBL3685122.csv',
 'CHEMBL3685070.csv',
 'CHEMBL3689648.csv',
 'CHEMBL3685036.csv',
 'CHEMBL3689724.csv'

In [5]:
tdata, ttargets = sample.read_data(test_data_files[:1])
input_nodes = tdata.shape[1]

In [6]:
def convert_time(second):
    day = second/86400
    hour = (day - int(day))*24
    minute = (hour - int(hour))*60
    second = round((minute - int(minute))*60,4)
    return(str(int(day)) + ' DAYS: '+ str(int(hour)) + ' HOURS: '+ str(int(minute)) + ' MINUTES: ' + str(second) + ' SECONDS')

In [7]:
def build_model(shape):
    model = models.Sequential()
    model.add(layers.Dense(200,  kernel_regularizer=regularizers.l2(0.001), activation ='relu', input_shape=(shape,)))
    model.add(layers.Dense(64, kernel_regularizer=regularizers.l2(0.001), activation ='relu'))
    model.add(layers.Dense(80, kernel_regularizer=regularizers.l2(0.001), activation ='relu'))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(1))
    model.compile(optimizer='rmsprop',loss='mse',metrics=['mae'])
    return model

In [8]:
train_data, train_targets = sample.read_data(train_data_files)
test_data, test_targets = sample.read_data(test_data_files)

In [9]:
all_data, all_targets = sample.read_data(all_data_files)

In [11]:
# Let's rebuild the model, fit it with training data and make predictions on the external test set 10 times

num_epochs = 500
batch_size = 10

new_train_data_files = [i[:-4] for i in train_data_files]
new_test_data_files = [i[:-4] for i in test_data_files]
new_all_data_files = [i[:-4] for i in all_data_files]

df_trainfit_train = pd.DataFrame(np.column_stack([new_train_data_files, train_targets]), columns=['IDs', 'Experimental Aff']) 
df_trainfit_test = pd.DataFrame(np.column_stack([new_test_data_files, test_targets]), columns=['IDs', 'Experimental Aff']) 
df_finalfit = pd.DataFrame(np.column_stack([new_all_data_files, all_targets]), columns=['IDs', 'Experimental Aff']) 

tentimes_start_time = time.time()


model_name = 'model'
fresh_model = build_model(input_nodes)
reset_weights(fresh_model)
fresh_model.fit(train_data, np.array(train_targets), epochs=num_epochs, batch_size=batch_size, verbose=0)

train_predictions = fresh_model.predict(train_data)
train_predictions_list  = np.concatenate(train_predictions, axis=0).tolist()


if all(x == train_predictions_list[0] for x in train_predictions_list):
    print("All predictions in train list are equal")

df_trainfit_train['PredictedAff'] = train_predictions_list

FM_test_predictions = fresh_model.predict(test_data)
FM_test_predictions_list  = np.concatenate(FM_test_predictions, axis=0).tolist()

R2 =  stats.linregress(test_targets, FM_test_predictions_list)[2]**2
mae_test = mean_absolute_error(test_targets, FM_test_predictions_list)
print('Test R2, Test mae : ', R2, mae_test)

if all(x == FM_test_predictions_list[0] for x in FM_test_predictions_list):
    print("All predictions in test list are equal")

df_trainfit_test['PredictedAff'] = FM_test_predictions_list

model_filename_trainfit_model = directory+'trainfit.h5'
fresh_model.save(model_filename_trainfit_model)
                        
reset_weights(fresh_model)
del fresh_model, train_data, train_targets, test_data, test_targets
reset_keras()

# Final model -- fitting with all data
final_model = build_model(input_nodes)
reset_weights(final_model)

final_model.fit(all_data, np.array(all_targets), epochs=num_epochs, batch_size=batch_size, verbose=0)
#     all_mse_score, all_mae_score = final_model.evaluate(all_data, all_targets)

final_all_predictions = final_model.predict(all_data)
final_all_predictions_list  = np.concatenate(final_all_predictions, axis=0).tolist()

if all(x == final_all_predictions_list[0] for x in final_all_predictions_list):
    print("All predictions in final list are equal")

model_filename_final_model = directory+'finalmodel.h5'
final_model.save(model_filename_final_model)

reset_weights(final_model)
reset_keras()

del final_model, all_data, all_targets

df_finalfit['PredictedAff'] = final_all_predictions_list
df_trainfit_train.to_csv(directory+'trainfit_train_predictions.csv',index=False)
df_trainfit_test.to_csv(directory+'trainfit_test_predictions.csv',index=False)
df_finalfit.to_csv(directory+'finalmodel_predictions.csv',index=False)

del df_trainfit_train, df_trainfit_test, df_finalfit

tentimes_duration = convert_time(time.time()-tentimes_start_time)

[3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467, 3.0445220470428467,

In [None]:
print(tentimes_duration)

In [None]:
# k-fold validation

start_time = time.time()

outer_k = 10
test_fold,train_fold = sample.split_partitions(all_data_files,outer_k)  #OUTER FOLDS

cv_frame = pd.DataFrame()
  
for i in range(outer_k):

    print('processing fold #', i)
    outer_train = train_fold[i]
    outer_test  = test_fold[i]
    
    train_fold[i], test_fold[i] = None, None
    
    outerCV__train_data, outerCV__train_targets = sample.read_data(outer_train)
    outerCV__test_data, outerCV__test_targets = sample.read_data(outer_test)

    cv_model = build_model(input_nodes) # builds the Keras model (already compiled)
    reset_weights(cv_model)
    
    cv_model.fit(outerCV__train_data, np.array(outerCV__train_targets), 
                       epochs = num_epochs, batch_size=batch_size, verbose=0)

    outerCV_predictions = cv_model.predict(outerCV__test_data)
    outerCV_predictions_list  = np.concatenate(outerCV_predictions, axis=0).tolist()

    cv_predictions.append(outerCV_predictions_list)
    cv_targets.append(outerCV__test_targets)
    cv_IDs.append(outer_test)

    reset_weights(cv_model)
    reset_keras()
    gc.collect()
    del cv_model
    
outerCV__targets_combined  = list(itertools.chain.from_iterable(cv_targets))
outerCV__predictions_combined = list(itertools.chain.from_iterable(cv_predictions))
outerCV__IDs_combined = list(itertools.chain.from_iterable(cv_IDs))

if all(x == outerCV__predictions_combined[0] for x in outerCV__predictions_combined):
    print("All predictions in list are equal")

cv_frame['PredictedAff'] = outerCV__predictions_combined

cv_frame.insert(loc=0, column='IDs', value = outerCV__IDs_combined)
cv_frame.insert(loc=1, column='ExperimentalAff', value = outerCV__targets_combined)  
cv_frame.to_csv(directory+'CV_BestModel_Predictions.csv',index=False)


#reset_keras()
duration = convert_time(time.time()-start_time)

print(duration)