In [2]:
import numpy as np
from sklearn.manifold import TSNE
import pandas as pd
import os
import itertools
import numpy as np
import time

from sklearn.utils import shuffle
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import rcParams
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import colors

from sklearn.decomposition import PCA

from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold, train_test_split, RandomizedSearchCV, GridSearchCV

import keras
from keras import backend as K
from keras import models, layers, regularizers
from keras.layers import Dense, Conv2D, Flatten, Dropout
from keras.layers.advanced_activations import LeakyReLU, PReLU
from keras.utils import multi_gpu_model
from keras.backend.tensorflow_backend import set_session
from keras.backend.tensorflow_backend import clear_session
from keras.backend.tensorflow_backend import get_session
from keras import initializers
import tensorflow

Using TensorFlow backend.


In [3]:
def convert_time(second):
    day = second/86400
    hour = (day - int(day))*24
    minute = (hour - int(hour))*60
    second = round((minute - int(minute))*60,4)
    return(str(int(day)) + ' DAYS: '+ str(int(hour)) + ' HOURS: '+ str(int(minute)) + ' MINUTES: ' + str(second) + ' SECONDS')

In [4]:
def data_process(pre_all_data, desc='desc'):
    '''
    fname should have desc or fing.
    Process data based on fingerprint or descriptor. desc = 'fing' for fingerprint, everything else is descriptors. 
    Return processed data.
    '''
    
    ################ USE THIS FOR BITS / FINGERPRINTS ################
    
    if 'fing' in desc:
    #         all_data = pre_all_data 
        imputer    = SimpleImputer()
        scaler     = preprocessing.MinMaxScaler()
        all_data   = scaler.fit_transform(imputer.fit_transform(pre_all_data)) 

    ################ USE THIS FOR EXPLICIT DESCRIPTORS / FINGERPRINTS ################

    else:

        # convert the dataframe to numpy array
        pre_all_data = pre_all_data.convert_objects(convert_numeric=True)
        pre_all_data = pre_all_data.fillna(0)
        # all_data_np = pre_all_data.values 

        # drop numeric columns
        temp = pre_all_data.apply(pd.to_numeric,errors='coerce').isnull().any()
        numeric_error_columns = [pre_all_data.columns[i] for i in range(len(temp)) if temp[i] == True]
        print('Below are the numeric Error Columns. They will be removed. ')
        print(numeric_error_columns)
        combined_data = pre_all_data.drop(columns=numeric_error_columns)

        # preprocess data, normalize columns
        imputer    = SimpleImputer()
        scaler     = preprocessing.MinMaxScaler()
        all_data   = scaler.fit_transform(imputer.fit_transform(combined_data)) 

        print("all_data :", all_data.shape)
    
    return all_data

In [5]:
def reduce_dimensions(all_data):
    '''
    Up to 100 dimensions, check and see if we can find variabilities explained up to 95% using PCA.
    If so, return the set with reduced dimension.
    Else, return the original all_data. 
    
    '''
    max_component = all_data.shape[1]
    all_components = [int(x) for x in np.linspace(start = 10, stop = 1000, num = 50)]
    possible_components = [i for i in all_components if i<max_component]
    variablity_explained = 0
    i=0
    while variablity_explained <=95:
        if i>= len(possible_components):
            print('Cannot find variability explained above 95% for 100 dimensions.')
            pca_result = all_data

        else:
            pca = PCA(n_components = possible_components[i])
            pca_result = pca.fit_transform(all_data)
            all_percents = []
            for v in pca.explained_variance_ratio_:
                each_percent = round(v*100,2)
        #         print('Explained variation per principal component: {}%'.format(each_percent))
                all_percents.append(each_percent)

            variablity_explained = sum(all_percents)
            i+=1    

        print(variablity_explained, possible_components[i])
    return pca_result
    

In [6]:
def split_partitions(DATA,TARGETS,IDS, folds):
    num_val_samples = len(DATA) // folds+1
    one_fold = []
    nine_folds = []
    for i in range(folds):
        one_fold_data = DATA[i * num_val_samples: (i + 1) * num_val_samples] # prepares the validation data: data from partition # k
        one_fold_targets = TARGETS[i * num_val_samples: (i + 1) * num_val_samples]
        one_fold_IDs = IDS[i * num_val_samples: (i + 1) * num_val_samples]
        one_fold += [[one_fold_data, one_fold_targets, one_fold_IDs]]
        
        # prepares the training data: data from all other partitions
        nine_fold_data = np.concatenate([DATA[:i * num_val_samples],DATA[(i + 1) * num_val_samples:]],axis=0)
        nine_fold_targets = np.concatenate([TARGETS[:i * num_val_samples],TARGETS[(i + 1) * num_val_samples:]],axis=0)
        nine_fold_IDs = np.concatenate([IDS[:i * num_val_samples],IDS[(i + 1) * num_val_samples:]],axis=0)
        nine_folds += [[nine_fold_data,nine_fold_targets,nine_fold_IDs]]
    return one_fold, nine_folds   

In [None]:
def build_model(shape):
    model = models.Sequential()
    model.add(layers.Dense(200, input_shape=(shape,)))
    model.add(layers.Dense(100, activation ='relu'))
    model.add(LeakyReLU(alpha=.001))
    model.add(layers.Dense(60, activation ='relu'))
    model.add(LeakyReLU(alpha=.001))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(40, activation ='relu'))
    model.add(layers.Dense(20, activation ='relu'))
    model.add(layers.Dense(1))
    model.compile(optimizer='adam',loss='mse',metrics=['mae'])
    return model

In [8]:
start_time = time.time()

outer_k = 10

directory = 'G:/My Drive/NCSU/2019 Summer/Data/Databases_Processed/Accepted/RS_12&above/Ester_in_ring_RS_12&above/Duplicates Removed/macrolactone_ester_in_R12&above/target_CHEMBL364/'

implicit = ['maccs_fing.csv', 'ecfp6_fing.csv']

explicit = ['mordred_desc.csv', '2Drdkit_desc.csv', 'mrc_desc.csv', 'mordred_mrc_desc.csv', 'all_comb.csv']

all_folders = implicit + explicit 

cv_frame = pd.DataFrame()

for desc in all_folders:
    print('Processing ' + desc + '...')
    cv_frame = pd.DataFrame()

    combined = pd.read_csv(directory + desc, sep=',')
    combined = shuffle(combined)


    all_IDs     = combined[['ID']]
    pre_all_data    = combined.drop(columns=['Aff', 'ID'])
    all_targets  = np.concatenate(combined[['Aff']].values, axis=0).tolist()

    all_data = data_process(pre_all_data, desc)
    all_data = reduce_dimensions(all_data)
    

    test_fold,train_fold = split_partitions(all_data,all_targets,all_IDs,outer_k)  #OUTER FOLDS

    all_outerCV__predictions = []
    all_outerCV__IDs = [] 
    all_outerCV__targets = []
    
    ###########################################
    

    for i in range(outer_k):    
        # For each external fold, redefine the train and test splits which have been predefined.
        print('Data Partition ',str(i), ' ...')

        outer_train = train_fold[i]
        outer_test  = test_fold[i]

        outerCV__test_data, outerCV__test_targets, outerCV__test_ids  = test_fold[i][0], test_fold[i][1], test_fold[i][2].values.flatten().tolist()
        outerCV__train_data, outerCV__train_targets, outerCV__train_ids = train_fold[i][0], train_fold[i][1], train_fold[i][2].tolist()

        ### DNN ###

        best_model = None
        best_model = build_model(input_nodes)
        reset_weights(best_model)

        best_model.fit(outerCV__train_data, outerCV__train_targets,epochs = num_epochs, batch_size=10, verbose=0)

        outerCV__predictions = best_model.predict(outerCV__test_data)
        outerCV__predictions_list  = np.concatenate(outerCV__predictions, axis=0).tolist()
        all_outerCV__predictions.append(outerCV__predictions_list)

        #     cv_model_path = directory+'cv_10_fold'
        #     cv_model_file = cv_model_path + '/fold_'+str(i)+'.h5'
        #     if not os.path.exists(cv_model_path):
        #         os.mkdir(cv_model_path)
        #         if os.path.exists(cv_model_file):
        #             os.remove(cv_model_file)
        #     best_model.save(cv_model_file)

        reset_weights(best_model)
        K.clear_session()
        gc.collect()
        del best_model

        

        ### targets and IDs ###

        all_outerCV__targets.append(outerCV__test_targets)
        all_outerCV__IDs.append(outerCV__test_ids)

    ##############################

    outerCV__predictions_combined   = list(itertools.chain.from_iterable(all_outerCV__predictions))
    
    
    outerCV__targets_combined  = list(itertools.chain.from_iterable(all_outerCV__targets))
    outerCV__predictions_combined = list(itertools.chain.from_iterable(all_outerCV__predictions))
    outerCV__IDs_combined = list(itertools.chain.from_iterable(all_outerCV__IDs))
    
    cv_frame[desc[:-9]+'_exp']  = outerCV__targets_combined
    cv_frame[desc[:-9]+'_IDs']  = outerCV__IDs_combined
    cv_frame[desc[:-9]+'_pred'] = outerCV__predictions_combined
    
    result_directory = directory+'predictions/5-fold/pca_tuned/'
    
cv_frame.to_csv(result_directory+'DNN.csv',index=False)

duration = convert_time(time.time()-start_time)
print(duration)

Processing maccs_fing.csv...
76.91000000000001 30
95.06000000000002 50
Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    1.4s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


{'n_iter': 200, 'fit_intercept': True}
Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=2)]: Done 240 out of 240 | elapsed:    1.1s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


{'shrinking': True, 'degree': 1, 'kernel': 'rbf'}
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=2)]: Done 1000 out of 1000 | elapsed:    2.2s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


{'n_neighbors': 11, 'algorithm': 'auto', 'weights': 'uniform', 'leaf_size': 60}
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   27.7s
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:  1.9min
[Parallel(n_jobs=2)]: Done 361 tasks      | elapsed:  4.3min
[Parallel(n_jobs=2)]: Done 644 tasks      | elapsed:  7.3min
[Parallel(n_jobs=2)]: Done 1000 out of 1000 | elapsed: 11.0min finished


{'max_depth': 10, 'bootstrap': True, 'max_features': 'sqrt', 'n_estimators': 1600, 'min_samples_leaf': 4, 'min_samples_split': 2}
Data Partition  0  ...




Data Partition  1  ...




Data Partition  2  ...




Data Partition  3  ...




Data Partition  4  ...




Processing ecfp6_fing.csv...
53.989999999999995 30
74.60000000000002 50
83.65000000000003 70
88.97000000000004 90
92.55000000000004 111
95.15000000000003 131
Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.7s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


{'n_iter': 200, 'fit_intercept': True}
Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=2)]: Done 240 out of 240 | elapsed:    4.9s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


{'shrinking': True, 'degree': 1, 'kernel': 'rbf'}
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=2)]: Done 1000 out of 1000 | elapsed:    4.8s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


{'n_neighbors': 7, 'algorithm': 'auto', 'weights': 'uniform', 'leaf_size': 20}
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   50.4s
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:  3.5min
[Parallel(n_jobs=2)]: Done 361 tasks      | elapsed:  8.1min
[Parallel(n_jobs=2)]: Done 644 tasks      | elapsed: 14.7min
[Parallel(n_jobs=2)]: Done 1000 out of 1000 | elapsed: 21.8min finished


{'max_depth': 10, 'bootstrap': False, 'max_features': 'sqrt', 'n_estimators': 2000, 'min_samples_leaf': 1, 'min_samples_split': 10}
Data Partition  0  ...




Data Partition  1  ...




Data Partition  2  ...




Data Partition  3  ...




Data Partition  4  ...




Processing mordred_desc.csv...


For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.


Below are the numeric Error Columns. They will be removed. 
['MAXsLi', 'MAXssBe', 'MAXssssBe', 'MAXssBH', 'MAXsssB', 'MAXssssB', 'MAXddC', 'MAXsNH3', 'MAXssNH2', 'MAXdNH', 'MAXaaNH', 'MAXsssNH', 'MAXssssN', 'MAXsSiH3', 'MAXssSiH2', 'MAXsssSiH', 'MAXssssSi', 'MAXsPH2', 'MAXssPH', 'MAXsssP', 'MAXdsssP', 'MAXsssssP', 'MAXsSH', 'MAXdssS', 'MAXsGeH3', 'MAXssGeH2', 'MAXsssGeH', 'MAXssssGe', 'MAXsAsH2', 'MAXssAsH', 'MAXsssAs', 'MAXsssdAs', 'MAXsssssAs', 'MAXsSeH', 'MAXdSe', 'MAXssSe', 'MAXaaSe', 'MAXdssSe', 'MAXddssSe', 'MAXsSnH3', 'MAXssSnH2', 'MAXsssSnH', 'MAXssssSn', 'MAXsI', 'MAXsPbH3', 'MAXssPbH2', 'MAXsssPbH', 'MAXssssPb', 'MINsLi', 'MINssBe', 'MINssssBe', 'MINssBH', 'MINsssB', 'MINssssB', 'MINddC', 'MINsNH3', 'MINssNH2', 'MINdNH', 'MINaaNH', 'MINsssNH', 'MINssssN', 'MINsSiH3', 'MINssSiH2', 'MINsssSiH', 'MINssssSi', 'MINsPH2', 'MINssPH', 'MINsssP', 'MINdsssP', 'MINsssssP', 'MINsSH', 'MINdssS', 'MINsGeH3', 'MINssGeH2', 'MINsssGeH', 'MINssssGe', 'MINsAsH2', 'MINssAsH', 'MINsssAs', 'MINsss

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.2s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


{'n_iter': 200, 'fit_intercept': True}
Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=2)]: Done 240 out of 240 | elapsed:    1.8s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


{'shrinking': True, 'degree': 1, 'kernel': 'rbf'}
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=2)]: Done 1000 out of 1000 | elapsed:    2.0s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


{'n_neighbors': 3, 'algorithm': 'auto', 'weights': 'uniform', 'leaf_size': 80}
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   25.6s
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:  1.7min
[Parallel(n_jobs=2)]: Done 361 tasks      | elapsed:  4.0min
[Parallel(n_jobs=2)]: Done 644 tasks      | elapsed:  6.9min
[Parallel(n_jobs=2)]: Done 1000 out of 1000 | elapsed: 10.4min finished


{'max_depth': 80, 'bootstrap': False, 'max_features': 'sqrt', 'n_estimators': 400, 'min_samples_leaf': 4, 'min_samples_split': 10}
Data Partition  0  ...




Data Partition  1  ...




Data Partition  2  ...




Data Partition  3  ...




Data Partition  4  ...




Processing 2Drdkit_desc.csv...
Below are the numeric Error Columns. They will be removed. 
[]
all_data : (241, 115)
83.91999999999999 30
98.55000000000001 50
Fitting 10 folds for each of 20 candidates, totalling 200 fits


For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.2s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


{'n_iter': 200, 'fit_intercept': True}
Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=2)]: Done 240 out of 240 | elapsed:    0.7s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


{'shrinking': True, 'degree': 1, 'kernel': 'rbf'}
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=2)]: Done 1000 out of 1000 | elapsed:    2.1s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


{'n_neighbors': 3, 'algorithm': 'auto', 'weights': 'uniform', 'leaf_size': 80}
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   25.3s
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:  1.7min
[Parallel(n_jobs=2)]: Done 361 tasks      | elapsed:  4.0min
[Parallel(n_jobs=2)]: Done 644 tasks      | elapsed:  6.9min
[Parallel(n_jobs=2)]: Done 1000 out of 1000 | elapsed: 10.5min finished


{'max_depth': 80, 'bootstrap': True, 'max_features': 'auto', 'n_estimators': 1800, 'min_samples_leaf': 2, 'min_samples_split': 2}
Data Partition  0  ...




Data Partition  1  ...




Data Partition  2  ...




Data Partition  3  ...




Data Partition  4  ...




Processing mrc_desc.csv...
Below are the numeric Error Columns. They will be removed. 
[]
all_data : (241, 91)
98.80999999999999 30
Fitting 10 folds for each of 20 candidates, totalling 200 fits


For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.2s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


{'n_iter': 200, 'fit_intercept': True}
Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=2)]: Done 240 out of 240 | elapsed:    0.4s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


{'shrinking': True, 'degree': 1, 'kernel': 'rbf'}
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=2)]: Done 1000 out of 1000 | elapsed:    1.4s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


{'n_neighbors': 11, 'algorithm': 'auto', 'weights': 'uniform', 'leaf_size': 10}
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   18.4s
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:  1.2min
[Parallel(n_jobs=2)]: Done 361 tasks      | elapsed:  2.6min
[Parallel(n_jobs=2)]: Done 644 tasks      | elapsed:  4.4min
[Parallel(n_jobs=2)]: Done 1000 out of 1000 | elapsed:  6.8min finished


{'max_depth': 50, 'bootstrap': True, 'max_features': 'sqrt', 'n_estimators': 800, 'min_samples_leaf': 4, 'min_samples_split': 10}
Data Partition  0  ...




Data Partition  1  ...




Data Partition  2  ...




Data Partition  3  ...




Data Partition  4  ...




Processing mordred_mrc_desc.csv...


For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.


Below are the numeric Error Columns. They will be removed. 
['MAXsLi', 'MAXssBe', 'MAXssssBe', 'MAXssBH', 'MAXsssB', 'MAXssssB', 'MAXddC', 'MAXsNH3', 'MAXssNH2', 'MAXdNH', 'MAXaaNH', 'MAXsssNH', 'MAXssssN', 'MAXsSiH3', 'MAXssSiH2', 'MAXsssSiH', 'MAXssssSi', 'MAXsPH2', 'MAXssPH', 'MAXsssP', 'MAXdsssP', 'MAXsssssP', 'MAXsSH', 'MAXdssS', 'MAXsGeH3', 'MAXssGeH2', 'MAXsssGeH', 'MAXssssGe', 'MAXsAsH2', 'MAXssAsH', 'MAXsssAs', 'MAXsssdAs', 'MAXsssssAs', 'MAXsSeH', 'MAXdSe', 'MAXssSe', 'MAXaaSe', 'MAXdssSe', 'MAXddssSe', 'MAXsSnH3', 'MAXssSnH2', 'MAXsssSnH', 'MAXssssSn', 'MAXsI', 'MAXsPbH3', 'MAXssPbH2', 'MAXsssPbH', 'MAXssssPb', 'MINsLi', 'MINssBe', 'MINssssBe', 'MINssBH', 'MINsssB', 'MINssssB', 'MINddC', 'MINsNH3', 'MINssNH2', 'MINdNH', 'MINaaNH', 'MINsssNH', 'MINssssN', 'MINsSiH3', 'MINssSiH2', 'MINsssSiH', 'MINssssSi', 'MINsPH2', 'MINssPH', 'MINsssP', 'MINdsssP', 'MINsssssP', 'MINsSH', 'MINdssS', 'MINsGeH3', 'MINssGeH2', 'MINsssGeH', 'MINssssGe', 'MINsAsH2', 'MINssAsH', 'MINsssAs', 'MINsss

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.3s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


{'n_iter': 200, 'fit_intercept': True}
Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=2)]: Done 240 out of 240 | elapsed:    1.9s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


{'shrinking': True, 'degree': 1, 'kernel': 'rbf'}
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=2)]: Done 1000 out of 1000 | elapsed:    2.6s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


{'n_neighbors': 3, 'algorithm': 'auto', 'weights': 'uniform', 'leaf_size': 80}
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   30.5s
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:  2.2min
[Parallel(n_jobs=2)]: Done 361 tasks      | elapsed:  4.9min
[Parallel(n_jobs=2)]: Done 644 tasks      | elapsed:  8.6min
[Parallel(n_jobs=2)]: Done 1000 out of 1000 | elapsed: 13.1min finished


{'max_depth': None, 'bootstrap': False, 'max_features': 'sqrt', 'n_estimators': 2000, 'min_samples_leaf': 4, 'min_samples_split': 5}
Data Partition  0  ...




Data Partition  1  ...




Data Partition  2  ...




Data Partition  3  ...




Data Partition  4  ...




Processing all_comb.csv...


For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.


Below are the numeric Error Columns. They will be removed. 
['MAXsLi', 'MAXssBe', 'MAXssssBe', 'MAXssBH', 'MAXsssB', 'MAXssssB', 'MAXddC', 'MAXsNH3', 'MAXssNH2', 'MAXdNH', 'MAXaaNH', 'MAXsssNH', 'MAXssssN', 'MAXsSiH3', 'MAXssSiH2', 'MAXsssSiH', 'MAXssssSi', 'MAXsPH2', 'MAXssPH', 'MAXsssP', 'MAXdsssP', 'MAXsssssP', 'MAXsSH', 'MAXdssS', 'MAXsGeH3', 'MAXssGeH2', 'MAXsssGeH', 'MAXssssGe', 'MAXsAsH2', 'MAXssAsH', 'MAXsssAs', 'MAXsssdAs', 'MAXsssssAs', 'MAXsSeH', 'MAXdSe', 'MAXssSe', 'MAXaaSe', 'MAXdssSe', 'MAXddssSe', 'MAXsSnH3', 'MAXssSnH2', 'MAXsssSnH', 'MAXssssSn', 'MAXsI', 'MAXsPbH3', 'MAXssPbH2', 'MAXsssPbH', 'MAXssssPb', 'MINsLi', 'MINssBe', 'MINssssBe', 'MINssBH', 'MINsssB', 'MINssssB', 'MINddC', 'MINsNH3', 'MINssNH2', 'MINdNH', 'MINaaNH', 'MINsssNH', 'MINssssN', 'MINsSiH3', 'MINssSiH2', 'MINsssSiH', 'MINssssSi', 'MINsPH2', 'MINssPH', 'MINsssP', 'MINdsssP', 'MINsssssP', 'MINsSH', 'MINdssS', 'MINsGeH3', 'MINssGeH2', 'MINsssGeH', 'MINssssGe', 'MINsAsH2', 'MINssAsH', 'MINsssAs', 'MINsss

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.7s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


{'n_iter': 200, 'fit_intercept': True}
Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=2)]: Done 240 out of 240 | elapsed:    6.4s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


{'shrinking': True, 'degree': 1, 'kernel': 'rbf'}
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=2)]: Done 1000 out of 1000 | elapsed:    4.8s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


{'n_neighbors': 3, 'algorithm': 'auto', 'weights': 'uniform', 'leaf_size': 80}
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   47.0s
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:  3.3min
[Parallel(n_jobs=2)]: Done 361 tasks      | elapsed:  7.9min
[Parallel(n_jobs=2)]: Done 644 tasks      | elapsed: 14.2min
[Parallel(n_jobs=2)]: Done 1000 out of 1000 | elapsed: 20.9min finished


{'max_depth': 40, 'bootstrap': True, 'max_features': 'auto', 'n_estimators': 600, 'min_samples_leaf': 4, 'min_samples_split': 2}
Data Partition  0  ...




Data Partition  1  ...




Data Partition  2  ...




Data Partition  3  ...




Data Partition  4  ...




0 DAYS: 1 HOURS: 36 MINUTES: 32.257 SECONDS
