In [1]:
# REGRESSION MODEL DEEP LEARNING
import os
import pickle
import shutil
import time

import matplotlib.pyplot as plt

import itertools
import pandas as pd
import numpy as np

from scipy import stats
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.preprocessing import Imputer
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import KFold, train_test_split,RandomizedSearchCV, GridSearchCV, cross_validate, cross_val_score, cross_val_predict 

# os.environ["CUDA_VISIBLE_DEVICES"]="0"



In [2]:
def convert_time(second):
    day = second/86400
    hour = (day - int(day))*24
    minute = (hour - int(hour))*60
    second = round((minute - int(minute))*60,4)
    return(str(int(day)) + ' DAYS: '+ str(int(hour)) + ' HOURS: '+ str(int(minute)) + ' MINUTES: ' + str(second) + ' SECONDS')

In [3]:
# def create_directory(old_directory):
#     new_directory = old_directory+'/RF'
#     while os.path.exists(new_directory):
#         shutil.rmtree(new_directory)
#     os.mkdir(new_directory)
#     return new_directory

In [4]:
def split_partitions(DATA,TARGETS,IDS, folds):
    num_val_samples = len(DATA) // folds+1
    one_fold = []
    nine_folds = []
    for i in range(folds):
        one_fold_data = DATA[i * num_val_samples: (i + 1) * num_val_samples] # prepares the validation data: data from partition # k
        one_fold_targets = TARGETS[i * num_val_samples: (i + 1) * num_val_samples]
        one_fold_IDs = IDS[i * num_val_samples: (i + 1) * num_val_samples]
        one_fold += [[one_fold_data, one_fold_targets, one_fold_IDs]]
        
        # prepares the training data: data from all other partitions
        nine_fold_data = np.concatenate([DATA[:i * num_val_samples],DATA[(i + 1) * num_val_samples:]],axis=0)
        nine_fold_targets = np.concatenate([TARGETS[:i * num_val_samples],TARGETS[(i + 1) * num_val_samples:]],axis=0)
        nine_fold_IDs = np.concatenate([IDS[:i * num_val_samples],IDS[(i + 1) * num_val_samples:]],axis=0)
        nine_folds += [[nine_fold_data,nine_fold_targets,nine_fold_IDs]]
    return one_fold, nine_folds   

In [5]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [6]:
# Load the train and test datasets
main_directory = "/media/zinph/DATAPART1/P5/Data/"

# 'Lig2D_Ki', 'Lig3D-Lig2D_Ki', 'Lig3D-Lig2D-Lig_Ki',

Ki_IC50_folders = ['Lig2D_All', 'Lig3D-Lig2D_All', 'Lig3D-Lig2D-Lig_All', 'Lig3D-Lig2D-Lig-BS_All', 'Lig3D-Lig2D-Lig-BS-FPI_All']
Ki_folders = ['Lig3D-Lig2D-Lig-BS_Ki', 'Lig3D-Lig2D-Lig-BS-FPI_Ki']
IC50_folders = ['Lig2D_IC50', 'Lig3D-Lig2D_IC50', 'Lig3D-Lig2D-Lig_IC50', 'Lig3D-Lig2D-Lig-BS_IC50', 'Lig3D-Lig2D-Lig-BS-FPI_IC50']

all_folders = Ki_folders + IC50_folders + Ki_IC50_folders
outer_k = 10

start_time = time.time()
        
for f in all_folders:
    print(f)
    directory = main_directory+f+'/'
    
    RF_directory = main_directory+f+'/RF/'
    if not os.path.exists(RF_directory):
        os.mkdir(RF_directory)
    RF_model_directory = RF_directory + '/cv_models/'
    if not os.path.exists(RF_model_directory):
        os.mkdir(RF_model_directory)
    
    
    train = pd.read_csv(directory+"trainSet.csv", sep=',')
    test  = pd.read_csv(directory+"testSet.csv", sep=',')
    
    frames = [train, test]
    combined = pd.concat(frames)
    combined = shuffle(combined)
    
    all_IDs     = combined[combined.columns[0]]
    pre_all_data    = combined.loc[:, combined.columns != 'Aff'].drop(combined.columns[0],axis=1)
    all_targets  = np.concatenate(combined[['Aff']].values, axis=0).tolist()

    all_data_np = pre_all_data.values 
    
    # preprocess data, normalize columns
    imputer = Imputer()
    scaler     = preprocessing.MinMaxScaler()
    all_data   = scaler.fit_transform(imputer.fit_transform(all_data_np)) 
    
    print("all_data :", all_data.shape)
        
    cv_frame = pd.DataFrame()
    
    test_fold,train_fold = split_partitions(all_data,all_targets,all_IDs,outer_k)
    
    outerCV__targets = []
    outerCV__predictions = []
    outerCV__IDs = [] 

    #################################################################################################
    
    for i in range(outer_k):
        
        outer_train = train_fold[i]
        outer_test  = test_fold[i]

        outerCV__test_data, outerCV__test_targets, outerCV__test_ids  = test_fold[i][0], test_fold[i][1], test_fold[i][2] 
        outerCV__train_data, outerCV__train_targets, outerCV__train_ids = train_fold[i][0], train_fold[i][1], train_fold[i][2]
        
    
        cv_rf = RandomForestRegressor(n_estimators= 1600, max_depth = 90, 
                               max_features = 'auto', min_samples_leaf = 1, 
                               min_samples_split = 5, bootstrap = True, criterion="mae", n_jobs = 10)
        # Fit the random search model
        cv_rf.fit(outerCV__train_data,outerCV__train_targets)
        
        
        cv_model = RF_model_directory+'RF_cv_'+str(i) +'.h5'
        pickle.dump(cv_rf, open(cv_model, 'wb'))
        
        outerCV__test_predictions = cv_rf.predict(outerCV__test_data).tolist()
        outerCV__predictions.append(outerCV__test_predictions)
        outerCV__targets.append(outerCV__test_targets)
        outerCV__IDs.append(outerCV__test_ids)
        del cv_rf, outerCV__test_predictions

    
    outerCV__targets_combined  = list(itertools.chain.from_iterable(outerCV__targets))
    outerCV__predictions_combined = list(itertools.chain.from_iterable(outerCV__predictions))
    outerCV__IDs_combined = list(itertools.chain.from_iterable(outerCV__IDs))
        
    cv_frame['IDs'] = outerCV__IDs_combined
    cv_frame['ExperimentalAff'] = outerCV__targets_combined
    cv_frame['PredictedAff'] = outerCV__predictions_combined

    cv_frame.to_csv(RF_directory+'RF_CV_BestModel_Predictions.csv',index=False)
    
duration = convert_time(time.time()-start_time)
print(duration)

Lig3D-Lig2D-Lig-BS_Ki




all_data : (555, 2389)
Lig3D-Lig2D-Lig-BS-FPI_Ki




all_data : (555, 2405)
Lig2D_IC50
all_data : (306, 647)




Lig3D-Lig2D_IC50




all_data : (306, 1149)
Lig3D-Lig2D-Lig_IC50




all_data : (306, 2153)
Lig3D-Lig2D-Lig-BS_IC50




all_data : (306, 2389)
Lig3D-Lig2D-Lig-BS-FPI_IC50




all_data : (306, 2405)
Lig2D_All




all_data : (861, 647)
Lig3D-Lig2D_All




all_data : (861, 1149)
Lig3D-Lig2D-Lig_All




all_data : (861, 2153)
Lig3D-Lig2D-Lig-BS_All




all_data : (861, 2389)
Lig3D-Lig2D-Lig-BS-FPI_All




all_data : (861, 2405)
0 DAYS: 14 HOURS: 34 MINUTES: 23.4272 SECONDS
