# Importing Libraries

In [None]:
#import default and other essential libraries
import os
import numpy as np
import pandas as pd
import sklearn
import joblib
import pickle
import csv
import sys
import random
import seaborn as sns
from functools import reduce
import matplotlib.backends.backend_pdf
import matplotlib.pyplot as plt
import tensorflow as tf

#Packages to split data and other preprocessing
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import metrics
from sklearn.feature_selection import VarianceThreshold
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.decomposition import PCA 
from boruta import BorutaPy
from imblearn.pipeline import Pipeline as sample_pipeline
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectFromModel
from imblearn.over_sampling import SMOTE

#Import Classifiers
from sklearn import svm
import smote_variants as sv
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

## Package for calculating accuracy and analysis
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.manifold import TSNE
from sklearn.model_selection import LeaveOneOut 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_curve, auc, accuracy_score, roc_auc_score, cohen_kappa_score, f1_score, precision_score, recall_score, matthews_corrcoef 
from sklearn.model_selection import StratifiedKFold
from lime.lime_tabular import LimeTabularExplainer

# Data Loading

In [None]:
#load preprocessed feature file here
data = pd.read_csv(r'sign_electrophile_latest_preprocessed.csv') ### features in columns,molecules in rows
data

In [None]:
 # dropping ALL duplicate values, if any
data.drop_duplicates(subset ="smiles", keep = 'first', inplace = True, ignore_index = True)
data

In [None]:
#Drop smiles column from the data
data=data.drop('smiles', axis=1)
data

In [None]:
# look at class imbalance
data['status'].value_counts()

# Train-Test Split

In [None]:
# Split Data
X_train, X_test,y_train,y_test = train_test_split(data,data["status"] ,test_size=0.25, random_state=1)

In [None]:
train_df_new = X_train.drop('status', axis=1)
valid_df_new = X_test.drop('status', axis=1)

# Feature Selection (Boruta)

In [None]:
def Boruta_Filteration(X_train,y_train,X_test,y_test):
    #### making files for boruta
    features = [f for f in X_train.columns if f not in ['status']]
    X_train_boruta = X_train[features].values
    Y_train_boruta = y_train.values.ravel()
    X_test_boruta = X_test[features].values
    Y_test_boruta = y_test.values.ravel()

    print('Before filteration\nTrain shape\n',X_train_boruta.shape,'\nTest shape\n',X_test_boruta.shape)

    ### implementing boruta
    
    # define random forest classifier, with utilising all cores and
    # sampling in proportion to y labels
    rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

    # define Boruta feature selection method
    feat_selector = BorutaPy(rf, n_estimators=100, random_state=1)

    # find all relevant features - 5 features should be selected
    feat_selector.fit(X_train_boruta, Y_train_boruta)

    # check selected features - first 5 features are selected
    feat_selector.support_

    # check ranking of features
    feat_selector.ranking_

    # call transform() on X to filter it down to selected features
    X_train_filtered = feat_selector.transform(X_train_boruta)
    X_test_filtered = feat_selector.transform(X_test_boruta)

    ### name of the features selected####
    final_features = list()
    indexes = np.where(feat_selector.support_ == True)
    for x in np.nditer(indexes):
        final_features.append(features[x])
    
    print('# of Features selected:',len(final_features))

    X_train_filtered=pd.DataFrame(X_train_filtered,columns=final_features)
    X_test_filtered=pd.DataFrame(X_test_filtered,columns=final_features)

    print('After filteration\nTrain shape\n',X_train_filtered.shape,'\nTest shape\n',X_test_filtered.shape)

    return X_train_filtered,X_test_filtered,Y_train_boruta,Y_test_boruta,final_features

# tsne Plot

In [None]:
def TSNE_plot(data,data_labels):
        tsne = TSNE(n_components=2, random_state=50)
        transformed_data = tsne.fit_transform(data)
        k = np.array(transformed_data)
        Group=["Class 0","Class 1"]
        plt.scatter(k[:, 0],k[:, 1], c=data_labels)
        #plt.legend(loc="lower right")
        plt.show()

In [None]:
TSNE_plot(train_df_new,y_train)

In [None]:
y_train.value_counts()

# Upsampling 

In [None]:
def Smote(traindata,trainlabel,prop):
        oversampler= sv.MSMOTE(proportion=prop,random_state=50)
        X_samp, y_samp= oversampler.sample(traindata.values,trainlabel.values)     
        TSNE_plot(X_samp, y_samp)
        X_samp= pd.DataFrame(X_samp)
        y_samp=pd.DataFrame(y_samp)
        X_samp.columns =list(traindata.columns.values)
        return X_samp,y_samp
def TSNE_plot(data,data_labels):
        tsne = TSNE(n_components=2, random_state=50)
        transformed_data = tsne.fit_transform(data)
        k = np.array(transformed_data)
        Group=["Class 0","Class 1"]
        plt.scatter(k[:, 0],k[:, 1], c=data_labels)
        plt.legend(loc="lower right")
        plt.show()

In [None]:
X_train_filtered,Y_train_boruta=Smote(X_train_filtered,Y_train_boruta,0.5)

In [None]:
Y_train_boruta.value_counts()

# Down Sampling (For genomic instability)

In [None]:
from imblearn.under_sampling import RandomUnderSampler
def Smote(traindata,trainlabel):
        # define undersample strategy
        undersample = RandomUnderSampler(sampling_strategy='majority')
        # fit and apply the transform
        X_samp, y_samp = undersample.fit_resample(traindata.values,trainlabel.values)        
        TSNE_plot(X_samp, y_samp)
        X_samp= pd.DataFrame(X_samp)
        y_samp=pd.DataFrame(y_samp)
        X_samp.columns =list(traindata.columns.values)
        return X_samp,y_samp
def TSNE_plot(data,data_labels):
        tsne = TSNE(n_components=2, random_state=50)
        transformed_data = tsne.fit_transform(data)
        k = np.array(transformed_data)
        Group=["Class 0","Class 1"]
        plt.scatter(k[:, 0],k[:, 1], c=data_labels)
        plt.legend(loc="lower right")
        plt.show()

In [None]:
X_train_filtered,Y_train_boruta=Smote(X_train_filtered,Y_train_boruta)

In [None]:
Y_train_boruta.value_counts()

#**Model Training and HyperParameter Tuning** 

---



# Loading Data

In [None]:
# Set random seed to maintain the randomness of each hyperparameter tuning run
def seed_all():
    np.random.seed(123)
    tf.random.set_seed(123)
seed_all()

In [None]:
# Make a new directory for HyperParameter Tuning
os.mkdir('HPTuning')

In [None]:
# FROM = Path of directory from which to load the preprocessed signaturizer file
FROM='/PreProcessed/'
# TO = Path of the newly made HyperParameter Tuning directory
TO='/HPTuning/'

# Set the HPTuning directory as the current working directory
os.chdir(TO)

In [None]:
# Load the preprocessed signaturizer file
Data=pd.read_csv(FROM+'sign_proliferative_anti_preprocessed.csv')
Data

In [None]:
# Use 90% of the Data as Training Data for further hyperparameter tuning
Train=Data.sample(n=int(len(Data)*0.9), random_state=1)
Train

# Defining the Grid

## Random Forest Grid (Proliferation)

In [None]:
def HPTing_Model(Train_x, Train_y):
    rf = RandomForestClassifier()
    parameters = {
        'max_features': ['auto', 'sqrt'],
        'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap':[True, False],
        'n_estimators':[int(x) for x in np.linspace(start = 2, stop = 100, num = 10)]
    }
    grid = RandomizedSearchCV(rf, parameters, scoring='accuracy', return_train_score=False, cv =5)
    grid_search=grid.fit(Train_x, Train_y)
    return grid_search

## MLP Grid (Electrophile)

In [None]:
def HPTing_Model(Train_x, Train_y):
    mlp = MLPClassifier()
    parameter_space = { 'hidden_layer_sizes':[(5,5,5),(20,30,50),(50,50,50), (50,100,50), (100,),(100,100,100),(5,2)],
                       'activation': ['tanh', 'relu'],
                       'solver': ['sgd', 'adam'],
                       'alpha': [0.001, 0.01, 0.02, 0.04, 0.05],
                       'learning_rate': ['constant','adaptive','invscaling']
}
    grid = RandomizedSearchCV(mlp, parameter_space, scoring='accuracy', return_train_score=False, cv =5)
    grid_search=grid.fit(Train_x, Train_y)
    return grid_search

## MLP Grid (Oxidative)

In [None]:
def HPTing_Model(Train_x, Train_y):
    mlp = MLPClassifier( max_iter= 1000 , random_state=50)
    parameter_space = { 'hidden_layer_sizes':[(5,5,5),(20,30,50),(50,50,50), (50,100,50), (100,),(100,100,100),(5,2)],'activation': ['tanh', 'relu'],
        'solver': ['sgd', 'adam'],
        'alpha': [0.0001, 0.05,0.001,0.01],
        'learning_rate': ['constant','adaptive']}
    grid = RandomizedSearchCV(mlp, parameter_space, scoring='accuracy',cv=5 ,return_train_score=False)
    grid_search=grid.fit(Train_x, Train_y)
    return grid_search

## SVM Grid (Epigenetics)

In [None]:
def HPTing_Model(Train_x, Train_y):
    svc_rand = SVC(probability=True)
    parameters = {
        'kernel':('linear', 'rbf'),
        'C': [0.5, 0.6, 0.8, 1.0, 1.2, 1.5], 
        'gamma': [0.05, 0.1, 1.0, 1.2, 1.5, 2],
    }
    grid = RandomizedSearchCV(svc_rand, parameters, cv = 5)
    grid_search=grid.fit(Train_x, Train_y)
    return grid_search

## Random Forest Grid (Genomic Instability)

In [None]:
def HPTing_Model(Train_x, Train_y):
    n_estimators = [int(x) for x in np.linspace(start = 2, stop = 100, num = 10)]
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    min_samples_split = list(range(1,30))
    min_samples_leaf = list(range(1,20))
    bootstrap = [True, False]
    random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
    rf = RandomForestClassifier()
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 10, verbose=2, n_jobs = -1)
    rf_random.fit(Train_x, Train_y)
    return rf_random

## KNN Grid (Apoptosis)

In [None]:
def HPTing_Model(Train_x, Train_y):
    leaf_size = list(range(1,50))
    n_neighbors = list(range(1,40))
    p=list(range(1,20))
    #Convert to dictionary
    hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p, weights=['uniform','distance'], metric= ['minkowski','euclidean','manhattan'])
    #Create new KNN object
    knn = KNeighborsClassifier()
    #Use GridSearch
    knn_Grid = GridSearchCV(knn, hyperparameters, cv=3, verbose=2, n_jobs = -1)
    best_model = knn_Grid.fit(Train_x, Train_y)
    return best_model


# Get Labels Function

In [None]:
def get_labels(pred_test,thsd): #Getting discrete labels from probability values    
    test_label = [] 
    for i in range(len(pred_test)):
        if pred_test[i]>thsd:
            test_label.append(1)
        else:
            test_label.append(0)
    return test_label

# Scoring Metrics

In [None]:
def Scoring_metrices(label, pred, truth, D):
    score={}
    
    accuracy = metrics.accuracy_score(truth, label)
    score[D+" Accuracy:"] = accuracy
    print(D+" Accuracy:", accuracy)
    
    mcc_score = matthews_corrcoef(truth, label)
    score[D+" MCC Score:"] = mcc_score
    print(D+" MCC Score:",mcc_score)
    
    F1_score = f1_score(truth, label, average='macro')
    score[D+" F1 Score:"] = F1_score
    print(D+" F1 Score:", F1_score)
    
    fpr, tpr, _ = roc_curve(truth, pred)
    roc_auc = auc(fpr, tpr)
    score[D+" AUC VALUE:"] = roc_auc
    print(D+" AUC VALUE:",roc_auc)
    
    kappa_rf=sklearn.metrics.cohen_kappa_score(truth, label)
    score[D+" kappa Score:"] = kappa_rf
    print(D+" kappa Score:",kappa_rf)
    
    Precision_score = metrics.precision_score(truth, label)
    score[D+" Precision:"] = Precision_score
    print(D+" Precision:", Precision_score)
    
    Recall_score = metrics.recall_score(truth, label)
    score[D+" Recall:"] = Recall_score
    print(D+" Recall:", Recall_score)
    
    
    display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name=D)
    display.plot()
    #plt.savefig('AUC_ROC.pdf')
    plt.show()
    return score

# Running Hyperparameter Tuning of the Model

Run the following chunks according to the selected model for a property

In [None]:
Train_Fold_outs=[]
Test_Fold_outs=[]
Best_params=[]
features = []
models=[]

#this chunk runs 20 iterations of the 5-Cross Validation of the pre-defined grid for the model
for i in range(20):
    print('Fold #',i)    
    
    # Split Data
    X_train, X_test,y_train,y_test = train_test_split(Train,Train["status"] ,test_size=0.25, shuffle = True, random_state=i)
    
    #Drop smiles,status from training,testing data
    x_train = X_train.drop(['status','smiles'], axis=1)
    x_test = X_test.drop(['status','smiles'], axis=1)

    #Feature selection
    x_train_filtered,x_test_filtered,y_train_filtered,y_test_filtered,selected_features = Boruta_Filteration(x_train,y_train,x_test,y_test)
    features.append(selected_features)
    
    y_train_filtered = pd.Series(y_train_filtered)
    
    #Oversampling
    Final_Xtrain,Final_Ytrain = Smote(x_train_filtered,y_train_filtered,0.5,'Upsampled')
    
    Final_Ytrain=Final_Ytrain.values.ravel()
    Final_Xtrain = pd.DataFrame(Final_Xtrain, dtype = np.float64)
    x_test_filtered = pd.DataFrame(x_test_filtered, dtype = np.float64)
    
    #Hyperparameter Tuning
    Parameters = HPTing_Model(Final_Xtrain,Final_Ytrain)
    
    #save best parameters
    Best_params.append(Parameters.best_estimator_.get_params())
    
    #build the tuned model
    #edit the parameters here according to your defined parameter space and model's grid
    rf = RandomForestClassifier(max_features=Parameters.best_estimator_.get_params()['max_features'],
                        max_depth=Parameters.best_estimator_.get_params()['max_depth'],
                        min_samples_split=Parameters.best_estimator_.get_params()['min_samples_split'],
                        min_samples_leaf=Parameters.best_estimator_.get_params()['min_samples_leaf'],
                        bootstrap=Parameters.best_estimator_.get_params()['bootstrap'],
                        n_estimators=Parameters.best_estimator_.get_params()['n_estimators'])
    
    #fit the built model
    rf.fit(Final_Xtrain,Final_Ytrain)
    models.append(rf)

    #Training Predictions for the model
    y_train_pred=rf.predict(Final_Xtrain)
    y_train_prob=rf.predict_proba(Final_Xtrain)

    #Save training metrics
    Train_Fold_outs.append(Scoring_metrices(y_train_pred,y_train_prob[:,1],Final_Ytrain,'Training'))

    #Testing Predictions for the model
    y_test_pred=rf.predict(x_test_filtered) 
    y_test_prob=rf.predict_proba(x_test_filtered)

    #Save testing metrics
    Test_Fold_outs.append(Scoring_metrices(y_test_pred,y_test_prob[:,1],y_test_filtered,'Testing'))

In [None]:
#Analyse the training metrics sorted by descending Training Accuracy
pd.DataFrame.from_dict(Train_Fold_outs).sort_values(by=['Training Accuracy:'],ascending = False)

In [None]:
#Analyse the testing metrics sorted by descending Testing Accuracy
pd.DataFrame.from_dict(Test_Fold_outs).sort_values(by=['Testing Accuracy:'],ascending = False)

In [None]:
#View the best parameters
pd.DataFrame.from_dict(Best_params)

In [None]:
#View length of the features selected for the top performing/most stable (selected) model (1st here)
len(features[1])

In [None]:
#Save the feature names
pd.DataFrame(features[1]).to_csv('/HPTuning/anti_prol_features_rf.csv',index=False)

In [None]:
#save the best parameters of the chosen model
with open('/HPTuning/anti_prol_best_params_RF.csv', 'w') as f:  # You will need 'wb' mode in Python 2.x
    w = csv.DictWriter(f, Best_params[1].keys())
    w.writeheader()
    w.writerow(Best_params[1])

# 20 Fold Boosting


In [None]:
#Randomly split the data into testing and validation for each fold
def Test_valid_split(Set3,frac,seed):
    Fraction=frac
    Test=Set3[Set3['status']==1].sample(frac = Fraction,random_state=1).append(Set3[Set3['status']==0].sample(frac = Fraction,random_state=seed))
    Valid_index=[item for item in list(Set3.index) if item not in list(Test.index)]
    Valid=Set3.T[Valid_index].T
    print('Test set size:',len(Test),'\nValid set size:',len(Valid))
    return Test,Valid

In [None]:
f_list=features[1] #feature list of the selected (hyperparameter tuned) model
Train_Fold_outs_1=[]
Test_Fold_outs_1=[]
models_1=[]

for i in range(20):
    print('Fold #',i)
    
    #Train-test split randomly
    Trn,Tst = Test_valid_split(Data,0.90,i)
    Train_y, Test_y = Trn['status'],Tst['status']

    #Use selected feature list
    Train_x = Trn[f_list]
    Test_x = Tst[f_list]
    
    x_train_filtered = Train_x.values
    x_test_filtered = Test_x.values
    y_train_filtered = Train_y.values.ravel()
    y_test_filtered = Test_y.values.ravel()
    
    #Upsampling
    Final_Xtrain,Final_Ytrain = Smote(Train_x,Train_y,0.5,'Upsamlped')
    
    Final_Ytrain=Final_Ytrain.values.ravel()
    Final_Xtrain = pd.DataFrame(Final_Xtrain, dtype = np.float64)
    x_test_filtered = pd.DataFrame(x_test_filtered, dtype = np.float64)

    #Use the best parameters from the chosen model here
    rf = RandomForestClassifier(bootstrap= True,
                                ccp_alpha= 0.0,
                                class_weight= None,
                                criterion= 'gini',
                                max_depth= 80,
                                max_features= 'auto',
                                max_leaf_nodes= None,
                                max_samples= None,
                                min_impurity_decrease= 0.0,
                                min_samples_leaf= 1,
                                min_samples_split= 10,
                                min_weight_fraction_leaf= 0.0,
                                n_estimators= 45,
                                n_jobs= None,
                                oob_score= False,
                                random_state= None,
                                verbose= 0,
                                warm_start= False)
    
    #Fit the model
    rf.fit(Final_Xtrain,Final_Ytrain)
    models_1.append(rf)
    
    #Training prediction and saving the metrics
    y_train_pred=rf.predict(Final_Xtrain)
    y_train_prob=rf.predict_proba(Final_Xtrain)
    Train_Fold_outs_1.append(Scoring_metrices(y_train_pred,y_train_prob[:,1],Final_Ytrain,'Training'))

    #Testing prediction and saving the metrics
    y_test_pred=rf.predict(x_test_filtered) 
    y_test_prob=rf.predict_proba(x_test_filtered)
    Test_Fold_outs_1.append(Scoring_metrices(y_test_pred,y_test_prob[:,1],y_test_filtered.astype('int'),'Testing'))

In [None]:
#To visualize the stability of the tuned model with each fold
(pd.DataFrame(Test_Fold_outs_1)).boxplot(grid=False,rot=45)

In [None]:
#To visualize the stability of the tuned model with each fold
(pd.DataFrame(Train_Fold_outs_1)).boxplot(grid=False,rot=45)

In [None]:
pd.DataFrame(Test_Fold_outs_1)

In [None]:
pd.DataFrame(Train_Fold_outs_1)

# Training on Whole Data 

In [None]:
TRAIN = Data.drop(['smiles','status'],axis=1)
TRAIN

In [None]:
TRAIN = TRAIN[features[1]] #Use features of the selected (hyperparameter tuned) model here
TRAIN

In [None]:
Y = Data['status']

In [None]:
#Fitting the model on whole data
fitted = models[1].fit(TRAIN,Y)
fitted

In [None]:
#Save the final model
joblib.dump(fitted, '/HPTuning/anti_prol_model_rf.pkl')

# MK Ensemble (Gradient Boosting Classifier)

## Hyper parameter tunning

In [None]:
def HPTing_Randomsearch(Train_x, Train_y):
    n_estimators = [int(x) for x in np.linspace(start = 2, stop = 200, num = 20)]
    learning_rate = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]
    min_samples_split = list(range(2,40))
    min_samples_leaf = list(range(1,30))
    max_depth = list(range(1,15))
    max_features = ['auto', 'sqrt']
    random_grid = {'n_estimators': n_estimators,
                   'learning_rate': learning_rate,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf}
    gbc=GradientBoostingClassifier()
    gsearch1 = RandomizedSearchCV(estimator = gbc, param_distributions = random_grid, n_iter = 100, verbose=2, scoring='roc_auc', n_jobs=-1, cv=3) 
    gsearch1.fit(x_Train, y_Train)
    return gsearch1


In [None]:
f=0
gbc_Train_Fold_outs=[]
gbc_Test_Fold_outs=[]
gbc_DP4NC_Fold_outs=[]
gbc_Best_params=[]
gbc_models=[]

for i in range(20):
    f+=1
    print('Fold #',f)
    Train_x, Test_x, y_Train, y_Test = train_test_split(Data,Data["status"] ,test_size=0.15, random_state=f)
    Test_x = Test_x.filter(regex='_1$').rename(columns={'GInstability_1':'GI','Apoptosis_1':'Apo','Electrophile_1':'Elec','Proliferation_1':'Prf','Oxidative_1':'Oxd','Epigenetics_1':'Epig'})
    Train_x = Train_x.filter(regex='_1$').rename(columns={'GInstability_1':'GI','Apoptosis_1':'Apo','Electrophile_1':'Elec','Proliferation_1':'Prf','Oxidative_1':'Oxd','Epigenetics_1':'Epig'})
    x_Train=Fetaure_combos(Train_x.copy(),1).apply(pd.to_numeric)
    x_Test=Fetaure_combos(Test_x.copy(),1).apply(pd.to_numeric)
    y_Train=y_Train.apply(int)
    y_Test=y_Test.apply(int)
#     Final_Xtrain,Final_Ytrain = Smote(x_Train,y_Train,0.5,'Upsamlped')
    Final_Xtrain=x_Train
    Final_Ytrain=y_Train.values.ravel()
    Parametrs = HPTing_Randomsearch(Final_Xtrain.apply(pd.to_numeric),Final_Ytrain)
    gbc_Best_params.append(Parametrs.best_params_)
    model_gbc = GradientBoostingClassifier(learning_rate= Parametrs.best_params_['learning_rate'],
                                      max_depth= Parametrs.best_params_['max_depth'],
                                      max_features= Parametrs.best_params_['max_features'],
                                      min_samples_leaf= Parametrs.best_params_['min_samples_leaf'],
                                      min_samples_split= Parametrs.best_params_['min_samples_split'],
                                      n_estimators =Parametrs.best_params_['n_estimators'])
    model_gbc.fit(Final_Xtrain.apply(pd.to_numeric),Final_Ytrain)
    gbc_models.append(model_gbc)
    y_train_pred=model_gbc.predict(Final_Xtrain.apply(pd.to_numeric))
    y_train_prob=model_gbc.predict_proba(Final_Xtrain.apply(pd.to_numeric))
    gbc_Train_Fold_outs.append(Scoring_metrices(y_train_pred,y_train_prob[:,1],Final_Ytrain,'Training'))
    y_test_pred=model_gbc.predict(x_Test.apply(pd.to_numeric)) 
    y_test_prob=model_gbc.predict_proba(x_Test.apply(pd.to_numeric))
    gbc_Test_Fold_outs.append(Scoring_metrices(y_test_pred,y_test_prob[:,1],y_Test,'Testing'))
    gbc_DP4NC_Fold_outs.append(DP4NC(model_gbc,Test,Tst_x,'Fold'+str(f)))


## Ensemble models (20)

In [None]:
f=0
Cut5_Train_Fold_outs=[]
Cut5_Test_Fold_outs=[]
Cut5_DP4NC_Fold_outs=[]
Cut5_models=[]

for i in range(f,20):
    f+=1
    print('Fold #',f)
    Train_x, Test_x, y_Train, y_Test = train_test_split(Data,Data["status"] ,test_size=0.10, random_state=f)
    Test_x = Test_x.filter(regex='_1$').rename(columns={'GInstability_1':'GI','Apoptosis_1':'Apo','Electrophile_1':'Elec','Proliferation_1':'Prf','Oxidative_1':'Oxd','Epigenetics_1':'Epig'})
    Train_x = Train_x.filter(regex='_1$').rename(columns={'GInstability_1':'GI','Apoptosis_1':'Apo','Electrophile_1':'Elec','Proliferation_1':'Prf','Oxidative_1':'Oxd','Epigenetics_1':'Epig'})
    x_Train=Fetaure_combos(Train_x.copy(),1).apply(pd.to_numeric)
    x_Test=Fetaure_combos(Test_x.copy(),1).apply(pd.to_numeric)
    y_Train=y_Train.apply(int)
    y_Test=y_Test.apply(int)
#     Final_Xtrain,Final_Ytrain = Smote(x_Train,y_Train,0.5,'Upsamlped')
    Final_Xtrain=x_Train
    Final_Ytrain=y_Train.values.ravel()
    model_gbc = GradientBoostingClassifier(n_estimators = 22,
                                       min_samples_split = 11,
                                       min_samples_leaf = 6,
                                       max_features = 'sqrt',
                                       max_depth = 5,
                                       learning_rate = 0.1)
    model_gbc.fit(Final_Xtrain.apply(pd.to_numeric),Final_Ytrain)
    Cut5_models.append(model_gbc)
    y_train_pred=model_gbc.predict(Final_Xtrain.apply(pd.to_numeric))
    y_train_prob=model_gbc.predict_proba(Final_Xtrain.apply(pd.to_numeric))
    Cut5_Train_Fold_outs.append(Scoring_metrices(y_train_pred,y_train_prob[:,1],Final_Ytrain,'Training'))
    y_test_pred=model_gbc.predict(x_Test.apply(pd.to_numeric)) 
    y_test_prob=model_gbc.predict_proba(x_Test.apply(pd.to_numeric))
    Cut5_Test_Fold_outs.append(Scoring_metrices(y_test_pred,y_test_prob[:,1],y_Test,'Testing'))
    Cut5_DP4NC_Fold_outs.append(DP4NC(model_gbc,Test,Tst_x,'Fold'+str(f)))