# Importing Libraries

In [None]:
#import default and other essential libraries
import os
import numpy as np
import pandas as pd
import sklearn
import joblib
import pickle
import csv
import sys
import random
import seaborn as sns
from functools import reduce
import matplotlib.backends.backend_pdf
import matplotlib.pyplot as plt
import tensorflow as tf

#Packages to split data and other preprocessing
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import metrics
from sklearn.feature_selection import VarianceThreshold
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.decomposition import PCA 
from boruta import BorutaPy
from imblearn.pipeline import Pipeline as sample_pipeline
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectFromModel
from imblearn.over_sampling import SMOTE

#Import Classifiers
from sklearn import svm
import smote_variants as sv
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

## Package for calculating accuracy and analysis
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.manifold import TSNE
from sklearn.model_selection import LeaveOneOut 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_curve, auc, accuracy_score, roc_auc_score, cohen_kappa_score, f1_score, precision_score, recall_score, matthews_corrcoef 
from sklearn.model_selection import StratifiedKFold
from lime.lime_tabular import LimeTabularExplainer

# Working Directory

In [None]:
# set working directory
os.chdir('/home/vishakhag/Projects/Metabokiller_Revision/Sakshi_Electrophiles/')

# Data Loading

In [None]:
#load preprocessed feature file here
data = pd.read_csv(r'/home/vishakhag/Projects/Metabokiller_Revision/Sakshi_Electrophiles/PreProcessed/sign_electrophile_latest_preprocessed.csv') ### features in columns,molecules in rows
data

In [None]:
 # dropping ALL duplicate values, if any
data.drop_duplicates(subset ="smiles", keep = 'first', inplace = True, ignore_index = True) #use inplace = False to view the edit
data

In [None]:
#Drop smiles column from the data
data=data.drop('smiles', axis=1)
data

In [None]:
# look at class imbalance
data['status'].value_counts()

# Train-Test Split

In [None]:
# Split Data
X_train, X_test,y_train,y_test = train_test_split(data,data["status"] ,test_size=0.25, random_state=1)

In [None]:
train_df_new = X_train.drop('status', axis=1)
valid_df_new = X_test.drop('status', axis=1)

# tsne Plot

In [None]:
def TSNE_plot(data,data_labels):
        tsne = TSNE(n_components=2, random_state=50)
        transformed_data = tsne.fit_transform(data)
        k = np.array(transformed_data)
        Group=["Class 0","Class 1"]
        plt.scatter(k[:, 0],k[:, 1], c=data_labels)
        #plt.legend(loc="lower right")
        plt.show()

In [None]:
TSNE_plot(train_df_new,y_train)

In [None]:
y_train.value_counts()

# Boruta

In [None]:
#### making files for boruta
features = [f for f in train_df_new.columns if f not in ['status']]
X_train_boruta = train_df_new[features].values
Y_train_boruta = y_train.values.ravel()
X_test_boruta = valid_df_new[features].values
Y_test_boruta = y_test.values.ravel()

In [None]:
X_train_boruta

In [None]:
### implementing boruta
# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)

# find all relevant features - 5 features should be selected
feat_selector.fit(X_train_boruta, Y_train_boruta)

# check selected features - first 5 features are selected
feat_selector.support_

# check ranking of features
feat_selector.ranking_

# call transform() on X to filter it down to selected features
X_train_filtered = feat_selector.transform(X_train_boruta)
X_test_filtered = feat_selector.transform(X_test_boruta)

In [None]:
### name of the features selected####
final_features = list()
indexes = np.where(feat_selector.support_ == True)
for x in np.nditer(indexes):
    final_features.append(features[x])
print(final_features)

In [None]:
len(final_features)

In [None]:
# Use selected features in the training-testing data
X_train_filtered=pd.DataFrame(X_train_filtered,columns=final_features)
X_test_filtered=pd.DataFrame(X_test_filtered,columns=final_features)

In [None]:
TSNE_plot(X_train_filtered,Y_train_boruta)

In [None]:
X_train_filtered

In [None]:
Y_train_boruta = pd.Series(Y_train_boruta)

In [None]:
Y_train_boruta

In [None]:
#save final feature names 
pd.DataFrame(final_features).to_csv('Predictions/final_features_boruta.csv',index=False)

# Upsampling 

In [None]:
def Smote(traindata,trainlabel,prop):
        oversampler= sv.MSMOTE(proportion=prop,random_state=50)
        X_samp, y_samp= oversampler.sample(traindata.values,trainlabel.values)     
        TSNE_plot(X_samp, y_samp)
        X_samp= pd.DataFrame(X_samp)
        y_samp=pd.DataFrame(y_samp)
        X_samp.columns =list(traindata.columns.values)
        return X_samp,y_samp
def TSNE_plot(data,data_labels):
        tsne = TSNE(n_components=2, random_state=50)
        transformed_data = tsne.fit_transform(data)
        k = np.array(transformed_data)
        Group=["Class 0","Class 1"]
        plt.scatter(k[:, 0],k[:, 1], c=data_labels)
        plt.legend(loc="lower right")
        plt.show()

In [None]:
X_train_filtered,Y_train_boruta=Smote(X_train_filtered,Y_train_boruta,0.5)

In [None]:
Y_train_boruta.value_counts()

# Down Sampling (For genomic instability)

In [None]:
from imblearn.under_sampling import RandomUnderSampler
def Smote(traindata,trainlabel):
        # define undersample strategy
        undersample = RandomUnderSampler(sampling_strategy='majority')
        # fit and apply the transform
        X_samp, y_samp = undersample.fit_resample(traindata.values,trainlabel.values)        
        TSNE_plot(X_samp, y_samp)
        X_samp= pd.DataFrame(X_samp)
        y_samp=pd.DataFrame(y_samp)
        X_samp.columns =list(traindata.columns.values)
        return X_samp,y_samp
def TSNE_plot(data,data_labels):
        tsne = TSNE(n_components=2, random_state=50)
        transformed_data = tsne.fit_transform(data)
        k = np.array(transformed_data)
        Group=["Class 0","Class 1"]
        plt.scatter(k[:, 0],k[:, 1], c=data_labels)
        plt.legend(loc="lower right")
        plt.show()

In [None]:
X_train_filtered,Y_train_boruta=Smote(X_train_filtered,Y_train_boruta)

In [None]:
Y_train_boruta.value_counts()

# **Model Training and Testing**

---



## Random Forest

In [None]:
########## grid sreach method Random Forest
def RandomForest_GridSearch(self):
        n_estimators = [int(x) for x in np.linspace(start = 2, stop = 100, num = 10)]
        max_features = ['auto', 'sqrt']
        max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
        max_depth.append(None)
        min_samples_split = [2, 5, 10]
        min_samples_leaf = [1, 2, 4]
        bootstrap = [True, False]
        random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}
        rf = RandomForestClassifier()
        rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=50, n_jobs = -1)
        return rf_random
z_rf=RandomForest_GridSearch(RandomForest_GridSearch)
z_rf.fit(X_train_filtered,Y_train_boruta)
z_rf.best_params_  

In [None]:
model_rf = RandomForestClassifier(bootstrap= z_rf.best_params_['bootstrap'],
 max_depth= z_rf.best_params_['max_depth'],
 max_features= z_rf.best_params_['max_features'],
 min_samples_leaf= z_rf.best_params_['min_samples_leaf'],
 min_samples_split= z_rf.best_params_['min_samples_split'],
 n_estimators =z_rf.best_params_['n_estimators'])
model_rf.fit(X_train_filtered,Y_train_boruta)

In [None]:
predict_test_label_rf = model_rf.predict(X_test_filtered)#test
predict_train_label_rf = model_rf.predict(X_train_filtered)
preds_rf=model_rf.predict_proba(X_test_filtered)

In [None]:
preds_rf_train=model_rf.predict_proba(X_train_filtered)

In [None]:
pd.DataFrame(preds_rf_train).to_csv('Predictions/RF/train_preds_rf.csv')

In [None]:
pd.DataFrame(predict_test_label_rf).to_csv('Predictions/RF/predict_test_label_rf.csv')
pd.DataFrame(predict_train_label_rf).to_csv('Predictions/RF/predict_train_label_rf.csv')
pd.DataFrame(preds_rf).to_csv('Predictions/RF/preds_rf.csv')

In [None]:
pd.DataFrame(Y_test_boruta).to_csv('Predictions/Y_test_boruta.csv')

In [None]:
pd.DataFrame(Y_train_boruta).to_csv('Predictions/Y_train_boruta.csv')

Testing Reports

In [None]:
print("Training Accuracy :",metrics.accuracy_score(Y_train_boruta, predict_train_label_rf))
print("Testing Accuracy:",metrics.accuracy_score(Y_test_boruta, predict_test_label_rf))
print("MCC Score:",matthews_corrcoef(Y_test_boruta, predict_test_label_rf))
print("F1 Score:",f1_score(Y_test_boruta, predict_test_label_rf, average='macro'))
fpr_rf, tpr_rf, _ = roc_curve(Y_test_boruta,preds_rf[:, 1])
roc_auc_rf = auc(fpr_rf, tpr_rf)
print("AUC VALUE:",roc_auc_rf)
kappa_rf=sklearn.metrics.cohen_kappa_score(Y_test_boruta,predict_test_label_rf)
print("kappa Score:",kappa_rf)

In [None]:
print(classification_report(Y_test_boruta, predict_test_label_rf))

## MLP

In [None]:
########## grid sreach method MLP
def MLP_classifier_Gridsearch(self):
        parameter_space = {'hidden_layer_sizes': [(5,5,5),(20,30,50),(50,50,50), (50,100,50), (100,),(100,100,100),(5,2)],'activation': ['tanh', 'relu'],
        'solver': ['sgd', 'adam'],
        'alpha': [0.0001, 0.05,0.001,0.01],
        'learning_rate': ['constant','adaptive']}
        mlp = MLPClassifier(max_iter=1000,random_state=50)
        clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=5,scoring='f1',verbose=2)
        return clf

z_mlp=MLP_classifier_Gridsearch(X_train_filtered)
z_mlp.fit(X_train_filtered,Y_train_boruta)
z_mlp.best_params_ 

In [None]:
model_mlp = MLPClassifier(activation=z_mlp.best_params_['activation'],
 alpha= z_mlp.best_params_['alpha'],
 hidden_layer_sizes= z_mlp.best_params_['hidden_layer_sizes'],
 learning_rate= z_mlp.best_params_['learning_rate'],
 solver=z_mlp.best_params_['solver'])
model_mlp.fit(X_train_filtered,Y_train_boruta)

In [None]:
predict_test_label_mlp = model_mlp.predict(X_test_filtered)#test
predict_train_label_mlp = model_mlp.predict(X_train_filtered)
preds_mlp=model_mlp.predict_proba(X_test_filtered)

In [None]:
train_preds_mlp=model_mlp.predict_proba(X_train_filtered)
pd.DataFrame(train_preds_mlp).to_csv('Predictions/MLP/train_preds_mlp.csv')

In [None]:
pd.DataFrame(predict_test_label_mlp).to_csv('Predictions/MLP/predict_test_label_mlp.csv')
pd.DataFrame(predict_train_label_mlp).to_csv('Predictions/MLP/predict_train_label_mlp.csv')
pd.DataFrame(preds_mlp).to_csv('Predictions/MLP/preds_mlp.csv')

Testing Reports

In [None]:
print("Training Accuracy :",metrics.accuracy_score(Y_train_boruta, predict_train_label_mlp))
print("Testing Accuracy:",metrics.accuracy_score(Y_test_boruta, predict_test_label_mlp))
print("MCC Score:",matthews_corrcoef(Y_test_boruta, predict_test_label_mlp))
print("F1 Score:",f1_score(Y_test_boruta, predict_test_label_mlp, average='macro'))
fpr_rf, tpr_rf, _ = roc_curve(Y_test_boruta,preds_mlp[:, 1])
roc_auc_rf = auc(fpr_rf, tpr_rf)
print("AUC VALUE:",roc_auc_rf)
kappa_rf=sklearn.metrics.cohen_kappa_score(Y_test_boruta,predict_test_label_mlp)
print("kappa Score:",kappa_rf)

In [None]:
print(classification_report(Y_test_boruta, predict_test_label_mlp))

## SVM 

In [None]:
import scipy
from sklearn.svm import SVC
# def SVM_classifier_RandomizedSearch(self):
#         parameter_space = {'kernel':('linear', 'rbf'), "C": scipy.stats.expon(scale=.01), 
#                            "gamma": scipy.stats.expon(scale=.01)}
#         svc_rand = SVC(probability=True)
#         svc_rs = RandomizedSearchCV(svc_rand, parameter_space, n_iter = 100, n_jobs = -1, cv = 3, random_state = 2017)
#         return svc_rs

# z_svm=SVM_classifier_RandomizedSearch(X_train_filtered)
# z_svm.fit(X_train_filtered,Y_train_boruta)
# z_svm.best_params_ 

In [None]:
# svm_model = SVC(kernel = z_svm.best_params_['kernel'], C = z_svm.best_params_['C'], gamma = z_svm.best_params_['gamma'],probability=True).fit(X_train_filtered,Y_train_boruta)

In [None]:
svm_model = SVC(probability=True).fit(X_train_filtered,Y_train_boruta)

In [None]:
predict_test_label_svm = svm_model.predict(X_test_filtered)#test
predict_train_label_svm = svm_model.predict(X_train_filtered)
preds_svm=svm_model.predict_proba(X_test_filtered)

In [None]:
train_preds_svm=svm_model.predict_proba(X_train_filtered)
pd.DataFrame(train_preds_svm).to_csv('Predictions/SVM/train_preds_svm.csv')

In [None]:
pd.DataFrame(predict_test_label_svm).to_csv('Predictions/SVM/predict_test_label_svm.csv')
pd.DataFrame(predict_train_label_svm).to_csv('Predictions/SVM/predict_train_label_svm.csv')
pd.DataFrame(preds_svm).to_csv('Predictions/SVM/preds_svm.csv')

Testing Reports

In [None]:
print("Training Accuracy :",metrics.accuracy_score(Y_train_boruta, predict_train_label_svm))
print("Testing Accuracy:",metrics.accuracy_score(Y_test_boruta, predict_test_label_svm))
print("MCC Score:",matthews_corrcoef(Y_test_boruta, predict_test_label_svm))
print("F1 Score:",f1_score(Y_test_boruta, predict_test_label_svm, average='macro'))
fpr_rf, tpr_rf, _ = roc_curve(Y_test_boruta,preds_svm[:, 1])
roc_auc_rf = auc(fpr_rf, tpr_rf)
print("AUC VALUE:",roc_auc_rf)
kappa_rf=sklearn.metrics.cohen_kappa_score(Y_test_boruta,predict_test_label_svm)
print("kappa Score:",kappa_rf)

In [None]:
print(classification_report(Y_test_boruta, predict_test_label_svm))

## XG Boost

In [None]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [None]:
xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)

In [None]:
folds = 5
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train_filtered,Y_train_boruta), verbose=3, random_state=1001 )

# Here we go
 # timing starts from this point for "start_time" variable
z_xgb=random_search.fit(X_train_filtered,Y_train_boruta)

z_xgb.best_params_ 
 # timing ends here for "start_time" variable

In [None]:
model_xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1, max_depth=z_xgb.best_params_['max_depth'], min_child_weight=z_xgb.best_params_['min_child_weight'],colsample_bytree=z_xgb.best_params_['colsample_bytree'],gamma=z_xgb.best_params_['gamma'],subsample=z_xgb.best_params_['subsample'])
model_xgb.fit(X_train_filtered,Y_train_boruta)

In [None]:
predict_test_label_xgb = model_xgb.predict(X_test_filtered)#test
predict_train_label_xgb = model_xgb.predict(X_train_filtered)
preds_xgb=model_xgb.predict_proba(X_test_filtered)

In [None]:
train_preds_xgb = model_xgb.predict_proba(X_train_filtered)
pd.DataFrame(train_preds_xgb).to_csv('Predictions/XGB/train_preds_xgb.csv')

In [None]:
pd.DataFrame(predict_test_label_xgb).to_csv('Predictions/XGB/predict_test_label_xgb.csv')
pd.DataFrame(predict_train_label_xgb).to_csv('Predictions/XGB/predict_train_label_xgb.csv')
pd.DataFrame(preds_xgb).to_csv('Predictions/XGB/preds_xgb.csv')

Testing Reports

In [None]:
print("Training Accuracy :",metrics.accuracy_score(Y_train_boruta, predict_train_label_xgb))
print("Testing Accuracy:",metrics.accuracy_score(Y_test_boruta, predict_test_label_xgb))
print("MCC Score:",matthews_corrcoef(Y_test_boruta, predict_test_label_xgb))
print("F1 Score:",f1_score(Y_test_boruta, predict_test_label_xgb, average='macro'))
fpr_rf, tpr_rf, _ = roc_curve(Y_test_boruta,preds_xgb[:, 1])
roc_auc_rf = auc(fpr_rf, tpr_rf)
print("AUC VALUE:",roc_auc_rf)
kappa_rf=sklearn.metrics.cohen_kappa_score(Y_test_boruta,predict_test_label_xgb)
print("kappa Score:",kappa_rf)

In [None]:
print(classification_report(Y_test_boruta, predict_test_label_xgb))

## KNN Classifier

In [None]:
knn = KNeighborsClassifier(n_neighbors = 4).fit(X_train_filtered, Y_train_boruta)

In [None]:
predict_test_label_knn = knn.predict(X_test_filtered)#test
predict_train_label_knn = knn.predict(X_train_filtered)
preds_knn=knn.predict_proba(X_test_filtered)

In [None]:
train_preds_knn = knn.predict_proba(X_train_filtered)
pd.DataFrame(train_preds_knn).to_csv('Predictions/KNN/train_preds_knn.csv')

In [None]:
pd.DataFrame(predict_test_label_knn).to_csv('Predictions/KNN/predict_test_label_knn.csv')
pd.DataFrame(predict_train_label_knn).to_csv('Predictions/KNN/predict_train_label_knn.csv')
pd.DataFrame(preds_knn).to_csv('Predictions/KNN/preds_knn.csv')

Testing Reports

In [None]:
print("Training Accuracy :",metrics.accuracy_score(Y_train_boruta, predict_train_label_knn))
print("Testing Accuracy:",metrics.accuracy_score(Y_test_boruta, predict_test_label_knn))
print("MCC Score:",matthews_corrcoef(Y_test_boruta, predict_test_label_knn))
print("F1 Score:",f1_score(Y_test_boruta, predict_test_label_knn, average='macro'))
fpr_rf, tpr_rf, _ = roc_curve(Y_test_boruta,preds_knn[:, 1])
roc_auc_rf = auc(fpr_rf, tpr_rf)
print("AUC VALUE:",roc_auc_rf)
kappa_rf=sklearn.metrics.cohen_kappa_score(Y_test_boruta,predict_test_label_knn)
print("kappa Score:",kappa_rf)

In [None]:
print(classification_report(Y_test_boruta, predict_test_label_knn))

## Gaussian Naive Bayes

In [None]:
gnb = GaussianNB().fit(X_train_filtered, Y_train_boruta)

In [None]:
predict_test_label_gnb = gnb.predict(X_test_filtered)#test
predict_train_label_gnb = gnb.predict(X_train_filtered)
preds_gnb=gnb.predict_proba(X_test_filtered)

In [None]:
pd.DataFrame(predict_test_label_gnb).to_csv('Predictions/GNB/predict_test_label_gnb.csv')
pd.DataFrame(predict_train_label_gnb).to_csv('Predictions/GNB/predict_train_label_gnb.csv')
pd.DataFrame(preds_gnb).to_csv('Predictions/GNB/preds_gnb.csv')

In [None]:
train_preds_gnb = gnb.predict_proba(X_train_filtered)
pd.DataFrame(train_preds_gnb).to_csv('Predictions/GNB/train_preds_gnb.csv')

Testing Reports

In [None]:
print("Training Accuracy :",metrics.accuracy_score(Y_train_boruta, predict_train_label_gnb))
print("Testing Accuracy:",metrics.accuracy_score(Y_test_boruta, predict_test_label_gnb))
print("MCC Score:",matthews_corrcoef(Y_test_boruta, predict_test_label_gnb))
print("F1 Score:",f1_score(Y_test_boruta, predict_test_label_gnb, average='macro'))
fpr_rf, tpr_rf, _ = roc_curve(Y_test_boruta,preds_gnb[:, 1])
roc_auc_rf = auc(fpr_rf, tpr_rf)
print("AUC VALUE:",roc_auc_rf)
kappa_rf=sklearn.metrics.cohen_kappa_score(Y_test_boruta,predict_test_label_gnb)
print("kappa Score:",kappa_rf)

In [None]:
print(classification_report(Y_test_boruta, predict_test_label_gnb))

## Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0)
    gb_clf.fit(X_train_filtered, Y_train_boruta)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train_filtered, Y_train_boruta)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_test_filtered, Y_test_boruta)))

In [None]:
#use maximum learning rate from above output here
gb_clf2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=0)
gb_clf2.fit(X_train_filtered, Y_train_boruta)

In [None]:
predict_test_label_gbc = gb_clf2.predict(X_test_filtered)#test
predict_train_label_gbc  = gb_clf2.predict(X_train_filtered)
preds_gbc =gb_clf2.predict_proba(X_test_filtered)

In [None]:
pd.DataFrame(predict_test_label_gbc).to_csv('Predictions/GBC/predict_test_label_gbc.csv')
pd.DataFrame(predict_train_label_gbc).to_csv('Predictions/GBC/predict_train_label_gbc.csv')
pd.DataFrame(preds_gbc).to_csv('Predictions/GBC/preds_gbc.csv')

In [None]:
train_preds_gbc = gb_clf2.predict_proba(X_train_filtered)
pd.DataFrame(train_preds_gbc).to_csv('Predictions/GBC/train_preds_gbc.csv')

Testing Reports

In [None]:
print("Training Accuracy :",metrics.accuracy_score(Y_train_boruta, predict_train_label_gbc))
print("Testing Accuracy:",metrics.accuracy_score(Y_test_boruta, predict_test_label_gbc))
print("MCC Score:",matthews_corrcoef(Y_test_boruta, predict_test_label_gbc))
print("F1 Score:",f1_score(Y_test_boruta, predict_test_label_gbc, average='macro'))
fpr_rf, tpr_rf, _ = roc_curve(Y_test_boruta,preds_gbc[:, 1])
roc_auc_rf = auc(fpr_rf, tpr_rf)
print("AUC VALUE:",roc_auc_rf)
kappa_rf=sklearn.metrics.cohen_kappa_score(Y_test_boruta,predict_test_label_gbc)
print("kappa Score:",kappa_rf)

In [None]:
print(classification_report(Y_test_boruta, predict_test_label_gbc))

## Stochastic Gradient Descent

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [None]:
sgd = SGDClassifier(loss = 'modified_huber', shuffle = True, random_state= 101).fit(X_train_filtered, Y_train_boruta)

In [None]:
predict_test_label_sgd = sgd.predict(X_test_filtered)#test
predict_train_label_sgd = sgd.predict(X_train_filtered)
preds_sgd=sgd.predict_proba(X_test_filtered)

In [None]:
pd.DataFrame(predict_test_label_sgd).to_csv('Predictions/SGD/predict_test_label_sgd.csv')
pd.DataFrame(predict_train_label_sgd).to_csv('Predictions/SGD/predict_train_label_sgd.csv')
pd.DataFrame(preds_sgd).to_csv('Predictions/SGD/preds_sgd.csv')

In [None]:
train_preds_sgd = sgd.predict_proba(X_train_filtered)
pd.DataFrame(train_preds_sgd).to_csv('Predictions/SGD/train_preds_sgd.csv')

Testing Reports

In [None]:
print("Training Accuracy :",metrics.accuracy_score(Y_train_boruta, predict_train_label_sgd))
print("Testing Accuracy:",metrics.accuracy_score(Y_test_boruta, predict_test_label_sgd))
print("MCC Score:",matthews_corrcoef(Y_test_boruta, predict_test_label_sgd))
print("F1 Score:",f1_score(Y_test_boruta, predict_test_label_sgd, average='macro'))
fpr_rf, tpr_rf, _ = roc_curve(Y_test_boruta,preds_sgd[:, 1])
roc_auc_rf = auc(fpr_rf, tpr_rf)
print("AUC VALUE:",roc_auc_rf)
kappa_rf=sklearn.metrics.cohen_kappa_score(Y_test_boruta,predict_test_label_sgd)
print("kappa Score:",kappa_rf)

In [None]:
print(classification_report(Y_test_boruta, predict_test_label_sgd))

## Logistic Regression

In [None]:
lr=LogisticRegression(max_iter=1000)
lr.fit(X_train_filtered, Y_train_boruta)

In [None]:
predict_test_label_lr = lr.predict(X_test_filtered)#test
predict_train_label_lr = lr.predict(X_train_filtered)
preds_lr=lr.predict_proba(X_test_filtered)

In [None]:
pd.DataFrame(predict_test_label_lr).to_csv('Predictions/LR/predict_test_label_lr.csv')
pd.DataFrame(predict_train_label_lr).to_csv('Predictions/LR/predict_train_label_lr.csv')
pd.DataFrame(preds_lr).to_csv('Predictions/LR/preds_lr.csv')

In [None]:
train_preds_lr = lr.predict_proba(X_train_filtered)
pd.DataFrame(train_preds_lr).to_csv('Predictions/LR/train_preds_lr.csv')

Testing Reports

In [None]:
print("Training Accuracy :",metrics.accuracy_score(Y_train_boruta, predict_train_label_lr))
print("Testing Accuracy:",metrics.accuracy_score(Y_test_boruta, predict_test_label_lr))
print("MCC Score:",matthews_corrcoef(Y_test_boruta, predict_test_label_lr))
print("F1 Score:",f1_score(Y_test_boruta, predict_test_label_lr, average='macro'))
fpr_rf, tpr_rf, _ = roc_curve(Y_test_boruta,preds_lr[:, 1])
roc_auc_rf = auc(fpr_rf, tpr_rf)
print("AUC VALUE:",roc_auc_rf)
kappa_rf=sklearn.metrics.cohen_kappa_score(Y_test_boruta,predict_test_label_lr)
print("kappa Score:",kappa_rf)

In [None]:
print(classification_report(Y_test_boruta, predict_test_label_lr))

## Extra Tree Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import make_classification

In [None]:
clf = ExtraTreesClassifier(n_estimators=100, random_state=101)
clf.fit(X_train_filtered, Y_train_boruta)

In [None]:
predict_test_label_et = clf.predict(X_test_filtered)#test
predict_train_label_et = clf.predict(X_train_filtered)
preds_et=clf.predict_proba(X_test_filtered)

In [None]:
pd.DataFrame(predict_test_label_et).to_csv('Predictions/ET/predict_test_label_et.csv')
pd.DataFrame(predict_train_label_et).to_csv('Predictions/ET/predict_train_label_et.csv')
pd.DataFrame(preds_et).to_csv('Predictions/ET/preds_et.csv')

In [None]:
train_preds_et = clf.predict_proba(X_train_filtered)
pd.DataFrame(train_preds_et).to_csv('Predictions/ET/train_preds_et.csv')

Testing Reports

In [None]:
print("Training Accuracy :",metrics.accuracy_score(Y_train_boruta, predict_train_label_et))
print("Testing Accuracy:",metrics.accuracy_score(Y_test_boruta, predict_test_label_et))
print("MCC Score:",matthews_corrcoef(Y_test_boruta, predict_test_label_et))
print("F1 Score:",f1_score(Y_test_boruta, predict_test_label_et, average='macro'))
fpr_rf, tpr_rf, _ = roc_curve(Y_test_boruta,preds_et[:, 1])
roc_auc_rf = auc(fpr_rf, tpr_rf)
print("AUC VALUE:",roc_auc_rf)
kappa_rf=sklearn.metrics.cohen_kappa_score(Y_test_boruta,predict_test_label_et)
print("kappa Score:",kappa_rf)

In [None]:
print(classification_report(Y_test_boruta, predict_test_label_et))

# External Validation (Testing)

In [None]:
#Preprocessing step for external validation datasets
def handle_missing_values(data):
    data = data.replace([np.inf, -np.inf, "", " "], np.nan)
    data = data.replace(["", " "], np.nan)
    for i in data.columns:
        data[i] = data[i].fillna(data[i].mean())
    return data

## Testing - FINAL Carcinogen Data - Combined

### Load Data

In [None]:
#load signaturizer feature files for external validation datasets
carc_data_1 = pd.read_csv('/home/vishakhag/Projects/Metabokiller_Revision/Feature_files/signaturizer_ISSCAN_FINAL_c_data.csv')
carc_data_1

In [None]:
data_1 = carc_data_1.drop('smiles', axis = 1)
data_1

### Data Preprocessing

In [None]:
# data preprocessing, handle missing values by imputation
data_1 = handle_missing_values(data_1)

In [None]:
data_1['status'].value_counts()

In [None]:
# Use only selected features for the external validation datasets
X_data_1 = data_1[final_features]

In [None]:
y_data_1 = data_1['status']

### Test the models' performances

In [None]:
models = {}

# Logistic Regression
models['Logistic Regression'] = lr

# Support Vector Machines
models['Support Vector Machines'] = svm_model

# Gradient Boosting Classifier
models['Gradient Boosting Classifier'] = gb_clf2

# Random Forest
models['Random Forest'] = model_rf

# Gaussian Naive Bayes
models['Gaussian Naive Bayes'] = gnb

# K-Nearest Neighbors
models['K-Nearest Neighbor'] = knn

# Extra Tree Classifier
models['Extra Tree Classifier'] = clf

#Stochastic Gradient Descent
models['Stochastic Gradient Descent'] = sgd

#XG Boost
models['XG Boost'] = model_xgb

#MultiLayer Perceptron
models['MultiLayer Perceptron'] = model_mlp

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy, precision, recall , MCC , F1_score , AUC , kappa = {}, {}, {}, {}, {}, {}, {}

for key in models.keys():
        
    # Prediction 
    predictions = models[key].predict(X_data_1)
    pred_probs = models[key].predict_proba(X_data_1)
    
    #AUC-ROC Calculations
    fpr, tpr, _ = roc_curve(y_data_1,pred_probs[:, 1])
    roc_auc = auc(fpr, tpr)
    
    # Calculate Accuracy, Precision and Recall Metrics
    accuracy[key] = accuracy_score(predictions, y_data_1)
    precision[key] = precision_score(predictions, y_data_1)
    recall[key] = recall_score(predictions, y_data_1)
    MCC[key] = matthews_corrcoef(y_data_1, predictions)
    F1_score[key] = f1_score(y_data_1, predictions, average='macro')
    AUC[key] = roc_auc
    kappa[key] = sklearn.metrics.cohen_kappa_score(y_data_1,predictions)

In [None]:
df_model = pd.DataFrame(index=models.keys(), columns=['Accuracy', 'Precision', 'Recall', 'MCC Score', 'F1 Score', 'AUC Value', 'kappa Score'])
df_model['Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()
df_model['MCC Score'] = MCC.values()
df_model['F1 Score'] = F1_score.values()
df_model['AUC Value'] = AUC.values()
df_model['kappa Score'] = kappa.values()



df_model

In [None]:
df_model = df_model.sort_values(by=['Accuracy'],ascending = False)
df_model

## Testing - Test Sets (4NC,DP)

### Load Data

In [None]:
#load signaturizer feature file for the test set molecules
carc_data_1 = pd.read_csv('/home/vishakhag/Projects/Metabokiller_Revision/TESTSETS/signaturizer_testdata.csv')
carc_data_1

In [None]:
data_1 = carc_data_1.drop(['smiles','name'], axis = 1)
data_1

### Data Preprocessing

In [None]:
data_1 = handle_missing_values(data_1)

In [None]:
data_1['status'].value_counts()

In [None]:
X_data_1 = data_1[final_features]

In [None]:
y_data_1 = data_1['status']

### Testing the models' performances

In [None]:
models = {}

# Logistic Regression
models['Logistic Regression'] = lr

# # Support Vector Machines
models['Support Vector Machines'] = svm_model

# # Gradient Boosting Classifier
models['Gradient Boosting Classifier'] = gb_clf2

# # Random Forest
models['Random Forest'] = model_rf

# # Gaussian Naive Bayes
models['Gaussian Naive Bayes'] = gnb

# # K-Nearest Neighbors
models['K-Nearest Neighbor'] = knn

# Extra Tree Classifier
models['Extra Tree Classifier'] = clf

#Stochastic Gradient Descent
models['Stochastic Gradient Descent'] = sgd

# #XG Boost
models['XG Boost'] = model_xgb

# #MultiLayer Perceptron
models['MultiLayer Perceptron'] = model_mlp

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy, precision, recall , MCC , F1_score , AUC , kappa, pred_4NC, preds_DOPAC , probablity_4NC , probablity_DP = {}, {}, {}, {},{}, {}, {}, {}, {}, {}, {}

for key in models.keys():
        
    # Prediction 
    predictions = models[key].predict(X_data_1)
    pred_probs = models[key].predict_proba(X_data_1)
    
    #AUC-ROC Calculations
    fpr, tpr, _ = roc_curve(y_data_1,pred_probs[:, 1])
    roc_auc = auc(fpr, tpr)
    
    # Calculate Accuracy, Precision and Recall Metrics, and Prediction Probabilities
    accuracy[key] = accuracy_score(predictions, y_data_1)
    precision[key] = precision_score(predictions, y_data_1)
    recall[key] = recall_score(predictions, y_data_1)
    MCC[key] = matthews_corrcoef(y_data_1, predictions)
    F1_score[key] = f1_score(y_data_1, predictions, average='macro')
    AUC[key] = roc_auc
    kappa[key] = sklearn.metrics.cohen_kappa_score(y_data_1,predictions)
    pred_4NC[key] = predictions[0]
    preds_DOPAC[key] = predictions[1]
    probablity_4NC[key]= np.max(pred_probs[0])
    probablity_DP[key]= np.max(pred_probs[1])

In [None]:
df_model = pd.DataFrame(index=models.keys(), columns=['Accuracy', 'Precision', 'Recall', 'MCC Score', 'F1 Score', 'AUC Value', 'kappa Score', 'Preds_4NC' , 'Preds_DP','Prob_4NC','Prob_DP'])
df_model['Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()
df_model['MCC Score'] = MCC.values()
df_model['F1 Score'] = F1_score.values()
df_model['AUC Value'] = AUC.values()
df_model['kappa Score'] = kappa.values()
df_model['Preds_4NC'] = pred_4NC.values()
df_model['Preds_DP'] = preds_DOPAC.values()
df_model['Prob_4NC'] = probablity_4NC.values()
df_model['Prob_DP'] = probablity_DP.values()

df_model

In [None]:
df_model = df_model.sort_values(by=['Accuracy'],ascending = False)
df_model

#**HyperParameter Tuning** 

---



# Upsampling

In [None]:
def Smote(traindata,trainlabel,prop,name):
        oversampler= sv.MSMOTE(proportion=prop,random_state=50)
        X_samp, y_samp= oversampler.sample(traindata.values,trainlabel.values)     
        X_samp= pd.DataFrame(X_samp)
        y_samp=pd.DataFrame(y_samp)
        X_samp.columns =list(traindata.columns.values)
        return X_samp,y_samp

# DownSampling (Genomic Instability)

In [None]:
from imblearn.under_sampling import RandomUnderSampler
def Smote_Under(traindata,trainlabel):
        # define undersample strategy
        undersample = RandomUnderSampler(sampling_strategy='majority')
        X_samp, y_samp = undersample.fit_resample(traindata.values,trainlabel.values)        
        X_samp= pd.DataFrame(X_samp)
        y_samp=pd.DataFrame(y_samp)
        X_samp.columns =list(traindata.columns.values)
        return X_samp,y_samp

# Feature Selection (Boruta)

In [None]:
def Boruta_Filteration(X_train,y_train,X_test,y_test):
    #### making files for boruta
    features = [f for f in X_train.columns if f not in ['status']]
    X_train_boruta = X_train[features].values
    Y_train_boruta = y_train.values.ravel()
    X_test_boruta = X_test[features].values
    Y_test_boruta = y_test.values.ravel()

    print('Before filteration\nTrain shape\n',X_train_boruta.shape,'\nTest shape\n',X_test_boruta.shape)

    ### implementing boruta
    
    # define random forest classifier, with utilising all cores and
    # sampling in proportion to y labels
    rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

    # define Boruta feature selection method
    feat_selector = BorutaPy(rf, n_estimators=100, random_state=1)

    # find all relevant features - 5 features should be selected
    feat_selector.fit(X_train_boruta, Y_train_boruta)

    # check selected features - first 5 features are selected
    feat_selector.support_

    # check ranking of features
    feat_selector.ranking_

    # call transform() on X to filter it down to selected features
    X_train_filtered = feat_selector.transform(X_train_boruta)
    X_test_filtered = feat_selector.transform(X_test_boruta)

    ### name of the features selected####
    final_features = list()
    indexes = np.where(feat_selector.support_ == True)
    for x in np.nditer(indexes):
        final_features.append(features[x])
    
    print('# of Features selected:',len(final_features))

    X_train_filtered=pd.DataFrame(X_train_filtered,columns=final_features)
    X_test_filtered=pd.DataFrame(X_test_filtered,columns=final_features)

    print('After filteration\nTrain shape\n',X_train_filtered.shape,'\nTest shape\n',X_test_filtered.shape)

    return X_train_filtered,X_test_filtered,Y_train_boruta,Y_test_boruta,final_features

# Loading Data

In [None]:
# Set random seed to maintain the randomness of each hyperparameter tuning run
def seed_all():
    np.random.seed(123)
    tf.random.set_seed(123)
seed_all()

In [None]:
# Make a new directory for HyperParameter Tuning
os.mkdir('/home/vishakhag/Projects/Metabokiller_Revision/Anti-Proliferation/HPTuning_RF')

In [None]:
# FROM = Path of directory from which to load the preprocessed signaturizer file
FROM='/home/vishakhag/Projects/Metabokiller_Revision/Anti-Proliferation/PreProcessed/'
# TO = Path of the newly made HyperParameter Tuning directory
TO='/home/vishakhag/Projects/Metabokiller_Revision/Anti-Proliferation/HPTuning_RF'

# Set the HPTuning directory as the current working directory
os.chdir(TO)

In [None]:
# Load the preprocessed signaturizer file
Data=pd.read_csv(FROM+'sign_proliferative_anti_preprocessed.csv')
Data

In [None]:
# Use 90% of the Data as Training Data for further hyperparameter tuning
Train=Data.sample(n=int(len(Data)*0.9), random_state=1)
Train

In [None]:
#RUN THIS IF LOADING A SAVED MODEL
# model = joblib.load('../Epigenetics/HPTuning/HOLY_epigenetic_model_svm.pkl')

# Defining the Grid

## Random Forest Grid (Proliferation)

In [None]:
def HPTing_Model(Train_x, Train_y):
    rf = RandomForestClassifier()
    parameters = {
        'max_features': ['auto', 'sqrt'],
        'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap':[True, False],
        'n_estimators':[int(x) for x in np.linspace(start = 2, stop = 100, num = 10)]
    }
    grid = RandomizedSearchCV(rf, parameters, scoring='accuracy', return_train_score=False, cv =5)
    grid_search=grid.fit(Train_x, Train_y)
    return grid_search

## MLP Grid (Electrophile)

In [None]:
def HPTing_Model(Train_x, Train_y):
    mlp = MLPClassifier()
    parameter_space = { 'hidden_layer_sizes':[(5,5,5),(20,30,50),(50,50,50), (50,100,50), (100,),(100,100,100),(5,2)],
                       'activation': ['tanh', 'relu'],
                       'solver': ['sgd', 'adam'],
                       'alpha': [0.001, 0.01, 0.02, 0.04, 0.05],
                       'learning_rate': ['constant','adaptive','invscaling']
}
    grid = RandomizedSearchCV(mlp, parameter_space, scoring='accuracy', return_train_score=False, cv =5)
    grid_search=grid.fit(Train_x, Train_y)
    return grid_search

## MLP Grid (Oxidative)

In [None]:
def HPTing_Model(Train_x, Train_y):
    mlp = MLPClassifier( max_iter= 1000 , random_state=50)
    parameter_space = { 'hidden_layer_sizes':[(5,5,5),(20,30,50),(50,50,50), (50,100,50), (100,),(100,100,100),(5,2)],'activation': ['tanh', 'relu'],
        'solver': ['sgd', 'adam'],
        'alpha': [0.0001, 0.05,0.001,0.01],
        'learning_rate': ['constant','adaptive']}
    grid = RandomizedSearchCV(mlp, parameter_space, scoring='accuracy',cv=5 ,return_train_score=False)
    grid_search=grid.fit(Train_x, Train_y)
    return grid_search

## SVM Grid (Epigenetics)

In [None]:
def HPTing_Model(Train_x, Train_y):
    svc_rand = SVC(probability=True)
    parameters = {
        'kernel':('linear', 'rbf'),
        'C': [0.5, 0.6, 0.8, 1.0, 1.2, 1.5], 
        'gamma': [0.05, 0.1, 1.0, 1.2, 1.5, 2],
    }
    grid = RandomizedSearchCV(svc_rand, parameters, cv = 5)
    grid_search=grid.fit(Train_x, Train_y)
    return grid_search

## Random Forest Grid (Genomic Instability)

In [None]:
def HPTing_Model(Train_x, Train_y):
    n_estimators = [int(x) for x in np.linspace(start = 2, stop = 100, num = 10)]
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    min_samples_split = list(range(1,30))
    min_samples_leaf = list(range(1,20))
    bootstrap = [True, False]
    random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
    rf = RandomForestClassifier()
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 10, verbose=2, n_jobs = -1)
    rf_random.fit(Train_x, Train_y)
    return rf_random

## KNN Grid (Apoptosis)

In [None]:
def HPTing_Model(Train_x, Train_y):
    leaf_size = list(range(1,50))
    n_neighbors = list(range(1,40))
    p=list(range(1,20))
    #Convert to dictionary
    hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p, weights=['uniform','distance'], metric= ['minkowski','euclidean','manhattan'])
    #Create new KNN object
    knn = KNeighborsClassifier()
    #Use GridSearch
    knn_Grid = GridSearchCV(knn, hyperparameters, cv=3, verbose=2, n_jobs = -1)
    best_model = knn_Grid.fit(Train_x, Train_y)
    return best_model


# Get Labels Function

In [None]:
def get_labels(pred_test,thsd): #Getting discrete labels from probability values    
    test_label = [] 
    for i in range(len(pred_test)):
        if pred_test[i]>thsd:
            test_label.append(1)
        else:
            test_label.append(0)
    return test_label

# Scoring Metrics

In [None]:
def Scoring_metrices(label, pred, truth, D):
    score={}
    
    accuracy = metrics.accuracy_score(truth, label)
    score[D+" Accuracy:"] = accuracy
    print(D+" Accuracy:", accuracy)
    
    mcc_score = matthews_corrcoef(truth, label)
    score[D+" MCC Score:"] = mcc_score
    print(D+" MCC Score:",mcc_score)
    
    F1_score = f1_score(truth, label, average='macro')
    score[D+" F1 Score:"] = F1_score
    print(D+" F1 Score:", F1_score)
    
    fpr, tpr, _ = roc_curve(truth, pred)
    roc_auc = auc(fpr, tpr)
    score[D+" AUC VALUE:"] = roc_auc
    print(D+" AUC VALUE:",roc_auc)
    
    kappa_rf=sklearn.metrics.cohen_kappa_score(truth, label)
    score[D+" kappa Score:"] = kappa_rf
    print(D+" kappa Score:",kappa_rf)
    
    Precision_score = metrics.precision_score(truth, label)
    score[D+" Precision:"] = Precision_score
    print(D+" Precision:", Precision_score)
    
    Recall_score = metrics.recall_score(truth, label)
    score[D+" Recall:"] = Recall_score
    print(D+" Recall:", Recall_score)
    
    
    display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name=D)
    display.plot()
    #plt.savefig('AUC_ROC.pdf')
    plt.show()
    return score

# Hyperparameter Tuning of the Model

In [None]:
Train_Fold_outs=[]
Test_Fold_outs=[]
Best_params=[]
features = []
models=[]

#this chunk runs 20 iterations of the 5-Cross Validation of the pre-defined grid for the model
for i in range(20):
    print('Fold #',i)    
    
    # Split Data
    X_train, X_test,y_train,y_test = train_test_split(Train,Train["status"] ,test_size=0.25, shuffle = True, random_state=i)
    
    #Drop smiles,status from training,testing data
    x_train = X_train.drop(['status','smiles'], axis=1)
    x_test = X_test.drop(['status','smiles'], axis=1)

    #Feature selection
    x_train_filtered,x_test_filtered,y_train_filtered,y_test_filtered,selected_features = Boruta_Filteration(x_train,y_train,x_test,y_test)
    features.append(selected_features)
    
    y_train_filtered = pd.Series(y_train_filtered)
    
    #Oversampling
    Final_Xtrain,Final_Ytrain = Smote(x_train_filtered,y_train_filtered,0.5,'Upsampled')
    
    Final_Ytrain=Final_Ytrain.values.ravel()
    Final_Xtrain = pd.DataFrame(Final_Xtrain, dtype = np.float64)
    x_test_filtered = pd.DataFrame(x_test_filtered, dtype = np.float64)
    
    #Hyperparameter Tuning
    Parameters = HPTing_Model(Final_Xtrain,Final_Ytrain)
    
    #save best parameters
    Best_params.append(Parameters.best_estimator_.get_params())
    
    #build the tuned model
    #edit the parameters here according to your defined parameter space and model's grid
    rf = RandomForestClassifier(max_features=Parameters.best_estimator_.get_params()['max_features'],
                        max_depth=Parameters.best_estimator_.get_params()['max_depth'],
                        min_samples_split=Parameters.best_estimator_.get_params()['min_samples_split'],
                        min_samples_leaf=Parameters.best_estimator_.get_params()['min_samples_leaf'],
                        bootstrap=Parameters.best_estimator_.get_params()['bootstrap'],
                        n_estimators=Parameters.best_estimator_.get_params()['n_estimators'])
    
    #fit the built model
    rf.fit(Final_Xtrain,Final_Ytrain)
    models.append(rf)

    #Training Predictions for the model
    y_train_pred=rf.predict(Final_Xtrain)
    y_train_prob=rf.predict_proba(Final_Xtrain)

    #Save training metrics
    Train_Fold_outs.append(Scoring_metrices(y_train_pred,y_train_prob[:,1],Final_Ytrain,'Training'))

    #Testing Predictions for the model
    y_test_pred=rf.predict(x_test_filtered) 
    y_test_prob=rf.predict_proba(x_test_filtered)

    #Save testing metrics
    Test_Fold_outs.append(Scoring_metrices(y_test_pred,y_test_prob[:,1],y_test_filtered,'Testing'))

In [None]:
#Analyse the training metrics sorted by descending Training Accuracy
pd.DataFrame.from_dict(Train_Fold_outs).sort_values(by=['Training Accuracy:'],ascending = False)

In [None]:
#Analyse the testing metrics sorted by descending Testing Accuracy
pd.DataFrame.from_dict(Test_Fold_outs).sort_values(by=['Testing Accuracy:'],ascending = False)

In [None]:
#View the best parameters
pd.DataFrame.from_dict(Best_params)

In [None]:
#View length of the features selected for the top performing/most stable (selected) model (1st here)
len(features[1])

In [None]:
#Save the feature names
pd.DataFrame(features[1]).to_csv('/home/vishakhag/Projects/Metabokiller_Revision/Anti-Proliferation/HPTuning_RF/anti_prol_features_rf.csv',index=False)

In [None]:
#save the best parameters of the chosen model
with open('/home/vishakhag/Projects/Metabokiller_Revision/Anti-Proliferation/HPTuning_RF/anti_prol_best_params_RF.csv', 'w') as f:  # You will need 'wb' mode in Python 2.x
    w = csv.DictWriter(f, Best_params[1].keys())
    w.writeheader()
    w.writerow(Best_params[1])

# 20 Fold Boosting


In [None]:
#Randomly split the data into testing and validation for each fold
def Test_valid_split(Set3,frac,seed):
    Fraction=frac
    Test=Set3[Set3['status']==1].sample(frac = Fraction,random_state=1).append(Set3[Set3['status']==0].sample(frac = Fraction,random_state=seed))
    Valid_index=[item for item in list(Set3.index) if item not in list(Test.index)]
    Valid=Set3.T[Valid_index].T
    print('Test set size:',len(Test),'\nValid set size:',len(Valid))
    return Test,Valid

In [None]:
f_list=features[1] #feature list of the selected (hyperparameter tuned) model
Train_Fold_outs_1=[]
Test_Fold_outs_1=[]
models_1=[]

for i in range(20):
    print('Fold #',i)
    
    #Train-test split randomly
    Trn,Tst = Test_valid_split(Data,0.90,i)
    Train_y, Test_y = Trn['status'],Tst['status']

    #Use selected feature list
    Train_x = Trn[f_list]
    Test_x = Tst[f_list]
    
    x_train_filtered = Train_x.values
    x_test_filtered = Test_x.values
    y_train_filtered = Train_y.values.ravel()
    y_test_filtered = Test_y.values.ravel()
    
    #Upsampling
    Final_Xtrain,Final_Ytrain = Smote(Train_x,Train_y,0.5,'Upsamlped')
    
    Final_Ytrain=Final_Ytrain.values.ravel()
    Final_Xtrain = pd.DataFrame(Final_Xtrain, dtype = np.float64)
    x_test_filtered = pd.DataFrame(x_test_filtered, dtype = np.float64)

    #Use the best parameters from the chosen model here
    rf = RandomForestClassifier(bootstrap= True,
                                ccp_alpha= 0.0,
                                class_weight= None,
                                criterion= 'gini',
                                max_depth= 80,
                                max_features= 'auto',
                                max_leaf_nodes= None,
                                max_samples= None,
                                min_impurity_decrease= 0.0,
                                min_samples_leaf= 1,
                                min_samples_split= 10,
                                min_weight_fraction_leaf= 0.0,
                                n_estimators= 45,
                                n_jobs= None,
                                oob_score= False,
                                random_state= None,
                                verbose= 0,
                                warm_start= False)
    
    #Fit the model
    rf.fit(Final_Xtrain,Final_Ytrain)
    models_1.append(rf)
    
    #Training prediction and saving the metrics
    y_train_pred=rf.predict(Final_Xtrain)
    y_train_prob=rf.predict_proba(Final_Xtrain)
    Train_Fold_outs_1.append(Scoring_metrices(y_train_pred,y_train_prob[:,1],Final_Ytrain,'Training'))

    #Testing prediction and saving the metrics
    y_test_pred=rf.predict(x_test_filtered) 
    y_test_prob=rf.predict_proba(x_test_filtered)
    Test_Fold_outs_1.append(Scoring_metrices(y_test_pred,y_test_prob[:,1],y_test_filtered.astype('int'),'Testing'))

In [None]:
#To visualize the stability of the tuned model with each fold
(pd.DataFrame(Test_Fold_outs_1)).boxplot(grid=False,rot=45)

In [None]:
#To visualize the stability of the tuned model with each fold
(pd.DataFrame(Train_Fold_outs_1)).boxplot(grid=False,rot=45)

In [None]:
pd.DataFrame(Test_Fold_outs_1)

In [None]:
pd.DataFrame(Train_Fold_outs_1)

# Training on Whole Data 

In [None]:
TRAIN = Data.drop(['smiles','status'],axis=1)
TRAIN

In [None]:
TRAIN = TRAIN[features[1]] #Use features of the selected (hyperparameter tuned) model here
TRAIN

In [None]:
Y = Data['status']

In [None]:
#Fitting the model on whole data
fitted = models[1].fit(TRAIN,Y)
fitted

In [None]:
#Save the final model
joblib.dump(fitted, '/home/vishakhag/Projects/Metabokiller_Revision/Anti-Proliferation/HPTuning_RF/HOLY_anti_prol_model_rf.pkl')

# Testing on external validation (Set3)

## Load Data

In [None]:
def get_labels(pred_test): #Getting discrete labels from probability values    
        test_pred = []        
        for i in range(pred_test.shape[0]):
            if(pred_test[i][0]>pred_test[i][1]):
                test_pred.append(0)
            else:
                test_pred.append(1)
        return test_pred

In [None]:
set3_sign = pd.read_csv('/home/vishakhag/Projects/Metabokiller_Revision/Carcinogen_Sets/Signaturizer_Set3_preprocessed.csv')
set3_sign

In [None]:
X_Test = set3_sign[features[1]]  #Use features of the selected (hyperparameter tuned) model here
X_Test

In [None]:
Y_Test = set3_sign['status']

In [None]:
# Confirm if there are null values, run handle_missing_values if any.
X_Test.isnull().values.any()

In [None]:
# Prediction using fitted (on whole data) model
Y_TEST_PRED = get_labels(Y_TEST_PROB)
Y_TEST_PROB = (fitted).predict_proba(X_Test)

In [None]:
Scoring_metrices(Y_TEST_PRED,Y_TEST_PROB[:,1],Y_Test,'Valid')

In [None]:
print(classification_report(Y_TEST_PRED,Y_Test))

## Save the final predictions

In [None]:
predictions = pd.DataFrame(columns = ["Proliferation_0","Proliferation_1","Proliferation_Preds"])

In [None]:
predictions

In [None]:
predictions['Proliferation_Preds'] = Y_TEST_PRED

In [None]:
predictions['Proliferation_1'] = Y_TEST_PROB[:,1]

In [None]:
predictions['Proliferation_0'] = Y_TEST_PROB[:,0]

In [None]:
predictions

In [None]:
predictions.to_csv('/home/vishakhag/Projects/Metabokiller_Revision/Anti-Proliferation/HPTuning_RF/proliferations_predictions_rf.csv',index=False)