# Classification of IDO/TDO Activity
Classification of compounds based on IDO and TDO activity using multiple machine learning models.

## Overview
- Load preprocessed activity data
- Create multiclass labels (AA, AI, IA, II)
- Train and compare ML models
- Evaluate classification performance
- Screen the database with the outperforming model

##  Load Dataset and Libraries

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import optuna
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot as plt

In [None]:
df_multinominal = pd.read_csv('1_input_multinominal.csv')
df_multinominal.drop(['Smiles'],axis=1, inplace= True)
df_multinominal.head

## Generate Multi-Class Target

In [None]:
# Define a function to determine the target based on IDO_pic50 and TDO_pic50
def determine_target(row):
    if row["IDO_pic50"] == "Active" and row["TDO_pic50"] == "Active":
        return "AA"
    elif row["IDO_pic50"] == "Active" and row["TDO_pic50"] == "Inactive":
        return "AI"
    elif row["IDO_pic50"] == "Inactive" and row["TDO_pic50"] == "Active":
        return "IA"
    else:
        return "II"

In [None]:
# Apply the function to create a new 'Target' column
df_multinominal["Target"] = df_multinominal.apply(determine_target, axis=1)
df_multinominal.head

In [None]:
# Save the modified DataFrame to a new CSV file
df_multinominal.to_csv("multinominal_output_file.csv", index=False)

In [None]:
#Now that we have saved the output from the multinomial model (`multinominal_output_file.csv`), we proceed to load the PaDEL-generated molecular descriptors and fingerprints for the same set of compounds. These features will be used for further analysis.

# Data Preprocessing

# Sum and Variance 

In [None]:
df_All_features = pd.read_csv('2_All_features.csv')
df_All_features.head()

In [None]:
df_All_features.shape

In [None]:
df_All_features.drop(['Smiles', 'Name'],axis=1, inplace= True)
df_All_features.head()

In [None]:
# Remove columns with sum = 0
df_1 = df_All_features.loc[:, df_All_features.sum() != 0]

# Display the modified DataFrame
df_1.head()

In [None]:
# Remove columns with variance < 0.05
df_2 = df_1.loc[:, df_1.var() >= 0.05]

# Display the modified DataFrame
df_2.head()

In [None]:
df_2.info()

In [None]:
df_2.head()

# Split the dataset into training and testing and load the training set

In [None]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv('3_training_set.csv')

# ---- Step 1: Check missing values ----
percent_missing = df.isnull().sum() * 100 / len(df)
print("Missing value percentage by column:")
print(percent_missing[percent_missing > 0])

# ---- Step 2: Drop columns with >50% missing ----
df = df.drop(columns=percent_missing[percent_missing > 50].index)

# ---- Step 3: Fill remaining missing values (mean imputation for numerical) ----
df = df.fillna(df.mean(numeric_only=True))

# ---- Step 4: Define X and y ----
X = df.drop(columns=['Smiles', 'Activity'], errors='ignore')
y = df['Activity']

# ---- Step 5: Confirm no missing values ----
assert X.isnull().sum().sum() == 0, "There are still missing values in X"
print("No missing values remain after imputation.")

# ---- Step 6: Class distribution before SMOTE ----
print(f'Class distribution before oversampling: total={len(y)}, class 0={sum(y==0)}, class 1={sum(y==1)}')

# ---- Step 7: Apply SMOTE ----
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print(f'Class distribution after SMOTE: total={len(y_resampled)}, class 0={sum(y_resampled==0)}, class 1={sum(y_resampled==1)}')


In [None]:
# ============================================================
# Missing Data Diagnostics (Mean Imputation Bias Check)
# ============================================================

from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold
from scipy.stats import ks_2samp

# Re-load data (for diagnostics on original dataset)
df = pd.read_csv('3_training_set.csv')
num_cols = df.select_dtypes(include=[np.number]).columns

# ---- Step 1: Mean Imputation ----
imp_mean = SimpleImputer(strategy='mean')
X_mean = pd.DataFrame(imp_mean.fit_transform(df[num_cols]), columns=num_cols)

# ---- Step 2: Kolmogorov–Smirnov (KS) test: observed vs imputed distributions ----
shifted_cols = []
for col in num_cols:
    observed = df[col].dropna()
    if len(observed) > 10:
        ks_stat, ks_p = ks_2samp(observed, X_mean[col])
        if ks_p < 0.05:
            shifted_cols.append(col)
print(f"Descriptors showing potential minor distribution shift after mean imputation: {len(shifted_cols)}")

# ---- Step 3: Compare model performance with mean, median, and KNN imputation ----
y = df['Activity']
X = df[num_cols]
cv = KFold(5, shuffle=True, random_state=42)

imputers = {
    'mean': SimpleImputer(strategy='mean'),
    'median': SimpleImputer(strategy='median'),
    'knn': KNNImputer(n_neighbors=5)
}

for name, imputer in imputers.items():
    X_imp = imputer.fit_transform(X)
    model = RandomForestClassifier(random_state=42)
    scores = cross_val_score(model, X_imp, y, cv=cv, scoring='accuracy')
    print(f"{name} imputer: mean CV accuracy = {scores.mean():.3f} ± {scores.std():.3f}")


# ============================================================
# Save Processed Dataset
# ============================================================

df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['Activity'] = y_resampled
df_resampled.to_csv('train_All_features_missing_val_resampled_data.csv', index=False)
print("Resampled data saved to 'train_All_features_missing_val_resampled_data.csv'.")


In [None]:
# ============================================================
# PCA Visualization (Chemical Space Overlap)
# ============================================================

import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Determine index split for visualization
n_real = len(X)
plt.figure(figsize=(8,6))
plt.scatter(X_pca[:n_real, 0], X_pca[:n_real, 1], label='Real', alpha=0.5)
plt.scatter(X_pca[n_real:, 0], X_pca[n_real:, 1], label='Synthetic (SMOTE)', alpha=0.5)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('PCA: Real vs SMOTE Synthetic Samples')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
X_1 = X_resampled
y_1 = y_resampled

In [None]:
# Train-test split
X_1_train, X_1_test, y_1_train, y_1_test = train_test_split(X_1, y_1, test_size=0.2, random_state=42)

In [None]:
# Display a confirmation
print(f"Training samples: {len(X_1_train)}, Testing samples: {len(X_1_test)}")

# Model development with all features

# XGB

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from pathlib import Path
from warnings import filterwarnings
import time
from matplotlib import pyplot as plt 
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, auc, recall_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import KFold, train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
import xgboost as xgb
xg = xgb.XGBClassifier()
xg.fit(X_1_train, y_1_train)

In [None]:
xg_predict = xg.predict(X_1_test)
print('Accuracy_score=',(accuracy_score(y_1_test, xg_predict)))
print((confusion_matrix(y_1_test,xg_predict)))
print(classification_report(y_1_test, xg_predict))
pd.crosstab(y_1_test, xg_predict)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
LR.fit(X_1_train, y_1_train)

In [None]:
LR_predict = LR.predict(X_1_test)

In [None]:
print('Accuracy_score=',(accuracy_score(y_1_test, LR_predict)))
print((confusion_matrix(y_1_test, LR_predict)))
print(classification_report(y_1_test, LR_predict))
pd.crosstab(y_1_test, LR_predict)

# SVM

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_val_score
SV = SVC()
SV.fit(X_1_train, y_1_train)

In [None]:
svm = SV.predict(X_1_test)

In [None]:
print("Accuracy:",accuracy_score(y_1_test, svm))
print((confusion_matrix(y_1_test, svm)))
print(classification_report(y_1_test, svm))
pd.crosstab(y_1_test, svm)

# RF

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [None]:
RF = RandomForestClassifier(random_state = 42)
RF.fit(X_1_train, y_1_train)

In [None]:
RF_pred = RF.predict(X_1_test)
accuracy_score(y_1_test, RF_pred)

In [None]:
from sklearn.metrics import confusion_matrix

print('Accuracy_score=',(accuracy_score(y_1_test, RF_pred)))
print(confusion_matrix(y_1_test, RF_pred))
print(classification_report(y_1_test, RF_pred))
pd.crosstab(y_1_test, RF_pred)

# DT

In [None]:
DT = DecisionTreeClassifier(random_state=42)
DT.fit(X_1_train, y_1_train)

In [None]:
DT_predict = DT.predict(X_1_test)
print('Accuracy_score=',(accuracy_score(y_1_test, DT_predict)))
print((confusion_matrix(y_1_test, DT_predict)))

In [None]:
print(classification_report(y_1_test, DT_predict))
pd.crosstab(y_1_test, DT_predict)

# NB

In [None]:
NB = GaussianNB() 
NB.fit(X_1_train, y_1_train)

In [None]:
NB_pred = NB.predict(X_1_test)
print('Accuracy_score=',(accuracy_score(y_1_test, NB_pred)))
print((confusion_matrix(y_1_test, NB_pred)))
print(classification_report(y_1_test, NB_pred))
pd.crosstab(y_1_test, NB_pred)

# ANN

In [None]:
from sklearn.neural_network import MLPClassifier
NN = MLPClassifier(random_state=42)

In [None]:
NN.fit(X_1_train, y_1_train)

In [None]:
from sklearn.metrics import accuracy_score

ANN = NN.predict(X_1_test)
print('Accuracy_score=',(accuracy_score(y_1_test, ANN)))
print((confusion_matrix(y_1_test, ANN)))

In [None]:
print(classification_report(y_1_test, ANN))
pd.crosstab(y_1_test, ANN)

# XGBRF

In [None]:
from xgboost import XGBRFClassifier

In [None]:
xgbrf = XGBRFClassifier()
xgbrf.fit(X_1_train,y_1_train)

In [None]:
y_predict = xgbrf.predict(X_1_test)
print(classification_report(y_1_test, y_predict))
print(accuracy_score(y_1_test, y_predict))
print((confusion_matrix(y_1_test, y_predict)))
pd.crosstab(y_1_test, y_predict)

In [None]:
df.info()

In [None]:
df.head()

# Feature Engineering - (i) Correlation filter

In [None]:
correlated_features = set()
correlation_matrix = df.corr()

In [None]:
for i in range(len(correlation_matrix .columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.90:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

In [None]:
print(len(correlated_features))

In [None]:
print(correlated_features)

In [None]:
df.drop(labels=correlated_features, axis=1, inplace=True)

In [None]:
df.to_csv('correlation_filter.csv')

In [None]:
df.info()

In [None]:
# data preprocessing
df.isnull().sum()

In [None]:
statistics = df.describe()
statistics

In [None]:
X1 = df.drop('Activity', axis=1)  # Features
y1 = df['Activity']  # Labels

In [None]:
X1.columns

In [None]:
from sklearn.model_selection import train_test_split

X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y1, test_size=0.2, random_state=42)

# Model development with correlation filter features 

# XGB

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from pathlib import Path
from warnings import filterwarnings
import time
from matplotlib import pyplot as plt 
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, auc, recall_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import KFold, train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
import xgboost as xgb
xg = xgb.XGBClassifier()
xg.fit(X1_train, y1_train)

In [None]:
xg_predict = xg.predict(X1_test)
print('Accuracy_score=',(accuracy_score(y1_test, xg_predict)))
print((confusion_matrix(y1_test,xg_predict)))
print(classification_report(y1_test, xg_predict))
pd.crosstab(y1_test, xg_predict)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
LR.fit(X1_train, y1_train)

In [None]:
LR_predict = LR.predict(X1_test)

In [None]:
print('Accuracy_score=',(accuracy_score(y1_test, LR_predict)))
print((confusion_matrix(y1_test, LR_predict)))
print(classification_report(y1_test, LR_predict))
pd.crosstab(y1_test, LR_predict)

# SVM

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_val_score
SV = SVC()
SV.fit(X1_train, y1_train)

In [None]:
svm = SV.predict(X1_test)

In [None]:
print("Accuracy:",accuracy_score(y1_test, svm))
print((confusion_matrix(y1_test, svm)))
print(classification_report(y1_test, svm))
pd.crosstab(y1_test, svm)

# RF

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [None]:
RF = RandomForestClassifier(random_state = 42)
RF.fit(X1_train, y1_train)

In [None]:
RF_pred = RF.predict(X1_test)
accuracy_score(y1_test, RF_pred)

In [None]:
from sklearn.metrics import confusion_matrix

print('Accuracy_score=',(accuracy_score(y1_test, RF_pred)))
print(confusion_matrix(y1_test, RF_pred))
print(classification_report(y1_test, RF_pred))
pd.crosstab(y1_test, RF_pred)

# DT

In [None]:
DT = DecisionTreeClassifier(random_state=42)
DT.fit(X1_train, y1_train)

In [None]:
DT_predict = DT.predict(X1_test)
print('Accuracy_score=',(accuracy_score(y1_test, DT_predict)))
print((confusion_matrix(y1_test, DT_predict)))

In [None]:
print(classification_report(y1_test, DT_predict))
pd.crosstab(y1_test, DT_predict)

# NB

In [None]:
NB = GaussianNB() 
NB.fit(X1_train, y1_train)

In [None]:
NB_pred = NB.predict(X1_test)
print('Accuracy_score=',(accuracy_score(y1_test, NB_pred)))
print((confusion_matrix(y1_test, NB_pred)))
print(classification_report(y1_test, NB_pred))
pd.crosstab(y1_test, NB_pred)

# ANN

In [None]:
from sklearn.neural_network import MLPClassifier
NN = MLPClassifier(random_state=42)

In [None]:
NN.fit(X1_train, y1_train)

In [None]:
from sklearn.metrics import accuracy_score

ANN = NN.predict(X1_test)
print('Accuracy_score=',(accuracy_score(y1_test, ANN)))
print((confusion_matrix(y1_test, ANN)))

In [None]:
print(classification_report(y1_test, ANN))
pd.crosstab(y1_test, ANN)

# XGBRF

In [None]:
from xgboost import XGBRFClassifier

In [None]:
xgbrf = XGBRFClassifier(random_state=22)
xgbrf.fit(X1_train,y1_train)

In [None]:
y_predict = xgbrf.predict(X1_test)
print(classification_report(y1_test, y_predict))
print(accuracy_score(y1_test, y_predict))
print((confusion_matrix(y1_test, y_predict)))
pd.crosstab(y1_test, y_predict)

# RFECV

In [None]:
rfecv = RFECV(
    estimator=XGBRFClassifier(),
    step=1,
    cv=StratifiedKFold(10)
)

In [None]:
rfecv.fit(X1.values, y1)

In [None]:
print('Optimal number of features: {}'.format(rfecv.n_features_))
plt.figure(figsize=(16, 9))
plt.title('Recursive Feature Elimination with Cross-Validation', fontsize=18, fontweight='bold', pad=20)
plt.xlabel('Number of features selected', fontsize=14, labelpad=20)
plt.ylabel('% Correct Classification', fontsize=14, labelpad=20)
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_,  linewidth=3)

plt.show()

In [None]:
print(rfecv.n_features_)

# RFECV Feature Extraction

In [None]:
import numpy as np
print(np.where(rfecv.support_ == False)[0])
df1 = X1.drop(X1.columns[np.where(rfecv.support_ == False)[0]], axis=1)
df1

In [None]:
print(df1.columns)

In [None]:
# Export the DataFrame df1 to a CSV file
df1.to_csv('rfecv_xgbrf_data.csv', index=False)

print("Filtered data exported successfully to 5_xgbrf_train_rfecv.csv") #rfecv_xgbrf_data.csv

In [None]:
df1.shape

In [None]:
#Import the dataframe after including the activity column
df1 = pd.read_csv('5_xgbrf_train_rfecv.csv') #rfecv_xgbrf_data.csv
df1.head()

In [None]:
X2 = df1.iloc[:,:-1]
y2 = df1.Activity

In [None]:
from sklearn.model_selection import train_test_split

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

## Model development with RFECV

# XGB

In [None]:
xg = xgb.XGBClassifier()
xg.fit(X2_train, y2_train)

In [None]:
xg_predict = xg.predict(X2_test)
print('Accuracy_score=',(accuracy_score(y2_test, xg_predict)))
print((confusion_matrix(y2_test,xg_predict)))
print(classification_report(y2_test, xg_predict))
pd.crosstab(y2_test, xg_predict)

## LR

In [None]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
LR.fit(X2_train, y2_train)

In [None]:
LR_predict = LR.predict(X2_test)

In [None]:
print('Accuracy_score=',(accuracy_score(y2_test, LR_predict)))
print((confusion_matrix(y2_test, LR_predict)))
print(classification_report(y2_test, LR_predict))
pd.crosstab(y2_test, LR_predict)

## SVM

In [None]:
from sklearn import svm

SV = svm.SVC()
SV.fit(X2_train, y2_train)

In [None]:
svm = SV.predict(X2_test)

In [None]:
print("Accuracy:",accuracy_score(y2_test, svm))
print((confusion_matrix(y2_test, svm)))
print(classification_report(y2_test, svm))
pd.crosstab(y2_test, svm)

## RF

In [None]:
RF = RandomForestClassifier(random_state = 42)
RF.fit(X2_train, y2_train)

In [None]:
RF_pred = RF.predict(X2_test)
accuracy_score(y2_test, RF_pred)

In [None]:
from sklearn.metrics import confusion_matrix

print('Accuracy_score=',(accuracy_score(y2_test, RF_pred)))
print(confusion_matrix(y2_test, RF_pred))
print(classification_report(y2_test, RF_pred))
pd.crosstab(y2_test, RF_pred)

## DT

In [None]:
DT = DecisionTreeClassifier(random_state=42)
DT.fit(X2_train, y2_train)

In [None]:
DT_predict = DT.predict(X2_test)
print('Accuracy_score=',(accuracy_score(y2_test, DT_predict)))
print((confusion_matrix(y2_test, DT_predict)))

In [None]:
print(classification_report(y2_test, DT_predict))
pd.crosstab(y2_test, DT_predict)

## NB

In [None]:
NB = GaussianNB() 
NB.fit(X2_train, y2_train)

In [None]:
NB_pred = NB.predict(X2_test)
print('Accuracy_score=',(accuracy_score(y2_test, NB_pred)))
print((confusion_matrix(y2_test, NB_pred)))
print(classification_report(y2_test, NB_pred))
pd.crosstab(y2_test, NB_pred)

## ANN

In [None]:
from sklearn.neural_network import MLPClassifier
NN = MLPClassifier(random_state=42)

In [None]:
NN.fit(X2_train, y2_train)

In [None]:
from sklearn.metrics import accuracy_score

ANN = NN.predict(X2_test)
print('Accuracy_score=',(accuracy_score(y2_test, ANN)))
print((confusion_matrix(y2_test, ANN)))

In [None]:
print(classification_report(y2_test, ANN))
pd.crosstab(y2_test, ANN)

## XGBRF

In [None]:
from xgboost import XGBRFClassifier

In [None]:
xgbrf = XGBRFClassifier(random_state=22)
xgbrf.fit(X2_train,y2_train)

In [None]:
y_predict = xgbrf.predict(X2_test)
print(classification_report(y2_test, y_predict))
print(accuracy_score(y2_test, y_predict))
print((confusion_matrix(y2_test, y_predict)))
pd.crosstab(y2_test, y_predict)

# AUC-ROC after RFECV

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

# Assuming you have your models and test data defined:
# LR_c, RF_c, xg_c, DT_c, NB_c, SV_c, NN_cf, xgbrf_c, x1_test, y1_test

# Dictionary to store model names and their predictions
models = {
    "Logistic Regression": LR,
    "XGBoost": xg,
    "Random Forest": RF,
    "Decision Tree": DT,
    "Naive Bayes": NB,
    "SVM": SV,
    "ANN": NN,
    "XGBRF": xgbrf,
}

plt.figure(figsize=(10, 8))  # Adjust figure size as needed

for name, model in models.items():
    try:
        y_pred_proba = model.predict_proba(X2_test)[:, 1]
        fpr, tpr, _ = roc_curve(y2_test, y_pred_proba)
        auc = roc_auc_score(y2_test, y_pred_proba)
        plt.plot(fpr, tpr, label=f'{name} AUC = {auc:.3f}')
        print(f'{name} ROC AUC: {auc:.3f}')
    except AttributeError:
        #Handle SVM, which can lack predict_proba.
        y_pred = model.decision_function(X2_test) #use decision_function
        fpr, tpr, _ = roc_curve(y2_test, y_pred)
        auc = roc_auc_score(y2_test, y_pred)
        plt.plot(fpr, tpr, label=f'{name} AUC = {auc:.3f}')
        print(f'{name} ROC AUC: {auc:.3f}')

plt.plot([0, 1], [0, 1], "k--", label="Baseline")  # Add random classifier line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curves')
plt.legend(loc='lower right')
plt.savefig('all_models_roc_curve.png') #save the figure
plt.show()

# Optuna hyperparameter tuning

# Optuna

## LR

In [None]:
def objective(trial,data=X2,target=y2):
    
    param = {
        'tol' : trial.suggest_uniform('tol' , 1e-6 , 1e-3),
        'C' : trial.suggest_loguniform("C", 1e-2, 1),
        'fit_intercept' : trial.suggest_categorical('fit_intercept' , [True, False]),
        'random_state' : trial.suggest_categorical('random_state' , [0, 42, 2021, 555]),
        'solver' : trial.suggest_categorical('solver' , ['lbfgs','liblinear']),
        "n_jobs" : -1
    }
    model = LogisticRegression(**param)  
    model.fit(X2_train,y2_train)
     
    preds = model.predict(X2_test)
    accuracy = accuracy_score(y2_test, preds)
    
    
    return accuracy

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1000)
trial = study.best_trial
print('accuracy:{}'.format(trial.value))
print("best hyperparameters:{}".format(trial.params))
print('Number of finished trials:', len(study.trials))

## XGB

In [None]:
def objective(trial,data=X2,target=y2):
    

    param = { 
        'gamma': trial.suggest_categorical('gamma', [0, 1]),
        'colsample_bylevel': trial.suggest_categorical('colsample_bylevel', [0,0.5,1]),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0,0.5,1]),
        'colsample_bynode' :trial.suggest_categorical('colsample_bynode', [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]),
        'subsample': trial.suggest_categorical('subsample', [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]),
        'n_estimators': trial.suggest_int('n_estimators', 1, 800),
        'learning_rate' :trial.suggest_categorical('learning_rate', [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]),
        'max_delta_step':trial.suggest_int('max_delta_step', 0,80),
        'max_depth':trial.suggest_int('max_depth', 1,80),
        'random_state':trial.suggest_int('random_state', 1,500),
        'num_parallel_tree':trial.suggest_int('num_parallel_tree', 1,500),
        'min_child_weight':trial.suggest_int('min_child_weight', 1, 80),
        'verbosity':trial.suggest_categorical('verbosity', [0, 1]),
    }
    model = XGBClassifier(**param)  
    model.fit(X2_train,y2_train)
     
    preds = model.predict(X2_test)
    accuracy = accuracy_score(y2_test, preds)
    
    
    return accuracy

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1000)
print('accuracy:{}'.format(trial.value))
print("best hyperparameters:{}".format(trial.params))
print('Number of finished trials:', len(study.trials))

## RF

In [None]:
def objective(trial,data=X2,target=y2):
    
    param = { 
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 1,90),
        'n_jobs': trial.suggest_int('n_jobs', 1,80),
        'random_state': trial.suggest_int('random_state', 0,500),
        'min_samples_split' :trial.suggest_int('min_samples_split', 5,500),
        'min_samples_leaf':trial.suggest_int('min_samples_leaf', 1,500),
        'max_leaf_nodes':trial.suggest_int('max_leaf_nodes', 5,500),
        'bootstrap':trial.suggest_categorical('bootstrap',[True]),
        'verbose':trial.suggest_categorical('verbose', [0,1]),
    }
    model = RandomForestClassifier(**param)  
    model.fit(X2_train,y2_train)
     
    preds = model.predict(X2_test)
    accuracy = accuracy_score(y2_test, preds)
    
    
    return accuracy


In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1000)
trial = study.best_trial
print('Accuracy:{}'.format(trial.value))
print("best hyperparameters:{}".format(trial.params))
print('Number of finished trials:', len(study.trials))

## DT


In [None]:
def objective(trial,data=X2,target=y2):
        
    param = { 
        'min_weight_fraction_leaf': trial.suggest_int('min_weight_fraction_leaf', 0, 0.5),
        'max_depth': trial.suggest_int('max_depth', 1,90),
        'random_state': trial.suggest_int('random_state', 0,500),
        'min_samples_split' :trial.suggest_int('min_samples_split', 5,50),
        'min_samples_leaf':trial.suggest_int('min_samples_leaf', 1,50),
        'max_leaf_nodes':trial.suggest_int('max_leaf_nodes', 5,50),
        'criterion': trial.suggest_categorical('criterion',['gini']),
        'splitter':trial.suggest_categorical('splitter', ['best']),
    }
    model = DecisionTreeClassifier(**param)  
    model.fit(X2_train,y2_train)
     
    preds = model.predict(X2_test)
    accuracy = accuracy_score(y2_test, preds)
    
    
    return accuracy

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1000)
trial = study.best_trial
print('Accuracy:{}'.format(trial.value))
print("best hyperparameters:{}".format(trial.params))
print('Number of finished trials:', len(study.trials))

## NB

In [None]:
def objective(trial,data=X2,target=y2):
    
    param = {"priors": None, "var_smoothing": 0

    }
    model = GaussianNB(**param)  
    model.fit(X2_train,y2_train)
     
    preds = model.predict(X2_test)
    accuracy = accuracy_score(y2_test, preds)
    
    
    return accuracy

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1000)
print('accuracy:{}'.format(trial.value))
print("best hyperparameters:{}".format(trial.params))
print('Number of finished trials:', len(study.trials))

## SVM

In [None]:
from sklearn import svm

def objective(trial, data=X2,target=y2):
    
    param = {'C':trial.suggest_float("C", 1e-2, 1e2, log=True),
        'gamma':trial.suggest_float("gamma", 1e-2, 1e-2, log=True),
        'kernel':trial.suggest_categorical("kernel", ["linear", "poly", "rbf"])

    }
    model = svm.SVC(**param)  
    model.fit(X2_train,y2_train)
     
    preds = model.predict(X2_test)
    accuracy = accuracy_score(y2_test, preds)
    
    
    return accuracy

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1000)
trial = study.best_trial
print('Accuracy:{}'.format(trial.value))
print("best hyperparameters:{}".format(trial.params))
print('Number of finished trials:', len(study.trials))

# ANN

In [None]:
def objective(trial, data=X2,target=y2):
    param = {
      "hidden_layer_size": (trial.suggest_int("hidden_layer_sizes", 10, 200),),  # Single layer with size suggestion
      "activation": trial.suggest_categorical("activation", ["relu", "tanh", "logistic"]),
      "solver": trial.suggest_categorical("solver", ["adam", "sgd"]),
        "alpha": trial.suggest_loguniform("alpha", 1e-5, 1e-1),  # L2 penalty (regularization)
        "learning_rate_init": trial.suggest_loguniform("learning_rate_init", 1e-5, 1e-1),
        "max_iter": 200,
        "early_stopping": True,  # Enable early stopping to avoid overfitting
        "random_state": 42
    }
    model = MLPClassifier(**param)  
    model.fit(X2_train,y2_train)
     
    preds = model.predict(X2_test)
    accuracy = accuracy_score(y2_test, preds)
    
    
    return accuracy

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1000)
trial = study.best_trial
print('Accuracy:{}'.format(trial.value))
print("best hyperparameters:{}".format(trial.params))
print('Number of finished trials:', len(study.trials))

# XGBRFClassifier

In [None]:
from xgboost import XGBRFClassifier
def objective(trial, data=X2, target=y2):
   
    # Define the hyperparameter param 
                 
                                
    param = {
        'base_score': 0.5,
        'booster': 'gbtree',
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'gpu_id': -1,
        'importance_type': 'gain',
        'interaction_constraints': '',
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'missing': None,
        'monotone_constraints': '()',
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'n_jobs': 8,
        'num_parallel_tree': trial.suggest_int('num_parallel_tree', 1, 100),
        'objective': 'binary:logistic',
        'random_state': 22,
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.5, 2.0),
        'tree_method': 'exact',
        'validate_parameters': 1,
        'verbosity': None
    }
    # Create the XGBRFClassifier model with hyperparameters from param 
    model = XGBRFClassifier(**param)

    # Train the model

    model.fit(X2_train,y2_train)
     
    preds = model.predict(X2_test)
    accuracy = accuracy_score(y2_test, preds)
    
    
    return accuracy 

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1000)
trial = study.best_trial
print('Accuracy:{}'.format(trial.value))
print("best hyperparameters:{}".format(trial.params))
print('Number of finished trials:', len(study.trials))

## Model development - Tunned parameters

## LR

In [None]:
LR_c = LogisticRegression(tol= 0.0002872485265154782, C= 0.2507602699236452, fit_intercept= True, random_state= 42, solver= 'lbfgs')

In [None]:
LR_c.fit(X2_train, y2_train)
LR_predict_c = LR_c.predict(X2_test)

In [None]:
print('Accuracy_score=',(accuracy_score(y2_test, LR_predict_c)))
print((confusion_matrix(y2_test, LR_predict_c)))
print(classification_report(y2_test, LR_predict_c))
pd.crosstab(y2_test, LR_predict_c)

## AUC-ROC

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
from matplotlib import pyplot as plt

LR1 = LR_c.predict_proba(X2_test)[:, 1]
roc_auc = roc_auc_score(y2_test, LR1)
fpr1, tpr1, threshold = roc_curve(y2_test, LR1)
plt.title('LR Receiver Operating Characteristic')
plt.plot(fpr1, tpr1,label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
print('LR ROC AUC %.3f' % roc_auc)
plt.savefig('LR ROC_curve.jpg')
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score,KFold

LR_c = LogisticRegression(tol= 0.0002872485265154782, C= 0.2507602699236452, fit_intercept= True, random_state= 42, solver= 'lbfgs')
kf=StratifiedKFold(n_splits=5, random_state=42,shuffle=True)
score=cross_val_score(LR_c,X2_train,y2_train,cv=kf, scoring='roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

In [None]:
from sklearn.model_selection import cross_val_score,KFold

LR_c = LogisticRegression(tol= 0.0002872485265154782, C= 0.2507602699236452, fit_intercept= True, random_state= 42, solver= 'lbfgs')
kf=StratifiedKFold(n_splits=10, random_state=42,shuffle=True)
score=cross_val_score(LR_c,X2_train,y2_train,cv=kf, scoring='roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

## RF

In [None]:
RF_c = RandomForestClassifier(n_estimators= 199, max_depth= 79, random_state= 1, min_samples_split= 14, min_samples_leaf= 10, max_leaf_nodes= 261, bootstrap= True, verbose= 1)
RF_c.fit(X2_train, y2_train)

In [None]:
RF_pred_c = RF_c.predict(X2_test)
print(accuracy_score(y2_test, RF_pred_c))
print(confusion_matrix(y2_test, RF_pred_c))
print(classification_report(y2_test, RF_pred_c))
pd.crosstab(y2_test, RF_pred_c)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
from matplotlib import pyplot as plt

RF = RF_c.predict_proba(X2_test)[:, 1]
roc_auc = roc_auc_score(y2_test, RF)
fpr1, tpr1, threshold = roc_curve(y2_test, RF)
plt.title('RF Receiver Operating Characteristic')
plt.plot(fpr1, tpr1,label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
print('RF ROC AUC %.3f' % roc_auc)
plt.savefig('RF ROC_curve.jpg')
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score,KFold

RF_c = RandomForestClassifier(n_estimators= 199, max_depth= 79, random_state= 1, min_samples_split= 14, min_samples_leaf= 10, max_leaf_nodes= 261, bootstrap= True, verbose= 1)
kf=StratifiedKFold(n_splits=5, random_state=42,shuffle=True)
score=cross_val_score(RF_c,X2_train,y2_train,cv=kf, scoring='roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))
from sklearn.model_selection import cross_val_score,KFold

In [None]:
from sklearn.model_selection import cross_val_score,KFold

RF_c = RandomForestClassifier(n_estimators= 199, max_depth= 79, random_state= 1, min_samples_split= 14, min_samples_leaf= 10, max_leaf_nodes= 261, bootstrap= True, verbose= 1)
kf=StratifiedKFold(n_splits=10, random_state=42,shuffle=True)
score=cross_val_score(RF_c,X2_train,y2_train,cv=kf, scoring='roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

## XGB

In [None]:
import xgboost as xgb
xg_c = xgb.XGBClassifier(gamma= 1, colsample_bylevel= 1, colsample_bytree= 1, colsample_bynode= 0.6, subsample= 0.9, n_estimators= 173, learning_rate= 0.9, max_delta_step= 9, max_depth= 30, random_state= 131, num_parallel_tree= 155, min_child_weight= 19, verbosity= 1)
xg_c.fit(X2_train, y2_train)

In [None]:
xg_pred_c = xg_c.predict(X2_test)
print(accuracy_score(y2_test, xg_pred_c))
print(confusion_matrix(y2_test, xg_pred_c))
print(classification_report(y2_test, xg_pred_c))
pd.crosstab(y2_test, xg_pred_c)

# AUC-ROC

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
from matplotlib import pyplot as plt

xg = xg_c.predict_proba(X2_test)[:, 1]
roc_auc = roc_auc_score(y2_test, xg)
fpr1, tpr1, threshold = roc_curve(y2_test, xg)
plt.title('XGB Receiver Operating Characteristic')
plt.plot(fpr1, tpr1,label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
print('XGB ROC AUC %.3f' % roc_auc)
plt.savefig('XGB ROC_curve.jpg')
plt.show()

# Cross validation

In [None]:
xg_c = xgb.XGBClassifier(gamma= 1, colsample_bylevel= 1, colsample_bytree= 1, colsample_bynode= 0.6, subsample= 0.9, n_estimators= 173, learning_rate= 0.9, max_delta_step= 9, max_depth= 30, random_state= 131, num_parallel_tree= 155, min_child_weight= 19, verbosity= 1)
kf=StratifiedKFold(n_splits=5, random_state=42,shuffle=True)
score=cross_val_score(xg_c,X2_train,y2_train,cv=kf, scoring='roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

In [None]:
xg_c = xgb.XGBClassifier(gamma= 1, colsample_bylevel= 1, colsample_bytree= 1, colsample_bynode= 0.6, subsample= 0.9, n_estimators= 173, learning_rate= 0.9, max_delta_step= 9, max_depth= 30, random_state= 131, num_parallel_tree= 155, min_child_weight= 19, verbosity= 1)
kf=StratifiedKFold(n_splits=10, random_state=42,shuffle=True)
score=cross_val_score(xg_c,X2_train,y2_train,cv=kf, scoring='roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

## DT

In [None]:
DT_c = DecisionTreeClassifier(min_weight_fraction_leaf= 0, max_depth= 33, random_state= 66, min_samples_split= 24, min_samples_leaf= 4, max_leaf_nodes= 21, criterion= 'gini', splitter= 'best')
DT_c.fit(X2_train, y2_train)

In [None]:
DT_predict_c = DT_c.predict(X2_test)
print('Accuracy_score=',(accuracy_score(y2_test, DT_predict_c)))
print((confusion_matrix(y2_test, DT_predict_c)))
print(classification_report(y2_test, DT_predict_c))
pd.crosstab(y2_test, DT_predict_c)

# AUC-ROC

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
from matplotlib import pyplot as plt

DT = DT_c.predict_proba(X2_test)[:, 1]
roc_auc = roc_auc_score(y2_test, DT)
fpr1, tpr1, threshold = roc_curve(y2_test, DT)
plt.title('DT Receiver Operating Characteristic')
plt.plot(fpr1, tpr1,label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
print('DT ROC AUC %.3f' % roc_auc)
plt.savefig('DT ROC_curve.jpg')
plt.show()

# Cross val

In [None]:
DT_c = DecisionTreeClassifier(min_weight_fraction_leaf= 0, max_depth= 33, random_state= 66, min_samples_split= 24, min_samples_leaf= 4, max_leaf_nodes= 21, criterion= 'gini', splitter= 'best')
kf=StratifiedKFold(n_splits=5, random_state=42,shuffle=True)
score=cross_val_score(DT_c,X2_train,y2_train,cv=kf, scoring='roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

In [None]:
DT_c = DecisionTreeClassifier(min_weight_fraction_leaf= 0, max_depth= 33, random_state= 66, min_samples_split= 24, min_samples_leaf= 4, max_leaf_nodes= 21, criterion= 'gini', splitter= 'best')
kf=StratifiedKFold(n_splits=10, random_state=42,shuffle=True)
score=cross_val_score(DT_c,X2_train,y2_train,cv=kf, scoring='roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

## NB

In [None]:
NB_c = GaussianNB(priors= None, var_smoothing= 0) 
NB_c.fit(X2_train, y2_train)

In [None]:
NB_pred_c = NB_c.predict(X2_test)
print('Accuracy_score=',(accuracy_score(y2_test, NB_pred_c)))
print((confusion_matrix(y2_test, NB_pred_c)))
print(classification_report(y2_test, NB_pred_c))
pd.crosstab(y2_test, NB_pred_c)

# AUC-ROC

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
from matplotlib import pyplot as plt

NB = NB_c.predict_proba(X2_test)[:, 1]
roc_auc = roc_auc_score(y2_test, NB)
fpr1, tpr1, threshold = roc_curve(y2_test, NB)
plt.title('NB Receiver Operating Characteristic')
plt.plot(fpr1, tpr1,label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
print('NB ROC AUC %.3f' % roc_auc)
plt.savefig('NB ROC_curve.jpg')
plt.show()

# Cross Val

In [None]:
NB_c = GaussianNB(priors= None, var_smoothing= 0)
kf=StratifiedKFold(n_splits=5, random_state=42,shuffle=True)
score=cross_val_score(NB_c, X2_train,y2_train, cv=kf, scoring='roc_auc' )
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

In [None]:
NB_c = GaussianNB(priors= None, var_smoothing= 0)
kf=StratifiedKFold(n_splits=10, random_state=42,shuffle=True)
score=cross_val_score(NB_c, X2_train,y2_train, cv=kf, scoring='roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

## SVM

In [None]:
from sklearn import svm

SV_c = svm.SVC(C= 0.012300586651659302, kernel= 'linear', gamma= 0.023918109379818128, probability=True)
SV_c.fit(X2_train, y2_train)

In [None]:
svm_c = SV_c.predict(X2_test)
print("Accuracy:",accuracy_score(y2_test, svm_c))
print((confusion_matrix(y2_test, svm_c)))
print(classification_report(y2_test, svm_c))
pd.crosstab(y2_test, svm_c)

# AUC-ROC

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
from matplotlib import pyplot as plt

SVM = SV_c.predict_proba(X2_test)[:, 1]
roc_auc = roc_auc_score(y2_test, SVM)
fpr1, tpr1, threshold = roc_curve(y2_test, SVM)
plt.title('SVM Receiver Operating Characteristic')
plt.plot(fpr1, tpr1,label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
print('SVM ROC AUC %.3f' % roc_auc)
plt.savefig('SVM ROC_curve.jpg')
plt.show()

# Cross Val

In [None]:
SV_c = svm.SVC(C= 0.012300586651659302, kernel= 'linear', gamma= 0.023918109379818128)
kf=StratifiedKFold(n_splits=5, random_state=42,shuffle=True)
score=cross_val_score(SV_c,X2_train,y2_train, cv=kf, scoring='roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

In [None]:
SV_c = svm.SVC(C= 0.012300586651659302, kernel= 'linear', gamma= 0.023918109379818128)
kf=StratifiedKFold(n_splits=10, random_state=42,shuffle=True)
score=cross_val_score(SV_c, X2_train,y2_train, cv=kf, scoring='roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

## ANN

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
NN_cf = MLPClassifier(hidden_layer_sizes= 152, activation= 'relu', solver= 'adam', alpha= 0.00015529246552986592, learning_rate_init= 0.0006044787294862748, max_iter= 200, random_state= 42)
NN_cf.fit(X2_train, y2_train)

In [None]:
ANN_c = NN_cf.predict(X2_test)
print(accuracy_score(y2_test, ANN_c))
print(classification_report(y2_test, ANN_c))
pd.crosstab(y2_test, ANN_c)

# AUC-ROC

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
from matplotlib import pyplot as plt

NN = NN_cf.predict_proba(X2_test)[:, 1]
roc_auc = roc_auc_score(y2_test, NN)
fpr1, tpr1, threshold = roc_curve(y2_test, NN)
plt.title('NN Receiver Operating Characteristic')
plt.plot(fpr1, tpr1,label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
print('NN ROC AUC %.3f' % roc_auc)
plt.savefig('NN ROC_curve.jpg')
plt.show()

# Cross Val

In [None]:
NN_cf = MLPClassifier(hidden_layer_sizes= 152, activation= 'relu', solver= 'adam', alpha= 0.00015529246552986592, learning_rate_init= 0.0006044787294862748, max_iter= 200, random_state= 42)
kf=StratifiedKFold(n_splits=5, random_state=42,shuffle=True)
score=cross_val_score(NN_cf, X2_train,y2_train, cv=kf, scoring='roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

In [None]:
NN_cf = MLPClassifier(hidden_layer_sizes= 152, activation= 'relu', solver= 'adam', alpha= 0.00015529246552986592, learning_rate_init= 0.0006044787294862748, max_iter= 200, random_state= 42)
kf=StratifiedKFold(n_splits=10, random_state=42,shuffle=True)
score=cross_val_score(NN_cf,X2_train,y2_train, cv=kf, scoring='roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

# XGBRF

In [None]:
from xgboost import XGBRFClassifier
xgbrf_c = XGBRFClassifier(colsample_bylevel= 0.7925507241456118, colsample_bytree= 0.9554889058169239, gamma= 0.39864754821171, max_delta_step= 1, max_depth= 10, min_child_weight= 1, n_estimators= 181, num_parallel_tree= 8, reg_alpha= 0.6056467523570456, scale_pos_weight= 1.6055505228911027)
xgbrf_c.fit(X2_train,y2_train)

In [None]:
y_predict = xgbrf_c.predict(X2_test)
print(classification_report(y2_test, y_predict))
print(accuracy_score(y2_test, y_predict))
print((confusion_matrix(y2_test, y_predict)))
pd.crosstab(y2_test, y_predict)

# AUC-ROC

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
from matplotlib import pyplot as plt

XGBRF = xgbrf_c.predict_proba(X2_test)[:, 1]
roc_auc = roc_auc_score(y2_test, XGBRF)
fpr1, tpr1, threshold = roc_curve(y2_test, XGBRF)
plt.title('XGBRF Receiver Operating Characteristic')
plt.plot(fpr1, tpr1,label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
print('XGBRF ROC AUC %.3f' % roc_auc)
plt.savefig('XGBRF ROC_curve.jpg')
plt.show()

# Cross Val

In [None]:
xgbrf_c = XGBRFClassifier(colsample_bylevel= 0.7925507241456118, colsample_bytree= 0.9554889058169239, gamma= 0.39864754821171, max_delta_step= 1, max_depth= 10, min_child_weight= 1, n_estimators= 181, num_parallel_tree= 8, reg_alpha= 0.6056467523570456, scale_pos_weight= 1.6055505228911027)
kf=StratifiedKFold(n_splits=5, random_state=42,shuffle=True)
score=cross_val_score(xgbrf_c, X2_train,y2_train, cv=kf, scoring='roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

In [None]:
xgbrf_c = XGBRFClassifier(colsample_bylevel= 0.7925507241456118, colsample_bytree= 0.9554889058169239, gamma= 0.39864754821171, max_delta_step= 1, max_depth= 10, min_child_weight= 1, n_estimators= 181, num_parallel_tree= 8, reg_alpha= 0.6056467523570456, scale_pos_weight= 1.6055505228911027)
kf=StratifiedKFold(n_splits=10, random_state=42,shuffle=True)
score=cross_val_score(xgbrf_c, X2_train,y2_train, cv=kf, scoring='roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

# AUC-ROC-Tunned Parameters

In [None]:
LR_c.fit(X2_train, y2_train)
RF_c.fit(X2_train, y2_train)
xg_c.fit(X2_train, y2_train)
DT_c.fit(X2_train, y2_train)
NB_c.fit(X2_train, y2_train)
SV_c.fit(X2_train, y2_train)
NN_cf.fit(X2_train, y2_train)
xgbrf_c.fit(X2_train, y2_train)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

# Assuming you have your models and test data defined:
# LR_c, RF_c, xg_c, DT_c, NB_c, SV_c, NN_cf, xgbrf_c, x1_test, y1_test

# Dictionary to store model names and their predictions
models = {
    "Logistic Regression": LR_c,
    "XGBoost": xg_c,
    "Random Forest": RF_c,
    "Decision Tree": DT_c,
    "Naive Bayes": NB_c,
    "SVM": SV_c,
    "ANN": NN_cf,
    "XGBRF": xgbrf_c,
}

plt.figure(figsize=(10, 8))  # Adjust figure size as needed

for name, model in models.items():
    try:
        y_pred_proba = model.predict_proba(X2_test)[:, 1]
        fpr, tpr, _ = roc_curve(y2_test, y_pred_proba)
        auc = roc_auc_score(y2_test, y_pred_proba)
        plt.plot(fpr, tpr, label=f'{name} AUC = {auc:.3f}')
        print(f'{name} ROC AUC: {auc:.3f}')
    except AttributeError:
        #Handle SVM, which can lack predict_proba.
        y_pred = model.decision_function(X2_test) #use decision_function
        fpr, tpr, _ = roc_curve(y2_test, y_pred)
        auc = roc_auc_score(y2_test, y_pred)
        plt.plot(fpr, tpr, label=f'{name} AUC = {auc:.3f}')
        print(f'{name} ROC AUC: {auc:.3f}')

plt.plot([0, 1], [0, 1], "k--", label="Baseline")  # Add random classifier line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curves')
plt.legend(loc='lower right')
plt.savefig('all_models_roc_curve.png') #save the figure
plt.show()

# Missing values - Test Set

In [None]:
df2 = pd.read_csv('4_Testset_ml_ido_tdo.csv')
df2.head(2)

In [None]:
df2.shape
df2.info()

In [None]:
df2['Activity'].value_counts()

In [None]:
# To calculate the missing values in each column

percent_missing = df2.isnull().sum() * 100 / len(df2)
missing_value_df = pd.DataFrame({'column_name': df2.columns,
                                 'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(percent_missing)

In [None]:
df2.shape

In [None]:
cols = ['BCUTw-1h','BCUTc-1l','BCUTc-1h','BCUTp-1l','BCUTp-1h','SCH-7','VCH-7','SC-3','SC-5','VC-3','SPC-4','SPC-5','SPC-6','VPC-4','VPC-5','VPC-6','SP-0','SP-1','SP-2','SP-3','SP-4','SP-5','SP-6','SP-7','VP-0','VP-1','VP-2','VP-3','VP-4','VP-5','VP-6','VP-7','VABC']
df2[cols].mean()

In [None]:
# List of columns to fill with mean values
cols = ['BCUTw-1h', 'BCUTc-1l', 'BCUTc-1h', 'BCUTp-1l', 'BCUTp-1h', 'SCH-7', 'VCH-7', 'SC-3', 'SC-5', 'VC-3', 'SPC-4', 'SPC-5', 'SPC-6', 'VPC-4', 'VPC-5', 'VPC-6', 'SP-0', 'SP-1', 'SP-2', 'SP-3', 'SP-4', 'SP-5', 'SP-6', 'SP-7', 'VP-0', 'VP-1', 'VP-2', 'VP-3', 'VP-4', 'VP-5', 'VP-6', 'VP-7', 'VABC']
# Fill missing values with column-wise means
df2[cols] = df2[cols].fillna(df2[cols].mean())
# Now, all missing values in the specified columns will be filled with their respective mean values

In [None]:
df2.isnull().sum()

In [None]:
# Export the DataFrame to a CSV file
df2.to_csv('output_test_final_missi_val.csv', index=False)  

# External Validation

In [None]:
#Import the external validation dataset (output_test_final_missi_val.csv) that contains only the RFECV features

df3 = pd.read_csv('6_Ext_val_xgbrf_test_rfecv.csv')
df3.head()

In [None]:
X3 = df3.drop(['Activity'], axis=1)  # Features
y3 = df3['Activity']  # Labels

## LR

In [None]:
LR_c.fit(X3, y3)

In [None]:
LR_predict_c = LR_c.predict(X3)
print('Accuracy_score=',(accuracy_score(y3, LR_predict_c)))
print((confusion_matrix(y3, LR_predict_c)))
print(classification_report(y3, LR_predict_c))
pd.crosstab(y3, LR_predict_c)

## AUC-ROC

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
from matplotlib import pyplot as plt
LR1 = LR_c.predict_proba(X3)[:, 1]
roc_auc_ext = roc_auc_score(y3, LR1)
fpr1, tpr1, threshold = roc_curve(y3, LR1)
plt.title('LR Receiver Operating Characteristic')
plt.plot(fpr1, tpr1,label = 'AUC = %0.3f' % roc_auc_ext)
plt.legend(loc = 'lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
print('LR ROC AUC %.3f' % roc_auc_ext)
plt.savefig('LR ROC_curve.jpg')
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score,KFold

LR = LogisticRegression(tol= 0.0002872485265154782, C= 0.2507602699236452, fit_intercept= True, random_state= 42, solver= 'lbfgs')
kf=StratifiedKFold(n_splits=5, random_state=42,shuffle=True)
score=cross_val_score(LR,X3,y3,cv=kf, scoring='roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

In [None]:
from sklearn.model_selection import cross_val_score,KFold

LR = LogisticRegression(tol= 0.0002872485265154782, C= 0.2507602699236452, fit_intercept= True, random_state= 42, solver= 'lbfgs')
kf=StratifiedKFold(n_splits=10, random_state=42,shuffle=True)
score=cross_val_score(LR,X3,y3,cv=kf, scoring='roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

## XGB

In [None]:
xg_c.fit(X3, y3)

In [None]:
xg_pred_c = xg_c.predict(X3)
print(accuracy_score(y3, xg_pred_c))
print(confusion_matrix(y3, xg_pred_c))
print(classification_report(y3, xg_pred_c))
pd.crosstab(y3, xg_pred_c)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
from matplotlib import pyplot as plt

xg_ex = xg_c.predict_proba(X3)[:, 1]
roc_auc = roc_auc_score(y3, xg_ex)
fpr1, tpr1, threshold = roc_curve(y3, xg_ex)
plt.title('XGB Receiver Operating Characteristic')
plt.plot(fpr1, tpr1,label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
print('XGB ROC AUC %.3f' % roc_auc)
plt.savefig('XGB ROC_curve.jpg')
plt.show()

In [None]:
xg_c = xgb.XGBClassifier(gamma= 1, colsample_bylevel= 1, colsample_bytree= 1, colsample_bynode= 0.6, subsample= 0.9, n_estimators= 173, learning_rate= 0.9, max_delta_step= 9, max_depth= 30, random_state= 131, num_parallel_tree= 155, min_child_weight= 19, verbosity= 1)
kf=KFold(n_splits=5, random_state=42,shuffle=True)
score=cross_val_score(xg_c,X3,y3,cv=kf, scoring='roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

In [None]:
xg_c = xgb.XGBClassifier(gamma= 1, colsample_bylevel= 1, colsample_bytree= 1, colsample_bynode= 0.6, subsample= 0.9, n_estimators= 173, learning_rate= 0.9, max_delta_step= 9, max_depth= 30, random_state= 131, num_parallel_tree= 155, min_child_weight= 19, verbosity= 1)
kf=KFold(n_splits=10, random_state=42,shuffle=True)
score=cross_val_score(xg_c,X3,y3,cv=kf, scoring='roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

## RF

In [None]:
RF_c.fit(X3, y3)

In [None]:
RF_pred_c = RF_c.predict(X3)
print(accuracy_score(y3, RF_pred_c))
print(confusion_matrix(y3, RF_pred_c))
print(classification_report(y3, RF_pred_c))
pd.crosstab(y3, RF_pred_c)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
from matplotlib import pyplot as plt

RF = RF_c.predict_proba(X3)[:, 1]
roc_auc = roc_auc_score(y3, RF)
fpr1, tpr1, threshold = roc_curve(y3, RF)
plt.title('RF Receiver Operating Characteristic')
plt.plot(fpr1, tpr1,label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
print('RF ROC AUC %.3f' % roc_auc)
plt.savefig('RF ROC_curve.jpg')
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score,KFold

RF_c = RandomForestClassifier(n_estimators= 199, max_depth= 79, random_state= 1, min_samples_split= 14, min_samples_leaf= 10, max_leaf_nodes= 261, bootstrap= True, verbose= 1)
kf=StratifiedKFold(n_splits=5, random_state=42,shuffle=True)
score=cross_val_score(RF_c,X3,y3,cv=kf, scoring='roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))


In [None]:
from sklearn.model_selection import cross_val_score,KFold

RF_c = RandomForestClassifier(n_estimators= 199, max_depth= 79, random_state= 1, min_samples_split= 14, min_samples_leaf= 10, max_leaf_nodes= 261, bootstrap= True, verbose= 1)
kf=StratifiedKFold(n_splits=10, random_state=42,shuffle=True)
score=cross_val_score(RF_c,X3,y3,cv=kf, scoring='roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

## DT

In [None]:
DT_c.fit(X3, y3)

In [None]:
DT_predict_c = DT_c.predict(X3)
print('Accuracy_score=',(accuracy_score(y3, DT_predict_c)))
print((confusion_matrix(y3, DT_predict_c)))
print(classification_report(y3, DT_predict_c))
pd.crosstab(y3, DT_predict_c)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
from matplotlib import pyplot as plt

DT = DT_c.predict_proba(X3)[:, 1]
roc_auc = roc_auc_score(y3, DT)
fpr1, tpr1, threshold = roc_curve(y3, DT)
plt.title('DT Receiver Operating Characteristic')
plt.plot(fpr1, tpr1,label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
print('DT ROC AUC %.3f' % roc_auc)
plt.savefig('DT ROC_curve.jpg')
plt.show()

In [None]:
DT_c = DecisionTreeClassifier(min_weight_fraction_leaf= 0, max_depth= 33, random_state= 66, min_samples_split= 24, min_samples_leaf= 4, max_leaf_nodes= 21, criterion= 'gini', splitter= 'best')
kf=StratifiedKFold(n_splits=5, random_state=42,shuffle=True)
score=cross_val_score(DT_c,X3,y3,cv=kf, scoring = 'roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

In [None]:
DT_c = DecisionTreeClassifier(min_weight_fraction_leaf= 0, max_depth= 33, random_state= 66, min_samples_split= 24, min_samples_leaf= 4, max_leaf_nodes= 21, criterion= 'gini', splitter= 'best')
kf=StratifiedKFold(n_splits=10, random_state=42,shuffle=True)
score=cross_val_score(DT_c,X3,y3,cv=kf, scoring = 'roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

## NB

In [None]:
NB_c.fit(X3, y3)

In [None]:
NB_pred_c = NB_c.predict(X3)
print('Accuracy_score=',(accuracy_score(y3, NB_pred_c)))
print((confusion_matrix(y3, NB_pred_c)))
print(classification_report(y3, NB_pred_c))
pd.crosstab(y3, NB_pred_c)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
from matplotlib import pyplot as plt

NB = NB_c.predict_proba(X3)[:, 1]
roc_auc = roc_auc_score(y3, NB)
fpr1, tpr1, threshold = roc_curve(y3, NB)
plt.title('NB Receiver Operating Characteristic')
plt.plot(fpr1, tpr1,label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
print('NB ROC AUC %.3f' % roc_auc)
plt.savefig('NB ROC_curve.jpg')
plt.show()

In [None]:
NB_c = GaussianNB(priors= None, var_smoothing= 0)
kf=StratifiedKFold(n_splits=5, random_state=42,shuffle=True)
score=cross_val_score(NB_c, X3,y3, cv=kf, scoring = 'roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(np.nanmean(score)))

In [None]:
NB_c = GaussianNB(priors= None, var_smoothing= 0)
kf=StratifiedKFold(n_splits=10, random_state=42,shuffle=True)
score=cross_val_score(NB_c, X3,y3, cv=kf, scoring = 'roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(np.nanmean(score)))

# SVM 

In [None]:
SV_c.fit(X3, y3)

In [None]:
svm_c = SV_c.predict(X3)
print("Accuracy:",accuracy_score(y3, svm_c))
print((confusion_matrix(y3, svm_c)))
print(classification_report(y3, svm_c))
pd.crosstab(y3, svm_c)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
from matplotlib import pyplot as plt

SVM = SV_c.predict_proba(X3)[:, 1]
roc_auc = roc_auc_score(y3, SVM)
fpr1, tpr1, threshold = roc_curve(y3, SVM)
plt.title('SVM Receiver Operating Characteristic')
plt.plot(fpr1, tpr1,label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
print('SVM ROC AUC %.3f' % roc_auc)
plt.savefig('SVM ROC_curve.jpg')
plt.show()

In [None]:
SV_c = svm.SVC(C= 0.012300586651659302, kernel= 'linear', gamma= 0.023918109379818128)
kf=StratifiedKFold(n_splits=5, random_state=42,shuffle=True)
score=cross_val_score(SV_c,X3,y3, cv=kf, scoring = 'roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

In [None]:
SV_c = svm.SVC(C= 0.012300586651659302, kernel= 'linear', gamma= 0.023918109379818128)
kf=StratifiedKFold(n_splits=10, random_state=42,shuffle=True)
score=cross_val_score(SV_c, X3,y3, cv=kf, scoring = 'roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

# XGBRF

In [None]:
xgbrf_c.fit(X3, y3)

In [None]:
y_predict = xgbrf_c.predict(X3)
print(classification_report(y3, y_predict))
print(accuracy_score(y3, y_predict))
pd.crosstab(y3, y_predict)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
from matplotlib import pyplot as plt

XGBRF_ex = xgbrf_c.predict_proba(X3)[:, 1]
roc_auc = roc_auc_score(y3, XGBRF_ex)
fpr1, tpr1, threshold = roc_curve(y3, XGBRF_ex)
plt.title('XGBRF Receiver Operating Characteristic')
plt.plot(fpr1, tpr1,label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
print('XGBRF ROC AUC %.3f' % roc_auc)
plt.savefig('XGBRF ROC_curve.jpg')
plt.show()

In [None]:
xgbrf_c = XGBRFClassifier(colsample_bylevel= 0.7925507241456118, colsample_bytree= 0.9554889058169239, gamma= 0.39864754821171, max_delta_step= 1, max_depth= 10, min_child_weight= 1, n_estimators= 181, num_parallel_tree= 8, reg_alpha= 0.6056467523570456, scale_pos_weight= 1.6055505228911027)
kf=StratifiedKFold(n_splits=5, random_state=42,shuffle=True)
score=cross_val_score(xgbrf_c, X3,y3, cv=kf, scoring ='roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

In [None]:
xgbrf_c = XGBRFClassifier(colsample_bylevel= 0.7925507241456118, colsample_bytree= 0.9554889058169239, gamma= 0.39864754821171, max_delta_step= 1, max_depth= 10, min_child_weight= 1, n_estimators= 181, num_parallel_tree= 8, reg_alpha= 0.6056467523570456, scale_pos_weight= 1.6055505228911027)
kf=StratifiedKFold(n_splits=10, random_state=42,shuffle=True)
score=cross_val_score(xgbrf_c, X3,y3, cv=kf, scoring ='roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

# ANN

In [None]:
NN_cf.fit(X3, y3)

In [None]:
y_predict = NN_cf.predict(X3)
print(classification_report(y3, y_predict))
print(accuracy_score(y3, y_predict))
pd.crosstab(y3, y_predict)

## AUC & ROC

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
from matplotlib import pyplot as plt
NN1 = NN_cf.predict_proba(X3)[:, 1]
roc_auc = roc_auc_score(y3, NN1)
fpr1, tpr1, threshold = roc_curve(y3, NN1)
plt.title('ANN Receiver Operating Characteristic')
plt.plot(fpr1, tpr1,label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
print('ANN ROC AUC %.3f' % roc_auc)
plt.savefig('ANN ROC_curve.jpg')
plt.show()

In [None]:
NN_cf = MLPClassifier(hidden_layer_sizes= 152, activation= 'relu', solver= 'adam', alpha= 0.00015529246552986592, learning_rate_init= 0.0006044787294862748, max_iter= 200, random_state= 42)
kf=StratifiedKFold(n_splits=5, random_state=42,shuffle=True)
score=cross_val_score(NN_cf, X3,y3, cv=kf, scoring = 'roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

In [None]:
NN_cf = MLPClassifier(hidden_layer_sizes= 152, activation= 'relu', solver= 'adam', alpha= 0.00015529246552986592, learning_rate_init= 0.0006044787294862748, max_iter= 200, random_state= 42)
kf=StratifiedKFold(n_splits=10, random_state=42,shuffle=True)
score=cross_val_score(NN_cf,X3,y3, cv=kf, scoring = 'roc_auc')
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

# AUC-ROC External validation - combined

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

# Assuming you have your models and data defined as:
# LR_c, xg_c, RF_c, DT_c, NB_c, SV_c, xgbrf_c, NN_cf
# X3, y3

models = {
    "Logistic Regression": LR_c,
    "XGBoost": xg_c,
    "Random Forest": RF_c,
    "Decision Tree": DT_c,
    "Naive Bayes": NB_c,
    "SVM": SV_c,
    "ANN Network": NN_cf,
    "XGBRF": xgbrf_c,
    
}

plt.figure(figsize=(10, 8))  # Adjust figure size as needed

for name, model in models.items():
    try:
        y_proba = model.predict_proba(X3)[:, 1]
    except AttributeError: #SVM does not have predict_proba
        y_proba = model.decision_function(X3)

    auc = roc_auc_score(y3, y_proba)
    fpr, tpr, _ = roc_curve(y3, y_proba)
    plt.plot(fpr, tpr, label=f"{name} (AUC = {auc:.3f})")
    print(f"{name} ROC AUC: {auc:.3f}")

plt.plot([0, 1], [0, 1], "k--", label="Baseline")  # Add random classifier line
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curves")
plt.legend(loc="lower right")
plt.savefig("all_models_roc_curve.png") # save the figure
plt.show()

# Applicability domain evaluation

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist

# ---- Use the same scaler fitted on the training data ----

scaler = StandardScaler()
X2_train_scaled = scaler.fit_transform(X2_train)
X3_scaled = scaler.transform(X3)

# ---- Fit final model ----

xgbrf_c.fit(X2_train_scaled, y2_train)

# ---- Compute the training centroid and distances ----
train_centroid = np.mean(X2_train_scaled, axis=0)
train_distances = cdist(X2_train_scaled, [train_centroid])

# Define AD threshold = mean + 2 × SD (can also use 3×SD for looser domain)
AD_threshold = np.mean(train_distances) + 3 * np.std(train_distances)
print(f"Applicability Domain threshold: {AD_threshold:.3f}")

# ---- Compute distances for external validation compounds ----
ext_distances = cdist(X3_scaled, [train_centroid])
within_AD = (ext_distances <= AD_threshold).flatten()

# ---- Make predictions for external dataset ----
XGBRF_preds = xgbrf_c.predict(X3_scaled)

# ---- Combine results into a dataframe ----
AD_results = pd.DataFrame({
    'Distance_to_Centroid': ext_distances.flatten(),
    'Within_AD': within_AD,
    'XGBRF_Pred': XGBRF_preds,
    'Actual_Activity': y3
})

# ---- Summarize AD coverage ----
within_count = np.sum(within_AD)
total = len(X3)
print(f"Compounds within AD: {within_count} / {total}")
AD_results.to_csv("External_AD_Results.csv", index=False)
# ---- View sample results ----
AD_results.head()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# ---- Basic scatter plot ----
plt.figure(figsize=(8,6))
plt.scatter(range(len(AD_results)),
            AD_results['Distance_to_Centroid'],
            c=AD_results['Within_AD'].map({True: 'green', False: 'pink'}),
            label='Compounds')

# ---- Draw threshold line ----
plt.axhline(AD_threshold, color='magenta', linestyle='--', linewidth=2,
            label=f'AD Threshold = {AD_threshold:.2f}')

# ---- Labeling and aesthetics ----
plt.title("Applicability Domain Evaluation (XGBRF Model)", fontsize=14)
plt.xlabel("Compound Index (External Validation Set)", fontsize=12)
plt.ylabel("Distance to Training Set Centroid", fontsize=12)
plt.legend(fontsize=10)
plt.tight_layout()
plt.show()

# Screening of Analyticon discovery 

In [None]:
df4 = pd.read_excel('7_MEGxp_data.xlsx')
df4.head()

In [None]:
df4.isnull().sum()

In [None]:
# To calculate the missing values in each column

percent_missing = df4.isnull().sum() * 100 / len(df4)
missing_value_df = pd.DataFrame({'column_name': df4.columns,
                                 'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(percent_missing)

In [None]:
df4.shape

In [None]:
cols = ['AATSC6s', 'AATSC7s', 'AATSC8s', 'VE3_DzZ', 'VE3_Dzs', 'BCUTc-1l', 'BCUTp-1h', 'SpMax5_Bhm', 'SpMax6_Bhm', 'SpMax5_Bhv', 'SpMin8_Bhe', 'SpMax3_Bhp', 'SpMax1_Bhs', 'SCH-7', 'VCH-7', 'SC-3', 'SC-5', 'nHBd', 'SHBint4', 'SHsOH', 'SsssCH', 'StsC', 'minHCsats', 'minsCH3', 'minssCH2', 'mindsCH', 'mindO', 'minsF', 'maxHBd', 'maxHBa', 'maxaaCH']
df4[cols].mean()

In [None]:
# Define the list of columns to fill missing values with their column-wise mean
cols = [
    'AATSC6s', 'AATSC7s', 'AATSC8s', 'VE3_DzZ', 'VE3_Dzs', 'BCUTc-1l', 'BCUTp-1h',
    'SpMax5_Bhm', 'SpMax6_Bhm', 'SpMax5_Bhv', 'SpMin8_Bhe', 'SpMax3_Bhp', 'SpMax1_Bhs',
    'SCH-7', 'VCH-7', 'SC-3', 'SC-5', 'nHBd', 'SHBint4', 'SHsOH', 'SsssCH', 'StsC',
    'minHCsats', 'minsCH3', 'minssCH2', 'mindsCH', 'mindO', 'minsF', 'maxHBd', 
    'maxHBa', 'maxaaCH'
]

# Fill missing values with column-wise mean for specified columns
df4[cols] = df4[cols].apply(lambda col: col.fillna(col.mean()))

# Check if all missing values in the specified columns have been filled
missing_values_after = df4[cols].isnull().sum()
print(missing_values_after)

In [None]:
df4.shape

In [None]:
df4.isnull().sum()

In [None]:
# Export the DataFrame to a CSV file
df4.to_csv('missingval_xgbrf_AD_predictions_output.csv', index=False)  # Adjust the filename as needed

In [None]:
df5 = pd.read_csv('missingval_xgbrf_AD_predictions_output.csv')
df5.head()

In [None]:
x = df5.loc[:,['ATSC6p','ATSC5i','ATSC6i','AATSC6m','AATSC7m','AATSC8m','AATSC4v','AATSC6v','AATSC7v','AATSC8v','AATSC7i','AATSC8i','AATSC6s','AATSC7s','AATSC8s','MATS8m','MATS8e','MATS8s','GATS5c','GATS6c','GATS7c','GATS8c','GATS6m','GATS8m','GATS8v','GATS5e','GATS7e','GATS8e','GATS8p','GATS4s','GATS8s','VE3_DzZ','VE3_Dzs','BCUTc-1l','BCUTp-1h','SpMax5_Bhm','SpMax6_Bhm','SpMax5_Bhv','SpMin8_Bhe','SpMax3_Bhp','SpMax1_Bhs','SCH-7','VCH-7','SC-3','SC-5','CrippenLogP','nHBd','SHBint4','SHsOH','SsssCH','StsC','minHCsats','minsCH3','minssCH2','mindsCH','mindO','minsF','maxHBd','maxHBa','maxaaCH','IC2','IC3','CIC1','CIC3','MIC0','ZMIC1','ZMIC2','MLogP','MDEC-22','MDEC-33','MDEN-12','MDEN-23','MLFER_BH','MLFER_S','MLFER_E','MLFER_L','R_TpiPCTPC','nRotB','GGI3','GGI5','TopoPSA','XLogP','FP17','FP166','FP354','FP395','FP493','FP682','FP875','FP876','FP884','FP912','FP928','ExtFP48','ExtFP99','ExtFP354','ExtFP374','ExtFP579','ExtFP685','ExtFP704','ExtFP819','ExtFP924','ExtFP931','ExtFP993','ExtFP996','GraphFP21','GraphFP73','GraphFP149','GraphFP170','GraphFP356','GraphFP402','GraphFP453','GraphFP458','GraphFP641','GraphFP780','GraphFP1016','KRFP1566','AD2D627','AD2D628']]

In [None]:
x

In [None]:
xgbrf_c.fit(X3, y3)
xgbrf_AD = xgbrf_c.predict(x)

In [None]:
xgbrf_AD

In [None]:
# Convert the predictions to a DataFrame - Method 1
df6 = pd.DataFrame(xgbrf_AD)
df6.to_excel('Output.xlsx', index= True)

In [None]:
# Convert the predictions to a DataFrame
predictions_df = pd.DataFrame(xgbrf_AD, index=x.index, columns=['Prediction'])

In [None]:
# Combine the predictions with the original index
result_df = pd.concat([x, predictions_df], axis=1)

In [None]:
# Export the results to an Excel file
result_df.to_excel('xgbrf_AD_predictions_output.xlsx')