# Import Libraries

In [1]:
import sys
import os
import numpy as np
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from rdkit import Chem
from statistics import mean
from rdkit.Chem import Descriptors, rdMolDescriptors, PandasTools, AllChem
from scipy.stats import loguniform, randint, uniform
from boruta import BorutaPy
from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score, confusion_matrix,matthews_corrcoef
from sklearn.metrics import make_scorer, classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, f1_score, roc_curve, precision_score, cohen_kappa_score, accuracy_score, r2_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, KFold
from sklearn.model_selection import cross_validate
from sklearn.utils import resample
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# loading of calculated discriptors

In [2]:
import pandas as pd
import numpy as np
Location = r'C:\Users\wanih\\CXCR4\\Dataset_training_descriptors.csv'
df = pd.read_csv(Location)
df.head()

Unnamed: 0.1,Unnamed: 0,Ids,Activity,Smiles,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,0,pk1,1,CN1CCN(CC1)c1cc(CN2CCC[C@H]2c2ncccc2C)nc(C)n1,4.747386,4.747386,0.396813,0.396813,0.828903,21.740741,...,0,0,0,0,0,0,0,0,0,0
1,1,pk2,1,CN1CCCN(CC1)c1cc(CN2CCC[C@H]2c2ncccc2C)nc(C)n1,4.770823,4.770823,0.395146,0.395146,0.812758,21.821429,...,0,0,0,0,0,0,0,0,0,0
2,2,pk3,1,CN(Cc1cc(ncn1)N1CCN(C)CC1)[C@H]1CCCc2cccnc12,4.674182,4.674182,0.376853,0.376853,0.840163,21.038462,...,0,0,0,0,0,0,0,0,0,0
3,3,pk4,1,C(N1CCNCCc2cccc(CCNCC1)n2)c1ccc(CN2CCNCCc3cccc...,4.807727,4.807727,0.975697,0.975697,0.379775,19.428571,...,0,0,0,0,0,0,0,0,0,0
4,4,pk5,1,CC(C)(C)OC(=O)Cn1c2ccccc2c2ccnc(CN(CCCCN)[C@H]...,13.011991,13.011991,0.139849,-0.543564,0.233446,15.763158,...,0,0,0,0,0,0,0,0,1,0


In [3]:
df['Activity'].value_counts()

Activity
1    381
0    227
Name: count, dtype: int64

In [4]:
x = df.iloc[:, 4:]
y = df["Activity"]

# Data Spliting

In [75]:
from sklearn.model_selection import train_test_split
#Dataset Spliting
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state= 2026) # different values of random seed were used [0, 42, 123, 1001, 2026]

# Data Normalization

In [76]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
#Fit and Transform the data
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)
with open("standard_scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

In [77]:
lst = x_train.columns
lst2 = x_test.columns

In [78]:
x_train_df = pd.DataFrame(x_train_scaled,columns=lst)
x_test_df = pd.DataFrame(x_test_scaled,columns=lst2)

# Features Selection using boruta 

In [79]:
#Step 1: Initialize Random Forest Classifier
rf = RandomForestClassifier(n_jobs=-1,random_state= 0)

#Step 2: Initialize Boruta
boruta_selector = BorutaPy(rf,n_estimators='auto',random_state= 0)

#Step 3: Fit Boruta on the data
boruta_selector.fit(x_train_df.values,y_train.values)

# Step 4: Get selected features
selected_features = x_train_df.columns[boruta_selector.support_]
#Convert the selected features to a DataFrame to save as Excel
selected_features_df = pd.DataFrame(selected_features, columns=['Selected Features'])
#Save the selected features to an Excel file
selected_features_df.to_csv('cxcr4_selected_features.csv',index=False)
selected_features_df

x_train_selected = x_train_df[selected_features]
x_test_selected = x_test_df[selected_features]

#Step 5:Train Random Forest on selected features
rf.fit(x_train_selected[selected_features],y_train)
feature_importances = rf.feature_importances_
#Create a DataFrame of selected features and their importances
selected_df = pd.DataFrame({
    'Selected Features': selected_features,
    'Feature Importance': feature_importances
}).sort_values(by='Feature Importance', ascending=False)
#Round feature importances to two decimal places
selected_df['Feature Importance'] = selected_df['Feature Importance'].round(3)
#save the dataframe
selected_df.to_csv('cxcr4_important_data.csv',index=False)

In [80]:
# Convert Index to list (important for portability)
selected_features_list = list(selected_features)

with open("selected_features.pkl", "wb") as f:
    pickle.dump(selected_features_list, f)

# hyper tunning

# Decision Tree

In [57]:
model_dt = DecisionTreeClassifier(random_state= 0,max_depth=5)
#tuning parameters
param_grid = {
    'splitter': ['best', 'random'],
    'min_samples_leaf': [1,3,5,7],
    'criterion': ['gini', 'entropy'],
    # 'max_depth':[1,3,5,7,9],
    'max_features':[1,3,5,7,9,11,13,15,17]
    }
grid_search = GridSearchCV(model_dt,param_grid=param_grid,scoring='roc_auc',cv= 5,n_jobs=-1)

#Fit the grid search to the training data
grid_search.fit(x_train_selected,y_train)

#best parameters found by the grid search
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: ",grid_search.best_score_)

Best parameters: {'criterion': 'gini', 'max_features': 15, 'min_samples_leaf': 5, 'splitter': 'best'}
Best cross-validation score:  0.7798088410991637


# KNeighborsClassifier

In [58]:
model_knn = KNeighborsClassifier(algorithm='auto')
#tuning parameters
param_grid = {
    'leaf_size': list(range(1,10)),
    'n_neighbors': list(range(1,10)),
    'metric': ['minkowski', 'euclidean', 'manhattan', 'chebyshev'],
   
}
#Deterministic cross-validation splitting
cv = KFold(n_splits=5, shuffle=True, random_state=1)
#grid search
grid_search=GridSearchCV(model_knn,param_grid=param_grid,cv=cv,scoring='roc_auc',n_jobs=-1)

#Fit the grid search to the training data
grid_search.fit(x_train_selected,y_train)

#best parameters found by the grid search
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:",grid_search.best_score_)

Best parameters: {'leaf_size': 1, 'metric': 'chebyshev', 'n_neighbors': 9}
Best cross-validation score: 0.7837311450777456


# Support Vector Machines (SVM)

In [59]:
model_svm = SVC(random_state= 0,class_weight='balanced')
#tunning parameters
param_grid = {
    'C': [0.001, 0.01, 0.1],
    'gamma': [0.1, 0.01, 0.001, 0.0001],
    'kernel': ['linear', 'rbf','sigmoid']
}
grid_search = GridSearchCV(model_svm,param_grid=param_grid,cv=5,scoring='roc_auc',n_jobs=-1)

#Fit the grid search to the training data
grid_search.fit(x_train_selected,y_train)

#Print the best parameters found by the grid search
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: ",grid_search.best_score_)

Best parameters: {'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'}
Best cross-validation score:  0.7786140979689367


# LogisticRegression

In [60]:
model_lr=LogisticRegression(max_iter=40,random_state= 0)
#tunning parameters
param_grid = {
    'C': [0.1, 0.01, 0.001, 1],
    'solver': ['liblinear','saga'],
    'tol': [0.1,0.01,0.001]
}
grid_search=GridSearchCV(model_lr,param_grid=param_grid,cv=5,scoring='roc_auc',n_jobs=-1)
# Fit the grid search to the training data
grid_search.fit(x_train_selected,y_train)
# Print the best parameters found by the grid search
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:",grid_search.best_score_)

Best parameters: {'C': 1, 'solver': 'liblinear', 'tol': 0.001}
Best cross-validation score: 0.7724014336917564


# AdaBoost decision tree (ABDT)

In [61]:
# Initialize a base learner (DecisionTreeClassifier with max depth 1)
base_learner = DecisionTreeClassifier(max_depth=1,random_state= 0)

# Initialize AdaBoost with the base estimator (use 'estimator' instead of 'base_estimator')
model_adbst = AdaBoostClassifier(estimator=base_learner,random_state= 0)
#tuning parameters
param_grid = {
    'n_estimators': [10,15,20,25,30,35,40,45,50],
    'algorithm': ['SAMME'],
    'learning_rate': [0.001,0.01, 0.1,1]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(model_adbst,param_grid=param_grid,cv=5,scoring='roc_auc',n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(x_train_selected,y_train)

# Print the best parameters found by the grid search
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

Best parameters: {'algorithm': 'SAMME', 'learning_rate': 1, 'n_estimators': 50}
Best cross-validation score:  0.7678614097968935




# Random Forest

In [62]:
model_rf = RandomForestClassifier(max_depth=5,random_state= 0)
#tunning parameters
param_grid = {
#     'max_depth': [3,5,7],
    'n_estimators': list(range(10,30)),
    'criterion': ['gini', 'entropy'],
    'max_features': list(range(1,14)),
    'min_samples_split': list(range(2, 10)),
    'min_samples_leaf': list(range(1, 10))
}
grid_search = GridSearchCV(model_rf,param_grid=param_grid,cv=5,scoring='roc_auc',n_jobs=-1)
#Fit the grid search to the training data
grid_search.fit(x_train_selected,y_train)
#best parameters
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: ",grid_search.best_score_)

Best parameters: {'criterion': 'gini', 'max_features': 1, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 10}
Best cross-validation score:  0.8311230585424132


# Models and Performance matricss and Performance matrics

In [81]:
import warnings
warnings.filterwarnings("ignore")
base_learner = DecisionTreeClassifier(max_depth=1,random_state= 0)

In [82]:
#Define models
models = []
models.append(('DT', DecisionTreeClassifier(random_state= 0,max_depth= 5,criterion='entropy', max_features= 7,min_samples_leaf=5,splitter='best')))
models.append(('KNN',KNeighborsClassifier(leaf_size= 3, n_neighbors= 9,metric='manhattan',algorithm='auto')))
models.append(('SVM',SVC(random_state=0,C=0.1,gamma= 0.1,kernel='rbf',class_weight='balanced')))
models.append(('RF', RandomForestClassifier(max_depth=5,random_state=0,criterion='gini',n_estimators= 24,max_features= 2,min_samples_leaf=1,min_samples_split=5)))
models.append(('LR', LogisticRegression(C=1, random_state= 0, solver='liblinear',max_iter=40,tol=0.1)))
models.append(('ABDT',AdaBoostClassifier(estimator=base_learner,random_state=0,algorithm='SAMME',learning_rate= 1,n_estimators=50)))

# Initialize result lists and names list
result1_train = []  # For storing training ACCURACY results
result2_train = []  # For storing training ROC_AUC results
result3_train = []  # For storing training F1_SCORE results
result4_train = []  # For storing training RECALL results
result5_train = []  # For storing training SPECIFICITY results
result6_train = []  # For storing training PRESCISION results
result7_train = []  # For storing training MCC results

result1_test = []   # For storing test ACCURACY results
result2_test = []   # For storing test ROC_AUC results
result3_test = []   # For storing test F1_SCORE results
result4_test = []   # For storing test RECALL results
result5_test = []   # For storing test SPECIFICITY results
result6_test = []   # For storing test PRESCISION results
result7_test = []   # For storing test MCC results

names = []  #For storing model names

# Loop through models and calculate metrics for both training and test sets
for name, model in models:
    # Fit the model on the training data
    model.fit(x_train_selected, y_train)
    
    # Get predictions for the training and test sets
    y_train_pred = model.predict(x_train_selected)
    y_test_pred = model.predict(x_test_selected)
    
    # Calculate metrics for the training set
    accuracy_train = accuracy_score(y_train, y_train_pred)
    auc_roc_train = roc_auc_score(y_train, y_train_pred)
    f1_train = f1_score(y_train, y_train_pred)  # F1-score for training set
    recall_train = recall_score(y_train, y_train_pred)
    precision_train = precision_score(y_train, y_train_pred)
    mcc_train = matthews_corrcoef(y_train, y_train_pred)  # MCC for training set
    # Calculate specificity for the training set
    tn_train, fp_train, fn_train, tp_train = confusion_matrix(y_train, y_train_pred).ravel()
    specificity_train = tn_train / (tn_train + fp_train)
    
    # Append the results to the training result lists
    result1_train.append(round(accuracy_train * 100, 2))          # Accuracy as percentage
    result2_train.append(round(auc_roc_train * 100, 2))           # AUC-ROC as percentage
    result3_train.append(round(f1_train, 2))                      # F1-Score as percentage
    result4_train.append(round(recall_train * 100, 2))            # Recall as percentage
    result5_train.append(round(specificity_train * 100, 2))       # Specificity as percentage
    result6_train.append(round(precision_train * 100, 2))         # Precision as percentage
    result7_train.append(round(mcc_train, 2))                     # MCC as percentage

    # Calculate metrics for the test set
    accuracy_test = accuracy_score(y_test, y_test_pred)
    auc_roc_test = roc_auc_score(y_test, y_test_pred)
    f1_test = f1_score(y_test, y_test_pred)  # F1-score for test set
    recall_test = recall_score(y_test, y_test_pred)
    precision_test = precision_score(y_test, y_test_pred)
    mcc_test = matthews_corrcoef(y_test, y_test_pred)  # MCC for test set

    # Calculate specificity for the test set (True Negative Rate)
    tn_test, fp_test, fn_test, tp_test = confusion_matrix(y_test, y_test_pred).ravel()
    specificity_test = tn_test / (tn_test + fp_test)
    
    # Append the results to the test result lists
    result1_test.append(round(accuracy_test * 100, 2))          # Accuracy as percentage
    result2_test.append(round(auc_roc_test * 100, 2))           # AUC-ROC as percentage
    result3_test.append(round(f1_test, 2))                      # F1-Score as percentage
    result4_test.append(round(recall_test * 100, 2))            # Recall as percentage
    result5_test.append(round(specificity_test * 100, 2))       # Specificity as percentage
    result6_test.append(round(precision_test * 100, 2))         # Precision as percentage
    result7_test.append(round(mcc_test, 2))                     # MCC as percentage
    
    # Append model name to the names list
    names.append(name)

# Create DataFrames for each metric for both training and test sets
r1_train = pd.DataFrame(result1_train, columns=['Train_accuracy'], index=names)
r2_train = pd.DataFrame(result2_train, columns=['Train_auc-roc'], index=names)
r3_train = pd.DataFrame(result3_train, columns=['Train_f1-score'], index=names)
r4_train = pd.DataFrame(result4_train, columns=['train_recall'], index=names)
r5_train = pd.DataFrame(result5_train, columns=['Train_specificity'], index=names)
r6_train = pd.DataFrame(result6_train, columns=['Train_precision'], index=names)
r7_train = pd.DataFrame(result7_train, columns=['Train_MCC'], index=names)

r1_test = pd.DataFrame(result1_test, columns=['Test_accuracy'], index=names)
r2_test = pd.DataFrame(result2_test, columns=['Test_auc-roc'], index=names)
r3_test = pd.DataFrame(result3_test, columns=['Test_f1-score'], index=names)
r4_test = pd.DataFrame(result4_test, columns=['Test_recall'], index=names)
r5_test = pd.DataFrame(result5_test, columns=['Test_specificity'], index=names)
r6_test = pd.DataFrame(result6_test, columns=['Test_precision'], index=names)
r7_test = pd.DataFrame(result7_test, columns=['Test_MCC'], index=names)
# Concatenate the DataFrames for both training and test sets along axis 1
trn_performance_matrics= pd.concat([r1_train,r2_train,r3_train,r4_train,r5_train,r6_train,r7_train], axis=1)
tst_performance_matrics = pd.concat([r1_test, r2_test,r3_test,r4_test,r5_test, r6_test, r7_test], axis=1)
# Display the final DataFrame

In [83]:
trn_performance_matrics

Unnamed: 0,Train_accuracy,Train_auc-roc,Train_f1-score,train_recall,Train_specificity,Train_precision,Train_MCC
DT,83.06,82.58,0.86,84.67,80.49,87.35,0.65
KNN,78.82,74.03,0.85,95.02,53.05,76.31,0.55
SVM,79.29,76.8,0.84,87.74,65.85,80.35,0.56
RF,86.12,82.47,0.9,98.47,66.46,82.37,0.72
LR,79.53,75.29,0.85,93.87,56.71,77.53,0.56
ABDT,82.35,78.83,0.87,94.25,63.41,80.39,0.63


In [84]:
tst_performance_matrics

Unnamed: 0,Test_accuracy,Test_auc-roc,Test_f1-score,Test_recall,Test_specificity,Test_precision,Test_MCC
DT,70.49,68.83,0.77,74.17,63.49,79.46,0.37
KNN,77.6,71.23,0.84,91.67,50.79,78.01,0.48
SVM,68.85,66.07,0.76,75.0,57.14,76.92,0.32
RF,77.05,70.44,0.84,91.67,49.21,77.46,0.47
LR,75.96,70.73,0.83,87.5,53.97,78.36,0.44
ABDT,74.86,69.9,0.82,85.83,53.97,78.03,0.42


# training of all model

In [48]:
DT=DecisionTreeClassifier(random_state= 0,max_depth= 5,criterion='entropy', max_features= 7,min_samples_leaf=5,splitter='best')
KNN=KNeighborsClassifier(leaf_size= 3, n_neighbors= 9,metric='manhattan',algorithm='auto')
SVM=SVC(random_state=0,C=0.1,gamma= 0.1,kernel='rbf',class_weight='balanced')
RF=RandomForestClassifier(max_depth=5,random_state=0,criterion='gini',n_estimators= 24,max_features= 2,min_samples_leaf=1,min_samples_split=5)
LR=LogisticRegression(C=1, random_state= 0, solver='liblinear',max_iter=40,tol=0.1)
ADBT=AdaBoostClassifier(estimator=base_learner,random_state=0,algorithm='SAMME',learning_rate= 1,n_estimators=50)

mod_dt=DT.fit(x_train_selected,y_train)
mod_knn=KNN.fit(x_train_selected,y_train)
mod_svm=SVM.fit(x_train_selected,y_train)
mod_rf=RF.fit(x_train_selected,y_train)
mod_lr=LR.fit(x_train_selected,y_train)
mod_adbt=ADBT.fit(x_train_selected,y_train)

In [49]:
import pickle

models = {
    "dt": mod_dt,
    "knn": mod_knn,
    "svm": mod_svm,
    "rf": mod_rf,
    "lr": mod_lr,
    "adbt": mod_adbt
}

for name, model in models.items():
    with open(f"{name}_model.pkl", "wb") as f:
        pickle.dump(model, f)

print("All models pickled successfully.")

All models pickled successfully.
