<a href="https://colab.research.google.com/github/xelothi/CRAF_for_a_paper/blob/main/validation_of_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Testing different ML models ability to predict CRAF inhibitory molecules based on Lipinski descriptors as features. Lipinksi descriptors are a widely used set of guidelines in medicinal chemistry for evaluating the drug-likeness or pharmacokinetic properties of potential drug compounds. Based on Lipiniski:

*   Molecular weight: should be less than 500 daltons
*   LogP(The octanol-water partition coefficient): should be less than 5.
*   Hydrogen bond donors: >= 5
*   Hydrogen bond acceptors: >= 10
*   Number of rotating bonds: >= 10

In [1]:
! pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.9.4-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.9.4-cp311-cp311-manylinux_2_28_x86_64.whl (34.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.2/34.2 MB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.9.4


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics,svm, clone
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, mean_squared_error
from sklearn.model_selection import train_test_split,KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from rdkit import Chem, DataStructs
from rdkit.Chem import Draw, Descriptors, rdFingerprintGenerator, AllChem, MACCSkeys
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

In [3]:
df = pd.read_csv("df_external_test.csv", index_col=[0])
df.head()

Unnamed: 0.1,Unnamed: 0,molecule_chembl_id,std_smiles,MW,LogP,NumHDonors,NumHAcceptors,numRotatingBonds,norm_value,pIC50,bioactivity
2,2,CHEMBL305178,C[C@H](Nc1nccc(-c2c(-c3cccc(C(F)(F)F)c3)nc(C3C...,506.576,6.203,2.0,6.0,6.0,1000.0,6.0,inactive
23,23,CHEMBL1336,CNC(=O)c1cc(Oc2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)...,464.831,5.5497,3.0,4.0,5.0,370.0,6.431798,inactive
32,32,CHEMBL1760617,CNC(=O)c1cc2ccc(CCNC(=O)Nc3cccc(S(=O)(=O)C(F)(...,480.468,3.2521,3.0,5.0,6.0,200.0,6.69897,active
33,33,CHEMBL1760618,CNC(=O)c1cc2ccc(CCNC(=O)Nc3ccc(C(F)(F)F)cc3)cc...,416.403,3.9774,3.0,3.0,5.0,370.0,6.431798,inactive
34,34,CHEMBL1760619,CNC(=O)c1cc2ccc(CCNC(=O)Nc3ccc(C)c(C(F)(F)F)c3...,430.43,4.28582,3.0,3.0,5.0,200.0,6.69897,active


In [4]:
df["activity"] = df["bioactivity"].apply(lambda x: 1 if x == "active" else 0)
df.shape

(496, 12)

In [5]:
df["activity"].value_counts()

Unnamed: 0_level_0,count
activity,Unnamed: 1_level_1
1,329
0,167


In [6]:
df = df.dropna()
df.shape

(496, 12)

In [7]:
def smiles_to_fp(smiles, method="morgan2", n_bits=32784):

    mol = Chem.MolFromSmiles(smiles)

    if method == "maccs":
        return np.array(MACCSkeys.GenMACCSKeys(mol))
    if method == "morgan2":
        fpg = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=n_bits)
        return np.array(fpg.GetFingerprint(mol))
    if method == "morgan3":
        fpg = rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=n_bits)
        return np.array(fpg.GetFingerprint(mol))
    else:
        print(f"Warning: Wrong method specified: {method}. Default will be used instead.")
        return np.array(MACCSkeys.GenMACCSKeys(mol))

In [8]:
df["fp"] = df["std_smiles"].apply(smiles_to_fp)
df.head(3)

Unnamed: 0.1,Unnamed: 0,molecule_chembl_id,std_smiles,MW,LogP,NumHDonors,NumHAcceptors,numRotatingBonds,norm_value,pIC50,bioactivity,activity,fp
2,2,CHEMBL305178,C[C@H](Nc1nccc(-c2c(-c3cccc(C(F)(F)F)c3)nc(C3C...,506.576,6.203,2.0,6.0,6.0,1000.0,6.0,inactive,0,"[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
23,23,CHEMBL1336,CNC(=O)c1cc(Oc2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)...,464.831,5.5497,3.0,4.0,5.0,370.0,6.431798,inactive,0,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
32,32,CHEMBL1760617,CNC(=O)c1cc2ccc(CCNC(=O)Nc3cccc(S(=O)(=O)C(F)(...,480.468,3.2521,3.0,5.0,6.0,200.0,6.69897,active,1,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


#Models


In [13]:
import pickle
with open('model_RF.pkl', 'rb') as file:
    model_rf = pickle.load(file)

In [15]:
fingerprint_to_model = df.fp.tolist()
predictions = model_rf.predict(fingerprint_to_model)

In [16]:

# Extract features from the external dataset
X_external = df.fp.tolist()

# Make predictions using the XGBoost model
confidence_scores = xgboost_model.predict_proba(X_external)
confidence_scores = confidence_scores[:, 1]

# Metrics
y_true = df['activity']
accuracy = accuracy_score(y_true, predictions)
sens = recall_score(y_true, predictions)
spec = recall_score(y_true, predictions, pos_label=0)
auc = roc_auc_score(y_true, predictions)
rmse = mean_squared_error(y_true, predictions)
print("Accuracy on external dataset:", accuracy)

print(f"Sensitivity: {sens:.2f}")
print(f"Specificity: {spec:.2f}")
print(f"AUC: {auc:.2f}")

Accuracy on external dataset: 0.8770161290322581
Sensitivity: 0.92
Specificity: 0.79
AUC: 0.86


In [17]:
results_df = pd.DataFrame({'y_true': df['activity'], 'predictions': predictions, "confidence": confidence_scores})
results_df

Unnamed: 0,y_true,predictions,confidence
2,0,0,0.12
23,0,1,0.83
32,1,1,0.63
33,0,1,0.58
34,1,1,0.67
...,...,...,...
1079,1,1,0.98
1080,1,1,0.99
1081,1,1,0.99
1082,1,1,0.97


In [18]:
results_df.to_csv("ende.csv")