In [None]:
!pip install pytdc xgboost rdkit scikit-learn pandas numpy

In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV



from rdkit.Chem import MACCSkeys
from rdkit.Chem import Descriptors
from rdkit.Chem import rdmolops

from tdc.single_pred import Tox

In [3]:
data = Tox(name = 'hERG_Karim')
df = data.get_data()
df #view dataframe to explore

Found local copy...
Loading...
Done!


Unnamed: 0,Drug_ID,Drug,Y
0,0,Fc1ccc(-n2cc(NCCN3CCCCC3)nn2)cc1F,1
1,1,COc1cc(N2Cc3ccc(Sc4ccc(F)cc4)nc3C2=O)ccc1OCCN1...,0
2,2,CCOC(=O)[C@H]1CC[C@@H](N2CC(NC(=O)CNc3nn(C(N)=...,0
3,3,N[C@@H](Cn1c(=O)cnc2ccc(F)cc21)C1CCC(NCc2ccc3c...,0
4,4,O=C(NC1COc2cccc(-c3ccnc(CO)c3)c2C1)c1ccc(OCC(F...,0
...,...,...,...
13440,13440,Cc1csc(NC(=O)c2sc3nc4c(c(C(F)(F)F)c3c2N)CCC4)n1,0
13441,13441,Cc1cccc(-c2n[nH]cc2-c2ccc3ncccc3n2)n1,0
13442,13442,Cc1ccccc1-n1c(Cn2cnc3c(N)ncnc32)nc2cccc(C)c2c1=O,0
13443,13443,Cc1ccccc1-n1c(Cn2ncc3c(N)ncnc32)nc2cccc(C)c2c1=O,0


In [None]:
split = data.get_split(frac=[0.8, 0.1, 0.1])

# Compute MACCS Keys
def compute_maccs_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    maccs = MACCSkeys.GenMACCSKeys(mol)
    return np.array(maccs)

# Compute additional molecular descriptors
def compute_molecular_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)

    descriptors = []
    # Molecular weight
    descriptors.append(Descriptors.MolWt(mol))
    # LogP (octanol-water partition coefficient)
    descriptors.append(Descriptors.MolLogP(mol))
    # Aromaticity (True/False)
    descriptors.append(Descriptors.NumAromaticRings(mol))
    # TPSA
    descriptors.append(Descriptors.TPSA(mol))

    return np.array(descriptors)

#Compute Morgan fingerprints
def compute_morgan_fingerprint(smiles, radius=2, nBits=1024):
    mol = Chem.MolFromSmiles(smiles)
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
    return np.array(fingerprint)

# Compute all
def compute_combined_fingerprints(smiles):
    # Compute Morgan Fingerprint
    morgan_fingerprint = compute_morgan_fingerprint(smiles)

    # Compute MACCS Fingerprint
    maccs_fingerprint = compute_maccs_fingerprint(smiles)

    # Compute molecular descriptors
    molecular_descriptors = compute_molecular_descriptors(smiles)

    # Combine all features into a single vector
    combined_features = np.concatenate([morgan_fingerprint, maccs_fingerprint, molecular_descriptors])

    return combined_features
  
feature_dict = {}
Y_dict = {}
scaler = StandardScaler()
for set in ['train', 'valid', 'test']:
  if set is 'train':  
    feature_dict[set] = np.stack(split[set]['Drug'].apply(compute_combined_fingerprints))
  else:
    feature_dict[set] = np.stack(split[set]['Drug'].apply(compute_combined_fingerprints))
    
  Y_dict[set] = split[set]['Y']



In [None]:
#scale and train
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(feature_dict['train'])

#keep track of best parameters found in search 
#model = MLPClassifier(solver= 'adam', learning_rate_init= 0.0001, learning_rate= 'adaptive', batch_size= 128, alpha= 0.0001, activation= 'relu')

# param_distributions = {
#     'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50), (50, 50, 50)],
#     'activation': ['relu', 'tanh', 'logistic'],
#     'solver': ['adam', 'sgd'],
#     'alpha': [1e-5, 1e-4, 1e-3, 1e-2],
#     'learning_rate': ['constant', 'adaptive'],
#     'learning_rate_init': [0.0001, 0.001, 0.01],
#     'batch_size': [32, 64, 128],
# }


# hyperparam_search = RandomizedSearchCV(
#     estimator=model,
#     param_distributions=param_distributions,
#     n_iter=10,  # Number of random configurations to test
#     scoring='accuracy',
#     cv=5,  # 5-fold cross-validation
#     verbose= 1,
#     n_jobs=-1,  # Use all available cores
#     random_state=42  # For reproducibility
# )

# hyperparam_search.fit(X_train_scaled, Y_dict['train'])

# # Best parameters and score
# print("Best parameters:", hyperparam_search.best_params_)
# print("Best cross-validation score:", hyperparam_search.best_score_)

#manually hyperparameter tune 
model = MLPClassifier(solver= 'adam', learning_rate_init= 0.001, learning_rate= 'constant', batch_size= 128, alpha= 0.01, activation= 'relu')

model.fit(X_train_scaled, Y_dict['train'])


In [None]:
X_valid_scaled = scaler.transform(feature_dict['valid'])
# y_pred = hyperparam_search.best_estimator_.predict(X_valid_scaled)
# print("MLP Accuracy:", accuracy_score(Y_dict['valid'], y_pred))
# print("MLP Classification Report:\n", classification_report(Y_dict['valid'], y_pred))

y_valid_pred = model.predict(X_valid_scaled)
print("MLP Accuracy:", accuracy_score(Y_dict['valid'], y_valid_pred))
print("MLP Classification Report:\n", classification_report(Y_dict['valid'], y_valid_pred))

MLP Accuracy: 0.8483271375464684
MLP Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.84      0.85       688
           1       0.84      0.86      0.85       657

    accuracy                           0.85      1345
   macro avg       0.85      0.85      0.85      1345
weighted avg       0.85      0.85      0.85      1345

MLP


In [9]:
X_test_scaled = scaler.transform(feature_dict['test'])

y_test_pred = model.predict(X_test_scaled)
y_test_prob = model.predict_proba(X_test_scaled)
print("MLP Accuracy:", accuracy_score(Y_dict['test'], y_test_pred))
print("MLP Classification Report:\n", classification_report(Y_dict['test'], y_test_pred))
print("MLP AUC ROC: " , roc_auc_score(Y_dict['test'], y_test_prob[:,1]))

MLP Accuracy: 0.8415178571428571
MLP Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.85      0.84       648
           1       0.85      0.84      0.85       696

    accuracy                           0.84      1344
   macro avg       0.84      0.84      0.84      1344
weighted avg       0.84      0.84      0.84      1344

MLP AUC ROC:  0.908722239960267
