In [None]:
!pip install PyTDC
!pip install rdkit

In [None]:
import pandas as pd
import numpy as np
from tdc.single_pred import ADME
from rdkit import Chem
from rdkit.Chem import Descriptors

In [None]:
data = ADME(name = 'PAMPA_NCATS')
split = data.get_split()

In [None]:
train_df = split['train']
valid_df = split['valid']
test_df = split['test']

In [None]:
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    descriptors = {}
    for descriptor_name, function in Descriptors.descList:
        descriptors[descriptor_name] = function(mol)
    return descriptors

In [None]:
# Calculate descriptors for train, valid, and test sets
train_df['Descriptors'] = train_df['Drug'].apply(calculate_descriptors)
valid_df['Descriptors'] = valid_df['Drug'].apply(calculate_descriptors)
test_df['Descriptors'] = test_df['Drug'].apply(calculate_descriptors)

In [None]:
# Prepare features and labels
X_train = pd.DataFrame(train_df['Descriptors'].tolist())
y_train = train_df['Y']

X_valid = pd.DataFrame(valid_df['Descriptors'].tolist())
y_valid = valid_df['Y']

X_test = pd.DataFrame(test_df['Descriptors'].tolist())
y_test = test_df['Y']

In [None]:
from sklearn.impute import SimpleImputer

# Impute missing values with mean
imputer = SimpleImputer(strategy='mean')
X_valid_imputed = imputer.fit_transform(X_valid)

# Convert X_valid_imputed to a DataFrame
X_valid_imputed_df = pd.DataFrame(X_valid_imputed, columns=X_valid.columns)

In [None]:
# Concatenate training and validation datasets
X_combined = pd.concat([X_train, X_valid_imputed_df], axis=0)
y_combined = pd.concat([y_train, y_valid], axis=0)

Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, confusion_matrix, cohen_kappa_score, accuracy_score

rf_classifier = RandomForestClassifier(n_estimators=200)

# Perform cross-validation
cv_scores = cross_val_score(rf_classifier, X_combined, y_combined, cv=5, scoring='accuracy')

# Print cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())

In [None]:
# Train the model
rf_classifier.fit(X_combined, y_combined)

In [None]:
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    auc_roc = roc_auc_score(y, y_pred)
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    balanced_accuracy = (sensitivity + specificity) / 2
    kappa = cohen_kappa_score(y, y_pred)
    accuracy = accuracy_score(y, y_pred)

    return auc_roc, sensitivity, specificity, balanced_accuracy, kappa, accuracy

In [None]:
# Evaluate the model on test data
auc_roc_test, sensitivity_test, specificity_test, balanced_accuracy_test, kappa_test, accuracy = evaluate_model(rf_classifier, X_test, y_test)
print("\nTest Performance:")
print("AUC-ROC:", auc_roc_test)
print("Sensitivity:", sensitivity_test)
print("Specificity:", specificity_test)
print("Balanced Accuracy:", balanced_accuracy_test)
print("Cohen's Kappa:", kappa_test)
print("Accuracy:", accuracy)