In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.metrics import f1_score

# Load data (assuming it's stored in a CSV file)
data = pd.read_csv("tested_molecules.csv")
data.columns = ["SMILES", "PKM2_inhibition", "ERK2_inhibition"]

# Convert inhibition columns to integers
data["PKM2_inhibition"] = data["PKM2_inhibition"].astype(int)
data["ERK2_inhibition"] = data["ERK2_inhibition"].astype(int)

# Preprocessing for Random Forest
# Convert SMILES strings to numerical features (e.g., molecular fingerprints)
# Function to convert SMILES to Morgan fingerprints
def smiles_to_fingerprint(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    return np.array(fp)

# Apply the function to create the feature matrix
data['features'] = data['SMILES'].apply(smiles_to_fingerprint)
data = data.dropna(subset=['features'])

# Convert list of arrays to 2D array
X_rf = np.array(data['features'].tolist())

# Create the y matrix for multi-label classification
y = data[['PKM2_inhibition', 'ERK2_inhibition']].values

# Split the data into training and test sets
A_samples = data[data["PKM2_inhibition"] == 1]
B_samples = data[data["ERK2_inhibition"] == 1]
A_train, A_test = train_test_split(A_samples, test_size=0.2, random_state=42)
B_train, B_test = train_test_split(B_samples, test_size=0.2, random_state=42)
train_data_rf = pd.concat([A_train, B_train], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)
test_data_rf = pd.concat([A_test, B_test], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)

# Separate features and labels for Random Forest
X_train_rf, y_train_rf = np.array(train_data_rf['features'].tolist()), train_data_rf[['PKM2_inhibition', 'ERK2_inhibition']].values
X_test_rf, y_test_rf = np.array(test_data_rf['features'].tolist()), test_data_rf[['PKM2_inhibition', 'ERK2_inhibition']].values

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

# Iterate over the parameter grid manually
best_score = -1
best_params = None
total_combinations = len(list(ParameterGrid(param_grid)))

for i, params in enumerate(ParameterGrid(param_grid)):
    print(f"Completed {i + 1} out of {total_combinations} grid search combinations.")
    
    rf_classifier = RandomForestClassifier(**params, random_state=42)
    rf_classifier.fit(X_train_rf, y_train_rf)
    
    rf_predictions = rf_classifier.predict(X_test_rf)
    score = f1_score(y_test_rf, rf_predictions, average='samples')
    
    if score > best_score:
        best_score = score
        best_params = params

print("Best parameters found: ", best_params)

# Train the best model
best_rf = RandomForestClassifier(**best_params, random_state=42)
best_rf.fit(X_train_rf, y_train_rf)
rf_predictions = best_rf.predict(X_test_rf)

# Evaluate the model
rf_report = classification_report(y_test_rf, rf_predictions, target_names=["PKM2_inhibition", "ERK2_inhibition"], zero_division=0)
print("Random Forest Model:")
print(rf_report)

Completed 1 out of 648 grid search combinations.
Completed 2 out of 648 grid search combinations.
Completed 3 out of 648 grid search combinations.
Completed 4 out of 648 grid search combinations.
Completed 5 out of 648 grid search combinations.
Completed 6 out of 648 grid search combinations.
Completed 7 out of 648 grid search combinations.
Completed 8 out of 648 grid search combinations.
Completed 9 out of 648 grid search combinations.
Completed 10 out of 648 grid search combinations.
Completed 11 out of 648 grid search combinations.
Completed 12 out of 648 grid search combinations.
Completed 13 out of 648 grid search combinations.
Completed 14 out of 648 grid search combinations.
Completed 15 out of 648 grid search combinations.
Completed 16 out of 648 grid search combinations.
Completed 17 out of 648 grid search combinations.
Completed 18 out of 648 grid search combinations.
Completed 19 out of 648 grid search combinations.
Completed 20 out of 648 grid search combinations.
Completed

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from rdkit import Chem
from rdkit.Chem import AllChem, MACCSkeys, rdMolDescriptors
from tqdm import tqdm

# Load data (assuming it's stored in a CSV file)
data = pd.read_csv("tested_molecules.csv")
data.columns = ["SMILES", "PKM2_inhibition", "ERK2_inhibition"]

# Convert inhibition columns to integers
data["PKM2_inhibition"] = data["PKM2_inhibition"].astype(int)
data["ERK2_inhibition"] = data["ERK2_inhibition"].astype(int)

# Functions to convert SMILES to various fingerprints
def smiles_to_morgan_fingerprint(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    return np.array(fp)

def smiles_to_maccs_keys(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = MACCSkeys.GenMACCSKeys(mol)
    return np.array(fp)

def smiles_to_rdkit_torsions(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(mol)
    return np.array(fp)

def smiles_to_rdkit_atom_pairs(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol)
    return np.array(fp)

# Function to convert SMILES to fingerprints
def convert_smiles_to_fingerprint(smiles, fingerprint_type):
    if fingerprint_type == 'morgan':
        return smiles_to_morgan_fingerprint(smiles)
    elif fingerprint_type == 'maccs':
        return smiles_to_maccs_keys(smiles)
    elif fingerprint_type == 'torsions':
        return smiles_to_rdkit_torsions(smiles)
    elif fingerprint_type == 'atom_pairs':
        return smiles_to_rdkit_atom_pairs(smiles)
    else:
        raise ValueError(f"Unsupported fingerprint type: {fingerprint_type}")

# Apply the function to create the feature matrix for different fingerprint types
fingerprint_types = ['morgan', 'maccs', 'torsions', 'atom_pairs']
results = []

for fingerprint_type in fingerprint_types:
    print(f"Processing fingerprint type: {fingerprint_type}")
    
    data['features'] = data['SMILES'].apply(lambda x: convert_smiles_to_fingerprint(x, fingerprint_type))
    data = data.dropna(subset=['features'])
    
    # Convert list of arrays to 2D array
    X_rf = np.array(data['features'].tolist())
    
    # Create the y matrix for multi-label classification
    y = data[['PKM2_inhibition', 'ERK2_inhibition']].values
    
    # Split the data into training and test sets
    A_samples = data[data["PKM2_inhibition"] == 1]
    B_samples = data[data["ERK2_inhibition"] == 1]
    A_train, A_test = train_test_split(A_samples, test_size=0.2, random_state=42)
    B_train, B_test = train_test_split(B_samples, test_size=0.2, random_state=42)
    train_data_rf = pd.concat([A_train, B_train], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)
    test_data_rf = pd.concat([A_test, B_test], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Separate features and labels for Random Forest
    X_train_rf = np.array(train_data_rf['features'].tolist())
    y_train_rf = train_data_rf[['PKM2_inhibition', 'ERK2_inhibition']].values
    X_test_rf = np.array(test_data_rf['features'].tolist())
    y_test_rf = test_data_rf[['PKM2_inhibition', 'ERK2_inhibition']].values
    
    # Define the parameter grid
    param_grid = {
        'n_estimators': [100, 200, 500],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 5],
        'max_features': ['sqrt', 'log2', None],
        'bootstrap': [True, False]
    }
    
    # Iterate over the parameter grid manually
    best_score = -1
    best_params = None
    total_combinations = len(list(ParameterGrid(param_grid)))
    
    for i, params in enumerate(ParameterGrid(param_grid)):
        print(f"Completed {i + 1} out of {total_combinations} grid search combinations.")
        
        rf_classifier = RandomForestClassifier(**params, random_state=42)
        rf_classifier.fit(X_train_rf, y_train_rf)
        
        rf_predictions = rf_classifier.predict(X_test_rf)
        score = f1_score(y_test_rf, rf_predictions, average='samples')
        
        if score > best_score:
            best_score = score
            best_params = params
    
    print(f"Best parameters for {fingerprint_type} fingerprint: ", best_params)
    
    # Train the best model
    best_rf = RandomForestClassifier(**best_params, random_state=42)
    best_rf.fit(X_train_rf, y_train_rf)
    rf_predictions = best_rf.predict(X_test_rf)
    
    # Evaluate the model
    rf_report = classification_report(y_test_rf, rf_predictions, target_names=["PKM2_inhibition", "ERK2_inhibition"], zero_division=0)
    results.append((fingerprint_type, rf_report))

# Print results for all fingerprint types
for fingerprint_type, report in results:
    print(f"Results for {fingerprint_type} fingerprint:")
    print(report)


Processing fingerprint type: morgan
Completed 1 out of 648 grid search combinations.
Completed 2 out of 648 grid search combinations.
Completed 3 out of 648 grid search combinations.
Completed 4 out of 648 grid search combinations.
Completed 5 out of 648 grid search combinations.
Completed 6 out of 648 grid search combinations.
Completed 7 out of 648 grid search combinations.
Completed 8 out of 648 grid search combinations.
Completed 9 out of 648 grid search combinations.
Completed 10 out of 648 grid search combinations.
Completed 11 out of 648 grid search combinations.
Completed 12 out of 648 grid search combinations.
Completed 13 out of 648 grid search combinations.
Completed 14 out of 648 grid search combinations.
Completed 15 out of 648 grid search combinations.
Completed 16 out of 648 grid search combinations.
Completed 17 out of 648 grid search combinations.
Completed 18 out of 648 grid search combinations.
Completed 19 out of 648 grid search combinations.
Completed 20 out of 648