In [8]:
import pandas as pd
import numpy as np
import requests
from pathlib import Path
import joblib

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdFingerprintGenerator

from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import r2_score

# Define file paths
DATA_DIR = Path('./modeling/data')
DATA_FILE = DATA_DIR / 'chembl_2487_data.csv'
MODEL_FILE = Path('./modeling/models/svr_model.joblib')

# Ensure directories exist
DATA_DIR.mkdir(exist_ok=True)
MODEL_FILE.parent.mkdir(exist_ok=True)

In [9]:
# Create the generator once with the desired parameters
fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)

def prepare_training_data():
    if DATA_FILE.exists():
        print(f"Found existing training data at {DATA_FILE}")
        return
    
    print(f"Training data not found. Downloading from ChEMBL...")
    url = 'https://www.ebi.ac.uk/chembl/api/data/activity.json?target_chembl_id=CHEMBL2487&pchembl_value__isnull=false&limit=1000'
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        activities = data['activities']
        df = pd.DataFrame(activities)
        
        if 'canonical_smiles' not in df.columns or 'pchembl_value' not in df.columns:
            print(f"Error: Required columns not in response. Available columns: {df.columns.tolist()}")
            return
        
        df = df[['canonical_smiles', 'pchembl_value']]
        df.rename(columns={'pchembl_value': 'pIC50'}, inplace=True)
        df['pIC50'] = pd.to_numeric(df['pIC50'], errors='coerce')
        df = df.dropna()
        df.to_csv(DATA_FILE, index=False)
        print(f"Successfully downloaded and saved training data to {DATA_FILE}")
    except Exception as e:
        print(f"Failed to download or process training data: {e}")
        raise

def smiles_to_fp(smiles):
    """Converts a SMILES string to a Morgan fingerprint using the modern RDKit API."""
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        # Use the new generator to get the fingerprint
        return fpgen.GetFingerprint(mol)
    except:
        return None

In [10]:
prepare_training_data()

Found existing training data at modeling\data\chembl_2487_data.csv


In [11]:
print("Loading dataset...")
df = pd.read_csv(DATA_FILE)

print("Generating Morgan fingerprints from SMILES...")
df['fingerprint'] = df['canonical_smiles'].apply(smiles_to_fp)
df = df.dropna(subset=['fingerprint'])

X = np.array(df['fingerprint'].tolist())
y = df['pIC50'].values

print(f"Generated {len(X)} fingerprints.")

Loading dataset...
Generating Morgan fingerprints from SMILES...
Generated 999 fingerprints.


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training SVR model on {len(X_train)} samples...")
model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
model.fit(X_train, y_train)

print("Model training complete.")

Training SVR model on 799 samples...
Model training complete.


In [13]:
print("Evaluating model performance on the test set...")
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)

print(f"Test Set R-squared (R2): {r2:.4f}")

Evaluating model performance on the test set...
Test Set R-squared (R2): 0.7855


In [14]:
print(f"Saving model to {MODEL_FILE}...")
joblib.dump(model, MODEL_FILE)
print("Model saved successfully.")

Saving model to modeling\models\svr_model.joblib...
Model saved successfully.
