In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import joblib
import numpy as np
import os

In [2]:
# --- Load ESOL data ---
url = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv"
df = pd.read_csv(url)
df = df[["smiles", "measured log solubility in mols per litre"]]
df.columns = ["smiles", "logS"]

In [3]:
df.head()

Unnamed: 0,smiles,logS
0,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...,-0.77
1,Cc1occc1C(=O)Nc2ccccc2,-3.3
2,CC(C)=CCCC(C)=CC(=O),-2.06
3,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43,-7.87
4,c1ccsc1,-1.33


In [4]:
# --- Convert SMILES to Morgan Fingerprints ---
def smiles_to_fp(smi):
    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    return np.array(fp)

df["fingerprint"] = df["smiles"].apply(smiles_to_fp)
df = df[df["fingerprint"].notnull()]  # remove invalid SMILES

X = np.stack(df["fingerprint"].values)
y = df["logS"].values



In [5]:
# --- Train-test split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:
from xgboost import XGBRegressor
model = XGBRegressor(objective='reg:squarederror', random_state=42)
model.fit(X_train, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [17]:
from sklearn.metrics import mean_absolute_error, r2_score

y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"📉 RMSE: {rmse:.3f}")
print(f"📏 MAE: {mae:.3f}")
print(f"📈 R²: {r2:.3f}")


📉 RMSE: 1.213
📏 MAE: 0.844
📈 R²: 0.743


In [18]:

# --- Save model ---
os.makedirs("models", exist_ok=True)
joblib.dump(model, "models/logS_model.pkl")
print("✅ Model saved to models/logS_model.pkl")


✅ Model saved to models/logS_model.pkl


# Approach 2 using RDKIT descriptors, to capture more chemistry as data is small

In [19]:
import pandas as pd
import numpy as np
import os
import joblib

from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from xgboost import XGBRegressor

# --- Load ESOL dataset ---
url = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv"
df = pd.read_csv(url)
df = df[["smiles", "measured log solubility in mols per litre"]]
df.columns = ["smiles", "logS"]

# --- Morgan Fingerprint ---
def smiles_to_fp(smi):
    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    return np.array(fp)

# --- RDKit Descriptors ---
def compute_rdkit_descriptors(smi):
    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        return None
    return [
        Descriptors.MolWt(mol),
        Descriptors.MolLogP(mol),
        Descriptors.NumHDonors(mol),
        Descriptors.NumHAcceptors(mol),
        Descriptors.TPSA(mol),
        Descriptors.NumRotatableBonds(mol),
        Descriptors.FractionCSP3(mol),
        Descriptors.HeavyAtomCount(mol)
    ]

# --- Generate features ---
df["fingerprint"] = df["smiles"].apply(smiles_to_fp)
df["descriptors"] = df["smiles"].apply(compute_rdkit_descriptors)

# Drop invalid rows
df = df[df["fingerprint"].notnull() & df["descriptors"].notnull()]

# Combine features
X_fp = np.stack(df["fingerprint"].values)      # shape: (n_samples, 2048)
X_desc = np.stack(df["descriptors"].values)    # shape: (n_samples, 8)
X = np.hstack([X_fp, X_desc])                  # shape: (n_samples, 2056)
y = df["logS"].values

# --- Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Train XGBoost Regressor ---
model2 = XGBRegressor(objective='reg:squarederror', random_state=42)
model2.fit(X_train, y_train)

# --- Evaluate ---
y_pred = model2.predict(X_test)

rmse = mean_squared_error(y_test, y_pred)  
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"📉 RMSE: {rmse:.3f}")
print(f"📏 MAE: {mae:.3f}")
print(f"📈 R²: {r2:.3f}")



📉 RMSE: 0.551
📏 MAE: 0.523
📈 R²: 0.883


In [20]:
# --- Save model ---
os.makedirs("models", exist_ok=True)
joblib.dump(model2, "models/logS_model_xgb_hybrid.pkl")
print("✅ Model saved to models/logS_model_xgb_hybrid.pkl")


✅ Model saved to models/logS_model_xgb_hybrid.pkl
