In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("AqSolDB.csv", index_col=0)
df.head()

Unnamed: 0,SMILES,LogS
0,[Br-].CCCCCCCCCCCCCCCCCC[N+](C)(C)C,-3.616127
1,O=C1Nc2cccc3cccc1c23,-3.254767
2,Clc1ccc(C=O)cc1,-2.177078
3,[Zn++].CC(c1ccccc1)c2cc(C(C)c3ccccc3)c(O)c(c2)...,-3.924409
4,C1OC1CN(CC2CO2)c3ccc(Cc4ccc(cc4)N(CC5CO5)CC6CO...,-4.662065


In [3]:
from rdkit import Chem

# helper: take a SMILES string → return sanitized canonical SMILES or None if invalid
def sanitize_smiles(smiles_str):
    mol = Chem.MolFromSmiles(smiles_str)
    if mol is None:
        return None  # invalid / can't parse
    Chem.SanitizeMol(mol)  # ensures valence, aromaticity, etc.
    return Chem.MolToSmiles(mol, canonical=True)  # write back a clean canonical SMILES

# helper: apply to a whole dataframe
def sanitize_dataframe(df, smiles_col="SMILES"):
    df = df.copy()

    # generate sanitized SMILES
    df["SMILES_sanitized"] = df[smiles_col].apply(sanitize_smiles)

    # drop rows where SMILES failed to sanitize (None)
    df = df.dropna(subset=["SMILES_sanitized"]).reset_index(drop=True)

    return df

df_clean = sanitize_dataframe(df, smiles_col="SMILES")

# ---- (optional) sanity check ----
print("| AqSolDB:", len(df), "→", len(df_clean))

df_clean.head()




| AqSolDB: 9982 → 9982


Unnamed: 0,SMILES,LogS,SMILES_sanitized
0,[Br-].CCCCCCCCCCCCCCCCCC[N+](C)(C)C,-3.616127,CCCCCCCCCCCCCCCCCC[N+](C)(C)C.[Br-]
1,O=C1Nc2cccc3cccc1c23,-3.254767,O=C1Nc2cccc3cccc1c23
2,Clc1ccc(C=O)cc1,-2.177078,O=Cc1ccc(Cl)cc1
3,[Zn++].CC(c1ccccc1)c2cc(C(C)c3ccccc3)c(O)c(c2)...,-3.924409,CC(c1ccccc1)c1cc(C(=O)[O-])c(O)c(C(C)c2ccccc2)...
4,C1OC1CN(CC2CO2)c3ccc(Cc4ccc(cc4)N(CC5CO5)CC6CO...,-4.662065,c1cc(N(CC2CO2)CC2CO2)ccc1Cc1ccc(N(CC2CO2)CC2CO...


In [4]:
from rdkit.Chem.MolStandardize import rdMolStandardize

def standardize_smiles(smiles_str):
    if pd.isna(smiles_str):
        return None

    mol = Chem.MolFromSmiles(smiles_str)
    if mol is None:
        return None

    # 1. Normalize functional groups etc.
    normalizer = rdMolStandardize.Normalizer()
    mol = normalizer.normalize(mol)

    # 2. Disconnect metals
    metal_disconnector = rdMolStandardize.MetalDisconnector()
    mol = metal_disconnector.Disconnect(mol)

    # 3. Remove fragments, keep largest organic
    fl = rdMolStandardize.LargestFragmentChooser()
    mol = fl.choose(mol)

    # 4. Reionize
    reionizer = rdMolStandardize.Reionizer()
    mol = reionizer.reionize(mol)

    # 5. Canonical SMILES
    return Chem.MolToSmiles(mol, canonical=True)


def add_standardized_smiles(df, smiles_col="SMILES"):
    df = df.copy()
    df["SMILES_standardized"] = df[smiles_col].apply(standardize_smiles)
    df = df.dropna(subset=["SMILES_standardized"]).reset_index(drop=True)
    return df

df_std = add_standardized_smiles(df_clean, smiles_col="SMILES_sanitized")

print("| AqSolDB std rows:", len(df_std))

df_std.head()


[16:13:10] Initializing Normalizer
[16:13:10] Running Normalizer
[16:13:10] Initializing MetalDisconnector
[16:13:10] Running MetalDisconnector
[16:13:10] Running LargestFragmentChooser
[16:13:10] Fragment: [Br-]
[16:13:10] New largest fragment: [Br-] (1)
[16:13:10] Fragment: CCCCCCCCCCCCCCCCCC[N+](C)(C)C
[16:13:10] New largest fragment: CCCCCCCCCCCCCCCCCC[N+](C)(C)C (68)
[16:13:10] Initializing Normalizer
[16:13:10] Running Normalizer
[16:13:10] Initializing MetalDisconnector
[16:13:10] Running MetalDisconnector
[16:13:10] Running LargestFragmentChooser
[16:13:10] Initializing Normalizer
[16:13:10] Running Normalizer
[16:13:10] Initializing MetalDisconnector
[16:13:10] Running MetalDisconnector
[16:13:10] Running LargestFragmentChooser
[16:13:10] Initializing Normalizer
[16:13:10] Running Normalizer
[16:13:10] Initializing MetalDisconnector
[16:13:10] Running MetalDisconnector
[16:13:10] Running LargestFragmentChooser
[16:13:10] Fragment: [Zn+2]
[16:13:10] New largest fragment: [Zn+2]

| AqSolDB std rows: 9982


[16:13:14] Initializing MetalDisconnector
[16:13:14] Running MetalDisconnector
[16:13:14] Running LargestFragmentChooser
[16:13:14] Initializing Normalizer
[16:13:14] Running Normalizer
[16:13:14] Initializing MetalDisconnector
[16:13:14] Running MetalDisconnector
[16:13:14] Running LargestFragmentChooser
[16:13:14] Initializing Normalizer
[16:13:14] Running Normalizer
[16:13:14] Initializing MetalDisconnector
[16:13:14] Running MetalDisconnector
[16:13:14] Running LargestFragmentChooser
[16:13:14] Initializing Normalizer
[16:13:14] Running Normalizer
[16:13:14] Initializing MetalDisconnector
[16:13:14] Running MetalDisconnector
[16:13:14] Running LargestFragmentChooser
[16:13:14] Initializing Normalizer
[16:13:14] Running Normalizer
[16:13:14] Initializing MetalDisconnector
[16:13:14] Running MetalDisconnector
[16:13:14] Running LargestFragmentChooser
[16:13:14] Initializing Normalizer
[16:13:14] Running Normalizer
[16:13:14] Initializing MetalDisconnector
[16:13:14] Running MetalDisc

Unnamed: 0,SMILES,LogS,SMILES_sanitized,SMILES_standardized
0,[Br-].CCCCCCCCCCCCCCCCCC[N+](C)(C)C,-3.616127,CCCCCCCCCCCCCCCCCC[N+](C)(C)C.[Br-],CCCCCCCCCCCCCCCCCC[N+](C)(C)C
1,O=C1Nc2cccc3cccc1c23,-3.254767,O=C1Nc2cccc3cccc1c23,O=C1Nc2cccc3cccc1c23
2,Clc1ccc(C=O)cc1,-2.177078,O=Cc1ccc(Cl)cc1,O=Cc1ccc(Cl)cc1
3,[Zn++].CC(c1ccccc1)c2cc(C(C)c3ccccc3)c(O)c(c2)...,-3.924409,CC(c1ccccc1)c1cc(C(=O)[O-])c(O)c(C(C)c2ccccc2)...,CC(c1ccccc1)c1cc(C(=O)[O-])c(O)c(C(C)c2ccccc2)c1
4,C1OC1CN(CC2CO2)c3ccc(Cc4ccc(cc4)N(CC5CO5)CC6CO...,-4.662065,c1cc(N(CC2CO2)CC2CO2)ccc1Cc1ccc(N(CC2CO2)CC2CO...,c1cc(N(CC2CO2)CC2CO2)ccc1Cc1ccc(N(CC2CO2)CC2CO...


In [5]:
# 2. Count how many times each standardized molecule appears
counts = (
    df_std.groupby("SMILES_standardized")
    .size()
    .reset_index(name="n_measurements")
)

# 3. Keep only molecules that show up more than once
dupes = counts[counts["n_measurements"] > 1]

print("Number of unique standardized SMILES with multiple measurements:",
      len(dupes))

display(dupes.head())

# 4. Join back to see all measurements per duplicated molecule
dupe_details = (
    df_std.merge(dupes, on="SMILES_standardized", how="inner")
)

# Optional: summary stats per molecule (mean, std, etc.)
summary_stats = (
    dupe_details.groupby("SMILES_standardized")
    .agg(
        n_measurements=("LogS", "count"),
        values=("LogS", list),
        mean_LogS=("LogS", "mean"),
        std_LogS=("LogS", "std"),
        min_LogS=("LogS", "min"),
        max_LogS=("LogS", "max"),
    )
    .reset_index()
    .sort_values("n_measurements", ascending=False)
)

print("Per-molecule measurement summary (first few rows):")
display(summary_stats.head())

# If you want to inspect one molecule’s disagreement manually:
example_smiles = summary_stats.iloc[0]["SMILES_standardized"]
print("Example SMILES with repeats:", example_smiles)

display(
    dupe_details[dupe_details["SMILES_standardized"] == example_smiles]
)

Number of unique standardized SMILES with multiple measurements: 204


Unnamed: 0,SMILES_standardized,n_measurements
46,C,4
138,C/C=C/C,2
172,C1=CC2C3C=CC(C3)C2C1,2
178,C1=CCCC1,2
184,C1=C\CC/C=C\CC/1,2


Per-molecule measurement summary (first few rows):


Unnamed: 0,SMILES_standardized,n_measurements,values,mean_LogS,std_LogS,min_LogS,max_LogS
144,O=S(=O)([O-])[O-],18,"[-4.5661230993, -3.9794501756, -7.7060474791, ...",-1.382235,2.808417,-7.706047,1.041524
146,O=[N+]([O-])[O-],15,"[-2.642008137, -2.5035796109, 1.1700973958, -0...",-0.063009,1.257608,-2.642008,1.377095
127,O=C([O-])[O-],15,"[-5.5472319177, -5.0220270242, -5.0664023631, ...",-3.26791,2.33029,-5.547232,0.930069
140,O=P([O-])([O-])[O-],15,"[-4.1419008964, -3.0559869633999996, -4.605114...",-2.854341,2.263743,-7.338159,0.592898
14,CC(=O)[O-],14,"[0.2936043702, -0.3755357121, 0.4692256086, -0...",0.001521,0.777442,-2.386501,0.833767


Example SMILES with repeats: O=S(=O)([O-])[O-]


Unnamed: 0,SMILES,LogS,SMILES_sanitized,SMILES_standardized,n_measurements
71,O.O.O.O.[Pb].[Pb].[Pb].[Pb].[Pb].[O-][S]([O-])...,-4.566123,O.O.O.O.O=S(=O)([O-])[O-].[Pb].[Pb].[Pb].[Pb]....,O=S(=O)([O-])[O-],18
122,[Pb++].[O-][S]([O-])(=O)=O.O=[Pb].O=[Pb].O=[Pb],-3.97945,O=S(=O)([O-])[O-].O=[Pb].O=[Pb].O=[Pb].[Pb+2],O=S(=O)([O-])[O-],18
154,[O--].[Zr+4].[O-][S]([O-])(=O)=O,-7.706047,O=S(=O)([O-])[O-].[O-2].[Zr+4],O=S(=O)([O-])[O-],18
229,[Na+].[Na+].[O-][S]([O-])(=O)=O,0.126334,O=S(=O)([O-])[O-].[Na+].[Na+],O=S(=O)([O-])[O-],18
239,[OH-].[Cr+3].[O-][S]([O-])(=O)=O,0.782342,O=S(=O)([O-])[O-].[Cr+3].[OH-],O=S(=O)([O-])[O-],18
243,[O--].[O--].[O--].[O--].[O--].[O--].[Al+3].[Al...,-3.074227,O=S(=O)([O-])[O-].O=S(=O)([O-])[O-].O=S(=O)([O...,O=S(=O)([O-])[O-],18
291,[Cl].[Fe].[O-][S]([O-])(=O)=O,0.540234,O=S(=O)([O-])[O-].[Cl].[Fe],O=S(=O)([O-])[O-],18
293,[Fe++].[O-][S]([O-])(=O)=O,0.631333,O=S(=O)([O-])[O-].[Fe+2],O=S(=O)([O-])[O-],18
322,[O-][S]([O-])(=O)=O.O=[V++],0.457119,O=S(=O)([O-])[O-].O=[V+2],O=S(=O)([O-])[O-],18
329,[Fe+3].[Fe+3].[O-][S]([O-])(=O)=O.[O-][S]([O-]...,1.041524,O=S(=O)([O-])[O-].O=S(=O)([O-])[O-].O=S(=O)([O...,O=S(=O)([O-])[O-],18


In [6]:
# build Mol objects from your standardized SMILES
df_std["Mol"] = df_std["SMILES_standardized"].apply(lambda s: Chem.MolFromSmiles(s) if pd.notna(s) else None)

# drop rows with invalid molecules
df_std = df_std.dropna(subset=["Mol"]).reset_index(drop=True)

print(f"Kept {len(df_std)} molecules after Mol creation")

[16:13:14] Unusual charge on atom 0 number of radical electrons set to zero
[16:13:14] Unusual charge on atom 0 number of radical electrons set to zero
[16:13:14] Unusual charge on atom 0 number of radical electrons set to zero
[16:13:14] Unusual charge on atom 0 number of radical electrons set to zero
[16:13:14] Unusual charge on atom 0 number of radical electrons set to zero
[16:13:14] Unusual charge on atom 0 number of radical electrons set to zero


Kept 9982 molecules after Mol creation


In [7]:
from sklearn.feature_selection import VarianceThreshold

def clean_descriptor_df(features_matrix):
    # 1) force numeric; non-numeric (like error strings) → NaN
    fm_n = features_matrix.apply(pd.to_numeric, errors="coerce")

    # 2) drop columns with too many NaNs
    fm_n = fm_n.dropna(axis=1)

    # 4) drop zero-variance columns
    sel = VarianceThreshold(0.0)
    fm_n2 = pd.DataFrame(
        sel.fit_transform(fm_n),
        columns=fm_n.columns[sel.get_support()],
        index=fm_n.index,
    )
    return fm_n2

In [8]:
from rdkit.Chem import MACCSkeys

def maccs_from_mol(mol):
    fp = MACCSkeys.GenMACCSKeys(mol)
    return list(fp)  # 167 bits

X_maccs = df_std["Mol"].apply(maccs_from_mol)
X_maccs = pd.DataFrame(X_maccs.tolist(), index=df_std.index)
print("MACCS shape:", X_maccs.shape)

MACCS shape: (9982, 167)


In [9]:
X_maccs_clean = clean_descriptor_df(X_maccs)
print("MACCS clean shape:", X_maccs_clean.shape)

MACCS clean shape: (9982, 162)


In [10]:
X_maccs_clean.head()

Unnamed: 0,3,5,6,7,8,9,10,11,12,13,...,156,157,158,159,160,161,162,163,164,165
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,1,1,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,1
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,0,1,1,1,1
4,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,0,1,1,1,1,1


In [11]:
import os
from mordred import Calculator, descriptors

# build calculator (ignore_3D=True keeps it fast)
calc = Calculator(descriptors, ignore_3D=True)

# parallelly compute using existing Mol objects
mols = df_std["Mol"].tolist()
rows = [list(r) for r in calc.map(mols, nproc=os.cpu_count(), quiet=True)]
X_mordred = pd.DataFrame(rows, columns=[str(d) for d in calc.descriptors], index=df_std.index)

# clean: replace inf, coerce errors-to-NaN, drop very sparse cols, impute, drop zero-variance
X_mordred = X_mordred.replace([np.inf, -np.inf], np.nan).apply(pd.to_numeric, errors="coerce")

print("Mordred shape (cleaned):", X_mordred.shape)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


[16:13:32] Unusual charge on atom 0 number of radical electrons set to zero
[16:13:32] Unusual charge on atom 0 number of radical electrons set to zero
[16:13:32] Unusual charge on atom 0 number of radical electrons set to zero
[16:13:32] Unusual charge on atom 0 number of radical electrons set to zero
[16:13:32] Unusual charge on atom 0 number of radical electrons set to zero
[16:13:32] Unusual charge on atom 0 number of radical electrons set to zero


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


[16:13:34] Unusual charge on atom 0 number of radical electrons set to zero
[16:13:34] Unusual charge on atom 0 number of radical electrons set to zero
[16:13:34] Unusual charge on atom 0 number of radical electrons set to zero


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


[16:14:05] Unusual charge on atom 0 number of radical electrons set to zero
[16:14:05] Unusual charge on atom 0 number of radical electrons set to zero
[16:14:05] Unusual charge on atom 0 number of radical electrons set to zero
[16:14:08] Unusual charge on atom 0 number of radical electrons set to zero
[16:14:08] Unusual charge on atom 0 number of radical electrons set to zero
[16:14:08] Unusual charge on atom 0 number of radical electrons set to zero
[16:14:20] Unusual charge on atom 0 number of radical electrons set to zero
[16:14:20] Unusual charge on atom 0 number of radical electrons set to zero
[16:14:20] Unusual charge on atom 0 number of radical electrons set to zero


Mordred shape (cleaned): (9982, 1613)


In [12]:
X_mordred_clean = clean_descriptor_df(X_mordred)
print("MACCS clean shape:", X_mordred_clean.shape)

MACCS clean shape: (9982, 788)


In [13]:
X_mordred_clean.head()

Unnamed: 0,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,VE1_A,VE2_A,...,SRW09,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb2
0,0.0,1.0,26.071695,2.12132,4.24264,26.071695,1.185077,3.91795,2.951912,0.134178,...,0.0,8.878079,53.22866,312.362477,4.593566,1716.0,19.0,88.0,86.0,5.375
1,0.0,0.0,17.518893,2.503145,4.873772,17.518893,1.347607,3.552497,3.395892,0.261222,...,7.050123,9.74455,59.616539,169.052764,8.452638,207.0,21.0,74.0,91.0,2.777778
2,0.0,0.0,11.189957,2.193993,4.387987,11.189957,1.243329,3.089765,2.856388,0.317376,...,0.0,8.590258,37.289972,140.002892,10.000207,90.0,9.0,40.0,43.0,2.166667
3,1.0,0.0,33.660518,2.453396,4.906793,33.660518,1.294635,4.179367,4.336419,0.166785,...,0.0,10.169576,60.967815,345.149618,7.343609,1602.0,43.0,134.0,158.0,5.777778
4,0.0,0.0,42.485221,2.398507,4.736543,42.485221,1.370491,4.465186,5.450586,0.175825,...,8.486528,10.35115,90.719091,422.220557,6.921648,3248.0,42.0,174.0,206.0,6.555556


In [14]:
from sklearn.model_selection import train_test_split

y = df_std["LogS"]  # target

X_train_mord, X_test_mord, y_train, y_test = train_test_split(
    X_mordred_clean, y, test_size=0.2, random_state=42
)

X_train_maccs, X_test_maccs, _, _ = train_test_split(
    X_maccs_clean, y, test_size=0.2, random_state=42
)

In [15]:
from sklearn.metrics import mean_squared_error, r2_score

def train_test_model(model, X_train, X_test, y_train):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model, y_pred

def evaluate(preds, y_test):
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    return {"RMSE": rmse, "R2": r2}

In [16]:
from sklearn.ensemble import RandomForestRegressor

rf_mordred, preds = train_test_model(RandomForestRegressor(random_state=42, n_jobs=-1), X_train_mord, X_test_mord, y_train)
rf_mordred_preds = evaluate(preds, y_test)

rf_maccs, preds = train_test_model(RandomForestRegressor(random_state=42, n_jobs=-1), X_train_maccs, X_test_maccs, y_train)
rf_maccs_preds = evaluate(preds, y_test)

print("Mordred + RF:", rf_mordred_preds)
print("MACCS + RF:", rf_maccs_preds)

Mordred + RF: {'RMSE': 1.0783172393351856, 'R2': 0.7856508572054841}
MACCS + RF: {'RMSE': 1.3055871669554853, 'R2': 0.6857752427555215}


In [17]:
from sklearn.svm import SVR

svm_maccs, preds = train_test_model(SVR(), X_train_maccs, X_test_maccs, y_train)
svm_maccs_preds = evaluate(preds, y_test)


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

svm_mordred = Pipeline([
    ("scaler", StandardScaler()),
    ("svr", SVR())
])

svm_mordred, preds = train_test_model(svm_mordred, X_train_mord, X_test_mord, y_train)
svm_mordred_preds = evaluate(preds, y_test)

print("Mordred + SVM:", svm_mordred_preds)
print("MACCS + SVM:", svm_maccs_preds)

Mordred + SVM: {'RMSE': 1.129866303200361, 'R2': 0.7646670346144492}
MACCS + SVM: {'RMSE': 1.287081580237302, 'R2': 0.6946198498654658}


In [18]:
results = {"Mordred + RF": rf_mordred_preds, "Mordred + SVM": svm_mordred_preds, "MACCS + RF": rf_maccs_preds, "MACCS + SVM": svm_maccs_preds}
pd.DataFrame(results).T

Unnamed: 0,RMSE,R2
Mordred + RF,1.078317,0.785651
Mordred + SVM,1.129866,0.764667
MACCS + RF,1.305587,0.685775
MACCS + SVM,1.287082,0.69462


In [19]:
challenge_1 = pd.read_csv("challenge_data_1.csv")
challenge_1.head()

Unnamed: 0,SMILES,LogS
0,COc1ccc(Cl)c(Nc2ncnc3cc(OCCN4CCN(C(C)=O)CC4)cc...,-4.369572
1,CCN(C(=O)Cc1ccc(S(C)(=O)=O)cc1)C1CCN(CCC(c2ccc...,-4.159894
2,N#CC1(NC(=O)[C@@H]2CCCC[C@H]2C(=O)N2CCc3[nH]c4...,-4.130182
3,CN(C)C(=N)c1ccc(C(=O)N2CCN(S(=O)(=O)c3ccc4cc(B...,-2.879986
4,N#Cc1cccnc1-c1ccc(C(=O)Nc2ccccc2N)cc1,-4.449772


In [20]:
challenge_2 = pd.read_csv("challenge_data_2.csv")
challenge_2.head()

Unnamed: 0,SMILES,LogS
0,CNc1cc(Nc2cccn(-c3ccccn3)c2=O)nn2c(C(=O)N[C@@H...,-8.54802
1,CCOc1cc2nn(CCC(C)(C)O)cc2cc1NC(=O)c1cccc(C(F)F)n1,-8.071409
2,CC(C)(Oc1ccc(-c2cnc(N)c(-c3ccc(Cl)cc3)c2)cc1)C...,-6.925969
3,CC#CC(=O)N[C@H]1CCCN(c2c(F)cc(C(N)=O)c3[nH]c(C...,-7.53528
4,C=CC(=O)N1CCC[C@@H](n2nc(-c3ccc(Oc4ccccc4)cc3)...,-7.709963


In [21]:
# --- prepare (same sanitize → standardize → Mol) ---
def prepare(df):
    df = sanitize_dataframe(df, smiles_col="SMILES")
    df = add_standardized_smiles(df, smiles_col="SMILES_sanitized")
    df["Mol"] = df["SMILES_standardized"].apply(lambda s: Chem.MolFromSmiles(s) if pd.notna(s) else None)
    df = df.dropna(subset=["Mol"]).reset_index(drop=True)
    return df

# Prepare challenge sets
challenge_1_prep = prepare(challenge_1)
challenge_2_prep = prepare(challenge_2)

# Mordred featurization (same calc/map you already use)
mols_1 = challenge_1_prep["Mol"].tolist()
mols_2 = challenge_2_prep["Mol"].tolist()

rows_1 = [list(r) for r in calc.map(mols_1, nproc=os.cpu_count(), quiet=True)]
rows_2 = [list(r) for r in calc.map(mols_2, nproc=os.cpu_count(), quiet=True)]

X_c1_mord = pd.DataFrame(rows_1, columns=[str(d) for d in calc.descriptors], index=challenge_1_prep.index)
X_c2_mord = pd.DataFrame(rows_2, columns=[str(d) for d in calc.descriptors], index=challenge_2_prep.index)

# Clean with your function
X_c1_mord = clean_descriptor_df(X_c1_mord)
X_c2_mord = clean_descriptor_df(X_c2_mord)

# Align to training columns
X_c1_mord = X_c1_mord.reindex(columns=X_train_mord.columns)
X_c2_mord = X_c2_mord.reindex(columns=X_train_mord.columns)

# Predict with your trained RF (Mordred)
pred_c1_rf_mord = rf_mordred.predict(X_c1_mord)
pred_c2_rf_mord = rf_mordred.predict(X_c2_mord)

c1_metrics = evaluate(pred_c1_rf_mord, challenge_1_prep["LogS"])
c2_metrics = evaluate(pred_c2_rf_mord, challenge_2_prep["LogS"])

challenge_results = {"Challenge Data #1": c1_metrics, "Challenge Data #2": c2_metrics}
pd.DataFrame(challenge_results).T

[16:18:40] Initializing Normalizer
[16:18:40] Running Normalizer
[16:18:40] Initializing MetalDisconnector
[16:18:40] Running MetalDisconnector
[16:18:40] Running LargestFragmentChooser
[16:18:40] Initializing Normalizer
[16:18:40] Running Normalizer
[16:18:40] Initializing MetalDisconnector
[16:18:40] Running MetalDisconnector
[16:18:40] Running LargestFragmentChooser
[16:18:40] Initializing Normalizer
[16:18:40] Running Normalizer
[16:18:40] Initializing MetalDisconnector
[16:18:40] Running MetalDisconnector
[16:18:40] Running LargestFragmentChooser
[16:18:40] Initializing Normalizer
[16:18:40] Running Normalizer
[16:18:40] Initializing MetalDisconnector
[16:18:40] Running MetalDisconnector
[16:18:40] Running LargestFragmentChooser
[16:18:40] Initializing Normalizer
[16:18:40] Running Normalizer
[16:18:40] Initializing MetalDisconnector
[16:18:40] Running MetalDisconnector
[16:18:40] Running LargestFragmentChooser
[16:18:40] Initializing Normalizer
[16:18:40] Running Normalizer
[16:1

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


Unnamed: 0,RMSE,R2
Challenge Data #1,1.133438,-0.377182
Challenge Data #2,3.677916,-25.864541
