In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("D2_GPCR.csv", index_col=0)
df.head()

Unnamed: 0_level_0,Smiles,Activity Type,p-value (-log)
Common name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dopamine,NCCc1ccc(O)c(O)c1,IC50,4.0
CHEMBL1259071,Cc1ccc(CNCC2(F)CCN(C(=O)c3cc(Br)cs3)CC2)nc1,Ki,4.0
CHEMBL407818,Oc1ccc(N2CCN(Cc3cnn4ccccc34)CC2)cc1,Ki,4.06
CHEMBL392401,Oc1cccc(C2NCCc3c2[nH]c2ccccc32)c1,Ki,4.06
procaterol,CCC(NC(C)C)C(O)c1ccc(O)c2[nH]c(=O)ccc12,Ki,4.07


In [3]:
df_ki = df[df["Activity Type"].isin(["Ki", "pKi"])].copy()
df_ki = df_ki.rename(columns={"p-value (-log)": "pKi"})
print(df_ki.shape)

(9320, 3)


In [4]:
from rdkit import Chem

# helper: take a SMILES string → return sanitized canonical SMILES or None if invalid
def sanitize_smiles(smiles_str):
    mol = Chem.MolFromSmiles(smiles_str)
    if mol is None:
        return None  # invalid / can't parse
    Chem.SanitizeMol(mol)  # ensures valence, aromaticity, etc.
    return Chem.MolToSmiles(mol, canonical=True)  # write back a clean canonical SMILES

# helper: apply to a whole dataframe
def sanitize_dataframe(df, smiles_col="SMILES"):
    df = df.copy()

    # generate sanitized SMILES
    df["SMILES_sanitized"] = df[smiles_col].apply(sanitize_smiles)

    # drop rows where SMILES failed to sanitize (None)
    df = df.dropna(subset=["SMILES_sanitized"]).reset_index(drop=True)

    return df

df_clean = sanitize_dataframe(df_ki, smiles_col="Smiles")

# ---- (optional) sanity check ----
print("| D2_GPCR:", len(df_ki), "→", len(df_clean))

df_clean.head()


| D2_GPCR: 9320 → 9320


Unnamed: 0,Smiles,Activity Type,pKi,SMILES_sanitized
0,Cc1ccc(CNCC2(F)CCN(C(=O)c3cc(Br)cs3)CC2)nc1,Ki,4.0,Cc1ccc(CNCC2(F)CCN(C(=O)c3cc(Br)cs3)CC2)nc1
1,Oc1ccc(N2CCN(Cc3cnn4ccccc34)CC2)cc1,Ki,4.06,Oc1ccc(N2CCN(Cc3cnn4ccccc34)CC2)cc1
2,Oc1cccc(C2NCCc3c2[nH]c2ccccc32)c1,Ki,4.06,Oc1cccc(C2NCCc3c2[nH]c2ccccc32)c1
3,CCC(NC(C)C)C(O)c1ccc(O)c2[nH]c(=O)ccc12,Ki,4.07,CCC(NC(C)C)C(O)c1ccc(O)c2[nH]c(=O)ccc12
4,COc1ccsc1CNCC[C@@]1(c2ccccn2)CCOC2(CCCC2)C1,Ki,4.08,COc1ccsc1CNCC[C@@]1(c2ccccn2)CCOC2(CCCC2)C1


In [5]:
from rdkit.Chem.MolStandardize import rdMolStandardize

def standardize_smiles(smiles_str):
    if pd.isna(smiles_str):
        return None

    mol = Chem.MolFromSmiles(smiles_str)
    if mol is None:
        return None

    # 1. Normalize functional groups etc.
    normalizer = rdMolStandardize.Normalizer()
    mol = normalizer.normalize(mol)

    # 2. Disconnect metals
    metal_disconnector = rdMolStandardize.MetalDisconnector()
    mol = metal_disconnector.Disconnect(mol)

    # 3. Remove fragments, keep largest organic
    fl = rdMolStandardize.LargestFragmentChooser()
    mol = fl.choose(mol)

    # 4. Reionize
    reionizer = rdMolStandardize.Reionizer()
    mol = reionizer.reionize(mol)

    # 5. Canonical SMILES
    return Chem.MolToSmiles(mol, canonical=True)


def add_standardized_smiles(df, smiles_col="SMILES"):
    df = df.copy()
    df["SMILES_standardized"] = df[smiles_col].apply(standardize_smiles)
    df = df.dropna(subset=["SMILES_standardized"]).reset_index(drop=True)
    return df

df_std = add_standardized_smiles(df_clean, smiles_col="SMILES_sanitized")

print("| D2 GPCR std rows:", len(df_std))

df_std.head()


[16:14:15] Initializing Normalizer
[16:14:15] Running Normalizer
[16:14:15] Initializing MetalDisconnector
[16:14:15] Running MetalDisconnector
[16:14:15] Running LargestFragmentChooser
[16:14:15] Initializing Normalizer
[16:14:15] Running Normalizer
[16:14:15] Initializing MetalDisconnector
[16:14:15] Running MetalDisconnector
[16:14:15] Running LargestFragmentChooser
[16:14:15] Initializing Normalizer
[16:14:15] Running Normalizer
[16:14:15] Initializing MetalDisconnector
[16:14:15] Running MetalDisconnector
[16:14:15] Running LargestFragmentChooser
[16:14:15] Initializing Normalizer
[16:14:15] Running Normalizer
[16:14:15] Initializing MetalDisconnector
[16:14:15] Running MetalDisconnector
[16:14:15] Running LargestFragmentChooser
[16:14:15] Initializing Normalizer
[16:14:15] Running Normalizer
[16:14:15] Initializing MetalDisconnector
[16:14:15] Running MetalDisconnector
[16:14:15] Running LargestFragmentChooser
[16:14:15] Initializing Normalizer
[16:14:15] Running Normalizer
[16:1

| D2 GPCR std rows: 9320


[16:14:22] Initializing Normalizer
[16:14:22] Running Normalizer
[16:14:22] Initializing MetalDisconnector
[16:14:22] Running MetalDisconnector
[16:14:22] Running LargestFragmentChooser
[16:14:22] Initializing Normalizer
[16:14:22] Running Normalizer
[16:14:22] Initializing MetalDisconnector
[16:14:22] Running MetalDisconnector
[16:14:22] Running LargestFragmentChooser
[16:14:22] Initializing Normalizer
[16:14:22] Running Normalizer
[16:14:22] Initializing MetalDisconnector
[16:14:22] Running MetalDisconnector
[16:14:22] Running LargestFragmentChooser
[16:14:22] Initializing Normalizer
[16:14:22] Running Normalizer
[16:14:22] Initializing MetalDisconnector
[16:14:22] Running MetalDisconnector
[16:14:22] Running LargestFragmentChooser
[16:14:22] Initializing Normalizer
[16:14:22] Running Normalizer
[16:14:22] Initializing MetalDisconnector
[16:14:22] Running MetalDisconnector
[16:14:22] Running LargestFragmentChooser
[16:14:22] Initializing Normalizer
[16:14:22] Running Normalizer
[16:1

Unnamed: 0,Smiles,Activity Type,pKi,SMILES_sanitized,SMILES_standardized
0,Cc1ccc(CNCC2(F)CCN(C(=O)c3cc(Br)cs3)CC2)nc1,Ki,4.0,Cc1ccc(CNCC2(F)CCN(C(=O)c3cc(Br)cs3)CC2)nc1,Cc1ccc(CNCC2(F)CCN(C(=O)c3cc(Br)cs3)CC2)nc1
1,Oc1ccc(N2CCN(Cc3cnn4ccccc34)CC2)cc1,Ki,4.06,Oc1ccc(N2CCN(Cc3cnn4ccccc34)CC2)cc1,Oc1ccc(N2CCN(Cc3cnn4ccccc34)CC2)cc1
2,Oc1cccc(C2NCCc3c2[nH]c2ccccc32)c1,Ki,4.06,Oc1cccc(C2NCCc3c2[nH]c2ccccc32)c1,Oc1cccc(C2NCCc3c2[nH]c2ccccc32)c1
3,CCC(NC(C)C)C(O)c1ccc(O)c2[nH]c(=O)ccc12,Ki,4.07,CCC(NC(C)C)C(O)c1ccc(O)c2[nH]c(=O)ccc12,CCC(NC(C)C)C(O)c1ccc(O)c2[nH]c(=O)ccc12
4,COc1ccsc1CNCC[C@@]1(c2ccccn2)CCOC2(CCCC2)C1,Ki,4.08,COc1ccsc1CNCC[C@@]1(c2ccccn2)CCOC2(CCCC2)C1,COc1ccsc1CNCC[C@@]1(c2ccccn2)CCOC2(CCCC2)C1


In [6]:
# 2. Count how many times each standardized molecule appears
counts = (
    df_std.groupby("SMILES_standardized")
    .size()
    .reset_index(name="n_measurements")
)

# 3. Keep only molecules that show up more than once
dupes = counts[counts["n_measurements"] > 1]

print("Number of unique standardized SMILES with multiple measurements:",
      len(dupes))

display(dupes.head())

# 4. Join back to see all measurements per duplicated molecule
dupe_details = (
    df_std.merge(dupes, on="SMILES_standardized", how="inner")
)

# Optional: summary stats per molecule (mean, std, etc.)
summary_stats = (
    dupe_details.groupby("SMILES_standardized")
    .agg(
        n_measurements=("pKi", "count"),
        values=("pKi", list),
        mean_pKi=("pKi", "mean"),
        std_pKi=("pKi", "std"),
        min_pKi=("pKi", "min"),
        max_pKi=("pKi", "max"),
    )
    .reset_index()
    .sort_values("n_measurements", ascending=False)
)

print("Per-molecule measurement summary (first few rows):")
display(summary_stats.head())

# If you want to inspect one molecule’s disagreement manually:
example_smiles = summary_stats.iloc[0]["SMILES_standardized"]
print("Example SMILES with repeats:", example_smiles)

display(
    dupe_details[dupe_details["SMILES_standardized"] == example_smiles]
)

Number of unique standardized SMILES with multiple measurements: 1221


Unnamed: 0,SMILES_standardized,n_measurements
2,Brc1ccc(NCCN2CCN(CCc3c[nH]c4ccccc34)CC2)cc1,2
4,Brc1ccc2c(c1)C1CN(Cc3ccccc3)CC1CO2,2
5,Brc1cccc(N2CCN(Cc3cc4ccccn4n3)CC2)n1,2
6,Brc1cccc(N2CCN(Cc3cnn4ccccc34)CC2)n1,2
8,C#CC1=CCC(N(CCC)CCC)CC1,10


Per-molecule measurement summary (first few rows):


Unnamed: 0,SMILES_standardized,n_measurements,values,mean_pKi,std_pKi,min_pKi,max_pKi
988,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,69,"[5.19, 7.75, 8.02, 8.04, 8.07, 8.1, 8.13, 8.2,...",8.723913,0.638395,5.19,9.92
97,CCCCN(CCCC)N=O,64,"[6.09, 6.23, 6.37, 6.45, 6.55, 6.56, 6.57, 6.5...",7.094062,0.459481,6.09,8.72
381,CN1CCN(C2=Nc3cc(Cl)ccc3Nc3ccccc32)CC1,52,"[6.0, 6.06, 6.35, 6.37, 6.38, 6.41, 6.43, 6.48...",6.922115,0.4325,6.0,8.16
951,NCCc1ccc(O)c(O)c1,44,"[4.82, 4.94, 5.0, 5.1, 5.25, 5.43, 5.68, 5.72,...",6.65,1.027526,4.82,8.72
780,Cc1cc2c(s1)=Nc1ccccc1NC=2N1CCN(C)CC1,31,"[6.76, 6.97, 7.0, 7.11, 7.14, 7.24, 7.28, 7.35...",7.68871,0.489964,6.76,8.68


Example SMILES with repeats: O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1


Unnamed: 0,Smiles,Activity Type,pKi,SMILES_sanitized,SMILES_standardized,n_measurements
254,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,Ki,5.19,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,69
2381,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,Ki,7.75,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,69
2580,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,Ki,8.02,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,69
2605,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,pKi,8.04,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,69
2649,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,Ki,8.07,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,69
...,...,...,...,...,...,...
3456,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,pKi,9.52,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,69
3460,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,Ki,9.58,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,69
3465,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,Ki,9.60,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,69
3488,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,Ki,9.80,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,69


In [7]:
# build Mol objects from your standardized SMILES
df_std["Mol"] = df_std["SMILES_standardized"].apply(lambda s: Chem.MolFromSmiles(s) if pd.notna(s) else None)

# drop rows with invalid molecules
df_std = df_std.dropna(subset=["Mol"]).reset_index(drop=True)

print(f"Kept {len(df_std)} molecules after Mol creation")

Kept 9320 molecules after Mol creation


In [8]:
from sklearn.feature_selection import VarianceThreshold

def clean_descriptor_df(features_matrix):
    # 1) force numeric; non-numeric (like error strings) → NaN
    fm_n = features_matrix.apply(pd.to_numeric, errors="coerce")

    # 2) drop columns with too many NaNs
    fm_n = fm_n.dropna(axis=1)

    # 4) drop zero-variance columns
    sel = VarianceThreshold(0.0)
    fm_n2 = pd.DataFrame(
        sel.fit_transform(fm_n),
        columns=fm_n.columns[sel.get_support()],
        index=fm_n.index,
    )
    return fm_n2

In [9]:
from rdkit.Chem import Descriptors, AllChem

# ---------- RDKit descriptors (exclude Ipc) ----------
desc_list = [(n, f) for (n, f) in Descriptors._descList if n != "Ipc"]
rd_names = [n for (n, _) in desc_list]

def rdkit_desc_from_mol(mol):
    vals = []
    for name, func in desc_list:
        try:
            v = func(mol)
        except Exception:
            v = np.nan
        vals.append(v)
    return vals

X_rd = pd.DataFrame(
    [rdkit_desc_from_mol(m) for m in df_std["Mol"]],
    columns=rd_names,
    index=df_std.index,
).replace([np.inf, -np.inf], np.nan)


# drop all-NaN columns (rare but safe)
X_rd = X_rd.dropna(axis=1, how="all")
print(X_rd.shape)



(9320, 209)


In [10]:
X_rd_clean = clean_descriptor_df(X_rd)
print(X_rd_clean.shape)

(9320, 185)


In [11]:
X_rd_clean.head()

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_quatN,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiophene,fr_unbrch_alkane,fr_urea
0,14.956975,14.956975,0.005033,-1.266359,0.787883,16.84,426.355,405.187,425.057274,134.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,9.392714,9.392714,0.319881,0.319881,0.806581,16.086957,308.385,288.225,308.163711,118.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,9.69351,9.69351,0.12338,0.12338,0.632215,18.1,264.328,248.2,264.126263,100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,11.410826,11.410826,0.003257,-0.716294,0.678675,14.52381,290.363,268.187,290.163043,114.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6.335622,6.335622,0.090024,0.090024,0.701081,24.333333,386.561,356.321,386.202799,146.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [12]:
from rdkit import DataStructs

# ---------- Morgan fingerprints (radius=2, nBits=2048) ----------
def morgan_bits(mol, radius=2, nBits=2048):
    arr = np.zeros((nBits,), dtype=int)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

X_morgan = pd.DataFrame(
    np.vstack(df_std["Mol"].apply(morgan_bits)),
    index=df_std.index
)
print(X_morgan.shape)



(9320, 2048)




In [13]:
X_morgan_clean = clean_descriptor_df(X_morgan)
print(X_morgan_clean.shape)

(9320, 2042)


In [14]:
X_morgan_clean.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
from sklearn.model_selection import train_test_split

y = df_std["pKi"]  # target

X_train_rd, X_test_rd, y_train, y_test = train_test_split(
    X_rd, y, test_size=0.2, random_state=42
)

X_train_morgan, X_test_morgan, _, _ = train_test_split(
    X_morgan_clean, y, test_size=0.2, random_state=42
)

In [16]:
from sklearn.metrics import mean_squared_error, r2_score

def train_test_model(model, X_train, X_test, y_train):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model, y_pred

def evaluate(preds, y_test):
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    return {"RMSE": rmse, "R2": r2}

In [17]:
from xgboost import XGBRegressor

xgb_rd, preds = train_test_model(XGBRegressor(n_jobs=-1, random_state=42), X_train_rd, X_test_rd, y_train)
xgb_rd_preds = evaluate(preds, y_test)

xgb_morgan, preds = train_test_model(XGBRegressor(n_jobs=-1, random_state=42), X_train_morgan, X_test_morgan, y_train)
xgb_morgan_preds = evaluate(preds, y_test)

print("RDKit + XGBoost:", xgb_rd_preds)
print("Morgan + XGBoost:", xgb_morgan_preds)

RDKit + XGBoost: {'RMSE': 0.7155708324917749, 'R2': 0.6052472206443201}
Morgan + XGBoost: {'RMSE': 0.7033898316130899, 'R2': 0.6185724062635563}


In [18]:
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# MLP benefits from scaling on continuous descriptors (RDKit)
mlp_rd = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler()),
    ("mlp", MLPRegressor(
        hidden_layer_sizes=(256,128),
        activation="relu",
        alpha=1e-4,
        batch_size=256,
        learning_rate_init=1e-3,
        max_iter=300,
        early_stopping=True,
        n_iter_no_change=20,
        random_state=42
    ))
])

mlp_rd, preds = train_test_model(mlp_rd, X_train_rd, X_test_rd, y_train)
mlp_rd_preds = evaluate(preds, y_test)

# For binary fingerprints, no scaling is recommended
mlp_mg = MLPRegressor(
    hidden_layer_sizes=(512,256),
    activation="relu",
    alpha=1e-4,
    batch_size=512,
    learning_rate_init=1e-3,
    max_iter=300,
    early_stopping=True,
    n_iter_no_change=20,
    random_state=42
)
mlp_morgan, preds = train_test_model(mlp_mg, X_train_morgan, X_test_morgan, y_train)
mlp_morgan_preds = evaluate(preds, y_test)

print("RDKit + MLP:", mlp_rd_preds)
print("Morgan + MLP:", mlp_morgan_preds)

RDKit + MLP: {'RMSE': 0.7608362191309296, 'R2': 0.5537252718388266}
Morgan + MLP: {'RMSE': 0.7117498182868424, 'R2': 0.6094517766151921}


In [19]:
results = {"RDKit + XGBoost": xgb_rd_preds, "RDKit + MLP": mlp_rd_preds, "Morgan FPs + XGBoost": xgb_morgan_preds, "Morgan FPs + MLP": mlp_morgan_preds}
pd.DataFrame(results).T

Unnamed: 0,RMSE,R2
RDKit + XGBoost,0.715571,0.605247
RDKit + MLP,0.760836,0.553725
Morgan FPs + XGBoost,0.70339,0.618572
Morgan FPs + MLP,0.71175,0.609452


In [20]:
challenge = pd.read_csv("challenge_data.csv")
print(challenge.shape)
challenge.head()

(469, 4)


Unnamed: 0,Common name,Smiles,Activity Type,p-value (-log)
0,α-ergocryptine,CC(C)C[C@H]1C(=O)N2CCC[C@H]2[C@]2(O)O[C@](NC(=...,IC50,8.71
1,α-ergocryptine,CC(C)C[C@H]1C(=O)N2CCC[C@H]2[C@]2(O)O[C@](NC(=...,Ki,9.19
2,α-ergocryptine,CC(C)C[C@H]1C(=O)N2CCC[C@H]2[C@]2(O)O[C@](NC(=...,pKi,8.04
3,(+)-sulpiride,CCN1CCC[C@@H]1CNC(=O)c1cc(S(N)(=O)=O)ccc1OC,AC50,6.09
4,(+)-sulpiride,CCN1CCC[C@@H]1CNC(=O)c1cc(S(N)(=O)=O)ccc1OC,IC50,6.89


In [21]:
challenge_ki = challenge[challenge["Activity Type"].isin(["Ki", "pKi"])].copy()
challenge_ki = challenge_ki.rename(columns={"p-value (-log)": "pKi"})
print(challenge_ki.shape)
challenge_ki.head()

(225, 4)


Unnamed: 0,Common name,Smiles,Activity Type,pKi
1,α-ergocryptine,CC(C)C[C@H]1C(=O)N2CCC[C@H]2[C@]2(O)O[C@](NC(=...,Ki,9.19
2,α-ergocryptine,CC(C)C[C@H]1C(=O)N2CCC[C@H]2[C@]2(O)O[C@](NC(=...,pKi,8.04
6,(+)-sulpiride,CCN1CCC[C@@H]1CNC(=O)c1cc(S(N)(=O)=O)ccc1OC,Ki,7.17
7,(+)-sulpiride,CCN1CCC[C@@H]1CNC(=O)c1cc(S(N)(=O)=O)ccc1OC,Ki,7.37
12,amiodarone,CCCCc1oc2ccccc2c1C(=O)c1cc(I)c(OCCN(CC)CC)c(I)c1,Ki,5.18


In [22]:
# --- prepare (same sanitize → standardize → Mol) ---
def prepare(df):
    df = sanitize_dataframe(df, smiles_col="Smiles")
    df = add_standardized_smiles(df, smiles_col="SMILES_sanitized")
    df["Mol"] = df["SMILES_standardized"].apply(lambda s: Chem.MolFromSmiles(s) if pd.notna(s) else None)
    df = df.dropna(subset=["Mol"]).reset_index(drop=True)
    return df

# Prepare challenge sets
challenge_prep = prepare(challenge_ki)

# RDKit featurization (same calc/map you already use)
X_challenge = pd.DataFrame(
    [rdkit_desc_from_mol(m) for m in challenge_prep["Mol"]],
    columns=rd_names,
    index=challenge_prep.index,
).replace([np.inf, -np.inf], np.nan)

# drop all-NaN columns (rare but safe)
X_challenge = X_challenge.dropna(axis=1, how="all")
print(X_rd.shape)

X_challenge_clean = clean_descriptor_df(X_challenge)
print(X_challenge_clean.shape)

# Align to training columns
X_challenge_clean = X_challenge_clean.reindex(columns=X_train_rd.columns)

# Predict with your trained RF (Mordred)
pred_challenge_xgb_rd = xgb_rd.predict(X_challenge_clean)

challenge_metrics = evaluate(pred_challenge_xgb_rd, challenge_prep["pKi"])

challenge_results = {"Challenge Data": challenge_metrics}
pd.DataFrame(challenge_results).T

[16:18:55] Initializing Normalizer
[16:18:55] Running Normalizer
[16:18:55] Initializing MetalDisconnector
[16:18:55] Running MetalDisconnector
[16:18:55] Running LargestFragmentChooser
[16:18:55] Initializing Normalizer
[16:18:55] Running Normalizer
[16:18:55] Initializing MetalDisconnector
[16:18:55] Running MetalDisconnector
[16:18:55] Running LargestFragmentChooser
[16:18:55] Initializing Normalizer
[16:18:55] Running Normalizer
[16:18:55] Initializing MetalDisconnector
[16:18:55] Running MetalDisconnector
[16:18:55] Running LargestFragmentChooser
[16:18:55] Initializing Normalizer
[16:18:55] Running Normalizer
[16:18:55] Initializing MetalDisconnector
[16:18:55] Running MetalDisconnector
[16:18:55] Running LargestFragmentChooser
[16:18:55] Initializing Normalizer
[16:18:55] Running Normalizer
[16:18:55] Initializing MetalDisconnector
[16:18:55] Running MetalDisconnector
[16:18:55] Running LargestFragmentChooser
[16:18:55] Initializing Normalizer
[16:18:55] Running Normalizer
[16:1

(9320, 209)
(225, 175)


Unnamed: 0,RMSE,R2
Challenge Data,1.043199,-0.154292
