In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from collections import defaultdict
import numpy as np
from rdkit.Chem.Scaffolds import MurckoScaffold
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
import os
import joblib
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from rdkit.Chem import Descriptors
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr
DESC_NAMES = [name for name, _ in Descriptors.descList]

def compute_rdkit_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [np.nan] * len(DESC_NAMES)
    return [func(mol) for _, func in Descriptors.descList]


def compute_descriptor_df(df, smiles_col="smiles"):
    desc_values = df[smiles_col].apply(compute_rdkit_descriptors)
    desc_df = pd.DataFrame(desc_values.tolist(), columns=DESC_NAMES)
    return desc_df
def select_correlated_descriptors(X, y, threshold=0.05, min_samples=30):
    """
    Select descriptors with |Pearson r| >= threshold using pairwise complete data.
    """
    selected = []
    correlations = {}

    y = np.asarray(y)

    for col in X.columns:
        x = X[col].values

        # Mask valid pairs
        mask = np.isfinite(x) & np.isfinite(y)

        if mask.sum() < min_samples:
            continue

        x_valid = x[mask]
        y_valid = y[mask]

        # Skip zero-variance descriptors
        if np.std(x_valid) == 0:
            continue

        r, _ = pearsonr(x_valid, y_valid)

        if np.isfinite(r) and abs(r) >= threshold:
            selected.append(col)
            correlations[col] = r

    print(f"âœ… Selected {len(selected)} descriptors with |r| â‰¥ {threshold}")
    return selected, pd.Series(correlations).sort_values(key=np.abs, ascending=False)
gb_model = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

hgb_model = HistGradientBoostingClassifier(
    max_iter=300,
    learning_rate=0.05,
    max_depth=6,
    random_state=42
)
df_train = pd.read_csv('./NIHDataset/train_df_class.csv')
df_test = pd.read_csv('./NIHDataset/test_df_class.csv')
df_val = pd.read_csv('./NIHDataset/val_df_class.csv')

df_train = df_train.rename(columns={"LD50_class": "label"})
df_val   = df_val.rename(columns={"LD50_class": "label"})
df_test  = df_test.rename(columns={"LD50_class": "label"})

df_train = df_train[['smiles', 'label']].dropna()
df_val   = df_val[['smiles', 'label']].dropna()
df_test  = df_test[['smiles', 'label']].dropna()

print("ðŸ”¹ Computing RDKit descriptors...")
X_train_desc = compute_descriptor_df(df_train)
X_val_desc   = compute_descriptor_df(df_val)
X_test_desc  = compute_descriptor_df(df_test)

y_train = df_train["label"].values
y_val   = df_val["label"].values
y_test  = df_test["label"].values

selected_desc, corr_series = select_correlated_descriptors(X_train_desc,y_train,threshold=0.05)

X_train_sel = X_train_desc[selected_desc]
X_val_sel   = X_val_desc[selected_desc]
X_test_sel  = X_test_desc[selected_desc]

imputer = KNNImputer(n_neighbors=5, weights="distance")

X_train_imp = imputer.fit_transform(X_train_sel)
X_val_imp   = imputer.transform(X_val_sel)
X_test_imp  = imputer.transform(X_test_sel)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train_imp)
X_val   = scaler.transform(X_val_imp)
X_test  = scaler.transform(X_test_imp)

print("âœ… Final feature shapes:")
print("Train:", X_train.shape)
print("Val  :", X_val.shape)
print("Test :", X_test.shape)


print("\nðŸš€ Training base models...")

gb_model.fit(X_train, y_train)
hgb_model.fit(X_train, y_train)
val_pred_gb  = gb_model.predict(X_val)
val_pred_hgb = hgb_model.predict(X_val)

val_prob_gb  = gb_model.predict_proba(X_val)
val_prob_hgb = hgb_model.predict_proba(X_val)


ðŸ”¹ Computing RDKit descriptors...
âœ… Selected 97 descriptors with |r| â‰¥ 0.05
âœ… Final feature shapes:
Train: (7117, 97)
Val  : (890, 97)
Test : (890, 97)

ðŸš€ Training base models...


In [2]:
def evaluate(y_true, y_pred, name):
    acc = accuracy_score(y_true, y_pred)
    f1  = f1_score(y_true, y_pred, average="macro")
    mcc = matthews_corrcoef(y_true, y_pred)
    print(f"\n{name}")
    print("ACC :", acc)
    print("F1  :", f1)
    print("MCC :", mcc)
    return mcc

mcc_gb  = evaluate(y_val, val_pred_gb,  "GradientBoosting")
mcc_hgb = evaluate(y_val, val_pred_hgb, "HistGradientBoosting")

weights = np.array([mcc_gb, mcc_hgb])
weights = np.clip(weights, 0, None)  # remove negative weights
weights = weights / weights.sum()

print("\nðŸ“Š Ensemble Weights:")
print("GB  :", weights[0])
print("HGB :", weights[1])

prob_gb  = gb_model.predict_proba(X_test)
prob_hgb = hgb_model.predict_proba(X_test)

ensemble_probs = (
    weights[0] * prob_gb +
    weights[1] * prob_hgb
)

ensemble_preds = np.argmax(ensemble_probs, axis=1)

print("\n================ FINAL TEST PERFORMANCE ================")

print("Accuracy:", accuracy_score(y_test, ensemble_preds))
print("F1 macro:", f1_score(y_test, ensemble_preds, average="macro"))
print("MCC:", matthews_corrcoef(y_test, ensemble_preds))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, ensemble_preds))

print("\nClassification Report:")
print(classification_report(y_test, ensemble_preds, digits=4))



GradientBoosting
ACC : 0.648314606741573
F1  : 0.4878039013569899
MCC : 0.3598138917176483

HistGradientBoosting
ACC : 0.6550561797752809
F1  : 0.5404026705844274
MCC : 0.3868917994865379

ðŸ“Š Ensemble Weights:
GB  : 0.4818684201233139
HGB : 0.518131579876686

Accuracy: 0.6393258426966292
F1 macro: 0.4852115485239922
MCC: 0.329954952428251

Confusion Matrix:
[[ 31  14  25   0]
 [ 12  54 124   1]
 [  5  32 462   9]
 [  0   3  96  22]]

Classification Report:
              precision    recall  f1-score   support

           0     0.6458    0.4429    0.5254        70
           1     0.5243    0.2827    0.3673       191
           2     0.6535    0.9094    0.7605       508
           3     0.6875    0.1818    0.2876       121

    accuracy                         0.6393       890
   macro avg     0.6278    0.4542    0.4852       890
weighted avg     0.6298    0.6393    0.5933       890

