In [22]:
import json
import subprocess
import sys

def ensure_package(import_name: str, install_name: str | None = None) -> None:
    """Install a pip package if missing (best-effort)."""
    install_name = install_name or import_name
    result = subprocess.run(
        [sys.executable, '-m', 'pip', 'list', '--format=json'],
        check=True,
        capture_output=True,
        text=True,
    )
    installed = {pkg['name'].lower() for pkg in json.loads(result.stdout)}
    # Some packages have different import vs distribution names (e.g., rdkit-pypi -> rdkit)
    if import_name.lower() in installed or (install_name and install_name.lower() in installed):
        print(f'{import_name} already installed.')
        return
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', install_name])
    print(f'Installed {install_name} (import as {import_name}).')
    
for pkg in [
    ('scikit-learn', 'scikit-learn'),
    ('lightgbm', 'lightgbm'),
    ('optuna', 'optuna'),
]:
    try:
        ensure_package(pkg[0], pkg[1])
    except Exception as exc:
        print(f"Package install check failed for {pkg}: {exc}")

import lightgbm as lgb
from sklearn.model_selection import train_test_split, KFold
#stratifed kfold 
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

scikit-learn already installed.
lightgbm already installed.
Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.17.2-py3-none-any.whl.metadata (7.2 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Collecting sqlalchemy>=1.4.2 (from optuna)
  Downloading sqlalchemy-2.0.45-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (9.5 kB)
Collecting PyYAML (from optuna)
  Downloading pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (2.4 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading mako-1.3.10-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet>=1 (from sqlalchemy>=1.4.2->optuna)
  Downloading greenlet-3.3.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Collecting MarkupSafe>=0.9.2 (from Mako->alembic>=1.5.0->

In [23]:
from __future__ import annotations

from pathlib import Path
from typing import Any, Callable, Dict, Iterable, Optional, Tuple
import json
import multiprocessing as mp
import numpy as np
import pandas as pd

from joblib import Parallel, delayed

from rdkit import Chem, rdBase, RDLogger
from rdkit.Chem import AllChem, Crippen, Descriptors, Fragments, Lipinski, rdMolDescriptors, rdFingerprintGenerator
from rdkit.Chem.MACCSkeys import GenMACCSKeys
from rdkit.Chem.EState import AtomTypes as EAtomTypes

try:
    from rdkit.Chem.Scaffolds import MurckoScaffold
except Exception:
    MurckoScaffold = None

RDLogger.DisableLog('rdApp.*')
rdBase.DisableLog('rdApp.*')

DATA_PATH = Path('result/data')


_MODEL_DIR = Path('result/model')
_MODEL_DIR.mkdir(parents=True, exist_ok=True)

PLOT_DIR = Path('result/plot')
PLOT_DIR.mkdir(parents=True, exist_ok=True)

MORGAN_BITS = 512
MORGAN_RADIUS = 2
USE_MACCS = True
COMPUTE_3D = True
MAX_ITERS_3D = 0  # 0 = no optimization, >0 enables a short UFF optimize

In [24]:
import json
from pathlib import Path
import joblib
def save_results(timestamp: str = '', params: dict = None, best_iteration: int = None, valid_score: float = None, additional_score: dict = None, name: str = '', feature_list: list = None) -> None:
    results = {
        'model_params': params,
        'best_iteration': best_iteration,
        'valid_score': valid_score,
        'additional_score': additional_score or {},
        'Feature_list': feature_list or [],
    }
    if timestamp:
        results['timestamp'] = timestamp
    results_path = _MODEL_DIR / f'{name}_results{("_" + timestamp) if timestamp else "_unknown"}.json'
    with open(results_path, 'w') as f:
        json.dump(results, f, indent=4)


def save_model(timestamp: str = '', model: object = None, name: str = '') -> None:
    model_path = _MODEL_DIR / f'{name}{("_" + timestamp) if timestamp else "_unknown"}.joblib'
    import joblib
    joblib.dump(model, model_path)


def create_timestamp() -> str:
    from datetime import datetime
    return datetime.now().strftime('%Y%m%d_%H%M%S')


    
    


In [25]:
# get newest ga features from results/eda-GA folder

def get_ga_features() -> list:
    eda_ga_path = Path('results/eda-GA')
    if not eda_ga_path.exists():
        return []
    ga_files = list(eda_ga_path.glob('GAFeatureSelectionCV_results_*.json'))
    if not ga_files:
        return []
    latest_file = max(ga_files, key=lambda x: x.stat().st_mtime)
    with open(latest_file, 'r') as f:
        ga_data = json.load(f)
    return ga_data.get('Feature_list', [])

get_ga_features()

print("GA features loaded:", get_ga_features())

GA features loaded: ['MaxAbsEStateIndex', 'MaxEStateIndex', 'SPS', 'MolWt', 'HeavyAtomMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MinAbsPartialCharge', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'BalabanJ', 'BertzCT', 'Chi0n', 'Chi0v', 'Chi1n', 'Chi3n', 'Chi4n', 'HallKierAlpha', 'Kappa1', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA12', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA7', 'PEOE_VSA8', 'SMR_VSA1', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA6', 'SMR_VSA8', 'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA9', 'EState_VSA1', 'EState_VSA3', 'EState_VSA4', 'EState_VSA8', 'EState_VSA9', 'VSA_EState10', 'VSA_EState2', 'VSA_EState4', 'VSA_EState5', 'VSA_EState8', 'FractionCSP3', 'NOCount', 'NumAliphaticCarbocycles', 'NumAliphaticHeterocycles', 'NumAromaticRings', 'NumAtomStereoCenters', 'NumHAcceptors', 'NumHeteroatoms', 'NumHeterocycles', 'NumSaturatedCarbocy

In [None]:
df_train_feat = pd.read_csv(DATA_PATH / 'train_organic_heteroatom_containing.csv')

# Build X/y from the featurized training frame
assert 'Tm' in df_train_feat.columns, "Expected target column 'Tm' in df_train_feat"
assert 'SMILES' in df_train_feat.columns, "Expected 'SMILES' column in df_train_feat"

work_df = df_train_feat.copy()
work_df = work_df.dropna(subset=['Tm']).reset_index(drop=True)
# select only GA features
feature_cols = [col for col in get_ga_features() if col != 'SMILES']
work_df = work_df[feature_cols+ ['Tm']]

#save the working dataframe for later use
work_df.to_csv(DATA_PATH / 'train_organic_heteroatom_containing_GA_features.csv', index=False)

# Use numeric features only (exclude identifiers/strings)
feature_cols = [col for col in work_df.columns if col != 'Tm' and col != 'SMILES']
X = work_df[feature_cols].select_dtypes(include=['number'])
y = work_df['Tm'].astype(float)

print('Rows used:', len(work_df))
print('Numeric features:', X.shape[1])
print('Target stats:', float(y.min()), float(y.mean()), float(y.max()))

Rows used: 9246
Numeric features: 491
Target stats: -698.8 242.31646441704524 4892.0


In [27]:
# Outlier removal 
Tm = work_df['Tm'].astype(float)
 
lower = -250
upper = 750

mask = (Tm >= lower) & (Tm <= upper)
removed = int((~mask).sum())
total = int(len(work_df))
pct_removed = 100.0 * removed / total if total else 0.0

print(f"IQR bounds for Tm: [{lower:.3f}, {upper:.3f}]")
print(f"Removed outliers: {removed}/{total} ({pct_removed:.2f}%)")

# Apply filter
work_df = work_df.loc[mask].reset_index(drop=True)
feature_cols = [c for c in work_df.columns if c not in ('Tm', 'SMILES')]
X = work_df[feature_cols].select_dtypes(include=['number'])
y = work_df['Tm'].astype(float)
#y_transformed = np.sign(y) * np.log1p(np.abs(y)) add log transform

print('After outlier removal -> Rows:', len(work_df), 'Numeric features:', X.shape[1])

IQR bounds for Tm: [-250.000, 750.000]
Removed outliers: 332/9246 (3.59%)
After outlier removal -> Rows: 8914 Numeric features: 491


In [28]:
# Check for inf and NaN values
print("NaN count per feature:")
print(X.isna().sum())
print("\nInf count per feature:")
print(np.isinf(X).sum())

# Replace inf with NaN, then impute
X = X.replace([np.inf, -np.inf], np.nan)

NaN count per feature:
MaxAbsEStateIndex    0
MaxEStateIndex       0
SPS                  0
MolWt                0
HeavyAtomMolWt       0
                    ..
MurckoAtoms          0
SideChainAtoms       0
SMI_len              0
SMI_ringDigits       0
SMI_ezSlashes        0
Length: 491, dtype: int64

Inf count per feature:
MaxAbsEStateIndex    0
MaxEStateIndex       0
SPS                  0
MolWt                0
HeavyAtomMolWt       0
                    ..
MurckoAtoms          0
SideChainAtoms       0
SMI_len              0
SMI_ringDigits       0
SMI_ezSlashes        0
Length: 491, dtype: int64


In [29]:

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# imputer = SimpleImputer(strategy='median')
# X_train_imp = imputer.fit_transform(X_train)
# X_valid_imp = imputer.transform(X_valid)

model = lgb.LGBMRegressor(
    objective='quantile',
    alpha=0.5,
    n_estimators=5000,
    learning_rate=0.01,
    num_leaves=31,
    min_child_samples=100,
    reg_alpha=0.1,
    reg_lambda=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
)

model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric='l1',
    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)],
)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.661386 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16999
[LightGBM] [Info] Number of data points in the train set: 7131, number of used features: 449
[LightGBM] [Info] Start training from score 291.200012


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.01
,n_estimators,5000
,subsample_for_bin,200000
,objective,'quantile'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [32]:
pred_valid = model.predict(X_valid, num_iteration=model.best_iteration_)
mae = mean_absolute_error(y_valid, pred_valid)
rmse = np.sqrt(mean_squared_error(y_valid, pred_valid))
r2 = r2_score(y_valid, pred_valid)

print(f'Validation MAE: {mae:.4f}')
print(f'Validation RMSE: {rmse:.4f}')
print(f'Validation R2: {r2:.4f}') 
print('Best iteration:', model.best_iteration_)

save_model(
    timestamp=create_timestamp(),
    model=model,
    name='lgbm_model'
)

Validation MAE: 68.3894
Validation RMSE: 108.7255
Validation R2: 0.6685
Best iteration: 5000


In [None]:
#shap plot
import shap

explainer = shap.Explainer(model)
shap_values = explainer(X_valid)
shap.plots.beeswarm(shap_values, max_display=20)
import json
from pathlib import Path


In [None]:
OUTPUT_DIR = Path('result/baseline_model')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Save baseline outputs
baseline_metrics = {
    'mae': mae,
    'rmse': rmse,
    'r2': r2,
    'n_rows': int(len(work_df)),
    'n_features': int(X.shape[1]),
    'best_iteration': int(getattr(model, 'best_iteration_', 0) or 0),
}
import json
metrics_path = OUTPUT_DIR / 'baseline_lgbm_metrics.json'
with open(metrics_path, 'w', encoding='utf-8') as f:
    json.dump(baseline_metrics, f, indent=2)
print('Saved metrics to', metrics_path)

pred_path = OUTPUT_DIR / 'baseline_lgbm_predictions.csv'
pred_df = pd.DataFrame({
    'y_true': y_valid.reset_index(drop=True),
    'y_pred': pred_valid,
})
pred_df.to_csv(pred_path, index=False)
print('Saved predictions to', pred_path)

In [30]:
import optuna
from sklearn.model_selection import cross_val_score, KFold

def objective(trial):
    params = {
        'objective': trial.suggest_categorical('objective', ['regression', 'quantile']),
        'n_estimators': trial.suggest_int('n_estimators', 500, 3000),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 15, 127),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 200),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'random_state': 42,
        'n_jobs': -1,
        'verbosity': -1,
    }
    
    if params['objective'] == 'quantile':
        params['alpha'] = trial.suggest_float('alpha', 0.1, 0.9)
    
    model = lgb.LGBMRegressor(**params)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_absolute_error', n_jobs=-1)
    return -scores.mean()

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, show_progress_bar=True)

print("Best parameters:", study.best_params)
print("Best CV MAE:", study.best_value)

# save model with best parameters
saved_timestamp = create_timestamp()
best_model = lgb.LGBMRegressor(**study.best_params)
best_model.fit(X, y)

save_model(timestamp=saved_timestamp, model=best_model, name='lgbm_best_model')

save_results(
    timestamp=saved_timestamp,
    params=study.best_params,
    best_iteration=best_model.best_iteration_,
    valid_score=None,
    additional_score=None,
    name='lgbm_best_model',
    feature_list=feature_cols
)



  from .autonotebook import tqdm as notebook_tqdm
[I 2025-12-25 08:00:34,861] A new study created in memory with name: no-name-a6e48d19-3dca-4d87-b258-005957c72bfe
  0%|          | 0/100 [2:50:37<?, ?it/s]


[W 2025-12-25 10:51:12,502] Trial 0 failed with parameters: {'objective': 'quantile', 'n_estimators': 1433, 'learning_rate': 0.049134945135553304, 'num_leaves': 96, 'min_child_samples': 42, 'reg_alpha': 0.8794945859469068, 'reg_lambda': 0.6252563817344834, 'subsample': 0.9029868379400904, 'colsample_bytree': 0.638650738249063, 'alpha': 0.2363945718427144} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/tp_ubuntu/project/.venv/lib/python3.12/site-packages/optuna/study/_optimize.py", line 205, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_80959/700107376.py", line 25, in objective
    scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_absolute_error', n_jobs=-1)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tp_ubuntu/project/.venv/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 21

KeyboardInterrupt: 

In [None]:

pred_valid = best_model.predict(X_valid, num_iteration=best_model.best_iteration_)
mae = mean_absolute_error(y_valid, pred_valid)
rmse = np.sqrt(mean_squared_error(y_valid, pred_valid))
r2 = r2_score(y_valid, pred_valid)

print(f'Validation MAE: {mae:.4f}')
print(f'Validation RMSE: {rmse:.4f}')
print(f'Validation R2: {r2:.4f}') 
print('Best iteration:', best_model.best_iteration_)

Validation MAE: 67.7339
Validation RMSE: 107.4369
Validation R2: 0.6763
Best iteration: 4997
