In [31]:
# Install baseline modeling deps if missing (best-effort)
for pkg in [
    ('scikit-learn', 'scikit-learn'),
    ('lightgbm', 'lightgbm'),
]:
    try:
        ensure_package(pkg[0], pkg[1])
    except Exception as exc:
        print(f"Package install check failed for {pkg}: {exc}")

import lightgbm as lgb
from sklearn.model_selection import train_test_split, KFold
#stratifed kfold 
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

Package install check failed for ('scikit-learn', 'scikit-learn'): name 'ensure_package' is not defined
Package install check failed for ('lightgbm', 'lightgbm'): name 'ensure_package' is not defined


In [32]:
from __future__ import annotations

from pathlib import Path
from typing import Any, Callable, Dict, Iterable, Optional, Tuple
import json
import multiprocessing as mp
import numpy as np
import pandas as pd

from joblib import Parallel, delayed

from rdkit import Chem, rdBase, RDLogger
from rdkit.Chem import AllChem, Crippen, Descriptors, Fragments, Lipinski, rdMolDescriptors, rdFingerprintGenerator
from rdkit.Chem.MACCSkeys import GenMACCSKeys
from rdkit.Chem.EState import AtomTypes as EAtomTypes

try:
    from rdkit.Chem.Scaffolds import MurckoScaffold
except Exception:
    MurckoScaffold = None

RDLogger.DisableLog('rdApp.*')
rdBase.DisableLog('rdApp.*')

DATA_DIR = Path('../../main-data')
TRAIN_PATH = DATA_DIR / 'train.csv'
PUBCHEM_PATH = Path('result/data/melting_point_features.csv')
TEST_PATH = DATA_DIR / 'test.csv'  # optional
OUTPUT_DIR = Path('result/data')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

MORGAN_BITS = 512
MORGAN_RADIUS = 2
USE_MACCS = True
COMPUTE_3D = True
MAX_ITERS_3D = 0  # 0 = no optimization, >0 enables a short UFF optimize

In [33]:
df_train_feat = pd.read_csv(PUBCHEM_PATH)

In [34]:
# Build X/y from the featurized training frame
assert 'Tm' in df_train_feat.columns, "Expected target column 'Tm' in df_train_feat"
assert 'SMILES' in df_train_feat.columns, "Expected 'SMILES' column in df_train_feat"

work_df = df_train_feat.copy()
work_df = work_df.dropna(subset=['Tm']).reset_index(drop=True)

# Use numeric features only (exclude identifiers/strings)
feature_cols = [c for c in work_df.columns if c not in ('Tm', 'SMILES')]
X = work_df[feature_cols].select_dtypes(include=['number'])
y = work_df['Tm'].astype(float)

print('Rows used:', len(work_df))
print('Numeric features:', X.shape[1])
print('Target stats:', float(y.min()), float(y.mean()), float(y.max()))

Rows used: 10520
Numeric features: 937
Target stats: -5126.0 273.41996007604564 6332.0


In [None]:
# Outlier removal 
Tm = work_df['Tm'].astype(float)
 
lower = -250
upper = 750

mask = (Tm >= lower) & (Tm <= upper)
removed = int((~mask).sum())
total = int(len(work_df))
pct_removed = 100.0 * removed / total if total else 0.0

print(f"IQR bounds for Tm: [{lower:.3f}, {upper:.3f}]")
print(f"Removed outliers: {removed}/{total} ({pct_removed:.2f}%)")

# Apply filter
work_df = work_df.loc[mask].reset_index(drop=True)
feature_cols = [c for c in work_df.columns if c not in ('Tm', 'SMILES')]
X = work_df[feature_cols].select_dtypes(include=['number'])
y = work_df['Tm'].astype(float)
#y_transformed = np.sign(y) * np.log1p(np.abs(y)) add log transform

print('After outlier removal -> Rows:', len(work_df), 'Numeric features:', X.shape[1])

IQR bounds for Tm: [-250.000, 750.000] (Q1=97.700, Q3=406.400, IQR=308.700)
Removed outliers: 640/10520 (6.08%)
After outlier removal -> Rows: 9880 Numeric features: 937


In [36]:
# Check for inf and NaN values
print("NaN count per feature:")
print(X.isna().sum())
print("\nInf count per feature:")
print(np.isinf(X).sum())

# Replace inf with NaN, then impute
X = X.replace([np.inf, -np.inf], np.nan)

NaN count per feature:
MaxAbsEStateIndex    0
MaxEStateIndex       0
MinAbsEStateIndex    0
MinEStateIndex       0
qed                  0
                    ..
SMI_len              0
SMI_branches         0
SMI_ringDigits       0
SMI_stereoAt         0
SMI_ezSlashes        0
Length: 937, dtype: int64

Inf count per feature:
MaxAbsEStateIndex    0
MaxEStateIndex       0
MinAbsEStateIndex    0
MinEStateIndex       0
qed                  0
                    ..
SMI_len              0
SMI_branches         0
SMI_ringDigits       0
SMI_stereoAt         0
SMI_ezSlashes        0
Length: 937, dtype: int64


In [None]:

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

imputer = SimpleImputer(strategy='median')
X_train_imp = imputer.fit_transform(X_train)
X_valid_imp = imputer.transform(X_valid)

model = lgb.LGBMRegressor(
    objective='quantile',
    alpha=0.5,
    n_estimators=2000,
    learning_rate=0.05,
    num_leaves=31,
    min_child_samples=100,
    reg_alpha=0.1,
    reg_lambda=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
)

model.fit(
    X_train_imp, y_train,
    eval_set=[(X_valid_imp, y_valid)],
    eval_metric='l1',
    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)],
)


Exception ignored on calling ctypes callback function: <function _log_callback at 0x7c1ae0605120>
Traceback (most recent call last):
  File "/home/tp_ubuntu/project/.venv/lib/python3.12/site-packages/lightgbm/basic.py", line 287, in _log_callback
    def _log_callback(msg: bytes) -> None:
    
KeyboardInterrupt: 


Auto-choosing col-wise multi-threading, the overhead of testing was 3.201664 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31486
[LightGBM] [Info] Number of data points in the train set: 7904, number of used features: 868
[LightGBM] [Info] Start training from score 280.399994


In [None]:

pred_valid = model.predict(X_valid_imp, num_iteration=model.best_iteration_)
mae = mean_absolute_error(y_valid, pred_valid)
rmse = np.sqrt(mean_squared_error(y_valid, pred_valid))
r2 = r2_score(y_valid, pred_valid)



In [None]:
# Save baseline outputs
baseline_metrics = {
    'mae': mae,
    'rmse': rmse,
    'r2': r2,
    'n_rows': int(len(work_df)),
    'n_features': int(X.shape[1]),
    'best_iteration': int(getattr(model, 'best_iteration_', 0) or 0),
}
import json
metrics_path = OUTPUT_DIR / 'baseline_lgbm_metrics.json'
with open(metrics_path, 'w', encoding='utf-8') as f:
    json.dump(baseline_metrics, f, indent=2)
print('Saved metrics to', metrics_path)

pred_path = OUTPUT_DIR / 'baseline_lgbm_predictions.csv'
pred_df = pd.DataFrame({
    'y_true': y_valid.reset_index(drop=True),
    'y_pred': pred_valid,
})
pred_df.to_csv(pred_path, index=False)
print('Saved predictions to', pred_path)

Saved metrics to result/data/baseline_lgbm_metrics.json
Saved predictions to result/data/baseline_lgbm_predictions.csv
