# Feature Optimization (Genetic Algorithm) — v0.5.1

Runs GA feature selection on the feature table produced by `preprocessing-v0.5.1.ipynb`.

- Input: `result/data/melting_point_features.csv`
- Outputs (in `result/data/`): selected feature list + run metadata + reduced dataset

Fitness = negative CV MAE from a LightGBM regressor. Uses scaffold CV if RDKit is available; otherwise KFold.


In [7]:
import json
import subprocess
import sys

def ensure_package(import_name: str, install_name: str | None = None) -> None:
    install_name = install_name or import_name
    r = subprocess.run([sys.executable, '-m', 'pip', 'list', '--format=json'], check=True, capture_output=True, text=True)
    installed = {p['name'].lower() for p in json.loads(r.stdout)}
    if import_name.lower() in installed or install_name.lower() in installed:
        print(f'{import_name} already installed')
        return
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', install_name])
    print(f'Installed {install_name}')

for name, pip_name in [
    ('numpy', None),
    ('pandas', None),
    ('joblib', None),
    ('scikit-learn', 'scikit-learn'),
    ('lightgbm', 'lightgbm'),
    ('rdkit', 'rdkit'),  # optional; enables scaffold CV
]:
    try:
        ensure_package(name, pip_name or name)
    except Exception as exc:
        print('Package check failed:', name, '->', exc)


numpy already installed
pandas already installed
joblib already installed
scikit-learn already installed
lightgbm already installed
rdkit already installed


In [8]:
from __future__ import annotations



from pathlib import Path

from typing import Dict, List, Sequence, Tuple



import numpy as np

import pandas as pd

from joblib import Parallel, delayed



import lightgbm as lgb

from sklearn.impute import SimpleImputer

from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import KFold



try:

    from rdkit import Chem

    from rdkit.Chem.Scaffolds import MurckoScaffold

except Exception:

    Chem = None

    MurckoScaffold = None



NAME = 'v0.5.1'



# Candidate reduction (GA over thousands of columns is expensive)

CANDIDATE_TOP_N = 800



# GA params

RANDOM_SEED = 42

POP_SIZE = 40

N_GEN = 25

TOURNAMENT_K = 3

CROSSOVER_PROB = 0.7

MUTATION_BIT_PROB = 0.02

MIN_FEATURES = 60

MAX_FEATURES = 280

PATIENCE = 8



# CV

USE_SCAFFOLD_CV = True

N_SPLITS = 5



# Parallelism: for a single GPU (Colab), keep this low to avoid multiple processes

# competing for the same GPU.

N_JOBS_FITNESS = -1  # Use all available CPU cores



# LightGBM: GPU + looser split constraints to reduce "best gain: -inf" on sparse/bit features.

LGBM_PARAMS = dict(

    objective='regression',

    n_estimators=5000,

    learning_rate=0.03,

    num_leaves=63,

    subsample=0.8,

    colsample_bytree=0.8,

    random_state=RANDOM_SEED,

    n_jobs=1,



    # GPU

 


    # Split constraints (defaults can block splits on sparse fingerprints)

    min_child_samples=5,

    min_data_in_bin=1,

    min_sum_hessian_in_leaf=1e-3,

    min_split_gain=0.0,

    feature_pre_filter=False,

)





def find_features_csv() -> tuple[Path, Path]:

    candidates = [

        Path('result/data'),

        Path('work/version-0.5/result/data'),

        Path('/content/result/data'),

        Path('/content/drive/MyDrive/result/data'),

    ]

    for d in candidates:

        p = d / 'melting_point_features.csv'

        if p.exists():

            return d, p

    return candidates[0], candidates[0] / 'melting_point_features.csv'





OUTPUT_DIR, FEATURES_CSV = find_features_csv()

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print('Using FEATURES_CSV:', FEATURES_CSV)



df = pd.read_csv(FEATURES_CSV)

print('Loaded shape', df.shape)

assert 'Tm' in df.columns

assert 'SMILES' in df.columns

df = df.dropna(subset=['Tm']).reset_index(drop=True)

print('After dropna(Tm):', df.shape)


Using FEATURES_CSV: result/data/melting_point_features.csv
Loaded shape (2660, 1347)
After dropna(Tm): (2660, 1347)


In [9]:
def numeric_feature_columns(frame: pd.DataFrame) -> List[str]:
    exclude = {'Tm', 'SMILES', 'id'}
    numeric_cols = frame.select_dtypes(include=['number']).columns.tolist()
    return [c for c in numeric_cols if c not in exclude]

def safe_abs_corr(frame: pd.DataFrame, cols: Sequence[str], y: np.ndarray) -> pd.Series:
    y = np.asarray(y, dtype=float)
    y = np.nan_to_num(y, nan=np.nanmean(y))
    y = y - y.mean()
    y_den = float(np.sqrt(np.sum(y * y)) + 1e-12)
    out: Dict[str, float] = {}
    for c in cols:
        x = frame[c].to_numpy(dtype=float, copy=False)
        if not np.isfinite(x).any():
            out[c] = 0.0
            continue
        x = np.nan_to_num(x, nan=np.nanmean(x))
        x = x - x.mean()
        x_den = float(np.sqrt(np.sum(x * x)) + 1e-12)
        out[c] = float(abs(np.sum(x * y) / (x_den * y_den)))
    return pd.Series(out).sort_values(ascending=False)

y_all = df['Tm'].astype(float).to_numpy()
all_numeric = numeric_feature_columns(df)
print('Numeric pool:', len(all_numeric))

# Keep these if present (prevent GA from dropping key chemistry signals)
LOCKED_IN = [
    'MolWt','TPSA','MolLogP','NumRings','NumAromaticRings',
    'Gasteiger_q_abs_sum','has_halogen','num_halogen_atoms',
    'has_phenyl_group','topological_diameter','topological_radius',
    'longest_aliphatic_chain_length'
]
locked_in = [c for c in LOCKED_IN if c in all_numeric]

scores = safe_abs_corr(df, all_numeric, y_all)
top = scores.head(CANDIDATE_TOP_N).index.tolist()
candidate_cols = sorted(set(top).union(set(locked_in)))
print('Candidates:', len(candidate_cols), '| locked_in:', len(locked_in))

smiles = df['SMILES'].astype(str).tolist()


Numeric pool: 1344
Candidates: 801 | locked_in: 10


In [10]:
def murcko_scaffold_smiles(smi: str) -> str:
    if Chem is None or MurckoScaffold is None:
        return ''
    try:
        m = Chem.MolFromSmiles(smi)
        if m is None:
            return ''
        scaf = MurckoScaffold.GetScaffoldForMol(m)
        if scaf is None or scaf.GetNumAtoms() == 0:
            return ''
        return Chem.MolToSmiles(scaf, canonical=True)
    except Exception:
        return ''

def make_scaffold_folds(smiles_list: Sequence[str], n_splits: int) -> List[Tuple[np.ndarray, np.ndarray]]:
    scaffolds: Dict[str, List[int]] = {}
    for i, s in enumerate(smiles_list):
        scaffolds.setdefault(murcko_scaffold_smiles(s), []).append(i)
    groups = list(scaffolds.values())
    groups.sort(key=len, reverse=True)
    fold_bins: List[List[int]] = [[] for _ in range(n_splits)]
    fold_sizes = [0] * n_splits
    for g in groups:
        j = int(np.argmin(fold_sizes))
        fold_bins[j].extend(g)
        fold_sizes[j] += len(g)
    all_idx = np.arange(len(smiles_list))
    folds: List[Tuple[np.ndarray, np.ndarray]] = []
    for k in range(n_splits):
        valid_idx = np.array(sorted(fold_bins[k]), dtype=int)
        train_mask = np.ones(len(smiles_list), dtype=bool)
        train_mask[valid_idx] = False
        folds.append((all_idx[train_mask], valid_idx))
    return folds

def make_folds(smiles_list: Sequence[str], n_splits: int, use_scaffold: bool, seed: int) -> List[Tuple[np.ndarray, np.ndarray]]:
    if use_scaffold and Chem is not None and MurckoScaffold is not None:
        try:
            return make_scaffold_folds(smiles_list, n_splits=n_splits)
        except Exception:
            pass
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    idx = np.arange(len(smiles_list))
    return [(tr, va) for tr, va in kf.split(idx)]

folds = make_folds(smiles, n_splits=N_SPLITS, use_scaffold=USE_SCAFFOLD_CV, seed=RANDOM_SEED)
print('Folds:', len(folds), 'valid sizes:', [len(v) for _, v in folds])


Folds: 5 valid sizes: [1120, 752, 263, 263, 262]


In [11]:
fitness_cache: Dict[bytes, float] = {}



def mask_key(mask: np.ndarray) -> bytes:

    return np.packbits(mask.astype(np.uint8)).tobytes()



def eval_subset_mae_cv(mask: np.ndarray) -> float:

    cols = [c for c, on in zip(candidate_cols, mask.tolist()) if on]

    if not cols:

        return float('inf')



    X_all = df[cols]

    y = df['Tm'].to_numpy(dtype=float)



    maes = []

    for tr, va in folds:

        imp = SimpleImputer(strategy='median')



        # Keep feature names consistent (prevents sklearn "feature names" warning).

        X_tr = pd.DataFrame(

            imp.fit_transform(X_all.iloc[tr]),

            columns=cols,

            index=np.asarray(tr),

        )

        X_va = pd.DataFrame(

            imp.transform(X_all.iloc[va]),

            columns=cols,

            index=np.asarray(va),

        )



        # Drop constant / near-constant columns within the fold

        var = X_tr.to_numpy().std(axis=0)

        keep = var > 1e-12

        if not np.any(keep):

            return float('inf')

        kept_cols = [c for c, k in zip(cols, keep.tolist()) if k]

        if len(kept_cols) < 2:

            return float('inf')

        X_tr = X_tr[kept_cols]

        X_va = X_va[kept_cols]



        y_tr = y[tr]

        y_va = y[va]



        model = lgb.LGBMRegressor(**LGBM_PARAMS)

        model.fit(

            X_tr,

            y_tr,

            eval_set=[(X_va, y_va)],

            eval_metric='l1',

            callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)],

        )

        pred = model.predict(X_va, num_iteration=getattr(model, 'best_iteration_', None))

        maes.append(float(mean_absolute_error(y_va, pred)))



    return float(np.mean(maes))



def fitness(mask: np.ndarray) -> float:

    k = mask_key(mask)

    if k in fitness_cache:

        return -fitness_cache[k]

    mae = eval_subset_mae_cv(mask)

    fitness_cache[k] = mae

    return -mae



locked_idx = np.array([candidate_cols.index(c) for c in locked_in], dtype=int) if locked_in else np.array([], dtype=int)


In [12]:
rng = np.random.default_rng(RANDOM_SEED)
n_feat = len(candidate_cols)

def repair(mask: np.ndarray) -> np.ndarray:
    m = mask.astype(bool, copy=True)
    if locked_idx.size:
        m[locked_idx] = True
    cnt = int(m.sum())
    if cnt > MAX_FEATURES:
        on = np.where(m)[0]
        if locked_idx.size:
            lock = set(locked_idx.tolist())
            on = np.array([i for i in on if i not in lock], dtype=int)
        drop_n = cnt - MAX_FEATURES
        if on.size and drop_n > 0:
            drop = rng.choice(on, size=min(drop_n, on.size), replace=False)
            m[drop] = False
    cnt = int(m.sum())
    if cnt < MIN_FEATURES:
        off = np.where(~m)[0]
        add_n = MIN_FEATURES - cnt
        if off.size and add_n > 0:
            add = rng.choice(off, size=min(add_n, off.size), replace=False)
            m[add] = True
    return m

def random_individual() -> np.ndarray:
    k = int(rng.integers(MIN_FEATURES, min(MAX_FEATURES, n_feat) + 1))
    m = np.zeros(n_feat, dtype=bool)
    idx = rng.choice(np.arange(n_feat), size=k, replace=False)
    m[idx] = True
    return repair(m)

def tournament_select(pop: List[np.ndarray], fit: np.ndarray, k: int) -> np.ndarray:
    idx = rng.choice(np.arange(len(pop)), size=k, replace=False)
    best = int(idx[np.argmax(fit[idx])])
    return pop[best].copy()

def crossover(a: np.ndarray, b: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    if rng.random() >= CROSSOVER_PROB:
        return a.copy(), b.copy()
    m = rng.random(n_feat) < 0.5
    c1 = np.where(m, a, b)
    c2 = np.where(m, b, a)
    return repair(c1), repair(c2)

def mutate(mask: np.ndarray) -> np.ndarray:
    m = mask.copy()
    flips = rng.random(n_feat) < MUTATION_BIT_PROB
    m[flips] = ~m[flips]
    return repair(m)

def eval_population(pop: List[np.ndarray]) -> np.ndarray:
    vals = Parallel(n_jobs=N_JOBS_FITNESS)(delayed(fitness)(ind) for ind in pop)
    return np.asarray(vals, dtype=float)

population: List[np.ndarray] = [random_individual() for _ in range(POP_SIZE)]
history = []
best_fit = -np.inf
best_mask = None
no_improve = 0

for gen in range(N_GEN):
    fit_vals = eval_population(population)
    best_idx = int(np.argmax(fit_vals))
    gen_best = float(fit_vals[best_idx])
    gen_mean = float(np.mean(fit_vals))

    if gen_best > best_fit:
        best_fit = gen_best
        best_mask = population[best_idx].copy()
        no_improve = 0
    else:
        no_improve += 1

    history.append({'gen': gen, 'best_mae': float(-gen_best), 'mean_mae': float(-gen_mean), 'cache_size': int(len(fitness_cache))})
    print(f'Gen {gen:02d} | best MAE={-gen_best:.5f} | mean MAE={-gen_mean:.5f} | cache={len(fitness_cache)}')
    if no_improve >= PATIENCE:
        print('Early stop: no improvement')
        break

    elite_n = 2
    elite_idx = np.argsort(-fit_vals)[:elite_n]
    elites = [population[int(i)].copy() for i in elite_idx]

    new_pop: List[np.ndarray] = []
    new_pop.extend(elites)
    while len(new_pop) < POP_SIZE:
        p1 = tournament_select(population, fit_vals, TOURNAMENT_K)
        p2 = tournament_select(population, fit_vals, TOURNAMENT_K)
        c1, c2 = crossover(p1, p2)
        new_pop.append(mutate(c1))
        if len(new_pop) < POP_SIZE:
            new_pop.append(mutate(c2))
    population = new_pop

assert best_mask is not None
print('Best CV MAE:', float(-best_fit))
print('Selected feature count:', int(best_mask.sum()))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000488 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000708 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3778
[LightGBM] [Info] Number of data points in the train set: 1540, number of used features: 81
[LightGBM] [Info] Number of data points in the train set: 1540, number of used features: 91
[LightGBM] [Info] Start training from score 306.040516
[LightGBM] [Info] Start training from score 306.040516
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001013 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=

In [13]:
selected_cols = sorted([c for c, on in zip(candidate_cols, best_mask.tolist()) if on])
print('Selected cols (first 25):', selected_cols[:25])

sel_json = OUTPUT_DIR / f'ga_selected_features_{NAME}.json'
sel_csv = OUTPUT_DIR / f'ga_selected_features_{NAME}.csv'
run_json = OUTPUT_DIR / f'ga_run_{NAME}.json'
reduced_parquet = OUTPUT_DIR / f'melting_point_features_ga_{NAME}.parquet'

with open(sel_json, 'w', encoding='utf-8') as f:
    json.dump({'name': NAME, 'selected_features': selected_cols, 'locked_in': locked_in, 'candidate_top_n': int(CANDIDATE_TOP_N)}, f, indent=2)
pd.DataFrame({'feature': selected_cols}).to_csv(sel_csv, index=False)

run_meta = {
    'name': NAME,
    'random_seed': int(RANDOM_SEED),
    'pop_size': int(POP_SIZE),
    'n_gen_requested': int(N_GEN),
    'n_gen_ran': int(len(history)),
    'best_cv_mae': float(-best_fit),
    'n_selected': int(len(selected_cols)),
    'candidate_cols': int(len(candidate_cols)),
    'use_scaffold_cv': bool(USE_SCAFFOLD_CV and Chem is not None and MurckoScaffold is not None),
    'history': history,
    'lgbm_params': LGBM_PARAMS,
}
with open(run_json, 'w', encoding='utf-8') as f:
    json.dump(run_meta, f, indent=2)

reduced = df[['SMILES', 'Tm']].copy()
for c in selected_cols:
    reduced[c] = df[c]
try:
    reduced.to_parquet(reduced_parquet, index=False)
    print('Saved reduced parquet to', reduced_parquet)
except Exception as exc:
    print('Parquet export skipped:', exc)

print('Saved:', sel_json)
print('Saved:', sel_csv)
print('Saved:', run_json)


Selected cols (first 25): ['Chi0n', 'Chi3v', 'Count_Cl', 'Count_N', 'EState_VSA3', 'EState_VSA6', 'EState_VSA7', 'Flexibility_Score', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FracTriple', 'Gasteiger_q_abs_sum', 'Gasteiger_q_min', 'Group 100', 'Group 109', 'Group 110', 'Group 117', 'Group 118', 'Group 120', 'Group 127', 'Group 14', 'Group 15', 'Group 17', 'Group 170', 'Group 176']
Parquet export skipped: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.
Saved: result/data/ga_selected_features_v0.5.1.json
Saved: result/data/ga_selected_features_v0.5.1.csv
Saved: result/data/ga_run_v0.5.1.jso

  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] = df[c]
  reduced[c] 