In [2]:
# ==========================================================
# Phase 4C (FAST): Optimized ML Training per Element
# ==========================================================
# Author: NMR Pipeline Optimization
# Purpose: Accelerated hyperparameter tuning with per-element models
# ==========================================================

from pathlib import Path
import pandas as pd
import numpy as np
import time
from math import sqrt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# ==========================================================
# Load data
# ==========================================================
ROOT = Path("C:/DOCTORAL HUB/nmr_pipeline_project")
df = pd.read_csv(ROOT / "data/merged/merged_phase3_clean.csv")

ban_cols = ["mol_idx", "atom_index", "element_x", "element_y", "shift_ppm_x", "shift_ppm_y"]
TARGET_COL = "shift_ppm_y"

print(f" Loaded dataset with {len(df):,} rows and {df.shape[1]} columns")

# ==========================================================
# FAST GRID CONFIGURATION
# ==========================================================
ridge_params = {"ridge__alpha": [0.1, 1.0, 10.0, 100.0]}  # smaller grid
rf_params = {
    "n_estimators": [100, 200],
    "max_depth": [10, None],
    "min_samples_split": [2, 5]
}

# ==========================================================
# TRAINING LOOP
# ==========================================================
results = []
elements = sorted(df["element_y"].dropna().unique())

for nuc in elements:
    print(f"\n==========================")
    print(f" Optimizing models for element: {nuc}")

    start_time = time.time()
    sub = df[df["element_y"] == nuc].copy()
    if len(sub) < 100:
        print(f"Skipping {nuc} (too few samples: {len(sub)})")
        continue

    # Prepare data
    feature_cols = [c for c in sub.columns if c not in ban_cols and np.issubdtype(sub[c].dtype, np.number)]
    X = sub[feature_cols].values
    y = sub[TARGET_COL].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # ==========================================================
    # Ridge Regression
    # ==========================================================
    ridge_pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("ridge", Ridge())
    ])
    ridge_grid = GridSearchCV(ridge_pipe, ridge_params, cv=2, n_jobs=-1)
    ridge_grid.fit(X_train, y_train)
    y_pred = ridge_grid.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)

    results.append({
        "Element": nuc, "Model": "Ridge",
        "R2": r2, "RMSE": rmse, "MAE": mae,
        "BestParams": ridge_grid.best_params_
    })
    print(f"[{nuc}] Ridge → R²={r2:.3f} | RMSE={rmse:.3f} | MAE={mae:.3f} | α={ridge_grid.best_params_['ridge__alpha']}")

    # ==========================================================
    # Random Forest (FAST GRID)
    # ==========================================================
    rf = RandomForestRegressor(random_state=42)
    rf_grid = GridSearchCV(rf, rf_params, cv=2, n_jobs=-1)
    rf_grid.fit(X_train, y_train)
    y_pred = rf_grid.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)

    results.append({
        "Element": nuc, "Model": "RandomForest",
        "R2": r2, "RMSE": rmse, "MAE": mae,
        "BestParams": rf_grid.best_params_
    })
    print(f"[{nuc}] RandomForest → R²={r2:.3f} | RMSE={rmse:.3f} | MAE={mae:.3f} | best={rf_grid.best_params_}")

    elapsed = (time.time() - start_time) / 60
    print(f" Completed {nuc} in {elapsed:.2f} min")

# ==========================================================
# SAVE RESULTS
# ==========================================================
df_results = pd.DataFrame(results)
out_path = ROOT / "results/phase4c_model_optimization_FAST.csv"
df_results.to_csv(out_path, index=False)

print("\n Optimization (FAST) complete!")
print(f" Results saved to: {out_path}")
df_results


 Loaded dataset with 434,147 rows and 138 columns

 Optimizing models for element: B
[B] Ridge → R²=0.122 | RMSE=23.162 | MAE=18.948 | α=10.0
[B] RandomForest → R²=0.089 | RMSE=23.594 | MAE=18.731 | best={'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
 Completed B in 0.14 min

 Optimizing models for element: C
[C] Ridge → R²=0.085 | RMSE=49.486 | MAE=42.445 | α=100.0
[C] RandomForest → R²=0.093 | RMSE=49.285 | MAE=42.021 | best={'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
 Completed C in 10.19 min

 Optimizing models for element: Cl
Skipping Cl (too few samples: 1)

 Optimizing models for element: F
[F] Ridge → R²=0.067 | RMSE=75.163 | MAE=55.946 | α=100.0
[F] RandomForest → R²=0.159 | RMSE=71.365 | MAE=50.265 | best={'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}
 Completed F in 0.16 min

 Optimizing models for element: Ge
Skipping Ge (too few samples: 6)

 Optimizing models for element: H
[H] Ridge → R²=0.263 | RMSE=27.869 | MAE=12.504 

Unnamed: 0,Element,Model,R2,RMSE,MAE,BestParams
0,B,Ridge,0.121838,23.162206,18.947847,{'ridge__alpha': 10.0}
1,B,RandomForest,0.088759,23.594425,18.730588,"{'max_depth': 10, 'min_samples_split': 5, 'n_e..."
2,C,Ridge,0.085225,49.486368,42.444857,{'ridge__alpha': 100.0}
3,C,RandomForest,0.092652,49.285087,42.020782,"{'max_depth': 10, 'min_samples_split': 5, 'n_e..."
4,F,Ridge,0.067292,75.162945,55.946314,{'ridge__alpha': 100.0}
5,F,RandomForest,0.159166,71.365125,50.265416,"{'max_depth': 10, 'min_samples_split': 2, 'n_e..."
6,H,Ridge,0.263476,27.86861,12.503645,{'ridge__alpha': 100.0}
7,H,RandomForest,0.472776,23.578688,9.459818,"{'max_depth': 10, 'min_samples_split': 2, 'n_e..."
8,N,Ridge,0.052364,60.486743,46.074813,{'ridge__alpha': 100.0}
9,N,RandomForest,-0.157488,66.849407,50.045515,"{'max_depth': 10, 'min_samples_split': 5, 'n_e..."
