In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from math import sqrt

ROOT = Path(r"C:\DOCTORAL HUB\nmr_pipeline_project")

# Load enhanced dataset
df = pd.read_csv(ROOT / "data/merged/merged_phase3_clean.csv")

# Filter hydrogen atoms only
h_df = df[df["element_y"] == "H"].copy()
print("Hydrogen samples:", len(h_df))

# Feature & target prep
non_features = ["mol_idx", "atom_index", "element_x", "element_y", "shift_ppm_x", "shift_ppm_y"]
feature_cols = [c for c in h_df.columns if c not in non_features and np.issubdtype(h_df[c].dtype, np.number)]
X = h_df[feature_cols].fillna(0).values
y = h_df["shift_ppm_y"].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Helper function
def evaluate(model, label):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    print(f"{label:15} | R²={r2:.3f} | RMSE={rmse:.2f} | MAE={mae:.2f}")
    return {"Model": label, "R2": r2, "RMSE": rmse, "MAE": mae}

results = []

# Ridge test with stronger regularization
for alpha in [10, 100, 500]:
    ridge = Ridge(alpha=alpha)
    results.append(evaluate(ridge, f"Ridge α={alpha}"))

# Random Forest test
for n in [300, 500]:
    rf = RandomForestRegressor(n_estimators=n, random_state=42, n_jobs=-1)
    results.append(evaluate(rf, f"RF n={n}"))

pd.DataFrame(results)


Hydrogen samples: 286
Ridge α=10      | R²=0.257 | RMSE=27.99 | MAE=13.10
Ridge α=100     | R²=0.085 | RMSE=31.06 | MAE=13.41
Ridge α=500     | R²=0.021 | RMSE=32.13 | MAE=14.24
RF n=300        | R²=0.515 | RMSE=22.62 | MAE=9.13
RF n=500        | R²=0.513 | RMSE=22.65 | MAE=9.15


Unnamed: 0,Model,R2,RMSE,MAE
0,Ridge α=10,0.257097,27.989045,13.100787
1,Ridge α=100,0.08538,31.055762,13.405729
2,Ridge α=500,0.020821,32.133128,14.237205
3,RF n=300,0.514967,22.615564,9.131445
4,RF n=500,0.513398,22.65212,9.151254


In [2]:
import joblib
joblib.dump(rf, ROOT / "results/model_H_RF_tuned.joblib")
joblib.dump(ridge, ROOT / "results/model_H_Ridge_tuned.joblib")


['C:\\DOCTORAL HUB\\nmr_pipeline_project\\results\\model_H_Ridge_tuned.joblib']