# Version 1

In [None]:
# Paths & logging setup (from inside notebooks/)
from pathlib import Path
import os, optuna, mlflow, warnings, json
import pandas as pd
import numpy as np
warnings.filterwarnings("ignore")

# Project root (one level up from notebooks/)
PROJ_ROOT = Path("..").resolve()

# Canonical directories
DATA_DIR = PROJ_ROOT / "data"
PROCESSED_DIR = DATA_DIR / "processed"
ARTIFACTS_DIR = PROJ_ROOT / "artifacts"
OOF_DIR = ARTIFACTS_DIR / "oof"
SUB_DIR = ARTIFACTS_DIR / "submissions"
for d in [ARTIFACTS_DIR, OOF_DIR, SUB_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# ---- MLflow (file-based tracking at ROOT/mlruns) ----
MLFLOW_URI = f"file:{(PROJ_ROOT / 'mlruns').as_posix()}"
os.environ["MLFLOW_TRACKING_URI"] = MLFLOW_URI
mlflow.set_tracking_uri(MLFLOW_URI)
mlflow.set_experiment("houseprices_lgbm")   # change per notebook/family if you like

# ---- Optuna (persistent study at ROOT/optuna_studies.db) ----
OPTUNA_URI = f"sqlite:///{(PROJ_ROOT / 'optuna_studies.db').as_posix()}"
study = optuna.create_study(
    study_name="houseprices_lgbm_v01",
    direction="minimize",
    storage=OPTUNA_URI,
    load_if_exists=True,
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=2),
)

# Quick visibility of resolved paths
pd.Series({
    "mlflow_uri": MLFLOW_URI,
    "optuna_uri": OPTUNA_URI,
    "data_dir": str(DATA_DIR),
    "artifacts_dir": str(ARTIFACTS_DIR),
})

In [None]:
# Load engineered data + metadata + selected CV folds
df_tr = pd.read_parquet(PROCESSED_DIR / "hp_train_feat_v01.parquet")
df_te = pd.read_parquet(PROCESSED_DIR / "hp_test_feat_v01.parquet")
with open(PROCESSED_DIR / "hp_clean_meta_v02.json", "r") as f:
    meta = json.load(f)
folds_df = pd.read_csv(PROCESSED_DIR / "cv_folds_selected.csv")  # stratified on log target

# Basic columns
id_col = "Id"
target_col = "SalePrice"
feature_cols = [c for c in df_tr.columns if c not in [id_col, target_col]]

pd.DataFrame({"train_shape":[df_tr.shape], "test_shape":[df_te.shape], "n_features":[len(feature_cols)]})

In [None]:
# 2) Categorical & numeric feature lists (LightGBM native cats)
# Ensure dtypes for categoricals are 'category'
nominal_cols_final = meta.get("nominal_cols_final", [])
engineered_nominal = meta.get("engineered_nominal", [])
for c in nominal_cols_final + engineered_nominal:
    if c in df_tr.columns:
        df_tr[c] = df_tr[c].astype("category")
        df_te[c] = df_te[c].astype("category")

# cat features = category dtype columns
cat_features = [c for c in feature_cols if str(df_tr[c].dtype) == "category"]
num_features = [c for c in feature_cols if c not in cat_features]

# target on log scale
y_log = np.log1p(df_tr[target_col])

pd.DataFrame({
    "n_cat":[len(cat_features)],
    "n_num":[len(num_features)],
    "sample_cats":[[cat_features[:10]]],
})

In [None]:
# 4) Optuna objective — Balanced-centered, safe ranges, no alias conflicts
def objective(trial):
    params = {
        "objective": "regression",
        "metric": "rmse",

        # LR: allow small but not glacial; early stopping will pick trees
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.2, log=True),

        # Capacity: avoid ultra-shallow, cap huge leaves
        "num_leaves": trial.suggest_int("num_leaves", 16, 256, log=True),
        "max_depth":  trial.suggest_categorical("max_depth", [-1, 6, 7, 8, 9, 10]),

        # Regularization via min samples per leaf (~1–15% of ~1100)
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 160, log=True),

        # Variance control
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "subsample":        trial.suggest_float("subsample", 0.6, 1.0),
        "subsample_freq":   trial.suggest_categorical("subsample_freq", [0, 1, 2]),

        # L1/L2 (log spaces)
        "reg_alpha":  trial.suggest_float("reg_alpha", 1e-4, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 10.0, log=True),

        # Gentle split penalty (near zero so we don't starve splits)
        "min_split_gain": trial.suggest_float("min_split_gain", 0.0, 1e-2),

        # Binning: modest range for tabular FE
        "max_bin": trial.suggest_int("max_bin", 128, 255),

        # Fixeds
        "n_estimators": 10000,      # rely on early stopping
        "random_state": 42,
        "n_jobs": -1,
        "force_row_wise": True,
    }

    # Keep leaves consistent with depth, if depth is bounded
    if params["max_depth"] != -1:
        params["num_leaves"] = int(min(params["num_leaves"], 2 ** params["max_depth"]))

    fold_scores = []
    for k in np.unique(fold):
        tr_idx = np.where(fold != k)[0]
        va_idx = np.where(fold == k)[0]

        model = lgb.LGBMRegressor(**params)
        _ = model.fit(
            X.iloc[tr_idx], y_log.iloc[tr_idx],
            eval_set=[(X.iloc[va_idx], y_log.iloc[va_idx])],
            categorical_feature=cat_features,
            callbacks=[
                lgb.early_stopping(200, verbose=False),
                lgb.log_evaluation(0),
            ],
        )
        pred_va = model.predict(X.iloc[va_idx], num_iteration=model.best_iteration_)
        rmse = float(np.sqrt(((y_log.iloc[va_idx] - pred_va) ** 2).mean()))
        fold_scores.append(rmse)

        trial.report(rmse, step=int(k))
        if trial.should_prune():
            raise optuna.TrialPruned()

    return float(np.mean(fold_scores))

pd.Series({"objective_ready": True, "study_name": study.study_name})

In [None]:
# 5) Run Optuna optimization (adjust n_trials as you like) — silent training via callbacks above
n_trials = 30
study.optimize(objective, n_trials=n_trials, gc_after_trial=True)

# Study summary (best trial)
best = study.best_trial
pd.DataFrame({
    "best_value":[best.value],
    "n_trials":[len(study.trials)],
    "state_counts":[str({s: sum(t.state==s for t in study.trials) for s in set(t.state for t in study.trials)})],
})

In [None]:
# 6) Best params snapshot (sorted) — keep for configs/lgbm.yaml later
best_params = dict(study.best_params)
# Add fixed fields we used during tuning
best_params.update({"objective":"regression","n_estimators":10000,"metric":"rmse","random_state":42,"n_jobs":-1})
pd.Series(best_params).sort_index()

In [None]:
# 7) Refit with best params to produce OOF & test predictions, and feature importance
from sklearn.metrics import mean_squared_error

oof_best = np.zeros(len(df_tr), dtype=float)
test_folds_best = []
fold_scores_best = []
feat_imps = []

for k in np.unique(fold):
    tr_idx = np.where(fold != k)[0]
    va_idx = np.where(fold == k)[0]

    model = lgb.LGBMRegressor(**best_params)
    _ = model.fit(
        X.iloc[tr_idx], y_log.iloc[tr_idx],
        eval_set=[(X.iloc[va_idx], y_log.iloc[va_idx])],
        eval_metric="rmse",
        callbacks=[lgb.early_stopping(200, verbose=False)],
        categorical_feature=cat_features,
    )
    pred_va = model.predict(X.iloc[va_idx], num_iteration=model.best_iteration_)
    oof_best[va_idx] = pred_va
    fold_scores_best.append(float(np.sqrt(mean_squared_error(y_log.iloc[va_idx], pred_va))))
    test_folds_best.append(model.predict(T, num_iteration=model.best_iteration_))

    # feature importance (gain)
    booster = model.booster_
    imp = pd.DataFrame({
        "feature": booster.feature_name(),
        "gain": booster.feature_importance(importance_type="gain"),
        "fold": int(k),
    })
    feat_imps.append(imp)

cv_best = pd.DataFrame({"fold": sorted(np.unique(fold)), "rmse": fold_scores_best})
cv_best.assign(cv_mean=float(np.mean(fold_scores_best)), cv_std=float(np.std(fold_scores_best)))

In [None]:
# 8) Save artifacts: OOF (log), submission (expm1), feature importance
run_tag = "lgbm_optuna_v01"
oof_path = OOF_DIR / f"{run_tag}_oof.csv"
sub_path = SUB_DIR / f"{run_tag}.csv"
imp_path = ARTIFACTS_DIR / f"{run_tag}_feat_importance.csv"
cfg_path = Path("../configs/lgbm.yaml")

# OOF
pd.DataFrame({id_col: df_tr[id_col].values, "pred_log": oof_best}).to_csv(oof_path, index=False)

# Test submission
test_mean = np.column_stack(test_folds_best).mean(axis=1)
pd.DataFrame({id_col: df_te[id_col].values, "SalePrice": np.expm1(test_mean)}).to_csv(sub_path, index=False)

# Importance (average across folds)
feat_imps_df = pd.concat(feat_imps, ignore_index=True)
feat_imps_agg = (
    feat_imps_df.groupby("feature", as_index=False)["gain"].mean().sort_values("gain", ascending=False)
)
feat_imps_agg.to_csv(imp_path, index=False)

# Config snapshot
cfg_series = pd.Series(best_params)
cfg_text = "\n".join(f"{k}: {v}" for k, v in cfg_series.items())
cfg_path.write_text(cfg_text)

pd.DataFrame({"oof":[str(oof_path)], "submission":[str(sub_path)], "feat_importance":[str(imp_path)], "config":[str(cfg_path)]})

In [None]:
# 9) Log best run to MLflow (params, metrics, artifacts)
cv_mean = float(np.mean(fold_scores_best))
cv_std  = float(np.std(fold_scores_best))

with mlflow.start_run(run_name=run_tag):
    # params
    mlflow.log_params({k: (float(v) if isinstance(v, np.floating) else v) for k, v in best_params.items()})
    mlflow.log_param("folds_file", str(PROCESSED_DIR / "cv_folds_selected.csv"))
    mlflow.log_param("features_version", "feat_v01_meta_v02")

    # metrics
    mlflow.log_metric("cv_rmse_mean_log", cv_mean)
    mlflow.log_metric("cv_rmse_std_log", cv_std)
    for r in cv_best.itertuples(index=False):
        mlflow.log_metric(f"fold{int(r.fold)}_rmse_log", float(r.rmse))

    # artifacts
    mlflow.log_artifact(str(oof_path))
    mlflow.log_artifact(str(sub_path))
    mlflow.log_artifact(str(imp_path))
    mlflow.log_artifact(str(cfg_path))

    # tags
    mlflow.set_tag("notebook", "notebooks/04b_lgbm.ipynb")
    mlflow.set_tag("study_name", study.study_name)

pd.DataFrame({"cv_rmse_mean_log":[cv_mean], "cv_rmse_std_log":[cv_std]})

In [None]:
# Mean±Std feature-importance table (all features)
import pandas as pd

feat_imps_stats = (
    feat_imps_df
    .groupby("feature", as_index=False)["gain"]
    .agg(mean="mean", std="std", n_folds="count")
    .sort_values("mean", ascending=False)
    .reset_index(drop=True)
)
feat_imps_stats.head(30)  # quick peek

In [None]:
# OOF dataframe with predictions + fold assignments
oof_df = pd.DataFrame({
    "Id": df_tr[id_col].values,
    "y_true_log": y_log.values,
    "y_pred_log": oof_best,
    "fold": fold,
    "Neighborhood": df_tr["Neighborhood"].astype(str).values
})

# Residuals on log scale
oof_df["resid_log"] = oof_df["y_true_log"] - oof_df["y_pred_log"]

# Back-transform to original scale
oof_df["y_true"] = np.expm1(oof_df["y_true_log"])
oof_df["y_pred"] = np.expm1(oof_df["y_pred_log"])
oof_df["resid_orig"] = oof_df["y_true"] - oof_df["y_pred"]

oof_df.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,5))
sns.boxplot(x="fold", y="resid_log", data=oof_df)
plt.axhline(0, color="red", linestyle="--")
plt.title("Residuals (log scale) by CV fold")
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

# RMSE per neighborhood (original scale)
nbhd_rmse = (
    oof_df.groupby("Neighborhood")
    .apply(lambda g: np.sqrt(mean_squared_error(g["y_true"], g["y_pred"])))
    .sort_values(ascending=False)
)

nbhd_rmse.head(10)  # neighborhoods with worst RMSE

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(x="Neighborhood", y="resid_orig", data=oof_df)
plt.axhline(0, color="red", linestyle="--")
plt.xticks(rotation=90)
plt.title("Residuals (original $ scale) by Neighborhood")
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# mean residuals (original $) by Neighborhood × Fold
heat_df = (
    oof_df.groupby(["Neighborhood", "fold"])
    .agg(mean_resid_orig=("resid_orig","mean"))
    .reset_index()
    .pivot(index="Neighborhood", columns="fold", values="mean_resid_orig")
)

plt.figure(figsize=(12,8))
sns.heatmap(heat_df, annot=True, fmt=".0f", cmap="RdBu_r", center=0, cbar_kws={"label":"Mean Residual ($)"})
plt.title("Mean Residuals by Neighborhood × Fold")
plt.ylabel("Neighborhood")
plt.xlabel("Fold")
plt.tight_layout()
plt.show()

heat_df.head()

# Version 2

In [None]:
# A) Setup: load v2 data/meta/folds, MLflow
import numpy as np, pandas as pd, json, os, warnings
from pathlib import Path
warnings.filterwarnings("ignore")

import mlflow
os.environ["MLFLOW_TRACKING_URI"] = "file:./mlruns"
mlflow.set_tracking_uri("file:./mlruns")
mlflow.set_experiment("houseprices_lgbm")

DATA_DIR = Path("../data/")
PROCESSED_DIR = DATA_DIR / "processed"
ARTIFACTS_DIR = Path("../artifacts")
OOF_DIR = ARTIFACTS_DIR / "oof"
SUB_DIR = ARTIFACTS_DIR / "submissions"
for d in [ARTIFACTS_DIR, OOF_DIR, SUB_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# v2 feature data
df_tr = pd.read_csv(PROCESSED_DIR / "hp_train_feat_v02.csv")
df_te = pd.read_csv(PROCESSED_DIR / "hp_test_feat_v02.csv")

# updated metadata
with open(PROCESSED_DIR / "hp_clean_meta_v03.json","r") as f:
    meta = json.load(f)

# folds (same file as before)
folds_df = pd.read_csv(PROCESSED_DIR / "cv_folds_selected.csv")
fold = folds_df["fold"].values if "fold" in folds_df.columns else folds_df.iloc[:,1].values

id_col = "Id"
target_col = "SalePrice"
feature_cols = [c for c in df_tr.columns if c not in [id_col, target_col]]

pd.DataFrame({"train_shape":[df_tr.shape], "test_shape":[df_te.shape], "n_features":[len(feature_cols)]})

In [None]:
# B) Cast categoricals for LightGBM (native cat handling)
# Combine any nominal lists + new v2 cats; only keep columns that exist
nominal_cols = set(meta.get("nominal_cols_final", [])) \
               | set(meta.get("engineered_nominal", [])) \
               | {"Nbhd_Qual_cat_v2","Nbhd_Decade_cat_v2","NbhdCluster4_v2"}

cat_features = [c for c in nominal_cols if c in df_tr.columns]
for c in cat_features:
    df_tr[c] = df_tr[c].astype("category")
    df_te[c] = df_te[c].astype("category")

num_features = [c for c in feature_cols if c not in cat_features]
y_log = np.log1p(df_tr[target_col])

pd.DataFrame({"n_cat":[len(cat_features)], "n_num":[len(num_features)], "sample_cat":[[cat_features[:10]]]})

In [None]:
# C) Load previous best params (../configs/lgbm.yaml), or fall back to a safe baseline
from pathlib import Path

def _smart_cast(v):
    s = str(v).strip()
    if s.lower() in {"true","false"}: return s.lower()=="true"
    try:
        if "." in s or "e" in s.lower(): return float(s)
        return int(s)
    except Exception:
        return s

cfg_path = Path("../configs/lgbm.yaml")
best_params_prev = {}
if cfg_path.exists():
    for line in cfg_path.read_text().splitlines():
        if ":" in line:
            k, v = line.split(":", 1)
            best_params_prev[k.strip()] = _smart_cast(v)

# minimal required fields / overrides
best_params = dict(best_params_prev) if best_params_prev else dict(
    objective="regression",
    learning_rate=0.05,
    num_leaves=31,
    max_depth=-1,
    min_data_in_leaf=30,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    lambda_l1=0.0,
    lambda_l2=0.0,
    min_gain_to_split=0.0,
)

best_params.update(dict(metric="rmse", n_estimators=10000, random_state=42, n_jobs=-1))
pd.Series(best_params).sort_index()

In [None]:
# D) Refit on v2 features with the loaded params → OOF/test/importance
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

X = df_tr[feature_cols].copy()
T = df_te[feature_cols].copy()

oof_v2 = np.zeros(len(df_tr), dtype=float)
test_folds_v2 = []
fold_scores_v2 = []
feat_imps = []

for k in np.unique(fold):
    tr_idx = np.where(fold != k)[0]
    va_idx = np.where(fold == k)[0]

    model = lgb.LGBMRegressor(**best_params)
    _ = model.fit(
        X.iloc[tr_idx], y_log.iloc[tr_idx],
        eval_set=[(X.iloc[va_idx], y_log.iloc[va_idx])],
        eval_metric="rmse",
        callbacks=[lgb.early_stopping(200, verbose=False)],
        categorical_feature=cat_features,
    )
    pred_va = model.predict(X.iloc[va_idx], num_iteration=model.best_iteration_)
    oof_v2[va_idx] = pred_va
    fold_scores_v2.append(float(np.sqrt(mean_squared_error(y_log.iloc[va_idx], pred_va))))
    test_folds_v2.append(model.predict(T, num_iteration=model.best_iteration_))

    booster = model.booster_
    imp = pd.DataFrame({
        "feature": booster.feature_name(),
        "gain": booster.feature_importance(importance_type="gain"),
        "fold": int(k),
    })
    feat_imps.append(imp)

cv_v2 = pd.DataFrame({"fold": sorted(np.unique(fold)), "rmse": fold_scores_v2})
cv_v2.assign(cv_mean=float(np.mean(fold_scores_v2)), cv_std=float(np.std(fold_scores_v2)))

In [None]:
# E) Save artifacts under a new v02 tag + log to MLflow
run_tag = "lgbm_v02_refit_prevbest"
from pathlib import Path

oof_path = OOF_DIR / f"{run_tag}_oof.csv"
sub_path = SUB_DIR / f"{run_tag}.csv"
imp_path = ARTIFACTS_DIR / f"{run_tag}_feat_importance.csv"

# OOF (log)
pd.DataFrame({id_col: df_tr[id_col].values, "pred_log": oof_v2}).to_csv(oof_path, index=False)

# Test submission (expm1 of mean over folds)
test_mean = np.column_stack(test_folds_v2).mean(axis=1)
pd.DataFrame({id_col: df_te[id_col].values, "SalePrice": np.expm1(test_mean)}).to_csv(sub_path, index=False)

# Importance (mean over folds)
feat_imps_df = pd.concat(feat_imps, ignore_index=True)
feat_imps_agg = (
    feat_imps_df.groupby("feature", as_index=False)["gain"].mean().sort_values("gain", ascending=False)
)
feat_imps_agg.to_csv(imp_path, index=False)

# MLflow logging
cv_mean = float(np.mean(fold_scores_v2))
cv_std  = float(np.std(fold_scores_v2))

with mlflow.start_run(run_name=run_tag):
    mlflow.log_params({k: (float(v) if isinstance(v, (np.floating,)) else v) for k, v in best_params.items()})
    mlflow.log_param("feature_version", "v02")
    mlflow.log_param("folds_file", str(PROCESSED_DIR / "cv_folds_selected.csv"))
    mlflow.log_metric("cv_rmse_mean_log", cv_mean)
    mlflow.log_metric("cv_rmse_std_log", cv_std)
    for r in cv_v2.itertuples(index=False):
        mlflow.log_metric(f"fold{int(r.fold)}_rmse_log", float(r.rmse))
    mlflow.log_artifact(str(oof_path))
    mlflow.log_artifact(str(sub_path))
    mlflow.log_artifact(str(imp_path))

pd.DataFrame({"cv_rmse_mean_log":[cv_mean], "cv_rmse_std_log":[cv_std], "oof":[str(oof_path)], "submission":[str(sub_path)]})

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold

# --- Step 1: Ensure target log column exists
df_tr["SalePrice_log"] = np.log1p(df_tr["SalePrice"])

# --- Step 2: Build stratification strata
nbhd_price = df_tr.groupby("Neighborhood")["SalePrice_log"].median()
nbhd_tier  = pd.qcut(nbhd_price, q=4, labels=False)

df_tr["nbhd_tier"] = df_tr["Neighborhood"].map(nbhd_tier)
df_tr["price_bin"] = pd.qcut(df_tr["SalePrice_log"], q=10, labels=False)

# Combine neighborhood tier + price bin
df_tr["strata"] = df_tr["nbhd_tier"].astype(str) + "_" + df_tr["price_bin"].astype(str)

# --- Step 3: StratifiedKFold on strata
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

df_tr["fold"] = -1
for fold, (train_idx, val_idx) in enumerate(skf.split(df_tr, df_tr["strata"])):
    df_tr.loc[val_idx, "fold"] = fold

# --- Step 4: Check balance
print(df_tr["fold"].value_counts())
print(df_tr.groupby("fold")["SalePrice_log"].mean())

In [None]:
from pathlib import Path

# --- Ensure processed dir exists ---
PROCESSED_DIR = DATA_DIR / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# --- Save folds file ---
folds_path = PROCESSED_DIR / "cv_folds_strat_nbhd_price_v01.csv"
df_tr[["Id", "fold"]].to_csv(folds_path, index=False)

print(f"Saved folds to: {folds_path}")