In [1]:
# 1) Setup: paths, load FE data + metadata
import numpy as np
import pandas as pd
from pathlib import Path
import json, warnings
warnings.filterwarnings("ignore")

DATA_DIR = Path("../data/")
PROCESSED_DIR = DATA_DIR / "processed"
ARTIFACTS_DIR = Path("../artifacts")
OOF_DIR = ARTIFACTS_DIR / "oof"
SUB_DIR = ARTIFACTS_DIR / "submissions"
for d in [ARTIFACTS_DIR, OOF_DIR, SUB_DIR]:
    d.mkdir(parents=True, exist_ok=True)

df_tr = pd.read_parquet(PROCESSED_DIR / "hp_train_feat_v01.parquet")
df_te = pd.read_parquet(PROCESSED_DIR / "hp_test_feat_v01.parquet")
with open(PROCESSED_DIR / "hp_clean_meta_v02.json", "r") as f:
    meta = json.load(f)

id_col = "Id"
target_col = "SalePrice"
feature_cols = [c for c in df_tr.columns if c not in [id_col, target_col]]

pd.DataFrame({"train_shape":[df_tr.shape], "test_shape":[df_te.shape]})

Unnamed: 0,train_shape,test_shape
0,"(1458, 117)","(1459, 117)"


In [2]:
# 2) Column groups + dtypes (string cats vs numeric-coded cats)
ordinal_int_cols   = meta.get("ordinal_int_cols", [])
cont_num_cols      = meta.get("cont_num_cols", [])
nominal_cols_final = meta.get("nominal_cols_final", [])
engineered_nominal = meta.get("engineered_nominal", [])
engineered_continuous = meta.get("engineered_continuous", [])

log1p_cols      = meta.get("log1p_cols", [])
yeojohnson_cols = meta.get("yeojohnson_cols", [])

for c in nominal_cols_final + engineered_nominal:
    if c in df_tr.columns:
        df_tr[c] = df_tr[c].astype("category")
        df_te[c] = df_te[c].astype("category")

def _string_cat_cols(df, cols):
    out = []
    for c in cols:
        if c in df.columns and str(df[c].dtype) == "category":
            cats = list(df[c].cat.categories)
            if all(isinstance(x, str) for x in cats):
                out.append(c)
    return out

string_cat_cols = _string_cat_cols(df_tr, nominal_cols_final + engineered_nominal)
cont_all = [c for c in (cont_num_cols + engineered_continuous) if c in df_tr.columns]
cont_log1p = [c for c in log1p_cols if c in cont_all]
cont_yeoj  = [c for c in yeojohnson_cols if c in cont_all and c not in cont_log1p]
cont_rest  = [c for c in cont_all if c not in cont_log1p + cont_yeoj]

pd.DataFrame({
    "n_features":[len(feature_cols)],
    "n_string_cats":[len(string_cat_cols)],
    "n_ordinals":[len([c for c in ordinal_int_cols if c in df_tr.columns])],
    "n_cont_log1p":[len(cont_log1p)],
    "n_cont_yeoj":[len(cont_yeoj)],
    "n_cont_rest":[len(cont_rest)]
})

Unnamed: 0,n_features,n_string_cats,n_ordinals,n_cont_log1p,n_cont_yeoj,n_cont_rest
0,115,30,15,4,15,38


In [3]:
# 3) Define three CV schemes: RandomKFold, StratifiedKFold(log target), GroupKFold(Neighborhood)
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold

y_log = np.log1p(df_tr[target_col])
bins10 = pd.qcut(y_log, q=10, labels=False, duplicates="drop")
groups_nbhd = df_tr["Neighborhood"].astype(str)

n_splits = 5
seed = 42

folds_random = np.full(len(df_tr), -1, dtype=int)
kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
for k, (_, val_idx) in enumerate(kf.split(df_tr)):
    folds_random[val_idx] = k

folds_strat = np.full(len(df_tr), -1, dtype=int)
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
for k, (_, val_idx) in enumerate(skf.split(y_log, bins10)):
    folds_strat[val_idx] = k

folds_group = np.full(len(df_tr), -1, dtype=int)
gkf = GroupKFold(n_splits=n_splits)
for k, (_, val_idx) in enumerate(gkf.split(df_tr, y_log, groups=groups_nbhd)):
    folds_group[val_idx] = k

cv_schemes = pd.DataFrame({
    "Id": df_tr[id_col].values,
    "fold_random": folds_random,
    "fold_stratlog": folds_strat,
    "fold_group_nbhd": folds_group
})
cv_schemes.head(10)

Unnamed: 0,Id,fold_random,fold_stratlog,fold_group_nbhd
0,1,2,2,1
1,2,4,0,3
2,3,2,0,1
3,4,1,1,2
4,5,3,3,4
5,6,1,2,1
6,7,2,0,4
7,8,2,3,1
8,9,4,1,2
9,10,2,3,3


In [4]:
# 4) Build the ElasticNet preprocessing & model (same across all CV schemes)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, PowerTransformer, FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet

preproc_linear = ColumnTransformer(
    transformers=[
        ("log1p", FunctionTransformer(np.log1p, validate=False), cont_log1p),
        ("yeo",   PowerTransformer(method="yeo-johnson", standardize=False), cont_yeoj),
        ("cont",  "passthrough", cont_rest),
        ("ord",   "passthrough", [c for c in ordinal_int_cols if c in df_tr.columns]),
        ("ohe",   OneHotEncoder(handle_unknown="ignore", min_frequency=10, sparse_output=True), string_cat_cols),
    ],
    remainder="drop",
    sparse_threshold=0.3,
)

elastic_pipe = Pipeline([
    ("pre", preproc_linear),
    ("scaler", StandardScaler(with_mean=False)),
    ("model", ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=seed, max_iter=2000)),
])

pd.Series({"pipeline_ready": True})

pipeline_ready    True
dtype: bool

In [5]:
# 5) Run ElasticNet on all CV schemes → OOF + test + scores
from sklearn.metrics import mean_squared_error

X = df_tr[feature_cols].copy()
T = df_te[feature_cols].copy()

rows_elastic = []
for name, folds in [("random", folds_random), ("stratlog", folds_strat), ("group_nbhd", folds_group)]:
    oof = np.zeros(len(df_tr), dtype=float)
    test_folds = []
    fold_scores = []
    for k in range(n_splits):
        tr_idx = np.where(folds != k)[0]
        va_idx = np.where(folds == k)[0]
        _ = elastic_pipe.fit(X.iloc[tr_idx], y_log.iloc[tr_idx])
        y_va_pred = elastic_pipe.predict(X.iloc[va_idx])
        oof[va_idx] = y_va_pred
        fold_scores.append(float(np.sqrt(mean_squared_error(y_log.iloc[va_idx], y_va_pred))))
        test_folds.append(elastic_pipe.predict(T))
    oof_df = pd.DataFrame({id_col: df_tr[id_col].values, "pred_log": oof})
    oof_path = OOF_DIR / f"elasticnet_{name}_v01_oof.csv"
    oof_df.to_csv(oof_path, index=False)

    test_pred = np.column_stack(test_folds).mean(axis=1)
    sub_df = pd.DataFrame({id_col: df_te[id_col].values, "SalePrice": np.expm1(test_pred)})
    sub_path = SUB_DIR / f"elasticnet_{name}_v01.csv"
    sub_df.to_csv(sub_path, index=False)

    rows_elastic.append({
        "model":"ElasticNet",
        "cv_scheme":name,
        "cv_rmse_mean": float(np.mean(fold_scores)),
        "cv_rmse_std":  float(np.std(fold_scores)),
        "oof_path": str(oof_path),
        "sub_path": str(sub_path)
    })

elastic_scoreboard = pd.DataFrame(rows_elastic).sort_values(["model","cv_rmse_mean"])
elastic_scoreboard

Unnamed: 0,model,cv_scheme,cv_rmse_mean,cv_rmse_std,oof_path,sub_path
1,ElasticNet,stratlog,0.151643,0.01055,../artifacts/oof/elasticnet_stratlog_v01_oof.csv,../artifacts/submissions/elasticnet_stratlog_v...
0,ElasticNet,random,0.152449,0.008862,../artifacts/oof/elasticnet_random_v01_oof.csv,../artifacts/submissions/elasticnet_random_v01...
2,ElasticNet,group_nbhd,0.154661,0.046646,../artifacts/oof/elasticnet_group_nbhd_v01_oof...,../artifacts/submissions/elasticnet_group_nbhd...


In [6]:
# 6) LightGBM baseline across CV schemes → OOF + test + scores
import lightgbm as lgb

lgb_params = dict(
    objective="regression",
    learning_rate=0.05,
    n_estimators=5000,
    num_leaves=31,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=0.0,
    random_state=seed,
    n_jobs=-1,
    force_row_wise=True
)

rows_lgb = []
for name, folds in [("random", folds_random), ("stratlog", folds_strat), ("group_nbhd", folds_group)]:
    oof = np.zeros(len(df_tr), dtype=float)
    test_folds = []
    fold_scores = []
    for k in range(n_splits):
        tr_idx = np.where(folds != k)[0]
        va_idx = np.where(folds == k)[0]
        lgbm = lgb.LGBMRegressor(**lgb_params)
        _ = lgbm.fit(
            X.iloc[tr_idx], y_log.iloc[tr_idx],
            eval_set=[(X.iloc[va_idx], y_log.iloc[va_idx])],
            eval_metric="rmse",
            callbacks=[lgb.early_stopping(200, verbose=False)],
        )
        y_va_pred = lgbm.predict(X.iloc[va_idx], num_iteration=lgbm.best_iteration_)
        oof[va_idx] = y_va_pred
        fold_scores.append(float(np.sqrt(mean_squared_error(y_log.iloc[va_idx], y_va_pred))))
        test_folds.append(lgbm.predict(T, num_iteration=lgbm.best_iteration_))

    oof_df = pd.DataFrame({id_col: df_tr[id_col].values, "pred_log": oof})
    oof_path = OOF_DIR / f"lgbm_{name}_v01_oof.csv"
    oof_df.to_csv(oof_path, index=False)

    test_pred = np.column_stack(test_folds).mean(axis=1)
    sub_df = pd.DataFrame({id_col: df_te[id_col].values, "SalePrice": np.expm1(test_pred)})
    sub_path = SUB_DIR / f"lgbm_{name}_v01.csv"
    sub_df.to_csv(sub_path, index=False)

    rows_lgb.append({
        "model":"LGBM",
        "cv_scheme":name,
        "cv_rmse_mean": float(np.mean(fold_scores)),
        "cv_rmse_std":  float(np.std(fold_scores)),
        "oof_path": str(oof_path),
        "sub_path": str(sub_path)
    })

lgb_scoreboard = pd.DataFrame(rows_lgb).sort_values(["model","cv_rmse_mean"])
lgb_scoreboard

[LightGBM] [Info] Total Bins 5598
[LightGBM] [Info] Number of data points in the train set: 1166, number of used features: 107
[LightGBM] [Info] Start training from score 12.023362
[LightGBM] [Info] Total Bins 5602
[LightGBM] [Info] Number of data points in the train set: 1166, number of used features: 107
[LightGBM] [Info] Start training from score 12.026498
[LightGBM] [Info] Total Bins 5607
[LightGBM] [Info] Number of data points in the train set: 1166, number of used features: 107
[LightGBM] [Info] Start training from score 12.025399
[LightGBM] [Info] Total Bins 5634
[LightGBM] [Info] Number of data points in the train set: 1167, number of used features: 108
[LightGBM] [Info] Start training from score 12.024512
[LightGBM] [Info] Total Bins 5525
[LightGBM] [Info] Number of data points in the train set: 1167, number of used features: 108
[LightGBM] [Info] Start training from score 12.020308
[LightGBM] [Info] Total Bins 5627
[LightGBM] [Info] Number of data points in the train set: 116

Unnamed: 0,model,cv_scheme,cv_rmse_mean,cv_rmse_std,oof_path,sub_path
0,LGBM,random,0.121848,0.00619,../artifacts/oof/lgbm_random_v01_oof.csv,../artifacts/submissions/lgbm_random_v01.csv
1,LGBM,stratlog,0.125161,0.014441,../artifacts/oof/lgbm_stratlog_v01_oof.csv,../artifacts/submissions/lgbm_stratlog_v01.csv
2,LGBM,group_nbhd,0.135301,0.025543,../artifacts/oof/lgbm_group_nbhd_v01_oof.csv,../artifacts/submissions/lgbm_group_nbhd_v01.csv


In [8]:
# 7) Combined scoreboard (CV on log target). Lower is better.
scoreboard = pd.concat([elastic_scoreboard, lgb_scoreboard], ignore_index=True)
scoreboard.sort_values(["model","cv_rmse_mean"])

Unnamed: 0,model,cv_scheme,cv_rmse_mean,cv_rmse_std,oof_path,sub_path
0,ElasticNet,stratlog,0.151643,0.01055,../artifacts/oof/elasticnet_stratlog_v01_oof.csv,../artifacts/submissions/elasticnet_stratlog_v...
1,ElasticNet,random,0.152449,0.008862,../artifacts/oof/elasticnet_random_v01_oof.csv,../artifacts/submissions/elasticnet_random_v01...
2,ElasticNet,group_nbhd,0.154661,0.046646,../artifacts/oof/elasticnet_group_nbhd_v01_oof...,../artifacts/submissions/elasticnet_group_nbhd...
3,LGBM,random,0.121848,0.00619,../artifacts/oof/lgbm_random_v01_oof.csv,../artifacts/submissions/lgbm_random_v01.csv
4,LGBM,stratlog,0.125161,0.014441,../artifacts/oof/lgbm_stratlog_v01_oof.csv,../artifacts/submissions/lgbm_stratlog_v01.csv
5,LGBM,group_nbhd,0.135301,0.025543,../artifacts/oof/lgbm_group_nbhd_v01_oof.csv,../artifacts/submissions/lgbm_group_nbhd_v01.csv


In [9]:
# 8) (Optional) Compare on ORIGINAL Kaggle scale using OOFs (helps interpretability)
from sklearn.metrics import mean_squared_error

rows_orig = []
for r in scoreboard.itertuples():
    oof_file = Path(r.oof_path)
    df_oof = pd.read_csv(oof_file)
    merged = df_tr[[id_col, target_col]].merge(df_oof, on=id_col, how="left")
    rmse_orig = float(np.sqrt(mean_squared_error(merged[target_col], np.expm1(merged["pred_log"]))))
    rows_orig.append({"model":r.model, "cv_scheme":r.cv_scheme, "cv_rmse_original": rmse_orig})

cv_original_scale = pd.DataFrame(rows_orig).sort_values(["model","cv_rmse_original"])
cv_original_scale

Unnamed: 0,model,cv_scheme,cv_rmse_original
0,ElasticNet,stratlog,28884.249111
1,ElasticNet,random,28976.855384
2,ElasticNet,group_nbhd,30138.403729
3,LGBM,random,24145.068535
4,LGBM,stratlog,25374.575672
5,LGBM,group_nbhd,29045.457888


In [11]:
# Use the stratified folds you already built as the project-default
import pandas as pd
from pathlib import Path

PROCESSED_DIR = Path("../data/processed")

# Create the selected folds DataFrame from the stratified folds you already have
folds_selected = pd.DataFrame({
    "Id": df_tr[id_col].values,
    "fold": folds_strat  # Use the stratified folds from earlier
})

# Save as the project default
folds_selected.to_csv(PROCESSED_DIR / "cv_folds_selected.csv", index=False)

print(f"Saved selected folds to: {PROCESSED_DIR / 'cv_folds_selected.csv'}")
print(f"Using folds: stratified on log target")
print(f"Folds distribution:")
print(folds_selected['fold'].value_counts().sort_index())

Saved selected folds to: ../data/processed/cv_folds_selected.csv
Using folds: stratified on log target
Folds distribution:
fold
0    292
1    292
2    292
3    291
4    291
Name: count, dtype: int64
