In [1]:
from pathlib import Path
import numpy as np, pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.base import clone

DATA_DIR = Path("../data"); PROCESSED_DIR = DATA_DIR / "processed"
ART = Path("../artifacts"); OOF_DIR = ART / "oof"; SUB_DIR = ART / "submissions"
OOF_DIR.mkdir(parents=True, exist_ok=True); SUB_DIR.mkdir(parents=True, exist_ok=True)

df_tr = pd.read_csv(PROCESSED_DIR / "hp_train_feat_v04.csv")
df_te = pd.read_csv(PROCESSED_DIR / "hp_test_feat_v04.csv")
folds = pd.read_csv(PROCESSED_DIR / "cv_folds_strat_nbhd_price_v01.csv")
df_tr = df_tr.merge(folds, on="Id", how="left")

y_log = np.log1p(df_tr["SalePrice"]).astype(float)
drop_cols = ["Id","SalePrice","fold"]
X    = df_tr.drop(columns=drop_cols)
X_te = df_te.drop(columns=["Id"])

cat_cols = [c for c in X.columns if (X[c].dtype == "object") or c.endswith("_cat_v4")]
num_cols = [c for c in X.columns if c not in cat_cols]
fold = df_tr["fold"].values

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.base import clone
import numpy as np
import pandas as pd

# assumes X, X_te, y_log, num_cols, cat_cols, fold already defined

pre_gbr = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            # GBR is tree-based → scaling not required
        ]), num_cols),
        ("cat", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("enc", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
        ]), cat_cols),
    ],
    remainder="drop",
    sparse_threshold=0.0,   # ensure dense for GBR
)

gbr_params = dict(
    n_estimators=4000,
    learning_rate=0.03,
    max_depth=3,
    subsample=0.8,
    random_state=42,
    validation_fraction=0.1,
    n_iter_no_change=80,
)

oof_gbr = np.zeros(len(X))
gbr_models, gbr_fold_rmse = [], []

for k in sorted(np.unique(fold)):
    tr_idx = np.where(fold != k)[0]; va_idx = np.where(fold == k)[0]

    pipe = Pipeline([
        ("prep", clone(pre_gbr)),
        ("model", GradientBoostingRegressor(**gbr_params)),
    ])
    pipe.fit(X.iloc[tr_idx], y_log.iloc[tr_idx])

    pred_va = pipe.predict(X.iloc[va_idx])
    oof_gbr[va_idx] = pred_va
    gbr_fold_rmse.append(float(root_mean_squared_error(y_log.iloc[va_idx], pred_va)))
    gbr_models.append(pipe)


gbr_test_pred = np.column_stack([m.predict(X_te) for m in gbr_models]).mean(axis=1)
gbr_oof = pd.DataFrame({"Id": df_tr["Id"], "pred_log": oof_gbr})
gbr_sub = pd.DataFrame({"Id": df_te["Id"], "SalePrice": np.expm1(gbr_test_pred)})

gbr_oof.to_csv(OOF_DIR / "gbr_v04_nbhdstrat_oof.csv", index=False)
gbr_sub.to_csv(SUB_DIR / "gbr_v04_nbhdstrat.csv", index=False)

pd.DataFrame({"model":["GBR_v04_nbhdstrat"],
              "cv_rmse_mean_log":[float(np.mean(gbr_fold_rmse))],
              "cv_rmse_std_log":[float(np.std(gbr_fold_rmse))]})

Unnamed: 0,model,cv_rmse_mean_log,cv_rmse_std_log
0,GBR_v04_nbhdstrat,0.116887,0.006192


In [3]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

ohe_sparse = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
pre_svr = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=False), num_cols),  # sparse-friendly
        ("cat", ohe_sparse, cat_cols),
    ],
    remainder="drop",
    sparse_threshold=1.0,
)

# modest grid for speed
param_grid = {
    "svr__C": [1.0, 3.0, 10.0],
    "svr__gamma": ["scale", "auto", 0.03, 0.1],
}

oof_svr = np.zeros(len(X))
svr_models, svr_fold_rmse = [], []

for k in sorted(np.unique(fold)):
    tr_idx = np.where(fold != k)[0]; va_idx = np.where(fold == k)[0]

    pipe = Pipeline([
        ("prep", clone(pre_svr)),                # sparse
        ("svd", TruncatedSVD(n_components=128, random_state=42)),
        ("scaler", StandardScaler(with_mean=True)),
        ("svr", SVR(kernel="rbf")),
    ])

    gscv = GridSearchCV(
        estimator=pipe,
        param_grid=param_grid,
        scoring="neg_root_mean_squared_error",
        cv=5,
        n_jobs=-1,
        refit=True,
    )
    gscv.fit(X.iloc[tr_idx], y_log.iloc[tr_idx])

    best = gscv.best_estimator_
    pred_va = best.predict(X.iloc[va_idx])
    oof_svr[va_idx] = pred_va
    svr_fold_rmse.append(float(np.sqrt(mean_squared_error(y_log.iloc[va_idx], pred_va))))
    svr_models.append(best)

svr_test_pred = np.column_stack([m.predict(X_te) for m in svr_models]).mean(axis=1)
svr_oof = pd.DataFrame({"Id": df_tr["Id"], "pred_log": oof_svr})
svr_sub = pd.DataFrame({"Id": df_te["Id"], "SalePrice": np.expm1(svr_test_pred)})

svr_oof.to_csv(OOF_DIR / "svr_rbf_v04_nbhdstrat_oof.csv", index=False)
svr_sub.to_csv(SUB_DIR / "svr_rbf_v04_nbhdstrat.csv", index=False)

pd.DataFrame({"model":["SVR_RBF_v04_nbhdstrat"],
              "cv_rmse_mean_log":[float(np.mean(svr_fold_rmse))],
              "cv_rmse_std_log":[float(np.std(svr_fold_rmse))]})

Unnamed: 0,model,cv_rmse_mean_log,cv_rmse_std_log
0,SVR_RBF_v04_nbhdstrat,0.201059,0.010247


In [10]:
from sklearn.neural_network import MLPRegressor

pre_mlp = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=True), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
    ],
    remainder="drop",
    sparse_threshold=0.0,  # force dense for MLP
)

mlp_cfg = dict(
    hidden_layer_sizes=(16, 8),
    activation="relu",
    alpha=1e-4,                 # L2
    learning_rate_init=3e-3,
    batch_size=64,
    solver="adam",
    early_stopping=True,
    n_iter_no_change=30,
    max_iter=500,
    random_state=42,
    verbose=False,
)

oof_mlp = np.zeros(len(X))
mlp_models, mlp_fold_rmse = [], []

for k in sorted(np.unique(fold)):
    tr_idx = np.where(fold != k)[0]; va_idx = np.where(fold == k)[0]

    pipe = Pipeline([
        ("prep", clone(pre_mlp)),
        ("mlp", MLPRegressor(**mlp_cfg)),
    ])
    pipe.fit(X.iloc[tr_idx], y_log.iloc[tr_idx])

    pred_va = pipe.predict(X.iloc[va_idx])
    oof_mlp[va_idx] = pred_va
    mlp_fold_rmse.append(float(np.sqrt(mean_squared_error(y_log.iloc[va_idx], pred_va))))
    mlp_models.append(pipe)

mlp_test_pred = np.column_stack([m.predict(X_te) for m in mlp_models]).mean(axis=1)
mlp_oof = pd.DataFrame({"Id": df_tr["Id"], "pred_log": oof_mlp})
mlp_sub = pd.DataFrame({"Id": df_te["Id"], "SalePrice": np.expm1(mlp_test_pred)})

mlp_oof.to_csv(OOF_DIR / "mlp_v04_nbhdstrat_oof.csv", index=False)
mlp_sub.to_csv(SUB_DIR / "mlp_v04_nbhdstrat.csv", index=False)

pd.DataFrame({"model":["MLP_v04_nbhdstrat"],
              "cv_rmse_mean_log":[float(np.mean(mlp_fold_rmse))],
              "cv_rmse_std_log":[float(np.std(mlp_fold_rmse))]})

Unnamed: 0,model,cv_rmse_mean_log,cv_rmse_std_log
0,MLP_v04_nbhdstrat,0.303995,0.086877
