
# House Prices — Minimal XGBoost Pipeline (No folders, minimal I/O)

This notebook is **Kaggle-friendly**:
- No directory creation, no intermediate files.
- In-memory cleaning (no one-hot), training, validation, and final prediction.
- Writes a single `submission.csv` in the current working directory.

It uses:
- A lightweight cleaner that keeps categorical columns as `category` (for `XGBRegressor(enable_categorical=True)`).
- Simple feature engineering (`TotalSF`, `AgeSinceBuilt`, `AgeSinceRemod`), optional cap on `GrLivArea`.
- Rare-category grouping (maps infrequent levels to `'Other'`).

> Paths default to Kaggle competition dataset: `/kaggle/input/house-prices-advanced-regression-techniques/`.


In [1]:

import os, sys, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.base import BaseEstimator, TransformerMixin
from xgboost import XGBRegressor

# ---- Config ----
DATA_DIR = "../data"
TRAIN_CSV = f"{DATA_DIR}/train.csv"
TEST_CSV  = f"{DATA_DIR}/test.csv"

SEED = 42
TEST_SIZE = 0.2
LOG_TARGET = False  # True if you want to train on log1p(y) and inverse-transform when evaluating/predicting
CAP_GRLIVAREA = 4000.0  # set to None to disable
RARE_THRESH = 0.02      # combine categories occurring < 2% into 'Other'

# XGB params (sensible defaults for this dataset)
XGB_PARAMS = dict(
    n_estimators=2000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.0,
    reg_lambda=1.0,
    tree_method="hist",
    enable_categorical=True,
    random_state=SEED,
    n_jobs=-1
)

def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))


## Cleaner (no one-hot, keep categorical dtype)

In [2]:

MISSING_MEANS_NONE = {
    "PoolQC": "NoPool",
    "Alley": "NoAlley",
    "Fence": "NoFence",
    "FireplaceQu": "NoFireplace",
    "GarageType": "NoGarage",
    "GarageFinish": "NoGarage",
    "GarageQual": "NoGarage",
    "GarageCond": "NoGarage",
    "BsmtQual": "NoBasement",
    "BsmtCond": "NoBasement",
    "BsmtExposure": "NoBasement",
    "BsmtFinType1": "NoBasement",
    "BsmtFinType2": "NoBasement",
    "MiscFeature": "None",
    "MasVnrType": "None",
}

class RareCategoryGrouper(BaseEstimator, TransformerMixin):
    def __init__(self, rare_thresh: float = 0.02):
        self.rare_thresh = rare_thresh
        self.frequent_levels_ = {}

    def fit(self, X, y=None):
        Xs = X.copy()
        self.frequent_levels_.clear()
        for col in Xs.columns:
            vc = Xs[col].astype("string").fillna(pd.NA).value_counts(normalize=True, dropna=True)
            self.frequent_levels_[col] = set(vc[vc >= self.rare_thresh].index.tolist())
        return self

    def transform(self, X):
        Xs = X.copy()
        for col in Xs.columns:
            good = self.frequent_levels_.get(col, set())
            mask = Xs[col].notna()
            Xs.loc[mask, col] = Xs.loc[mask, col].astype("string").where(
                Xs.loc[mask, col].astype("string").isin(good), other="Other"
            )
        return Xs

class SpecialRulesAndFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, cap_grlivarea=4000.0, create_features=True):
        self.cap = cap_grlivarea
        self.create_features = create_features
        self.group_median_ = None

    def fit(self, X, y=None):
        if {"LotFrontage","Neighborhood"}.issubset(X.columns):
            self.group_median_ = X.groupby("Neighborhood")["LotFrontage"].median()
        else:
            self.group_median_ = None
        return self

    def transform(self, X):
        Xw = X.copy()

        # MSSubClass -> string (categorical later)
        if "MSSubClass" in Xw.columns:
            Xw["MSSubClass"] = Xw["MSSubClass"].astype("Int64").astype("string")

        # No garage -> GarageYrBlt = 0
        if "GarageType" in Xw.columns and "GarageYrBlt" in Xw.columns:
            no_garage = Xw["GarageType"].isna() | (Xw["GarageType"].astype("string").str.lower().isin(["nan","none"]))
            Xw.loc[no_garage, "GarageYrBlt"] = Xw.loc[no_garage, "GarageYrBlt"].fillna(0)

        # LotFrontage by Neighborhood median (others remain NaN)
        if self.group_median_ is not None and "LotFrontage" in Xw.columns and "Neighborhood" in Xw.columns:
            need = Xw["LotFrontage"].isna()
            Xw.loc[need, "LotFrontage"] = Xw.loc[need, "Neighborhood"].map(self.group_median_)

        # Cap GrLivArea
        if self.cap is not None and "GrLivArea" in Xw.columns:
            Xw["GrLivArea"] = np.where(
                Xw["GrLivArea"].notna(), np.minimum(Xw["GrLivArea"], self.cap), Xw["GrLivArea"]
            )

        # Simple features
        if self.create_features:
            for req in ["1stFlrSF","2ndFlrSF","TotalBsmtSF"]:
                if req not in Xw.columns: Xw[req] = np.nan
            Xw["TotalSF"] = Xw["1stFlrSF"] + Xw["2ndFlrSF"] + Xw["TotalBsmtSF"]

            for req in ["YrSold","YearBuilt","YearRemodAdd"]:
                if req not in Xw.columns: Xw[req] = np.nan
            Xw["AgeSinceBuilt"] = Xw["YrSold"] - Xw["YearBuilt"]
            Xw["AgeSinceRemod"] = Xw["YrSold"] - Xw["YearRemodAdd"]

        return Xw

class XGBCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, rare_thresh=0.02, cap_grlivarea=4000.0, create_features=True, impute_numeric=False):
        self.rare_thresh = rare_thresh
        self.cap = cap_grlivarea
        self.create_features = create_features
        self.impute_numeric = impute_numeric

        self.special_ = SpecialRulesAndFeatures(cap_grlivarea=self.cap, create_features=self.create_features)
        self.rare_ = RareCategoryGrouper(rare_thresh=self.rare_thresh)

        self.cat_cols_ = []
        self.num_cols_ = []
        self.cat_levels_ = {}

    def _detect_cols(self, X):
        all_features = [c for c in X.columns if c.lower() != "id"]
        likely_cat = [
            "MSSubClass","MSZoning","Street","LotShape","LandContour","Utilities","LotConfig","LandSlope",
            "Neighborhood","Condition1","Condition2","BldgType","HouseStyle","RoofStyle","RoofMatl",
            "Exterior1st","Exterior2nd","ExterQual","ExterCond","Foundation","Heating","HeatingQC",
            "CentralAir","Electrical","KitchenQual","Functional","PavedDrive","SaleType","SaleCondition"
        ]
        obj_cols = X[all_features].select_dtypes(include=["object"]).columns.tolist()
        cat = list(dict.fromkeys([c for c in likely_cat if c in all_features] + obj_cols))
        cat_none_cols = [c for c in MISSING_MEANS_NONE.keys() if c in all_features]
        cat = list(dict.fromkeys(cat + cat_none_cols))
        num = [c for c in X[all_features].select_dtypes(include=["number"]).columns if c not in cat]
        self.cat_cols_, self.num_cols_ = cat, num
        return all_features

    def fit(self, X, y=None):
        Xw = self.special_.fit_transform(X)
        for c, lvl in MISSING_MEANS_NONE.items():
            if c in Xw.columns:
                Xw[c] = Xw[c].astype("string")
                Xw[c] = Xw[c].fillna(lvl).replace({"nan": lvl, "None": lvl})
        self._detect_cols(Xw)

        cat_df = Xw[self.cat_cols_].copy()
        self.rare_.fit(cat_df)

        cat_df = self.rare_.transform(cat_df)
        self.cat_levels_.clear()
        for c in cat_df.columns:
            levels = pd.Series(cat_df[c].dropna().astype("string").unique()).tolist()
            if "Other" not in levels: levels.append("Other")
            self.cat_levels_[c] = sorted([str(v) for v in levels])
        return self

    def transform(self, X):
        Xw = self.special_.transform(X)
        for c, lvl in MISSING_MEANS_NONE.items():
            if c in Xw.columns:
                Xw[c] = Xw[c].astype("string")
                Xw[c] = Xw[c].fillna(lvl).replace({"nan": lvl, "None": lvl})
        self._detect_cols(Xw)

        cat_df = Xw[self.cat_cols_].copy()
        for c in cat_df.columns:
            mask = cat_df[c].notna()
            good = self.rare_.frequent_levels_.get(c, set())
            cat_df.loc[mask, c] = cat_df.loc[mask, c].astype("string").where(
                cat_df.loc[mask, c].astype("string").isin(good), other="Other"
            )
        for c in cat_df.columns:
            levels = self.cat_levels_.get(c, ["Other"])
            s = cat_df[c].astype("string")
            s = s.where(s.isin(levels), other="Other")
            cat_df[c] = pd.Categorical(s, categories=levels)

        num_df = Xw[self.num_cols_].copy()
        if self.impute_numeric:
            for c in num_df.columns:
                if num_df[c].isna().any():
                    num_df[c] = num_df[c].fillna(num_df[c].median())

        return pd.concat([num_df, cat_df], axis=1)


## Load train/test, clean, and split

In [3]:

train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)

TARGET = "SalePrice"
y = train_df[TARGET].values
X_raw = train_df.drop(columns=[TARGET], errors="ignore")

cleaner = XGBCleaner(
    rare_thresh=RARE_THRESH,
    cap_grlivarea=CAP_GRLIVAREA,
    create_features=True,
    impute_numeric=False  # XGB can handle NaNs
)
cleaner.fit(X_raw, y)
X_clean = cleaner.transform(X_raw)
X_submit = cleaner.transform(test_df)

X_tr, X_va, y_tr, y_va = train_test_split(X_clean, y, test_size=TEST_SIZE, random_state=SEED)

X_clean.shape, X_submit.shape, X_tr.shape, X_va.shape


((1460, 82), (1459, 82), (1168, 82), (292, 82))

## Train & validate XGBoost

In [4]:

y_tr_fit = np.log1p(y_tr) if LOG_TARGET else y_tr

model = XGBRegressor(**XGB_PARAMS)
model.fit(X_tr, y_tr_fit)

pred_log = model.predict(X_va)
pred = np.expm1(pred_log) if LOG_TARGET else pred_log

metrics = {
    "rmse": rmse(y_va, pred),
    "r2": float(r2_score(y_va, pred)),
    "log_target": bool(LOG_TARGET)
}
metrics


{'rmse': 25522.873819380136, 'r2': 0.9150730967521667, 'log_target': False}

## Train on full data & create submission

In [5]:

y_fit_full = np.log1p(y) if LOG_TARGET else y
final_model = XGBRegressor(**XGB_PARAMS)
final_model.fit(X_clean, y_fit_full)

test_pred_log = final_model.predict(X_submit)
test_pred = np.expm1(test_pred_log) if LOG_TARGET else test_pred_log

submission = pd.DataFrame({"Id": test_df["Id"], "SalePrice": test_pred})
submission.to_csv("submission.csv", index=False)
print("[OK] Saved submission.csv")
submission.head()


[OK] Saved submission.csv


Unnamed: 0,Id,SalePrice
0,1461,122834.570312
1,1462,166305.734375
2,1463,183328.203125
3,1464,193722.140625
4,1465,184686.0
