## CV

In [6]:
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor


# ---------- Custom transformer: outlier trimming learned on each training fold ----------
class IQRTrimmer(BaseEstimator, TransformerMixin):
    """
    Trims rows based on IQR bounds learned from X (and optionally y).
    Applied only during fit_transform; during transform it filters rows in X,
    and if y is passed via fit(X, y) it will also align y.
    """
    def __init__(self, cols=("living_area",), y_name="price", y_iqr=True, room_col="number_rooms", room_max=12):
        self.cols = cols
        self.y_name = y_name
        self.y_iqr = y_iqr
        self.room_col = room_col
        self.room_max = room_max

    def fit(self, X, y=None):
        X_ = X.copy()
        self.bounds_ = {}

        for c in self.cols:
            if c in X_.columns:
                q1 = X_[c].quantile(0.25)
                q3 = X_[c].quantile(0.75)
                iqr = q3 - q1
                self.bounds_[c] = (q1 - 1.5 * iqr, q3 + 1.5 * iqr)

        if self.y_iqr and y is not None:
            y_series = pd.Series(y)
            q1 = y_series.quantile(0.25)
            q3 = y_series.quantile(0.75)
            iqr = q3 - q1
            self.y_bounds_ = (q1 - 1.5 * iqr, q3 + 1.5 * iqr)
        else:
            self.y_bounds_ = None

        return self

    def transform(self, X):
        # During inference, do NOT drop rows. Return X unchanged.
        # (Dropping at inference can surprise downstream consumers.)
        return X

    def fit_transform(self, X, y=None, **fit_params):
        X_ = X.copy()
        mask = pd.Series(True, index=X_.index)

        for c, (lo, hi) in getattr(self, "bounds_", {}).items():
            mask &= X_[c].between(lo, hi) | X_[c].isna()

        if self.room_col in X_.columns:
            mask &= (X_[self.room_col].fillna(0) <= self.room_max)

        if getattr(self, "y_bounds_", None) is not None and y is not None:
            y_series = pd.Series(y, index=X_.index)
            lo, hi = self.y_bounds_
            mask &= y_series.between(lo, hi)

            return X_.loc[mask].copy(), y_series.loc[mask].to_numpy()

        return X_.loc[mask].copy()


# ---------- Preprocessor ----------
def build_preprocessor(X):
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = X.select_dtypes(include=["object", "string", "category", "bool"]).columns.tolist()

    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", min_frequency=20))  # reduces overfitting for rare categories
    ])

    return ColumnTransformer(
        transformers=[
            ("num", num_pipe, num_cols),
            ("cat", cat_pipe, cat_cols)
        ],
        remainder="drop"
    )


# ---------- CV runner ----------
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

scoring = {
    "mae": make_scorer(mean_absolute_error, greater_is_better=False),
    "rmse": make_scorer(rmse, greater_is_better=False),
    "r2": "r2"
}


def run_cv(df, target="price", test_size=0.2, random_state=42, n_splits=5):
    y = df[target].to_numpy()
    X = df.drop(columns=[target])

    # final untouched test set
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    preprocessor = build_preprocessor(X_train)

    models = {
        "LinearRegression": LinearRegression(),
        "RandomForest": RandomForestRegressor(
            n_estimators=600,
            min_samples_leaf=5,      # combats your observed overfitting
            min_samples_split=10,
            n_jobs=-1,
            random_state=random_state
        ),
        "XGBoost": XGBRegressor(
            objective="reg:squarederror",
            n_estimators=2000,
            learning_rate=0.03,
            max_depth=5,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=2.0,
            reg_alpha=0.0,
            random_state=random_state,
            n_jobs=-1,
            tree_method="hist"
        ),
    }

    cv = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    rows = []
    fitted_models = {}

    for name, est in models.items():
        pipe = Pipeline([
            ("trim", IQRTrimmer(cols=("living_area",), y_iqr=True)),
            ("preprocess", preprocessor),
            ("model", est)
        ])

        cv_res = cross_validate(
            pipe, X_train, y_train,
            cv=cv,
            scoring=scoring,
            n_jobs=-1,
            return_train_score=True
        )

        # summarize (note: mae/rmse are negative because of sklearn convention)
        rows.append({
            "Model": name,
            "CV_MAE_mean": -cv_res["test_mae"].mean(),
            "CV_MAE_std":  cv_res["test_mae"].std(),
            "CV_RMSE_mean": -cv_res["test_rmse"].mean(),
            "CV_RMSE_std":  cv_res["test_rmse"].std(),
            "CV_R2_mean": cv_res["test_r2"].mean(),
            "CV_R2_std":  cv_res["test_r2"].std(),
            "Overfit_R2_gap(mean)": (cv_res["train_r2"].mean() - cv_res["test_r2"].mean())
        })

        # fit on full train set for final test evaluation
        pipe.fit(X_train, y_train)
        fitted_models[name] = pipe

    summary = pd.DataFrame(rows).sort_values(by="CV_RMSE_mean")
    return summary, fitted_models, (X_test, y_test)


# Usage:
# summary, fitted, (X_test, y_test) = run_cv(df)
# print(summary)
# best = min(fitted, key=lambda k: summary.set_index("Model").loc[k, "CV_RMSE_mean"])
# y_pred = fitted[best].predict(X_test)


In [7]:
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path(r"C:\Users\welde\Desktop\immo-eliza-ml")
csv_path = PROJECT_ROOT / "data" / "processed" / "cleaned_v2.csv"

df = pd.read_csv(csv_path)

# safety: ensure target exists
assert "price" in df.columns, "Expected a 'price' column in df"
print(df.shape)

(14374, 12)


In [8]:
summary, fitted_models, (X_test, y_test) = run_cv(df, target="price", n_splits=5)

# Pretty display
summary_display = summary.copy()
for c in ["CV_MAE_mean", "CV_MAE_std", "CV_RMSE_mean", "CV_RMSE_std"]:
    summary_display[c] = summary_display[c].round(2)
for c in ["CV_R2_mean", "CV_R2_std", "Overfit_R2_gap(mean)"]:
    summary_display[c] = summary_display[c].round(4)

print(summary_display)


              Model  CV_MAE_mean  CV_MAE_std  CV_RMSE_mean  CV_RMSE_std  \
2           XGBoost     75022.55     3164.69     146751.00     14218.64   
1      RandomForest     79590.46     2997.51     164603.72     15679.82   
0  LinearRegression    109072.60     3607.96     199464.60     21399.84   

   CV_R2_mean  CV_R2_std  Overfit_R2_gap(mean)  
2      0.6960     0.0166                0.2329  
1      0.6175     0.0238                0.1466  
0      0.4389     0.0447                0.0017  
