## 1. Load the data

In [47]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import os
os.chdir(r"C:\Users\welde\Desktop\immo-eliza-ml")  # Change Python's working directory so that all relative paths are resolved from your project root


# load the data set

df = pd.read_csv("data/raw/raw.csv")
df.head()      # 
df.shape       # Check dataset shape
df.info()       # Inspect column data types
df.describe()    #Summary statistics
df.isna().sum()   #missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15232 entries, 0 to 15231
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   build_year          6226 non-null   object 
 1   facades             10088 non-null  float64
 2   garden              15232 non-null  object 
 3   living_area         13504 non-null  object 
 4   locality_name       15012 non-null  object 
 5   number_rooms        13980 non-null  object 
 6   postal_code         15008 non-null  float64
 7   price               14389 non-null  float64
 8   property_id         15232 non-null  object 
 9   property_type       14236 non-null  object 
 10  property_url        15232 non-null  object 
 11  state               11116 non-null  object 
 12  swimming_pool       15232 non-null  object 
 13  terrace             13832 non-null  object 
 14  province            15008 non-null  object 
 15  property_type_name  14232 non-null  object 
 16  stat

build_year            9006
facades               5144
garden                   0
living_area           1728
locality_name          220
number_rooms          1252
postal_code            224
price                  843
property_id              0
property_type          996
property_url             0
state                 4116
swimming_pool            0
terrace               1400
province               224
property_type_name    1000
state_mapped          4120
dtype: int64

## DATA CLEANING FUNCTION

In [48]:
import pandas as pd
import numpy as np
import os

def enhanced_clean(
        path="data/raw/raw.csv",
        save_path="data/processed/cleaned_v2.csv"
    ):
    """
    FINAL unified cleaning pipeline for ImmoEliza.
    - Merges basic cleaning + enhanced cleaning
    - Handles numeric sanity
    - Normalizes booleans
    - Converts numeric-like columns
    - Removes locality_name completely
    """

    # 1. Load raw-cleaned data
    df = pd.read_csv(path, dtype={"postal_code": "string"}).copy()

    # 2. Drop useless columns
    drop_cols = [
        "property_id",
        "property_url",
        "property_type_name",
        "state_mapped",
        "locality_name"      
    ]
    df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")

    # 3. Normalize boolean-like columns
    bool_cols = ["garden", "terrace", "swimming_pool"]
    for col in bool_cols:
        if col in df.columns:
            df[col] = (
                df[col].astype(str).str.lower().str.strip()
                .replace({
                    "1": "yes", "true": "yes", "yes": "yes",
                    "0": "no", "false": "no", "no": "no"
                })
            )

    # 4. Convert numeric-like columns (comma → dot)
    numeric_cols = ["build_year", "number_rooms", "facades", "living_area"]
    for col in numeric_cols:
        if col in df.columns:
            df[col] = df[col].astype(str).str.replace(",", ".", regex=False)
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # Ensure price numeric
    df["price"] = pd.to_numeric(df["price"], errors="coerce")

    # Postal code remains categorical
    df["postal_code"] = df["postal_code"].astype("string")

    # 5. Numeric sanity constraints
    df.loc[df["build_year"] < 1800, "build_year"] = np.nan
    df.loc[df["build_year"] > 2025, "build_year"] = np.nan

    df.loc[df["number_rooms"] <= 0, "number_rooms"] = np.nan
    df.loc[df["number_rooms"] > 12, "number_rooms"] = np.nan

    df.loc[df["living_area"] < 10, "living_area"] = np.nan
    df.loc[df["living_area"] > 500, "living_area"] = np.nan

    df = df[df["price"] >= 10000]
    df.loc[df["price"] > 7_500_000, "price"] = np.nan
    df = df[df["price"].notna()]

    df["province"] = df["province"].astype(str).str.strip().replace("nan", np.nan)

    # 6. LOCALITY REMOVED — nothing here

    # 7. Save
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    df.to_csv(save_path, index=False)

    print(f"Enhanced Cleaned dataset saved to: {save_path}")
    print("Final shape:", df.shape)
    return df


In [49]:
# run
cleaned_df = enhanced_clean(
    path="data/raw/raw.csv",
    save_path="data/processed/cleaned_v2.csv"
)

cleaned_df.head()



Enhanced Cleaned dataset saved to: data/processed/cleaned_v2.csv
Final shape: (14374, 12)


Unnamed: 0,build_year,facades,garden,living_area,number_rooms,postal_code,price,property_type,state,swimming_pool,terrace,province
0,1996.0,2.0,yes,270.0,4.0,1853.0,580000.0,Residence,Excellent,no,yes,Flemish Brabant
1,1991.0,4.0,yes,218.0,5.0,1341.0,695000.0,Residence,Excellent,yes,yes,Walloon Brabant
2,1970.0,4.0,no,135.0,3.0,1300.0,249000.0,Apartment,To be renovated,no,yes,Walloon Brabant
3,1959.0,3.0,yes,176.0,3.0,1853.0,499000.0,Residence,Normal,no,yes,Flemish Brabant
4,2007.0,4.0,yes,200.0,4.0,1341.0,650000.0,Residence,Excellent,no,yes,Walloon Brabant


## feature engineering

In [77]:
import pandas as pd
import numpy as np
import os
from datetime import datetime


def feature_engineering(
        path="data/processed/cleaned_v2.csv",
        save_path="data/processed/feature_engineered.csv"
    ):
    """
    Feature Engineering for ImmoEliza.

    Adds:
    - build_year features (age, decade, age flags)
    - region mapping from province (province is uppercase NL)
    - boolean flags (garden/terrace/swimming_pool)
    """

    df = pd.read_csv(path, dtype={"postal_code": "string"}).copy()

    # Postal code: clean 4-digit string or "unknown"
    df["postal_code"] = (
        df["postal_code"]
        .astype("string")
        .str.strip()
        .str.extract(r"(\d{4})", expand=False)
        .fillna("unknown")
        .astype("string")
    )

    # Build-year features
    if "build_year" in df.columns:
        build_year_num = pd.to_numeric(df["build_year"], errors="coerce")
        current_year = datetime.now().year

        df["house_age"] = current_year - build_year_num
        df.loc[df["house_age"] < 0, "house_age"] = np.nan

        df["is_new_build"] = (df["house_age"] <= 5).astype("Int64")
        df["is_recent"]    = (df["house_age"] <= 20).astype("Int64")
        df["is_old"]       = (df["house_age"] >= 50).astype("Int64")

        df["build_decade"] = (build_year_num // 10 * 10).astype("Int64")
    else:
        df["house_age"] = np.nan
        df["is_new_build"] = np.nan
        df["is_recent"] = np.nan
        df["is_old"] = np.nan
        df["build_decade"] = np.nan

    # Region from province (province is uppercase NL in your pipeline)
    region_map_nl_upper = {
        "ANTWERPEN": "Flanders",
        "OOST-VLAANDEREN": "Flanders",
        "WEST-VLAANDEREN": "Flanders",
        "LIMBURG": "Flanders",
        "VLAAMS-BRABANT": "Flanders",
        "WAALS-BRABANT": "Wallonia",
        "HENEGOUWEN": "Wallonia",
        "LUIK": "Wallonia",
        "LUXEMBURG": "Wallonia",
        "NAMEN": "Wallonia",
        "BRUSSEL": "Brussels",
    }

    if "province" in df.columns:
        df["region"] = (
            df["province"]
            .astype("string")
            .str.strip()
            .str.upper()
            .map(region_map_nl_upper)
            .fillna("unknown")
            .astype("string")
        )
    else:
        df["region"] = "unknown"

    # Boolean flags
    bool_map = {"yes": 1, "no": 0}
    for col in ["garden", "terrace", "swimming_pool"]:
        if col in df.columns:
            df[col + "_flag"] = (
                df[col]
                .astype("string")
                .str.strip()
                .str.lower()
                .map(bool_map)
                .astype("Int64")
            )
        else:
            df[col + "_flag"] = np.nan

    # Save
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    df.to_csv(save_path, index=False)

    print(f"Feature-engineered dataset saved to: {save_path}")
    print("Final shape:", df.shape)
    print("Region counts:\n", df["region"].value_counts(dropna=False))
    print("\nColumns:", df.columns.tolist())

    return df


In [78]:
# RUN IT

df_fe = feature_engineering(
    path="data/processed/cleaned_v2.csv",
    save_path="data/processed/feature_engineered.csv"
)

df_fe[["postal_code", "province", "region"]].head()




Feature-engineered dataset saved to: data/processed/feature_engineered.csv
Final shape: (14374, 21)
Region counts:
 region
Flanders    7040
Wallonia    5925
Brussels    1409
Name: count, dtype: Int64

Columns: ['build_year', 'facades', 'garden', 'living_area', 'number_rooms', 'postal_code', 'price', 'property_type', 'state', 'swimming_pool', 'terrace', 'province', 'house_age', 'is_new_build', 'is_recent', 'is_old', 'build_decade', 'region', 'garden_flag', 'terrace_flag', 'swimming_pool_flag']


Unnamed: 0,postal_code,province,region
0,1853,VLAAMS-BRABANT,Flanders
1,1341,WAALS-BRABANT,Wallonia
2,1300,WAALS-BRABANT,Wallonia
3,1853,VLAAMS-BRABANT,Flanders
4,1341,WAALS-BRABANT,Wallonia


In [54]:
df_fe["postal_code"].value_counts().head(10)


postal_code
5000    356
4000    298
7000    187
1180    174
1140    166
2500    164
1000    149
3970    148
5100    147
4020    146
Name: count, dtype: int64

## PREPROCESS DATA (Pipelines)

In [83]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


def split_data(df: pd.DataFrame, target: str = "price", test_size: float = 0.20, random_state: int = 42):
    # Separate target (y) from features (X), then split into train/test
    y = df[target]
    X = df.drop(columns=[target])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    print("Train:", X_train.shape)
    print("Test:", X_test.shape)
    return X_train, X_test, y_train, y_test


def remove_outliers_from_train(X_train: pd.DataFrame, y_train: pd.Series):
    # Remove outliers ONLY from training data (avoid leakage into test/validation)
    df_train = X_train.copy()
    df_train["price"] = y_train

    cols_to_filter = [c for c in ["price", "living_area"] if c in df_train.columns]

    def iqr_filter(df_: pd.DataFrame, col: str) -> pd.DataFrame:
        # Standard IQR rule: keep values in [Q1 - 1.5*IQR, Q3 + 1.5*IQR]
        q1 = df_[col].quantile(0.25)
        q3 = df_[col].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - 1.5 * iqr
        upper = q3 + 1.5 * iqr
        return df_[(df_[col] >= lower) & (df_[col] <= upper)]

    for col in cols_to_filter:
        df_train = iqr_filter(df_train, col)

    # Optional hard cap on room counts (domain sanity rule)
    if "number_rooms" in df_train.columns:
        df_train = df_train[df_train["number_rooms"].fillna(0) <= 12]

    y_train_clean = df_train["price"]
    X_train_clean = df_train.drop(columns=["price"])

    print("Training after outlier removal:", X_train_clean.shape)
    return X_train_clean, y_train_clean


def build_preprocessor(X_train: pd.DataFrame, drop_postal_code: bool = False) -> ColumnTransformer:
    # Detect numeric and categorical columns by dtype
    numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = X_train.select_dtypes(include=["object", "string", "category"]).columns.tolist()

    # Ensure postal_code is treated as categorical (unless we explicitly drop it)
    if "postal_code" in numeric_cols:
        numeric_cols.remove("postal_code")
        categorical_cols.append("postal_code")

    # For linear models, dropping high-cardinality postal_code is often more stable
    if drop_postal_code and "postal_code" in categorical_cols:
        categorical_cols.remove("postal_code")

    # Numeric: impute missing values then scale
    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ])

    # Categorical: impute missing values then one-hot encode
    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ])

    # Combine numeric + categorical transformations
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_pipeline, numeric_cols),
            ("cat", categorical_pipeline, categorical_cols),
        ],
        remainder="drop",
    )

    print("\nNumeric columns:", numeric_cols)
    print("Categorical columns:", categorical_cols)
    print("drop_postal_code:", drop_postal_code)

    return preprocessor


def run_preprocessing_pipeline(
    path: str = "data/processed/feature_engineered.csv",
    target: str = "price",
    drop_postal_code: bool = False,
    remove_outliers: bool = False,
    test_size: float = 0.20,
    random_state: int = 42,
):
    # Load engineered dataset (postal_code kept as string so it can be categorical)
    df = pd.read_csv(path, dtype={"postal_code": "string"})

    # Train/test split
    X_train, X_test, y_train, y_test = split_data(
        df, target=target, test_size=test_size, random_state=random_state
    )

    # Optional: outlier removal on TRAIN only
    if remove_outliers:
        X_train, y_train = remove_outliers_from_train(X_train, y_train)

    # Build preprocessor using training data only (prevents leakage)
    preprocessor = build_preprocessor(X_train, drop_postal_code=drop_postal_code)

    return X_train, X_test, y_train, y_test, preprocessor


In [84]:
# ------------------------------------------------------------
# RUN THE PIPELINE
# ------------------------------------------------------------
X_train_clean, X_test, y_train_clean, y_test, preprocessor = run_preprocessing_pipeline(
    "data/processed/feature_engineered.csv"
)



Train: (11499, 20)
Test: (2875, 20)

Numeric columns: ['build_year', 'facades', 'living_area', 'number_rooms', 'house_age', 'is_new_build', 'is_recent', 'is_old', 'build_decade', 'garden_flag', 'terrace_flag', 'swimming_pool_flag']
Categorical columns: ['garden', 'postal_code', 'property_type', 'state', 'swimming_pool', 'terrace', 'province', 'region']
drop_postal_code: False


## Tuned Linear Models: Ridge, Lasso, ElasticNet

In [81]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
import joblib


# -------------------------------------------------------------
# Evaluate a single split
# -------------------------------------------------------------
def evaluate_split(model, X, y, name=""):
    preds = model.predict(X)
    mae = mean_absolute_error(y, preds)
    rmse = np.sqrt(mean_squared_error(y, preds))
    r2 = r2_score(y, preds)

    print(f"\n--- {name} Evaluation ---")
    print(f"MAE:  {mae:,.2f}")
    print(f"RMSE: {rmse:,.2f}")
    print(f"R²:   {r2:.4f}")

    return {"Split": name, "MAE": mae, "RMSE": rmse, "R2": r2}


# -------------------------------------------------------------
# Train tuned linear models (Ridge, Lasso, ElasticNet)
# -------------------------------------------------------------
def train_tuned_linear_models(X_train, y_train, preprocessor):

    models = {
        "Ridge": Ridge(),
        "Lasso": Lasso(max_iter=10000),
        "ElasticNet": ElasticNet(max_iter=10000)
    }

    param_grids = {
        "Ridge": {"model__alpha": [0.001, 0.01, 0.1, 1, 10, 100]},
        "Lasso": {"model__alpha": [0.001, 0.01, 0.1, 1, 10]},
        "ElasticNet": {
            "model__alpha": [0.001, 0.01, 0.1, 1],
            "model__l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9]
        }
    }

    best_models = {}

    for name, model in models.items():
        print(f"\nTuning {name}...")

        pipe = Pipeline([
            ("preprocessor", preprocessor),
            ("model", model)
        ])

        search = RandomizedSearchCV(
            estimator=pipe,
            param_distributions=param_grids[name],
            n_iter=10,
            scoring="neg_mean_absolute_error",
            cv=3,
            n_jobs=-1,
            random_state=42
        )

        search.fit(X_train, y_train)

        print(f"Best params for {name}: {search.best_params_}")
        best_models[name] = search.best_estimator_

    return best_models


# -------------------------------------------------------------
# Full workflow: Train/Test only (NO validation)
# -------------------------------------------------------------
def run_tuned_linear_models(
    X_train_clean, X_test,
    y_train_clean, y_test,
    preprocessor
):

    best_models = train_tuned_linear_models(X_train_clean, y_train_clean, preprocessor)

    all_results = []

    for name, model in best_models.items():
        print(f"\n===== {name} Results =====")

        # Train performance
        all_results.append(evaluate_split(
            model, X_train_clean, y_train_clean, f"{name} - Train"
        ))

        # Test performance
        all_results.append(evaluate_split(
            model, X_test, y_test, f"{name} - Test"
        ))

        # Save model
        model_path = f"models/{name.lower()}_tuned.pkl"
        joblib.dump(model, model_path)
        print(f"{name} model saved to {model_path}")

    return pd.DataFrame(all_results)


In [82]:
tuned_linear_results = run_tuned_linear_models(
    X_train_clean, X_test,
    y_train_clean, y_test,
    preprocessor
)

tuned_linear_results




Tuning Ridge...




Best params for Ridge: {'model__alpha': 10}

Tuning Lasso...




Best params for Lasso: {'model__alpha': 10}

Tuning ElasticNet...
Best params for ElasticNet: {'model__l1_ratio': 0.1, 'model__alpha': 0.001}

===== Ridge Results =====

--- Ridge - Train Evaluation ---
MAE:  92,755.73
RMSE: 180,320.66
R²:   0.5436

--- Ridge - Test Evaluation ---
MAE:  93,414.97
RMSE: 170,091.12
R²:   0.5461
Ridge model saved to models/ridge_tuned.pkl

===== Lasso Results =====

--- Lasso - Train Evaluation ---
MAE:  91,136.79
RMSE: 176,515.45
R²:   0.5627

--- Lasso - Test Evaluation ---
MAE:  93,969.48
RMSE: 170,339.97
R²:   0.5448
Lasso model saved to models/lasso_tuned.pkl

===== ElasticNet Results =====

--- ElasticNet - Train Evaluation ---
MAE:  92,818.28
RMSE: 180,433.99
R²:   0.5430

--- ElasticNet - Test Evaluation ---
MAE:  93,449.67
RMSE: 170,144.60
R²:   0.5458
ElasticNet model saved to models/elasticnet_tuned.pkl


Unnamed: 0,Split,MAE,RMSE,R2
0,Ridge - Train,92755.72763,180320.661479,0.543601
1,Ridge - Test,93414.965592,170091.124611,0.546088
2,Lasso - Train,91136.786345,176515.452572,0.56266
3,Lasso - Test,93969.476856,170339.972944,0.544758
4,ElasticNet - Train,92818.275128,180433.994784,0.543027
5,ElasticNet - Test,93449.669673,170144.600766,0.545802


## Train LR

In [85]:
import os
import joblib
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline


# ============================================================
# 1. Train Linear Regression (Pipeline = Preprocessor + LR)
# ============================================================
def train_linear_regression(X_train, y_train, preprocessor):
    """
    Build and train a full pipeline:
    - Preprocessor: ColumnTransformer
    - Linear Regression model
    """

    model = Pipeline([
        ("preprocessor", preprocessor),
        ("model", LinearRegression())
    ])

    model.fit(X_train, y_train)

    print("\nLinear Regression model trained successfully.")
    return model



# ============================================================
# 2. Evaluate one split (Train or Test)
# ============================================================
def evaluate_single_split(model, X, y, split_name=""):
    """
    Evaluate the model on a given split.
    """

    preds = model.predict(X)

    mae = mean_absolute_error(y, preds)
    rmse = mean_squared_error(y, preds) ** 0.5   # manual RMSE
    r2 = r2_score(y, preds)

    print(f"\n--- {split_name} Evaluation ---")
    print(f"MAE:  {mae:,.2f}")
    print(f"RMSE: {rmse:,.2f}")
    print(f"R²:   {r2:.4f}")

    return mae, rmse, r2



# ============================================================
# 3. Evaluate Train + Test (NO validation)
# ============================================================
def evaluate_all_splits(model,
                        X_train, y_train,
                        X_test, y_test,
                        model_name="Linear Regression"):

    results = []

    # Train
    tr_mae, tr_rmse, tr_r2 = evaluate_single_split(
        model, X_train, y_train, "Train"
    )

    # Test
    te_mae, te_rmse, te_r2 = evaluate_single_split(
        model, X_test, y_test, "Test"
    )

    # Summary DataFrame
    df_results = pd.DataFrame([
        {"Model": model_name, "Split": "Train", "MAE": tr_mae, "RMSE": tr_rmse, "R2": tr_r2},
        {"Model": model_name, "Split": "Test", "MAE": te_mae, "RMSE": te_rmse, "R2": te_r2},
    ])

    print("\n--- Summary Performance Table ---")
    print(df_results)

    return df_results



# ============================================================
# 4. Save model to disk
# ============================================================
def save_linear_regression(model, path="models/linear_regression.pkl"):
    """
    Save the full model pipeline (Preprocessor + Linear Regression).
    """

    os.makedirs(os.path.dirname(path), exist_ok=True)
    joblib.dump(model, path)

    print(f"\nModel pipeline saved to: {path}")



# ============================================================
# 5. Full Linear Regression Workflow (Train/Test Only)
# ============================================================
def run_linear_regression(X_train_clean, X_test,
                          y_train_clean, y_test,
                          preprocessor):
    """
    End-to-end workflow:
    - Train LR model
    - Evaluate on train/test (no validation)
    - Save model
    - Return model + performance table
    """

    # Train model
    model = train_linear_regression(X_train_clean, y_train_clean, preprocessor)

    # Evaluate
    scores = evaluate_all_splits(
        model,
        X_train_clean, y_train_clean,
        X_test, y_test,
        model_name="Linear Regression"
    )

    # Save
    save_linear_regression(model)

    return model, scores


In [86]:
lr_model, lr_results = run_linear_regression(
    X_train_clean, X_test,
    y_train_clean, y_test,
    preprocessor
)

lr_results



Linear Regression model trained successfully.

--- Train Evaluation ---
MAE:  90,395.48
RMSE: 175,974.52
R²:   0.5653

--- Test Evaluation ---
MAE:  95,222.92
RMSE: 171,256.19
R²:   0.5398

--- Summary Performance Table ---
               Model  Split           MAE           RMSE        R2
0  Linear Regression  Train  90395.478718  175974.520523  0.565337
1  Linear Regression   Test  95222.922735  171256.188651  0.539848

Model pipeline saved to: models/linear_regression.pkl


Unnamed: 0,Model,Split,MAE,RMSE,R2
0,Linear Regression,Train,90395.478718,175974.520523,0.565337
1,Linear Regression,Test,95222.922735,171256.188651,0.539848


## leakage testing 

In [34]:
import pandas as pd
from scipy.stats import ks_2samp
import numpy as np

def drift_check(X_train_clean, X_test, y_train_clean, y_test):
    print("\n==============================")
    print(" DRIFT ANALYSIS STARTING")
    print("==============================\n")

    # --------------------------------------------------------
    # 1. Reconstruct full train/test DataFrames
    # --------------------------------------------------------
    train = X_train_clean.copy()
    test = X_test.copy()

    train["price"] = y_train_clean.values
    test["price"] = y_test.values

    # --------------------------------------------------------
    # 2. Identify columns
    # --------------------------------------------------------
    numeric_cols = train.select_dtypes(include=["float64","int64","Int64"]).columns.tolist()
    categorical_cols = train.select_dtypes(include=["object","string"]).columns.tolist()

    # Ensure price treated as numeric
    if "price" in categorical_cols:
        categorical_cols.remove("price")
        numeric_cols.append("price")

    print("Numeric columns:", numeric_cols)
    print("Categorical columns:", categorical_cols)

    # --------------------------------------------------------
    # 3. Category Coverage Checks
    # --------------------------------------------------------
    def category_coverage(feature):
        train_set = set(train[feature].dropna())
        test_set = set(test[feature].dropna())

        missing_in_train = test_set - train_set
        missing_in_test = train_set - test_set

        print(f"\n=== CATEGORY COVERAGE: {feature} ===")
        print(f"Unique in train: {len(train_set)}")
        print(f"Unique in test:  {len(test_set)}")
        print(f"Present in TEST but NOT in train (unseen categories): {len(missing_in_train)}")
        if len(missing_in_train) > 0:
            print("Examples:", list(missing_in_train)[:15])
        print(f"Present in TRAIN but NOT in test: {len(missing_in_test)}")

    print("\n========== CHECKING CATEGORY COVERAGE ==========\n")
    for col in categorical_cols:
        category_coverage(col)

    # --------------------------------------------------------
    # 4. KS Tests for Numerical Drift
    # --------------------------------------------------------
    def ks_test(feature):
        a = train[feature].dropna()
        b = test[feature].dropna()

        stat, p = ks_2samp(a, b)

        print(f"\n=== KS TEST: {feature} ===")
        print(f"KS statistic: {stat:.4f}")
        print(f"P-value     : {p:.6f}")

        if p < 0.05:
            print(" -> SIGNIFICANT DRIFT DETECTED")
        else:
            print(" -> No significant drift")

    print("\n========== KS-TEST FOR NUMERICAL FEATURES ==========\n")

    # You normally do NOT test price for KS, but here we include it to see target drift
    for col in numeric_cols:
        ks_test(col)

    # --------------------------------------------------------
    # 5. Target Drift Summary
    # --------------------------------------------------------
    print("\n========== TARGET DRIFT (PRICE DISTRIBUTION) ==========\n")
    print("Train price stats:")
    print(train["price"].describe())

    print("\nTest price stats:")
    print(test["price"].describe())

    # --------------------------------------------------------
    # 6. Interaction Drift: Locality-level mean price
    # --------------------------------------------------------
    if "locality_name" in train.columns:
        print("\n========== LOCALITY-LEVEL PRICE DRIFT ==========\n")

        train_loc_means = train.groupby("locality_name")["price"].mean()
        test_loc_means = test.groupby("locality_name")["price"].mean()

        print("\nTrain locality price mean distribution:")
        print(train_loc_means.describe())

        print("\nTest locality price mean distribution:")
        print(test_loc_means.describe())

        # common localities only
        common = list(set(train_loc_means.index) & set(test_loc_means.index))

        paired_train = train_loc_means.loc[common]
        paired_test = test_loc_means.loc[common]

        stat, p = ks_2samp(paired_train, paired_test)

        print("\nLocality-level KS test (common localities):")
        print(f"KS statistic: {stat:.4f}")
        print(f"P-value     : {p:.6f}")

    print("\n==============================")
    print(" DRIFT ANALYSIS COMPLETE")
    print("==============================\n")


In [35]:
X_train_clean, X_test, y_train_clean, y_test, preprocessor = run_preprocessing_pipeline(
    path="data/processed/cleaned_v2.csv"
)

drift_check(X_train_clean, X_test, y_train_clean, y_test)


Train: (11499, 23)
Test: (2875, 23)
Train after outlier removal: (9169, 23)

 DRIFT ANALYSIS STARTING

Numeric columns: ['build_year', 'facades', 'living_area', 'number_rooms', 'house_age', 'is_new_build', 'is_recent', 'is_old', 'build_decade', 'garden_flag', 'terrace_flag', 'swimming_pool_flag', 'postal_prefix', 'postal_code_num', 'price']
Categorical columns: ['garden', 'locality_name', 'postal_code', 'property_type', 'state', 'swimming_pool', 'terrace', 'province', 'region']



=== CATEGORY COVERAGE: garden ===
Unique in train: 2
Unique in test:  2
Present in TEST but NOT in train (unseen categories): 0
Present in TRAIN but NOT in test: 0

=== CATEGORY COVERAGE: locality_name ===
Unique in train: 51
Unique in test:  51
Present in TEST but NOT in train (unseen categories): 0
Present in TRAIN but NOT in test: 0

=== CATEGORY COVERAGE: postal_code ===
Unique in train: 779
Unique in test:  573
Present in TEST but NOT in train (unseen categories): 41
Examples: ['9112.0', '6010.0', '8820.

## Random forest

In [87]:
import pandas as pd
import numpy as np
import os
import joblib

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


# -------------------------------------------------------------
# 1. Train/Test Split (80/20)
# -------------------------------------------------------------
def split_train_test(df, target="price"):
    y = df[target]
    X = df.drop(columns=[target])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42
    )

    print("Train:", X_train.shape)
    print("Test:", X_test.shape)

    return X_train, X_test, y_train, y_test


# -------------------------------------------------------------
# 2. Outlier removal (train only)
# -------------------------------------------------------------
def remove_outliers_from_train(X_train, y_train):
    df_train = X_train.copy()
    df_train["price"] = y_train

    # IQR filter on price and living_area
    def iqr_filter(df, col):
        if col not in df.columns:
            return df
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        return df[(df[col] >= lower) & (df[col] <= upper)]

    for col in ["price", "living_area"]:
        df_train = iqr_filter(df_train, col)

    # Rooms sanity filter
    if "number_rooms" in df_train.columns:
        df_train = df_train[df_train["number_rooms"].fillna(0) <= 12]

    y_train_clean = df_train["price"]
    X_train_clean = df_train.drop(columns=["price"])

    print("Train after outlier removal:", X_train_clean.shape)
    return X_train_clean, y_train_clean


# -------------------------------------------------------------
# 3. Preprocessor
# -------------------------------------------------------------
def build_preprocessor(X):
    numeric_cols = X.select_dtypes(include=["float64", "int64", "Int64"]).columns.tolist()
    categorical_cols = X.select_dtypes(include=["object", "string"]).columns.tolist()

    # Postal code must be categorical
    if "postal_code" in numeric_cols:
        numeric_cols.remove("postal_code")
        categorical_cols.append("postal_code")

    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ])

    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer([
        ("num", numeric_pipeline, numeric_cols),
        ("cat", categorical_pipeline, categorical_cols)
    ])

    print("Numeric:", numeric_cols)
    print("Categorical:", categorical_cols)

    return preprocessor


# -------------------------------------------------------------
# 4. Evaluate helper
# -------------------------------------------------------------
def evaluate_rf(model, X, y, split_name=""):
    preds = model.predict(X)
    mae = mean_absolute_error(y, preds)
    rmse = mean_squared_error(y, preds) ** 0.5
    r2 = r2_score(y, preds)

    print(f"\n--- {split_name} Evaluation ---")
    print(f"MAE:  {mae:,.2f}")
    print(f"RMSE: {rmse:,.2f}")
    print(f"R²:   {r2:.4f}")

    return {"Split": split_name, "MAE": mae, "RMSE": rmse, "R2": r2}


# -------------------------------------------------------------
# 5. Train RF
# -------------------------------------------------------------
def train_random_forest(X_train, y_train, preprocessor):
    model = Pipeline([
        ("preprocessor", preprocessor),
        ("model", RandomForestRegressor(
            n_estimators=300,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features="sqrt",
            random_state=42,
            n_jobs=-1
        ))
    ])

    model.fit(X_train, y_train)
    print("\nRandom Forest model trained successfully.")
    return model


# -------------------------------------------------------------
# 6. Full RF workflow (Train/Test only)
# -------------------------------------------------------------
def run_random_forest_no_val(df):

    # Split DF
    X_train, X_test, y_train, y_test = split_train_test(df)

    # Outliers from train only
    X_train_clean, y_train_clean = remove_outliers_from_train(X_train, y_train)

    # Preprocessor
    preprocessor = build_preprocessor(X_train_clean)

    # Train model
    model = train_random_forest(X_train_clean, y_train_clean, preprocessor)

    # Evaluate
    results = []
    results.append(evaluate_rf(model, X_train_clean, y_train_clean, "Train"))
    results.append(evaluate_rf(model, X_test, y_test, "Test"))

    # Save
    os.makedirs("models", exist_ok=True)
    joblib.dump(model, "models/random_forest_no_val.pkl")
    print("\nModel saved to models/random_forest_no_val.pkl")

    return model, pd.DataFrame(results)


In [88]:
#Run
df = pd.read_csv("data/processed/feature_engineered.csv", dtype={"postal_code": "string"})

rf_model, rf_results = run_random_forest_no_val(df)
rf_results



Train: (11499, 20)
Test: (2875, 20)
Train after outlier removal: (9169, 20)
Numeric: ['build_year', 'facades', 'living_area', 'number_rooms', 'house_age', 'is_new_build', 'is_recent', 'is_old', 'build_decade', 'garden_flag', 'terrace_flag', 'swimming_pool_flag']
Categorical: ['garden', 'postal_code', 'property_type', 'state', 'swimming_pool', 'terrace', 'province', 'region']

Random Forest model trained successfully.

--- Train Evaluation ---
MAE:  17,538.70
RMSE: 25,588.66
R²:   0.9587

--- Test Evaluation ---
MAE:  92,433.16
RMSE: 210,496.94
R²:   0.3048

Model saved to models/random_forest_no_val.pkl


Unnamed: 0,Split,MAE,RMSE,R2
0,Train,17538.695779,25588.659985,0.95868
1,Test,92433.157509,210496.944683,0.304815


### Function to train a Random Forest with preprocessing

In [89]:
import pandas as pd
import numpy as np
import os
import joblib

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


# -------------------------------------------------------------
# 1. Train/Test Split (80/20)
# -------------------------------------------------------------
def split_train_test(df, target="price"):
    y = df[target]
    X = df.drop(columns=[target])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42
    )

    print("Train:", X_train.shape)
    print("Test:", X_test.shape)

    return X_train, X_test, y_train, y_test


# -------------------------------------------------------------
# 2. Outlier removal (train only)
# -------------------------------------------------------------
def remove_outliers_from_train(X_train, y_train):
    df_train = X_train.copy()
    df_train["price"] = y_train

    # IQR filter on price and living_area
    def iqr_filter(df, col):
        if col not in df.columns:
            return df
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        return df[(df[col] >= lower) & (df[col] <= upper)]

    for col in ["price", "living_area"]:
        df_train = iqr_filter(df_train, col)

    # Rooms sanity filter
    if "number_rooms" in df_train.columns:
        df_train = df_train[df_train["number_rooms"].fillna(0) <= 12]

    y_train_clean = df_train["price"]
    X_train_clean = df_train.drop(columns=["price"])

    print("Train after outlier removal:", X_train_clean.shape)
    return X_train_clean, y_train_clean


# -------------------------------------------------------------
# 3. Preprocessor
# -------------------------------------------------------------
def build_preprocessor(X):
    numeric_cols = X.select_dtypes(include=["float64", "int64", "Int64"]).columns.tolist()
    categorical_cols = X.select_dtypes(include=["object", "string"]).columns.tolist()

    # Postal code must be categorical
    if "postal_code" in numeric_cols:
        numeric_cols.remove("postal_code")
        categorical_cols.append("postal_code")

    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ])

    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer([
        ("num", numeric_pipeline, numeric_cols),
        ("cat", categorical_pipeline, categorical_cols)
    ])

    print("Numeric:", numeric_cols)
    print("Categorical:", categorical_cols)

    return preprocessor


# -------------------------------------------------------------
# 4. Evaluate helper
# -------------------------------------------------------------
def evaluate_rf(model, X, y, split_name=""):
    preds = model.predict(X)
    mae = mean_absolute_error(y, preds)
    rmse = mean_squared_error(y, preds) ** 0.5
    r2 = r2_score(y, preds)

    print(f"\n--- {split_name} Evaluation ---")
    print(f"MAE:  {mae:,.2f}")
    print(f"RMSE: {rmse:,.2f}")
    print(f"R²:   {r2:.4f}")

    return {"Split": split_name, "MAE": mae, "RMSE": rmse, "R2": r2}


# -------------------------------------------------------------
# 5. Train RF
# -------------------------------------------------------------
def train_random_forest(X_train, y_train, preprocessor):
    model = Pipeline([
        ("preprocessor", preprocessor),
        ("model", RandomForestRegressor(
            n_estimators=300,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features="sqrt",
            random_state=42,
            n_jobs=-1
        ))
    ])

    model.fit(X_train, y_train)
    print("\nRandom Forest model trained successfully.")
    return model


# -------------------------------------------------------------
# 6. Full RF workflow (Train/Test only)
# -------------------------------------------------------------
def run_random_forest_no_val(df):

    # Split DF
    X_train, X_test, y_train, y_test = split_train_test(df)

    # Outliers from train only
    X_train_clean, y_train_clean = remove_outliers_from_train(X_train, y_train)

    # Preprocessor
    preprocessor = build_preprocessor(X_train_clean)

    # Train model
    model = train_random_forest(X_train_clean, y_train_clean, preprocessor)

    # Evaluate
    results = []
    results.append(evaluate_rf(model, X_train_clean, y_train_clean, "Train"))
    results.append(evaluate_rf(model, X_test, y_test, "Test"))

    # Save
    os.makedirs("models", exist_ok=True)
    joblib.dump(model, "models/random_forest_no_val.pkl")
    print("\nModel saved to models/random_forest_no_val.pkl")

    return model, pd.DataFrame(results)


### RUN random forest

In [90]:
df = pd.read_csv("data/processed/cleaned_v2.csv", dtype={"postal_code": "string"})

rf_model, rf_results = run_random_forest_no_val(df)
rf_results


Train: (11499, 11)
Test: (2875, 11)
Train after outlier removal: (9169, 11)
Numeric: ['build_year', 'facades', 'living_area', 'number_rooms']
Categorical: ['garden', 'postal_code', 'property_type', 'state', 'swimming_pool', 'terrace', 'province']

Random Forest model trained successfully.

--- Train Evaluation ---
MAE:  17,368.61
RMSE: 25,312.89
R²:   0.9596

--- Test Evaluation ---
MAE:  90,827.02
RMSE: 208,240.82
R²:   0.3196

Model saved to models/random_forest_no_val.pkl


Unnamed: 0,Split,MAE,RMSE,R2
0,Train,17368.608609,25312.891212,0.959566
1,Test,90827.018322,208240.824147,0.319637


## Tuned Random forest

In [96]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import os


#----------------------------------------------------------
# 1) Quick train/test split (80/20)
#----------------------------------------------------------
def split_train_test(df, target="price"):
    y = df[target]
    X = df.drop(columns=[target])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42
    )

    print("Train:", X_train.shape)
    print("Test :", X_test.shape)

    return X_train, X_test, y_train, y_test


#----------------------------------------------------------
# 2) Outlier removal (train only)
#----------------------------------------------------------
def remove_outliers_from_train(X_train, y_train):
    df_train = X_train.copy()
    df_train["price"] = y_train

    if "living_area" in df_train.columns:
        Q1, Q3 = df_train["living_area"].quantile([0.25, 0.75])
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df_train = df_train[(df_train["living_area"] >= lower) & (df_train["living_area"] <= upper)]

    df_train = df_train[df_train["price"] >= 10000]

    if "number_rooms" in df_train.columns:
        df_train = df_train[df_train["number_rooms"].fillna(0) <= 12]

    y = df_train["price"]
    X = df_train.drop(columns=["price"])

    print("Train after outlier removal:", X.shape)
    return X, y


#----------------------------------------------------------
# 3) Safe preprocessor
#----------------------------------------------------------
def build_preprocessor(X_train):

    numeric_cols = X_train.select_dtypes(include=["float64","int64","Int64"]).columns.tolist()
    categorical_cols = X_train.select_dtypes(include=["object","string"]).columns.tolist()

    if "postal_code" in numeric_cols:
        numeric_cols.remove("postal_code")
        categorical_cols.append("postal_code")

    # Limit one-hot explosion
    encoder = OneHotEncoder(handle_unknown="ignore", min_frequency=50)

    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", encoder)
    ])

    preprocessor = ColumnTransformer([
        ("num", numeric_pipeline, numeric_cols),
        ("cat", categorical_pipeline, categorical_cols)
    ])

    return preprocessor


#----------------------------------------------------------
# 4) Safe evaluation
#----------------------------------------------------------
def evaluate_split(model, X, y, split_name):
    preds = model.predict(X)
    mae = mean_absolute_error(y, preds)
    rmse = mean_squared_error(y, preds) ** 0.5
    r2 = r2_score(y, preds)

    print(f"\n--- {split_name} ---")
    print(f"MAE:  {mae:,.2f}")
    print(f"RMSE: {rmse:,.2f}")
    print(f"R2:   {r2:.4f}")

    return {"Split": split_name, "MAE": mae, "RMSE": rmse, "R2": r2}


#----------------------------------------------------------
# 5) Random Forest tuning (safe)
#----------------------------------------------------------
def train_random_forest_tuned(X_train, y_train, preprocessor):

    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("model", RandomForestRegressor(random_state=42))
    ])

    # SAFE small grid (won’t crash)
    param_grid = {
        "model__n_estimators": [200, 300],
        "model__max_depth": [10, 20, 30],
        "model__max_features": [0.3, 0.5, "sqrt"],
        "model__min_samples_split": [2, 5],
        "model__min_samples_leaf": [1, 2]
    }

    search = RandomizedSearchCV(
        pipe,
        param_distributions=param_grid,
        n_iter=10,          # small search
        scoring="neg_mean_absolute_error",
        cv=3,
        n_jobs=1,           # IMPORTANT: prevents kernel crash
        verbose=1,
        random_state=42
    )

    search.fit(X_train, y_train)

    print("\nBest params:", search.best_params_)
    return search.best_estimator_


#----------------------------------------------------------
# 6) Full workflow (No validation)
#----------------------------------------------------------
def run_random_forest_tuned_no_val(df):

    X_train, X_test, y_train, y_test = split_train_test(df)

    X_train_clean, y_train_clean = remove_outliers_from_train(X_train, y_train)

    preprocessor = build_preprocessor(X_train_clean)

    model = train_random_forest_tuned(X_train_clean, y_train_clean, preprocessor)

    train_res = evaluate_split(model, X_train_clean, y_train_clean, "Train")
    test_res = evaluate_split(model, X_test, y_test, "Test")

    results = pd.DataFrame([train_res, test_res])

    os.makedirs("models", exist_ok=True)
    joblib.dump(model, "models/random_forest_tuned_no_val.pkl")

    print("\nModel saved to models/random_forest_tuned_no_val.pkl")

    return model, results


## Run random forest tuned

In [97]:
df = pd.read_csv("data/processed/feature_engineered.csv", dtype={"postal_code": "string"})

rf_tuned_model, rf_tuned_results = run_random_forest_tuned_no_val(df)
rf_tuned_results




Train: (11499, 24)
Test : (2875, 24)
Train after outlier removal: (9675, 24)
Fitting 3 folds for each of 10 candidates, totalling 30 fits

Best params: {'model__n_estimators': 200, 'model__min_samples_split': 5, 'model__min_samples_leaf': 1, 'model__max_features': 0.3, 'model__max_depth': 30}

--- Train ---
MAE:  30,099.19
RMSE: 53,884.69
R2:   0.9153

--- Test ---
MAE:  81,187.41
RMSE: 180,581.62
R2:   0.4884

Model saved to models/random_forest_tuned_no_val.pkl


Unnamed: 0,Split,MAE,RMSE,R2
0,Train,30099.191414,53884.68512,0.91534
1,Test,81187.409103,180581.623286,0.48837


## XGBoost

In [91]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


# ============================================================
# 1. BUILD PREPROCESSOR
# ============================================================
def build_preprocessor(df):
    numeric_cols = df.select_dtypes(include=["float64", "int64", "Int64"]).columns.tolist()
    categorical_cols = df.select_dtypes(include=["object", "string"]).columns.tolist()

    # Postal code must remain categorical
    if "postal_code" in numeric_cols:
        numeric_cols.remove("postal_code")
        categorical_cols.append("postal_code")

    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    return ColumnTransformer([
        ("num", numeric_pipeline, numeric_cols),
        ("cat", categorical_pipeline, categorical_cols)
    ])


# ============================================================
# 2. TRAIN XGBOOST (NO VALIDATION SET)
# ============================================================
def train_xgboost_no_val(X_train, y_train, preprocessor):
    model = Pipeline([
        ("preprocessor", preprocessor),
        ("model", XGBRegressor(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            objective="reg:squarederror",
            random_state=42
        ))
    ])

    model.fit(X_train, y_train)
    print("XGBoost model trained successfully.")
    return model


# ============================================================
# 3. EVALUATE MODEL
# ============================================================
def evaluate(model, X, y, name=""):
    preds = model.predict(X)
    mae = mean_absolute_error(y, preds)
    rmse = np.sqrt(mean_squared_error(y, preds))
    r2 = r2_score(y, preds)

    print(f"\n--- {name} Evaluation ---")
    print(f"MAE:  {mae:,.2f}")
    print(f"RMSE: {rmse:,.2f}")
    print(f"R²:   {r2:.4f}")

    return {"Split": name, "MAE": mae, "RMSE": rmse, "R2": r2}


# ============================================================
# 4. FULL WORKFLOW (80/20 TRAIN-TEST)
# ============================================================
def run_xgboost_no_val(df):

    # --- enforce correct dtype ---
    df["postal_code"] = df["postal_code"].astype("string")

    # --- split 80/20 ---
    X = df.drop(columns=["price"])
    y = df["price"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42
    )

    print("Train:", X_train.shape)
    print("Test:", X_test.shape)

    # --- build preprocessor ---
    preprocessor = build_preprocessor(X_train)

    # --- train model ---
    model = train_xgboost_no_val(X_train, y_train, preprocessor)

    # --- evaluate ---
    results = []
    results.append(evaluate(model, X_train, y_train, "Train"))
    results.append(evaluate(model, X_test, y_test, "Test"))

    results_df = pd.DataFrame(results)

    # --- save model ---
    os.makedirs("models", exist_ok=True)
    joblib.dump(model, "models/xgboost_no_val.pkl")

    print("\nXGBoost (No Validation) saved to models/xgboost_no_val.pkl")

    return model, results_df


In [92]:
df = pd.read_csv("data/processed/feature_engineered.csv", dtype={"postal_code": "string"})

xgb_model, xgb_results = run_xgboost_no_val(df)
xgb_results


Train: (11499, 20)
Test: (2875, 20)
XGBoost model trained successfully.

--- Train Evaluation ---
MAE:  60,808.20
RMSE: 94,950.97
R²:   0.8735

--- Test Evaluation ---
MAE:  76,492.30
RMSE: 143,854.95
R²:   0.6753

XGBoost (No Validation) saved to models/xgboost_no_val.pkl


Unnamed: 0,Split,MAE,RMSE,R2
0,Train,60808.203236,94950.972736,0.873453
1,Test,76492.303783,143854.949868,0.675318


### Run XGBOOST

## Tuned XGBoost Model

In [93]:
import pandas as pd
import numpy as np
import joblib
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# ---------------------------------------------------------
# 1. Split into 80% train / 20% test
# ---------------------------------------------------------
def split_train_test(df, target="price"):
    y = df[target]
    X = df.drop(columns=[target])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42
    )

    print("Train:", X_train.shape)
    print("Test:", X_test.shape)
    return X_train, X_test, y_train, y_test



# ---------------------------------------------------------
# 2. Evaluate helper
# ---------------------------------------------------------
def evaluate_split(model, X, y, name=""):
    preds = model.predict(X)
    mae = mean_absolute_error(y, preds)
    rmse = np.sqrt(mean_squared_error(y, preds))
    r2 = r2_score(y, preds)

    print(f"\n--- {name} Evaluation ---")
    print(f"MAE:  {mae:,.2f}")
    print(f"RMSE: {rmse:,.2f}")
    print(f"R²:   {r2:.4f}")

    return {"Split": name, "MAE": mae, "RMSE": rmse, "R2": r2}



# ---------------------------------------------------------
# 3. Train XGBoost with Hyperparameter Tuning (NO VAL)
# ---------------------------------------------------------
def train_xgboost_tuned_no_val(X_train, y_train, preprocessor):

    # Define default model
    base_xgb = XGBRegressor(
        objective="reg:squarederror",
        tree_method="hist",
        booster="gbtree",
        eval_metric="rmse",
        n_estimators=400,
        learning_rate=0.05
    )

    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("model", base_xgb)
    ])

    # Parameter grid (lightweight for Windows)
    param_dist = {
    "model__n_estimators": [200, 300],
    "model__learning_rate": [0.03, 0.07],
    "model__max_depth": [3, 4],
    "model__subsample": [0.8],
    "model__colsample_bytree": [0.8]
    }

    print("\nStarting RandomizedSearchCV for XGBoost...")
    search = RandomizedSearchCV(
        pipe,
        param_distributions=param_dist,
        n_iter=10,
        scoring="neg_mean_absolute_error",
        cv=3,
        n_jobs=1,             # IMPORTANT: fixes BrokenProcessPool
        verbose=1,
        random_state=42
    )

    search.fit(X_train, y_train)

    print("\nBest parameters found:")
    print(search.best_params_)

    return search.best_estimator_



# ---------------------------------------------------------
# 4. Full workflow (train + test)
# ---------------------------------------------------------
def run_xgboost_tuned_no_val(df, preprocessor):

    # Split
    X_train, X_test, y_train, y_test = split_train_test(df)

    # Train tuned model
    model = train_xgboost_tuned_no_val(X_train, y_train, preprocessor)

    # Evaluate
    results = []
    results.append(evaluate_split(model, X_train, y_train, "Train"))
    results.append(evaluate_split(model, X_test, y_test, "Test"))

    # Save
    joblib.dump(model, "models/xgboost_tuned_no_val.pkl")
    print("\nXGBoost (Tuned, No Validation) saved to models/xgboost_tuned_no_val.pkl")

    return model, pd.DataFrame(results)


In [94]:
df = pd.read_csv("data/processed/feature_engineered.csv", dtype={"postal_code": "string"})

# Build preprocessor correctly
preprocessor = build_preprocessor(df.drop(columns=["price"]))

# Run tuned XGBoost
xgb_model, xgb_results = run_xgboost_tuned_no_val(df, preprocessor)

xgb_results



Train: (11499, 20)
Test: (2875, 20)

Starting RandomizedSearchCV for XGBoost...
Fitting 3 folds for each of 8 candidates, totalling 24 fits





Best parameters found:
{'model__subsample': 0.8, 'model__n_estimators': 300, 'model__max_depth': 4, 'model__learning_rate': 0.07, 'model__colsample_bytree': 0.8}

--- Train Evaluation ---
MAE:  73,800.37
RMSE: 121,079.57
R²:   0.7942

--- Test Evaluation ---
MAE:  82,378.67
RMSE: 151,429.23
R²:   0.6402

XGBoost (Tuned, No Validation) saved to models/xgboost_tuned_no_val.pkl


Unnamed: 0,Split,MAE,RMSE,R2
0,Train,73800.365681,121079.568821,0.794224
1,Test,82378.667252,151429.229191,0.640227


## LOG-TRANSFORMED XGBOOST TRAINING PIPELINE

In [95]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
import joblib
import os

def train_xgboost_log_target(
        df,
        target="price",
        save_path="models/xgboost_log_model.pkl"
    ):
    """
    XGBoost Pipeline with Log-Transformed Target.
    
    Steps:
    - removes missing price
    - train/test split
    - log1p transform on target
    - OneHotEncoder (categorical) + passthrough (numerical)
    - XGBoost model
    - exponentiate predictions back to EUR
    """

    # ------------------------------------------
    # 1. Remove missing target
    # ------------------------------------------
    df = df[df[target].notna()].copy()

    # ------------------------------------------
    # 2. Split dataset
    # ------------------------------------------
    X = df.drop(columns=[target])
    y = np.log1p(df[target])   # LOG transform

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42
    )

    # ------------------------------------------
    # 3. Column types
    # ------------------------------------------
    cat_cols = X_train.select_dtypes(include=["object", "string"]).columns.tolist()
    num_cols = X_train.select_dtypes(include=["number"]).columns.tolist()

    preprocessor = ColumnTransformer([
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols)
    ])

    # ------------------------------------------
    # 4. XGBoost Model
    # ------------------------------------------
    model = XGBRegressor(
        n_estimators=400,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="reg:squarederror",
        random_state=42,
        tree_method="hist"
    )

    pipe = Pipeline([
        ("preprocess", preprocessor),
        ("model", model)
    ])

    # ------------------------------------------
    # 5. Fit Model
    # ------------------------------------------
    pipe.fit(X_train, y_train)

    # ------------------------------------------
    # 6. Predict (exponentiate back to EUR)
    # ------------------------------------------
    train_preds = np.expm1(pipe.predict(X_train))
    test_preds  = np.expm1(pipe.predict(X_test))

    y_train_real = np.expm1(y_train)
    y_test_real  = np.expm1(y_test)

    # ------------------------------------------
    # 7. Metrics
    # ------------------------------------------
    results = {
        "Train MAE": mean_absolute_error(y_train_real, train_preds),
        "Train RMSE": np.sqrt(mean_squared_error(y_train_real, train_preds)),
        "Train R2": r2_score(y_train_real, train_preds),

        "Test MAE": mean_absolute_error(y_test_real, test_preds),
        "Test RMSE": np.sqrt(mean_squared_error(y_test_real, test_preds)),
        "Test R2": r2_score(y_test_real, test_preds)
    }

    # ------------------------------------------
    # 8. Save model
    # ------------------------------------------
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    joblib.dump(pipe, save_path)

    print("Model saved to:", save_path)
    print("\n===== XGBoost (Log Target) Results =====")
    for k, v in results.items():
        print(f"{k}: {v:,.2f}")

    return pipe, results


In [96]:
df = pd.read_csv("data/processed/feature_engineered.csv", dtype={"postal_code": "string"})

model, metrics = train_xgboost_log_target(df)
metrics


Model saved to: models/xgboost_log_model.pkl

===== XGBoost (Log Target) Results =====
Train MAE: 71,153.50
Train RMSE: 145,360.82
Train R2: 0.70
Test MAE: 75,870.23
Test RMSE: 149,351.96
Test R2: 0.65


{'Train MAE': 71153.50031694332,
 'Train RMSE': np.float64(145360.82230481753),
 'Train R2': 0.7034156400423359,
 'Test MAE': 75870.22748641304,
 'Test RMSE': np.float64(149351.96133388623),
 'Test R2': 0.6500301466738249}

## cross validation

In [77]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score

from xgboost import XGBRegressor


# ------------------------------------------------------------
# BUILD PREPROCESSOR
# ------------------------------------------------------------
def build_preprocessor(df):
    numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
    categorical_cols = df.select_dtypes(include=["object", "string"]).columns.tolist()

    # Make sure postal_code is categorical
    if "postal_code" in numeric_cols:
        numeric_cols.remove("postal_code")
        categorical_cols.append("postal_code")

    preprocessor = ColumnTransformer([
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), numeric_cols),

        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("encoder", OneHotEncoder(handle_unknown="ignore"))
        ]), categorical_cols)
    ])

    return preprocessor


# ------------------------------------------------------------
# CROSS VALIDATION FUNCTION
# ------------------------------------------------------------
def run_cross_validation(df, target="price"):
    # Split features/target
    y = df[target]
    X = df.drop(columns=[target])

    # Preprocessor
    preprocessor = build_preprocessor(X)

    # Model pipeline
    model = Pipeline([
        ("preprocess", preprocessor),
        ("xgb", XGBRegressor(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=8,
            subsample=0.8,
            colsample_bytree=0.8,
            objective="reg:squarederror",
            random_state=42
        ))
    ])

    # Metrics
    scorers = {
        "MAE": make_scorer(mean_absolute_error, greater_is_better=False),
        "RMSE": make_scorer(lambda y, y_pred: np.sqrt(mean_squared_error(y, y_pred)),
                            greater_is_better=False),
        "R2": make_scorer(r2_score)
    }

    # CV split
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    # Evaluate
    results = cross_validate(
        model,
        X, y,
        scoring=scorers,
        cv=cv,
        n_jobs=-1,
        return_train_score=False
    )

    print("\nCROSS VALIDATION RESULTS (5-FOLD)")
    print("----------------------------------")
    print("MAE :", -results["test_MAE"].mean())
    print("RMSE:", -results["test_RMSE"].mean())
    print("R²  :", results["test_R2"].mean())

    return results


In [78]:


# Load your cleaned dataset
df = pd.read_csv("data/processed/cleaned_v2.csv", dtype={"postal_code": "string"})

# Run CV
cv_results = run_cross_validation(df)



CROSS VALIDATION RESULTS (5-FOLD)
----------------------------------
MAE : 75769.87637568748
RMSE: 152483.7902251448
R²  : 0.6661841873221889


## XGboost

In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import joblib


# ------------------------------------------------------------
# Train-only outlier removal
# ------------------------------------------------------------
def remove_train_outliers(X_train, y_train, columns=None, multiplier=1.5):
    """
    Removes outliers from TRAIN ONLY (IQR filtering).
    Avoids test leakage.
    """
    df = X_train.copy()
    df["price"] = y_train

    if columns is None:
        columns = ["price", "living_area", "number_rooms"]

    for col in columns:
        if col not in df.columns:
            continue

        series = df[col].dropna()
        if len(series) < 100:
            continue

        Q1 = series.quantile(0.25)
        Q3 = series.quantile(0.75)
        IQR = Q3 - Q1

        lower = Q1 - multiplier * IQR
        upper = Q3 + multiplier * IQR

        df = df[(df[col] >= lower) & (df[col] <= upper)]

    y_clean = df["price"]
    X_clean = df.drop(columns=["price"])
    return X_clean, y_clean


# ------------------------------------------------------------
# Group-based split
# ------------------------------------------------------------
def group_split(df, group_col="locality_name", test_size=0.20):
    splitter = GroupShuffleSplit(
        test_size=test_size,
        n_splits=1,
        random_state=42
    )
    groups = df[group_col]
    train_idx, test_idx = next(splitter.split(df, groups=groups))

    train = df.iloc[train_idx].copy()
    test = df.iloc[test_idx].copy()

    print("Train:", train.shape)
    print("Test :", test.shape)
    return train, test


# ------------------------------------------------------------
# FINAL XGBoost Training Function
# ------------------------------------------------------------
def train_xgboost(df, target="price", save_path="models/xgboost_geo_tuned.pkl"):

    # 1. Split (geospatial)
    train_df, test_df = group_split(df, "locality_name")

    # 2. Log target
    y_train = np.log1p(train_df[target])
    y_test = np.log1p(test_df[target])

    X_train = train_df.drop(columns=[target])
    X_test = test_df.drop(columns=[target])

    # 3. TRAIN-ONLY outlier removal
    X_train, y_train = remove_train_outliers(
        X_train, y_train,
        columns=["price", "living_area", "number_rooms"]
    )
    print("Train after outlier removal:", X_train.shape)

    # 4. Identify column types
    cat_cols = X_train.select_dtypes(include=["object", "string"]).columns.tolist()
    num_cols = X_train.select_dtypes(include=["number"]).columns.tolist()

    print("\nCategorical:", cat_cols)
    print("Numeric:", num_cols)

    # 5. Preprocessing
    preprocessor = ColumnTransformer([
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse=True), cat_cols),
        ("num", "passthrough", num_cols)
    ])

    # 6. XGBoost model
    model = XGBRegressor(
        objective="reg:squarederror",
        tree_method="hist",
        random_state=42
    )

    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("model", model)
    ])

    param_dist = {
        "model__max_depth": [4, 6, 8],
        "model__learning_rate": [0.03, 0.05, 0.1],
        "model__n_estimators": [300, 600, 900],
        "model__subsample": [0.7, 0.8, 1.0],
        "model__colsample_bytree": [0.6, 0.8, 1.0],
        "model__min_child_weight": [1, 5, 10],
        "model__gamma": [0, 1, 5],
        "model__reg_lambda": [1, 3, 5]
    }

    print("\nTuning XGBoost hyperparameters...")
    search = RandomizedSearchCV(
        pipe,
        param_distributions=param_dist,
        n_iter=20,
        cv=3,
        scoring="neg_root_mean_squared_error",
        n_jobs=-1,
        verbose=1,
        random_state=42
    )

    search.fit(X_train, y_train)
    best_model = search.best_estimator_

    print("\nBest hyperparameters:")
    print(search.best_params_)

    # 7. Evaluate
    preds_train = np.expm1(best_model.predict(X_train))
    preds_test = np.expm1(best_model.predict(X_test))

    mae_train = mean_absolute_error(train_df[target], preds_train)
    rmse_train = np.sqrt(mean_squared_error(train_df[target], preds_train))
    r2_train = r2_score(train_df[target], preds_train)

    mae_test = mean_absolute_error(test_df[target], preds_test)
    rmse_test = np.sqrt(mean_squared_error(test_df[target], preds_test))
    r2_test = r2_score(test_df[target], preds_test)

    print("\n===== FINAL XGBOOST RESULTS =====")
    print("\n--- Train ---")
    print(f"MAE:  {mae_train:,.2f}")
    print(f"RMSE: {rmse_train:,.2f}")
    print(f"R²:   {r2_train:.4f}")

    print("\n--- Test ---")
    print(f"MAE:  {mae_test:,.2f}")
    print(f"RMSE: {rmse_test:,.2f}")
    print(f"R²:   {r2_test:.4f}")

    # 8. Save
    joblib.dump(best_model, save_path)
    print(f"\nSaved model to: {save_path}")

    return best_model


In [36]:
df = pd.read_csv("data/processed/cleaned_v2.csv", dtype={"postal_code": "string"})

xgb_model, xgb_results = run_xgboost_no_val(df)
xgb_results


Train: (11511, 18)
Test: (2878, 18)

Numeric columns: ['build_year', 'facades', 'living_area', 'number_rooms', 'swimming_pool', 'terrace', 'has_garden']
Categorical columns: ['locality_name', 'postal_code', 'property_id', 'property_type', 'property_url', 'state', 'province', 'property_type_name', 'state_mapped', 'region']
XGBoost model trained successfully.

--- Train Evaluation ---
MAE:  60,358.44
RMSE: 86,530.08
R²:   0.9030

--- Test Evaluation ---
MAE:  77,210.48
RMSE: 139,893.22
R²:   0.6928

XGBoost (No Validation) saved to models/xgboost_no_val.pkl


Unnamed: 0,Split,MAE,RMSE,R2
0,Train,60358.439647,86530.081949,0.903011
1,Test,77210.479633,139893.2156,0.692796
