## 1. Load the data

In [109]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import os
os.chdir(r"C:\Users\welde\Desktop\immo-eliza-ml")  # Change Python's working directory so that all relative paths are resolved from your project root


# load the data set

df = pd.read_csv("data/raw/raw.csv")
df.head()      # 
df.shape       # Check dataset shape
df.info()       # Inspect column data types
df.describe()    #Summary statistics
df.isna().sum()   #missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15232 entries, 0 to 15231
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   build_year          6226 non-null   object 
 1   facades             10088 non-null  float64
 2   garden              15232 non-null  object 
 3   living_area         13504 non-null  object 
 4   locality_name       15012 non-null  object 
 5   number_rooms        13980 non-null  object 
 6   postal_code         15008 non-null  float64
 7   price               14389 non-null  float64
 8   property_id         15232 non-null  object 
 9   property_type       14236 non-null  object 
 10  property_url        15232 non-null  object 
 11  state               11116 non-null  object 
 12  swimming_pool       15232 non-null  object 
 13  terrace             13832 non-null  object 
 14  province            15008 non-null  object 
 15  property_type_name  14232 non-null  object 
 16  stat

build_year            9006
facades               5144
garden                   0
living_area           1728
locality_name          220
number_rooms          1252
postal_code            224
price                  843
property_id              0
property_type          996
property_url             0
state                 4116
swimming_pool            0
terrace               1400
province               224
property_type_name    1000
state_mapped          4120
dtype: int64

## DATA CLEANING FUNCTION

In [110]:
import pandas as pd
import numpy as np
import os

def enhanced_cleaning(
        path="data/raw/raw.csv",
        save_path="data/processed/cleaned_v2.csv"
    ):
    """
    FINAL unified cleaning pipeline for ImmoEliza.
    - Merges basic cleaning + enhanced cleaning
    - Handles numeric sanity, boolean normalization
    - Reduces locality noise (top 200)
    - Prepares data for feature engineering
    """

    # 1. Load raw-cleaned data
    df = pd.read_csv(path, dtype={"postal_code": "string"}).copy()

    # 2. Drop useless columns
    drop_cols = [
        "property_id",
        "property_url",
        "property_type_name",
        "state_mapped"
    ]
    df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")

    # 3. Normalize boolean-like columns
    bool_cols = ["garden", "terrace", "swimming_pool"]
    for col in bool_cols:
        if col in df.columns:
            df[col] = (
                df[col].astype(str).str.lower().str.strip()
                .replace({
                    "1": "yes", "true": "yes", "yes": "yes",
                    "0": "no", "false": "no", "no": "no"
                })
            )

    # 4. Convert numeric-like columns (comma → dot)
    numeric_cols = ["build_year", "number_rooms", "facades", "living_area"]
    for col in numeric_cols:
        if col in df.columns:
            df[col] = df[col].astype(str).str.replace(",", ".", regex=False)
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # Ensure price numeric
    df["price"] = pd.to_numeric(df["price"], errors="coerce")

    # Postal code stays categorical
    df["postal_code"] = df["postal_code"].astype("string")

    # 5. Numeric sanity constraints
    df.loc[df["build_year"] < 1800, "build_year"] = np.nan
    df.loc[df["build_year"] > 2025, "build_year"] = np.nan

    df.loc[df["number_rooms"] <= 0, "number_rooms"] = np.nan
    df.loc[df["number_rooms"] > 12, "number_rooms"] = np.nan

    df.loc[df["living_area"] < 10, "living_area"] = np.nan
    df.loc[df["living_area"] > 500, "living_area"] = np.nan

    df = df[df["price"] >= 10000]
    df.loc[df["price"] > 7_500_000, "price"] = np.nan
    df = df[df["price"].notna()]

    df["province"] = df["province"].astype(str).str.strip().replace("nan", np.nan)

    # 6. Reduce locality high-cardinality noise (top 200)
    if "locality_name" in df.columns:
        top_localities = df["locality_name"].value_counts().head(200).index
        df["locality_name"] = df["locality_name"].where(
            df["locality_name"].isin(top_localities),
            "Other"
        )

    # 7. Save
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    df.to_csv(save_path, index=False)

    print(f"Enhanced Cleaned dataset saved to: {save_path}")
    print("Final shape:", df.shape)
    return df


In [112]:
# run
cleaned_df = enhanced_cleaning(
    path="data/raw/raw.csv",
    save_path="data/processed/cleaned_v2.csv"
)

cleaned_df.head()



Enhanced Cleaned dataset saved to: data/processed/cleaned_v2.csv
Final shape: (14374, 13)


Unnamed: 0,build_year,facades,garden,living_area,locality_name,number_rooms,postal_code,price,property_type,state,swimming_pool,terrace,province
0,1996.0,2.0,yes,270.0,Other,4.0,1853.0,580000.0,Residence,Excellent,no,yes,Flemish Brabant
1,1991.0,4.0,yes,218.0,Other,5.0,1341.0,695000.0,Residence,Excellent,yes,yes,Walloon Brabant
2,1970.0,4.0,no,135.0,Other,3.0,1300.0,249000.0,Apartment,To be renovated,no,yes,Walloon Brabant
3,1959.0,3.0,yes,176.0,Other,3.0,1853.0,499000.0,Residence,Normal,no,yes,Flemish Brabant
4,2007.0,4.0,yes,200.0,Other,4.0,1341.0,650000.0,Residence,Excellent,no,yes,Walloon Brabant


## feature engineering

In [113]:
import pandas as pd
import numpy as np
import os

def feature_engineering(
        path="data/processed/cleaned_v2.csv",
        save_path="data/processed/feature_engineered.csv"
    ):
    """
    FINAL Feature Engineering for ImmoEliza.
    Must be run AFTER the unified enhanced cleaning step.
    
    Adds:
    - postal_prefix
    - house_age, build_decade
    - is_new_build / is_recent / is_old
    - boolean flags (garden_flag, etc.)
    - region (Flanders/Wallonia/Brussels)
    - normalized locality_name
    """

    df = pd.read_csv(path, dtype={"postal_code": "string"}).copy()

    # ------------------------------------------------------------
    # 1. Postal code remains categorical
    # ------------------------------------------------------------
    df["postal_code"] = df["postal_code"].fillna("unknown").astype("string")

    # ------------------------------------------------------------
    # 2. Build-year engineering
    # ------------------------------------------------------------
    if "build_year" in df.columns:
        current_year = 2024

        df["house_age"] = current_year - df["build_year"]
        df.loc[df["house_age"] < 0, "house_age"] = np.nan

        df["is_new_build"] = (df["house_age"] <= 5).astype("Int64")
        df["is_recent"]     = (df["house_age"] <= 20).astype("Int64")
        df["is_old"]        = (df["house_age"] >= 50).astype("Int64")

        df["build_decade"] = (df["build_year"] // 10 * 10).astype("Int64")
    else:
        df["house_age"] = np.nan
        df["is_new_build"] = np.nan
        df["is_recent"] = np.nan
        df["is_old"] = np.nan
        df["build_decade"] = np.nan

    # ------------------------------------------------------------
    # 3. Region mapping (province → region)
    # ------------------------------------------------------------
    region_map = {
        "Antwerp": "Flanders",
        "East Flanders": "Flanders",
        "West Flanders": "Flanders",
        "Limburg": "Flanders",
        "Flemish Brabant": "Flanders",

        "Walloon Brabant": "Wallonia",
        "Hainaut": "Wallonia",
        "Liège": "Wallonia",
        "Luxembourg": "Wallonia",
        "Namur": "Wallonia",

        "Brussels": "Brussels"
    }

    df["region"] = df["province"].map(region_map).fillna("unknown").astype("string")

    # ------------------------------------------------------------
    # 4. Boolean flags
    # ------------------------------------------------------------
    bool_map = {"yes": 1, "no": 0}

    for col in ["garden", "terrace", "swimming_pool"]:
        if col in df.columns:
            df[col + "_flag"] = df[col].map(bool_map).astype("Int64")
        else:
            df[col + "_flag"] = np.nan

    # ------------------------------------------------------------
    # 5. Locality normalization
    # ------------------------------------------------------------
    df["locality_name"] = df["locality_name"].astype(str).str.strip().str.lower()

    # ------------------------------------------------------------
    # 6. Postal prefix (categorical location smoothing)
    # ------------------------------------------------------------
    df["postal_prefix"] = df["postal_code"].str[:2].astype("string")

    # ------------------------------------------------------------
    # 7. Save engineered dataset
    # ------------------------------------------------------------
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    df.to_csv(save_path, index=False)

    print(f"Feature-engineered dataset saved to: {save_path}")
    print("Final shape:", df.shape)
    print("\nColumns:", df.columns.tolist())

    return df


In [114]:
# RUN IT

fe_df = feature_engineering()
fe_df.head()

Feature-engineered dataset saved to: data/processed/feature_engineered.csv
Final shape: (14374, 23)

Columns: ['build_year', 'facades', 'garden', 'living_area', 'locality_name', 'number_rooms', 'postal_code', 'price', 'property_type', 'state', 'swimming_pool', 'terrace', 'province', 'house_age', 'is_new_build', 'is_recent', 'is_old', 'build_decade', 'region', 'garden_flag', 'terrace_flag', 'swimming_pool_flag', 'postal_prefix']


Unnamed: 0,build_year,facades,garden,living_area,locality_name,number_rooms,postal_code,price,property_type,state,...,house_age,is_new_build,is_recent,is_old,build_decade,region,garden_flag,terrace_flag,swimming_pool_flag,postal_prefix
0,1996.0,2.0,yes,270.0,other,4.0,1853.0,580000.0,Residence,Excellent,...,28.0,0,0,0,1990,Flanders,1,1,0,18
1,1991.0,4.0,yes,218.0,other,5.0,1341.0,695000.0,Residence,Excellent,...,33.0,0,0,0,1990,Wallonia,1,1,1,13
2,1970.0,4.0,no,135.0,other,3.0,1300.0,249000.0,Apartment,To be renovated,...,54.0,0,0,1,1970,Wallonia,0,1,0,13
3,1959.0,3.0,yes,176.0,other,3.0,1853.0,499000.0,Residence,Normal,...,65.0,0,0,1,1950,Flanders,1,1,0,18
4,2007.0,4.0,yes,200.0,other,4.0,1341.0,650000.0,Residence,Excellent,...,17.0,0,1,0,2000,Wallonia,1,1,0,13


## PREPROCESS DATA (Pipelines)

In [82]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


# ------------------------------------------------------------
# 1. TRAIN / TEST SPLIT (80 / 20)
# ------------------------------------------------------------
def split_data(df, target="price"):
    """
    Split dataset into train and test sets (80/20).
    """
    y = df[target]
    X = df.drop(columns=[target])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42
    )

    print("Train:", X_train.shape)
    print("Test:", X_test.shape)

    return X_train, X_test, y_train, y_test



# ------------------------------------------------------------
# 2. OUTLIER REMOVAL FROM TRAINING SET ONLY
# ------------------------------------------------------------
def remove_outliers_from_train(X_train, y_train):
    """
    Remove outliers ONLY from the training dataset
    (NO validation/test leakage).
    """
    df_train = X_train.copy()
    df_train["price"] = y_train

    cols_to_filter = [col for col in ["price", "living_area"] if col in df_train.columns]

    def iqr_filter(df, col):
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        return df[(df[col] >= lower) & (df[col] <= upper)]

    for col in cols_to_filter:
        df_train = iqr_filter(df_train, col)

    if "number_rooms" in df_train.columns:
        df_train = df_train[df_train["number_rooms"].fillna(0) <= 12]

    y_train_clean = df_train["price"]
    X_train_clean = df_train.drop(columns=["price"])

    print("Training after outlier removal:", X_train_clean.shape)
    return X_train_clean, y_train_clean



# ------------------------------------------------------------
# 3. BUILD PREPROCESSOR (numeric + categorical)
# ------------------------------------------------------------
def build_preprocessor(X_train):
    """
    Build preprocessing transformer for numeric and categorical features.
    """

    numeric_cols = X_train.select_dtypes(
        include=["float64", "int64", "Int64"]
    ).columns.tolist()

    categorical_cols = X_train.select_dtypes(
        include=["object", "string"]
    ).columns.tolist()

    # Ensure postal_code stays categorical
    if "postal_code" in numeric_cols:
        numeric_cols.remove("postal_code")
        categorical_cols.append("postal_code")

    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer([
        ("num", numeric_pipeline, numeric_cols),
        ("cat", categorical_pipeline, categorical_cols)
    ])

    print("\nNumeric columns:", numeric_cols)
    print("Categorical columns:", categorical_cols)

    return preprocessor



# ------------------------------------------------------------
# 4. MAIN PIPELINE FUNCTION (AFTER FEATURE ENGINEERING)
# ------------------------------------------------------------
def run_preprocessing_pipeline(path="data/processed/cleaned_v2.csv"):
    """
    Load engineered dataset, enforce postal_code dtype,
    split 80/20, remove outliers, and build preprocessing pipeline.
    """

    df = pd.read_csv(path, dtype={"postal_code": "string"})

    # 80/20 split
    X_train, X_test, y_train, y_test = split_data(df)

    # Remove outliers ONLY from train
    X_train_clean, y_train_clean = remove_outliers_from_train(X_train, y_train)

    # Build final preprocessor
    preprocessor = build_preprocessor(X_train_clean)

    return X_train_clean, X_test, y_train_clean, y_test, preprocessor


In [83]:
# ------------------------------------------------------------
# RUN THE PIPELINE
# ------------------------------------------------------------
X_train_clean, X_test, y_train_clean, y_test, preprocessor = run_preprocessing_pipeline(
    "data/processed/cleaned_v2.csv"
)



Train: (11499, 15)
Test: (2875, 15)
Training after outlier removal: (9169, 15)

Numeric columns: ['build_year', 'facades', 'living_area', 'number_rooms', 'postal_code_num', 'postal_prefix']
Categorical columns: ['garden', 'locality_name', 'postal_code', 'property_type', 'state', 'swimming_pool', 'terrace', 'province', 'region']


## Tuned Linear Models: Ridge, Lasso, ElasticNet

In [47]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
import joblib


# -------------------------------------------------------------
# Evaluate a single split
# -------------------------------------------------------------
def evaluate_split(model, X, y, name=""):
    preds = model.predict(X)
    mae = mean_absolute_error(y, preds)
    rmse = np.sqrt(mean_squared_error(y, preds))
    r2 = r2_score(y, preds)

    print(f"\n--- {name} Evaluation ---")
    print(f"MAE:  {mae:,.2f}")
    print(f"RMSE: {rmse:,.2f}")
    print(f"R²:   {r2:.4f}")

    return {"Split": name, "MAE": mae, "RMSE": rmse, "R2": r2}


# -------------------------------------------------------------
# Train tuned linear models (Ridge, Lasso, ElasticNet)
# -------------------------------------------------------------
def train_tuned_linear_models(X_train, y_train, preprocessor):

    models = {
        "Ridge": Ridge(),
        "Lasso": Lasso(max_iter=10000),
        "ElasticNet": ElasticNet(max_iter=10000)
    }

    param_grids = {
        "Ridge": {"model__alpha": [0.001, 0.01, 0.1, 1, 10, 100]},
        "Lasso": {"model__alpha": [0.001, 0.01, 0.1, 1, 10]},
        "ElasticNet": {
            "model__alpha": [0.001, 0.01, 0.1, 1],
            "model__l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9]
        }
    }

    best_models = {}

    for name, model in models.items():
        print(f"\nTuning {name}...")

        pipe = Pipeline([
            ("preprocessor", preprocessor),
            ("model", model)
        ])

        search = RandomizedSearchCV(
            estimator=pipe,
            param_distributions=param_grids[name],
            n_iter=10,
            scoring="neg_mean_absolute_error",
            cv=3,
            n_jobs=-1,
            random_state=42
        )

        search.fit(X_train, y_train)

        print(f"Best params for {name}: {search.best_params_}")
        best_models[name] = search.best_estimator_

    return best_models


# -------------------------------------------------------------
# Full workflow: Train/Test only (NO validation)
# -------------------------------------------------------------
def run_tuned_linear_models(
    X_train_clean, X_test,
    y_train_clean, y_test,
    preprocessor
):

    best_models = train_tuned_linear_models(X_train_clean, y_train_clean, preprocessor)

    all_results = []

    for name, model in best_models.items():
        print(f"\n===== {name} Results =====")

        # Train performance
        all_results.append(evaluate_split(
            model, X_train_clean, y_train_clean, f"{name} - Train"
        ))

        # Test performance
        all_results.append(evaluate_split(
            model, X_test, y_test, f"{name} - Test"
        ))

        # Save model
        model_path = f"models/{name.lower()}_tuned.pkl"
        joblib.dump(model, model_path)
        print(f"{name} model saved to {model_path}")

    return pd.DataFrame(all_results)


In [84]:
tuned_linear_results = run_tuned_linear_models(
    X_train_clean, X_test,
    y_train_clean, y_test,
    preprocessor
)

tuned_linear_results




Tuning Ridge...




Best params for Ridge: {'model__alpha': 1}

Tuning Lasso...




Best params for Lasso: {'model__alpha': 10}

Tuning ElasticNet...
Best params for ElasticNet: {'model__l1_ratio': 0.7, 'model__alpha': 0.001}

===== Ridge Results =====

--- Ridge - Train Evaluation ---
MAE:  49,130.67
RMSE: 66,751.69
R²:   0.7188

--- Ridge - Test Evaluation ---
MAE:  89,479.28
RMSE: 189,515.28
R²:   0.4365
Ridge model saved to models/ridge_tuned.pkl

===== Lasso Results =====

--- Lasso - Train Evaluation ---
MAE:  49,788.06
RMSE: 67,395.84
R²:   0.7134

--- Lasso - Test Evaluation ---
MAE:  89,432.08
RMSE: 189,757.50
R²:   0.4351
Lasso model saved to models/lasso_tuned.pkl

===== ElasticNet Results =====

--- ElasticNet - Train Evaluation ---
MAE:  49,992.56
RMSE: 67,832.86
R²:   0.7096

--- ElasticNet - Test Evaluation ---
MAE:  89,489.81
RMSE: 190,006.94
R²:   0.4336
ElasticNet model saved to models/elasticnet_tuned.pkl


  model = cd_fast.sparse_enet_coordinate_descent(


Unnamed: 0,Split,MAE,RMSE,R2
0,Ridge - Train,49130.667544,66751.68921,0.718816
1,Ridge - Test,89479.280125,189515.281336,0.436496
2,Lasso - Train,49788.057104,67395.840517,0.713363
3,Lasso - Test,89432.078331,189757.502933,0.435054
4,ElasticNet - Train,49992.561744,67832.861072,0.709634
5,ElasticNet - Test,89489.80592,190006.937159,0.433568


## Train LR

In [63]:
import os
import joblib
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline


# ============================================================
# 1. Train Linear Regression (Pipeline = Preprocessor + LR)
# ============================================================
def train_linear_regression(X_train, y_train, preprocessor):
    """
    Build and train a full pipeline:
    - Preprocessor: ColumnTransformer
    - Linear Regression model
    """

    model = Pipeline([
        ("preprocessor", preprocessor),
        ("model", LinearRegression())
    ])

    model.fit(X_train, y_train)

    print("\nLinear Regression model trained successfully.")
    return model



# ============================================================
# 2. Evaluate one split (Train or Test)
# ============================================================
def evaluate_single_split(model, X, y, split_name=""):
    """
    Evaluate the model on a given split.
    """

    preds = model.predict(X)

    mae = mean_absolute_error(y, preds)
    rmse = mean_squared_error(y, preds) ** 0.5   # manual RMSE
    r2 = r2_score(y, preds)

    print(f"\n--- {split_name} Evaluation ---")
    print(f"MAE:  {mae:,.2f}")
    print(f"RMSE: {rmse:,.2f}")
    print(f"R²:   {r2:.4f}")

    return mae, rmse, r2



# ============================================================
# 3. Evaluate Train + Test (NO validation)
# ============================================================
def evaluate_all_splits(model,
                        X_train, y_train,
                        X_test, y_test,
                        model_name="Linear Regression"):

    results = []

    # Train
    tr_mae, tr_rmse, tr_r2 = evaluate_single_split(
        model, X_train, y_train, "Train"
    )

    # Test
    te_mae, te_rmse, te_r2 = evaluate_single_split(
        model, X_test, y_test, "Test"
    )

    # Summary DataFrame
    df_results = pd.DataFrame([
        {"Model": model_name, "Split": "Train", "MAE": tr_mae, "RMSE": tr_rmse, "R2": tr_r2},
        {"Model": model_name, "Split": "Test", "MAE": te_mae, "RMSE": te_rmse, "R2": te_r2},
    ])

    print("\n--- Summary Performance Table ---")
    print(df_results)

    return df_results



# ============================================================
# 4. Save model to disk
# ============================================================
def save_linear_regression(model, path="models/linear_regression.pkl"):
    """
    Save the full model pipeline (Preprocessor + Linear Regression).
    """

    os.makedirs(os.path.dirname(path), exist_ok=True)
    joblib.dump(model, path)

    print(f"\nModel pipeline saved to: {path}")



# ============================================================
# 5. Full Linear Regression Workflow (Train/Test Only)
# ============================================================
def run_linear_regression(X_train_clean, X_test,
                          y_train_clean, y_test,
                          preprocessor):
    """
    End-to-end workflow:
    - Train LR model
    - Evaluate on train/test (no validation)
    - Save model
    - Return model + performance table
    """

    # Train model
    model = train_linear_regression(X_train_clean, y_train_clean, preprocessor)

    # Evaluate
    scores = evaluate_all_splits(
        model,
        X_train_clean, y_train_clean,
        X_test, y_test,
        model_name="Linear Regression"
    )

    # Save
    save_linear_regression(model)

    return model, scores


In [65]:
lr_model, lr_results = run_linear_regression(
    X_train_clean, X_test,
    y_train_clean, y_test,
    preprocessor
)

lr_results



Linear Regression model trained successfully.

--- Train Evaluation ---
MAE:  48,289.47
RMSE: 66,177.99
R²:   0.7236

--- Test Evaluation ---
MAE:  90,186.10
RMSE: 189,667.52
R²:   0.4356

--- Summary Performance Table ---
               Model  Split           MAE           RMSE        R2
0  Linear Regression  Train  48289.470068   66177.990337  0.723629
1  Linear Regression   Test  90186.101096  189667.515591  0.435590

Model pipeline saved to: models/linear_regression.pkl


Unnamed: 0,Model,Split,MAE,RMSE,R2
0,Linear Regression,Train,48289.470068,66177.990337,0.723629
1,Linear Regression,Test,90186.101096,189667.515591,0.43559


## leakage testing 

In [34]:
import pandas as pd
from scipy.stats import ks_2samp
import numpy as np

def drift_check(X_train_clean, X_test, y_train_clean, y_test):
    print("\n==============================")
    print(" DRIFT ANALYSIS STARTING")
    print("==============================\n")

    # --------------------------------------------------------
    # 1. Reconstruct full train/test DataFrames
    # --------------------------------------------------------
    train = X_train_clean.copy()
    test = X_test.copy()

    train["price"] = y_train_clean.values
    test["price"] = y_test.values

    # --------------------------------------------------------
    # 2. Identify columns
    # --------------------------------------------------------
    numeric_cols = train.select_dtypes(include=["float64","int64","Int64"]).columns.tolist()
    categorical_cols = train.select_dtypes(include=["object","string"]).columns.tolist()

    # Ensure price treated as numeric
    if "price" in categorical_cols:
        categorical_cols.remove("price")
        numeric_cols.append("price")

    print("Numeric columns:", numeric_cols)
    print("Categorical columns:", categorical_cols)

    # --------------------------------------------------------
    # 3. Category Coverage Checks
    # --------------------------------------------------------
    def category_coverage(feature):
        train_set = set(train[feature].dropna())
        test_set = set(test[feature].dropna())

        missing_in_train = test_set - train_set
        missing_in_test = train_set - test_set

        print(f"\n=== CATEGORY COVERAGE: {feature} ===")
        print(f"Unique in train: {len(train_set)}")
        print(f"Unique in test:  {len(test_set)}")
        print(f"Present in TEST but NOT in train (unseen categories): {len(missing_in_train)}")
        if len(missing_in_train) > 0:
            print("Examples:", list(missing_in_train)[:15])
        print(f"Present in TRAIN but NOT in test: {len(missing_in_test)}")

    print("\n========== CHECKING CATEGORY COVERAGE ==========\n")
    for col in categorical_cols:
        category_coverage(col)

    # --------------------------------------------------------
    # 4. KS Tests for Numerical Drift
    # --------------------------------------------------------
    def ks_test(feature):
        a = train[feature].dropna()
        b = test[feature].dropna()

        stat, p = ks_2samp(a, b)

        print(f"\n=== KS TEST: {feature} ===")
        print(f"KS statistic: {stat:.4f}")
        print(f"P-value     : {p:.6f}")

        if p < 0.05:
            print(" -> SIGNIFICANT DRIFT DETECTED")
        else:
            print(" -> No significant drift")

    print("\n========== KS-TEST FOR NUMERICAL FEATURES ==========\n")

    # You normally do NOT test price for KS, but here we include it to see target drift
    for col in numeric_cols:
        ks_test(col)

    # --------------------------------------------------------
    # 5. Target Drift Summary
    # --------------------------------------------------------
    print("\n========== TARGET DRIFT (PRICE DISTRIBUTION) ==========\n")
    print("Train price stats:")
    print(train["price"].describe())

    print("\nTest price stats:")
    print(test["price"].describe())

    # --------------------------------------------------------
    # 6. Interaction Drift: Locality-level mean price
    # --------------------------------------------------------
    if "locality_name" in train.columns:
        print("\n========== LOCALITY-LEVEL PRICE DRIFT ==========\n")

        train_loc_means = train.groupby("locality_name")["price"].mean()
        test_loc_means = test.groupby("locality_name")["price"].mean()

        print("\nTrain locality price mean distribution:")
        print(train_loc_means.describe())

        print("\nTest locality price mean distribution:")
        print(test_loc_means.describe())

        # common localities only
        common = list(set(train_loc_means.index) & set(test_loc_means.index))

        paired_train = train_loc_means.loc[common]
        paired_test = test_loc_means.loc[common]

        stat, p = ks_2samp(paired_train, paired_test)

        print("\nLocality-level KS test (common localities):")
        print(f"KS statistic: {stat:.4f}")
        print(f"P-value     : {p:.6f}")

    print("\n==============================")
    print(" DRIFT ANALYSIS COMPLETE")
    print("==============================\n")


In [35]:
X_train_clean, X_test, y_train_clean, y_test, preprocessor = run_preprocessing_pipeline(
    path="data/processed/cleaned_v2.csv"
)

drift_check(X_train_clean, X_test, y_train_clean, y_test)


Train: (11499, 23)
Test: (2875, 23)
Train after outlier removal: (9169, 23)

 DRIFT ANALYSIS STARTING

Numeric columns: ['build_year', 'facades', 'living_area', 'number_rooms', 'house_age', 'is_new_build', 'is_recent', 'is_old', 'build_decade', 'garden_flag', 'terrace_flag', 'swimming_pool_flag', 'postal_prefix', 'postal_code_num', 'price']
Categorical columns: ['garden', 'locality_name', 'postal_code', 'property_type', 'state', 'swimming_pool', 'terrace', 'province', 'region']



=== CATEGORY COVERAGE: garden ===
Unique in train: 2
Unique in test:  2
Present in TEST but NOT in train (unseen categories): 0
Present in TRAIN but NOT in test: 0

=== CATEGORY COVERAGE: locality_name ===
Unique in train: 51
Unique in test:  51
Present in TEST but NOT in train (unseen categories): 0
Present in TRAIN but NOT in test: 0

=== CATEGORY COVERAGE: postal_code ===
Unique in train: 779
Unique in test:  573
Present in TEST but NOT in train (unseen categories): 41
Examples: ['9112.0', '6010.0', '8820.

## Random forest

In [66]:
import pandas as pd
import numpy as np
import os
import joblib

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


# -------------------------------------------------------------
# 1. Train/Test Split (80/20)
# -------------------------------------------------------------
def split_train_test(df, target="price"):
    y = df[target]
    X = df.drop(columns=[target])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42
    )

    print("Train:", X_train.shape)
    print("Test:", X_test.shape)

    return X_train, X_test, y_train, y_test


# -------------------------------------------------------------
# 2. Outlier removal (train only)
# -------------------------------------------------------------
def remove_outliers_from_train(X_train, y_train):
    df_train = X_train.copy()
    df_train["price"] = y_train

    # IQR filter on price and living_area
    def iqr_filter(df, col):
        if col not in df.columns:
            return df
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        return df[(df[col] >= lower) & (df[col] <= upper)]

    for col in ["price", "living_area"]:
        df_train = iqr_filter(df_train, col)

    # Rooms sanity filter
    if "number_rooms" in df_train.columns:
        df_train = df_train[df_train["number_rooms"].fillna(0) <= 12]

    y_train_clean = df_train["price"]
    X_train_clean = df_train.drop(columns=["price"])

    print("Train after outlier removal:", X_train_clean.shape)
    return X_train_clean, y_train_clean


# -------------------------------------------------------------
# 3. Preprocessor
# -------------------------------------------------------------
def build_preprocessor(X):
    numeric_cols = X.select_dtypes(include=["float64", "int64", "Int64"]).columns.tolist()
    categorical_cols = X.select_dtypes(include=["object", "string"]).columns.tolist()

    # Postal code must be categorical
    if "postal_code" in numeric_cols:
        numeric_cols.remove("postal_code")
        categorical_cols.append("postal_code")

    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ])

    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer([
        ("num", numeric_pipeline, numeric_cols),
        ("cat", categorical_pipeline, categorical_cols)
    ])

    print("Numeric:", numeric_cols)
    print("Categorical:", categorical_cols)

    return preprocessor


# -------------------------------------------------------------
# 4. Evaluate helper
# -------------------------------------------------------------
def evaluate_rf(model, X, y, split_name=""):
    preds = model.predict(X)
    mae = mean_absolute_error(y, preds)
    rmse = mean_squared_error(y, preds) ** 0.5
    r2 = r2_score(y, preds)

    print(f"\n--- {split_name} Evaluation ---")
    print(f"MAE:  {mae:,.2f}")
    print(f"RMSE: {rmse:,.2f}")
    print(f"R²:   {r2:.4f}")

    return {"Split": split_name, "MAE": mae, "RMSE": rmse, "R2": r2}


# -------------------------------------------------------------
# 5. Train RF
# -------------------------------------------------------------
def train_random_forest(X_train, y_train, preprocessor):
    model = Pipeline([
        ("preprocessor", preprocessor),
        ("model", RandomForestRegressor(
            n_estimators=300,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features="sqrt",
            random_state=42,
            n_jobs=-1
        ))
    ])

    model.fit(X_train, y_train)
    print("\nRandom Forest model trained successfully.")
    return model


# -------------------------------------------------------------
# 6. Full RF workflow (Train/Test only)
# -------------------------------------------------------------
def run_random_forest_no_val(df):

    # Split DF
    X_train, X_test, y_train, y_test = split_train_test(df)

    # Outliers from train only
    X_train_clean, y_train_clean = remove_outliers_from_train(X_train, y_train)

    # Preprocessor
    preprocessor = build_preprocessor(X_train_clean)

    # Train model
    model = train_random_forest(X_train_clean, y_train_clean, preprocessor)

    # Evaluate
    results = []
    results.append(evaluate_rf(model, X_train_clean, y_train_clean, "Train"))
    results.append(evaluate_rf(model, X_test, y_test, "Test"))

    # Save
    os.makedirs("models", exist_ok=True)
    joblib.dump(model, "models/random_forest_no_val.pkl")
    print("\nModel saved to models/random_forest_no_val.pkl")

    return model, pd.DataFrame(results)


In [67]:
#Run
df = pd.read_csv("data/processed/cleaned_v2.csv", dtype={"postal_code": "string"})

rf_model, rf_results = run_random_forest_no_val(df)
rf_results



Train: (11499, 15)
Test: (2875, 15)
Train after outlier removal: (9169, 15)
Numeric: ['build_year', 'facades', 'living_area', 'number_rooms', 'postal_code_num', 'postal_prefix']
Categorical: ['garden', 'locality_name', 'postal_code', 'property_type', 'state', 'swimming_pool', 'terrace', 'province', 'region']

Random Forest model trained successfully.

--- Train Evaluation ---
MAE:  16,987.36
RMSE: 24,782.33
R²:   0.9612

--- Test Evaluation ---
MAE:  90,049.99
RMSE: 206,236.78
R²:   0.3327

Model saved to models/random_forest_no_val.pkl


Unnamed: 0,Split,MAE,RMSE,R2
0,Train,16987.35546,24782.328777,0.961243
1,Test,90049.993575,206236.783811,0.332669


### Function to train a Random Forest with preprocessing

In [68]:
import pandas as pd
import numpy as np
import os
import joblib

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


# -------------------------------------------------------------
# 1. Train/Test Split (80/20)
# -------------------------------------------------------------
def split_train_test(df, target="price"):
    y = df[target]
    X = df.drop(columns=[target])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42
    )

    print("Train:", X_train.shape)
    print("Test:", X_test.shape)

    return X_train, X_test, y_train, y_test


# -------------------------------------------------------------
# 2. Outlier removal (train only)
# -------------------------------------------------------------
def remove_outliers_from_train(X_train, y_train):
    df_train = X_train.copy()
    df_train["price"] = y_train

    # IQR filter on price and living_area
    def iqr_filter(df, col):
        if col not in df.columns:
            return df
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        return df[(df[col] >= lower) & (df[col] <= upper)]

    for col in ["price", "living_area"]:
        df_train = iqr_filter(df_train, col)

    # Rooms sanity filter
    if "number_rooms" in df_train.columns:
        df_train = df_train[df_train["number_rooms"].fillna(0) <= 12]

    y_train_clean = df_train["price"]
    X_train_clean = df_train.drop(columns=["price"])

    print("Train after outlier removal:", X_train_clean.shape)
    return X_train_clean, y_train_clean


# -------------------------------------------------------------
# 3. Preprocessor
# -------------------------------------------------------------
def build_preprocessor(X):
    numeric_cols = X.select_dtypes(include=["float64", "int64", "Int64"]).columns.tolist()
    categorical_cols = X.select_dtypes(include=["object", "string"]).columns.tolist()

    # Postal code must be categorical
    if "postal_code" in numeric_cols:
        numeric_cols.remove("postal_code")
        categorical_cols.append("postal_code")

    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ])

    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer([
        ("num", numeric_pipeline, numeric_cols),
        ("cat", categorical_pipeline, categorical_cols)
    ])

    print("Numeric:", numeric_cols)
    print("Categorical:", categorical_cols)

    return preprocessor


# -------------------------------------------------------------
# 4. Evaluate helper
# -------------------------------------------------------------
def evaluate_rf(model, X, y, split_name=""):
    preds = model.predict(X)
    mae = mean_absolute_error(y, preds)
    rmse = mean_squared_error(y, preds) ** 0.5
    r2 = r2_score(y, preds)

    print(f"\n--- {split_name} Evaluation ---")
    print(f"MAE:  {mae:,.2f}")
    print(f"RMSE: {rmse:,.2f}")
    print(f"R²:   {r2:.4f}")

    return {"Split": split_name, "MAE": mae, "RMSE": rmse, "R2": r2}


# -------------------------------------------------------------
# 5. Train RF
# -------------------------------------------------------------
def train_random_forest(X_train, y_train, preprocessor):
    model = Pipeline([
        ("preprocessor", preprocessor),
        ("model", RandomForestRegressor(
            n_estimators=300,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features="sqrt",
            random_state=42,
            n_jobs=-1
        ))
    ])

    model.fit(X_train, y_train)
    print("\nRandom Forest model trained successfully.")
    return model


# -------------------------------------------------------------
# 6. Full RF workflow (Train/Test only)
# -------------------------------------------------------------
def run_random_forest_no_val(df):

    # Split DF
    X_train, X_test, y_train, y_test = split_train_test(df)

    # Outliers from train only
    X_train_clean, y_train_clean = remove_outliers_from_train(X_train, y_train)

    # Preprocessor
    preprocessor = build_preprocessor(X_train_clean)

    # Train model
    model = train_random_forest(X_train_clean, y_train_clean, preprocessor)

    # Evaluate
    results = []
    results.append(evaluate_rf(model, X_train_clean, y_train_clean, "Train"))
    results.append(evaluate_rf(model, X_test, y_test, "Test"))

    # Save
    os.makedirs("models", exist_ok=True)
    joblib.dump(model, "models/random_forest_no_val.pkl")
    print("\nModel saved to models/random_forest_no_val.pkl")

    return model, pd.DataFrame(results)


### RUN random forest

In [69]:
df = pd.read_csv("data/processed/cleaned_v2.csv", dtype={"postal_code": "string"})

rf_model, rf_results = run_random_forest_no_val(df)
rf_results


Train: (11499, 15)
Test: (2875, 15)
Train after outlier removal: (9169, 15)
Numeric: ['build_year', 'facades', 'living_area', 'number_rooms', 'postal_code_num', 'postal_prefix']
Categorical: ['garden', 'locality_name', 'postal_code', 'property_type', 'state', 'swimming_pool', 'terrace', 'province', 'region']

Random Forest model trained successfully.

--- Train Evaluation ---
MAE:  16,987.36
RMSE: 24,782.33
R²:   0.9612

--- Test Evaluation ---
MAE:  90,049.99
RMSE: 206,236.78
R²:   0.3327

Model saved to models/random_forest_no_val.pkl


Unnamed: 0,Split,MAE,RMSE,R2
0,Train,16987.35546,24782.328777,0.961243
1,Test,90049.993575,206236.783811,0.332669


## Tuned Random forest

In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import os


#----------------------------------------------------------
# 1) Quick train/test split (80/20)
#----------------------------------------------------------
def split_train_test(df, target="price"):
    y = df[target]
    X = df.drop(columns=[target])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42
    )

    print("Train:", X_train.shape)
    print("Test :", X_test.shape)

    return X_train, X_test, y_train, y_test


#----------------------------------------------------------
# 2) Outlier removal (train only)
#----------------------------------------------------------
def remove_outliers_from_train(X_train, y_train):
    df_train = X_train.copy()
    df_train["price"] = y_train

    if "living_area" in df_train.columns:
        Q1, Q3 = df_train["living_area"].quantile([0.25, 0.75])
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df_train = df_train[(df_train["living_area"] >= lower) & (df_train["living_area"] <= upper)]

    df_train = df_train[df_train["price"] >= 10000]

    if "number_rooms" in df_train.columns:
        df_train = df_train[df_train["number_rooms"].fillna(0) <= 12]

    y = df_train["price"]
    X = df_train.drop(columns=["price"])

    print("Train after outlier removal:", X.shape)
    return X, y


#----------------------------------------------------------
# 3) Safe preprocessor
#----------------------------------------------------------
def build_preprocessor(X_train):

    numeric_cols = X_train.select_dtypes(include=["float64","int64","Int64"]).columns.tolist()
    categorical_cols = X_train.select_dtypes(include=["object","string"]).columns.tolist()

    if "postal_code" in numeric_cols:
        numeric_cols.remove("postal_code")
        categorical_cols.append("postal_code")

    # Limit one-hot explosion
    encoder = OneHotEncoder(handle_unknown="ignore", min_frequency=50)

    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", encoder)
    ])

    preprocessor = ColumnTransformer([
        ("num", numeric_pipeline, numeric_cols),
        ("cat", categorical_pipeline, categorical_cols)
    ])

    return preprocessor


#----------------------------------------------------------
# 4) Safe evaluation
#----------------------------------------------------------
def evaluate_split(model, X, y, split_name):
    preds = model.predict(X)
    mae = mean_absolute_error(y, preds)
    rmse = mean_squared_error(y, preds) ** 0.5
    r2 = r2_score(y, preds)

    print(f"\n--- {split_name} ---")
    print(f"MAE:  {mae:,.2f}")
    print(f"RMSE: {rmse:,.2f}")
    print(f"R2:   {r2:.4f}")

    return {"Split": split_name, "MAE": mae, "RMSE": rmse, "R2": r2}


#----------------------------------------------------------
# 5) Random Forest tuning (safe)
#----------------------------------------------------------
def train_random_forest_tuned(X_train, y_train, preprocessor):

    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("model", RandomForestRegressor(random_state=42))
    ])

    # SAFE small grid (won’t crash)
    param_grid = {
        "model__n_estimators": [200, 300],
        "model__max_depth": [10, 20, 30],
        "model__max_features": [0.3, 0.5, "sqrt"],
        "model__min_samples_split": [2, 5],
        "model__min_samples_leaf": [1, 2]
    }

    search = RandomizedSearchCV(
        pipe,
        param_distributions=param_grid,
        n_iter=10,          # small search
        scoring="neg_mean_absolute_error",
        cv=3,
        n_jobs=1,           # IMPORTANT: prevents kernel crash
        verbose=1,
        random_state=42
    )

    search.fit(X_train, y_train)

    print("\nBest params:", search.best_params_)
    return search.best_estimator_


#----------------------------------------------------------
# 6) Full workflow (No validation)
#----------------------------------------------------------
def run_random_forest_tuned_no_val(df):

    X_train, X_test, y_train, y_test = split_train_test(df)

    X_train_clean, y_train_clean = remove_outliers_from_train(X_train, y_train)

    preprocessor = build_preprocessor(X_train_clean)

    model = train_random_forest_tuned(X_train_clean, y_train_clean, preprocessor)

    train_res = evaluate_split(model, X_train_clean, y_train_clean, "Train")
    test_res = evaluate_split(model, X_test, y_test, "Test")

    results = pd.DataFrame([train_res, test_res])

    os.makedirs("models", exist_ok=True)
    joblib.dump(model, "models/random_forest_tuned_no_val.pkl")

    print("\nModel saved to models/random_forest_tuned_no_val.pkl")

    return model, results


## Run random forest tuned

In [10]:
df = pd.read_csv("data/processed/dtype_cleaned.csv", dtype={"postal_code": "string"})

rf_tuned_model, rf_tuned_results = run_random_forest_tuned_no_val(df)
rf_tuned_results




Train: (11500, 14)
Test : (2875, 14)
Train after outlier removal: (9672, 14)
Fitting 3 folds for each of 10 candidates, totalling 30 fits

Best params: {'model__n_estimators': 200, 'model__min_samples_split': 5, 'model__min_samples_leaf': 1, 'model__max_features': 0.3, 'model__max_depth': 30}

--- Train ---
MAE:  32,774.53
RMSE: 57,269.74
R2:   0.9029

--- Test ---
MAE:  85,311.63
RMSE: 185,057.98
R2:   0.4609

Model saved to models/random_forest_tuned_no_val.pkl


Unnamed: 0,Split,MAE,RMSE,R2
0,Train,32774.531804,57269.741781,0.90295
1,Test,85311.629638,185057.984081,0.460938


## XGBoost

In [70]:
import pandas as pd
import numpy as np
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


# ============================================================
# 1. BUILD PREPROCESSOR
# ============================================================
def build_preprocessor(df):
    numeric_cols = df.select_dtypes(include=["float64", "int64", "Int64"]).columns.tolist()
    categorical_cols = df.select_dtypes(include=["object", "string"]).columns.tolist()

    # Postal code must remain categorical
    if "postal_code" in numeric_cols:
        numeric_cols.remove("postal_code")
        categorical_cols.append("postal_code")

    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    return ColumnTransformer([
        ("num", numeric_pipeline, numeric_cols),
        ("cat", categorical_pipeline, categorical_cols)
    ])


# ============================================================
# 2. TRAIN XGBOOST (NO VALIDATION SET)
# ============================================================
def train_xgboost_no_val(X_train, y_train, preprocessor):
    model = Pipeline([
        ("preprocessor", preprocessor),
        ("model", XGBRegressor(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            objective="reg:squarederror",
            random_state=42
        ))
    ])

    model.fit(X_train, y_train)
    print("XGBoost model trained successfully.")
    return model


# ============================================================
# 3. EVALUATE MODEL
# ============================================================
def evaluate(model, X, y, name=""):
    preds = model.predict(X)
    mae = mean_absolute_error(y, preds)
    rmse = np.sqrt(mean_squared_error(y, preds))
    r2 = r2_score(y, preds)

    print(f"\n--- {name} Evaluation ---")
    print(f"MAE:  {mae:,.2f}")
    print(f"RMSE: {rmse:,.2f}")
    print(f"R²:   {r2:.4f}")

    return {"Split": name, "MAE": mae, "RMSE": rmse, "R2": r2}


# ============================================================
# 4. FULL WORKFLOW (80/20 TRAIN-TEST)
# ============================================================
def run_xgboost_no_val(df):

    # --- enforce correct dtype ---
    df["postal_code"] = df["postal_code"].astype("string")

    # --- split 80/20 ---
    X = df.drop(columns=["price"])
    y = df["price"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42
    )

    print("Train:", X_train.shape)
    print("Test:", X_test.shape)

    # --- build preprocessor ---
    preprocessor = build_preprocessor(X_train)

    # --- train model ---
    model = train_xgboost_no_val(X_train, y_train, preprocessor)

    # --- evaluate ---
    results = []
    results.append(evaluate(model, X_train, y_train, "Train"))
    results.append(evaluate(model, X_test, y_test, "Test"))

    results_df = pd.DataFrame(results)

    # --- save model ---
    os.makedirs("models", exist_ok=True)
    joblib.dump(model, "models/xgboost_no_val.pkl")

    print("\nXGBoost (No Validation) saved to models/xgboost_no_val.pkl")

    return model, results_df


### Run XGBOOST

In [73]:
df = pd.read_csv("data/processed/cleaned_v2.csv", dtype={"postal_code": "string"})

xgb_model, xgb_results = run_xgboost_no_val(df)
xgb_results


Train: (11499, 15)
Test: (2875, 15)
XGBoost model trained successfully.

--- Train Evaluation ---
MAE:  59,607.46
RMSE: 92,920.92
R²:   0.8788

--- Test Evaluation ---
MAE:  74,591.29
RMSE: 144,435.84
R²:   0.6727

XGBoost (No Validation) saved to models/xgboost_no_val.pkl


Unnamed: 0,Split,MAE,RMSE,R2
0,Train,59607.455568,92920.923819,0.878806
1,Test,74591.290916,144435.842582,0.67269


## Tuned XGBoost Model

In [74]:
import pandas as pd
import numpy as np
import joblib
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# ---------------------------------------------------------
# 1. Split into 80% train / 20% test
# ---------------------------------------------------------
def split_train_test(df, target="price"):
    y = df[target]
    X = df.drop(columns=[target])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42
    )

    print("Train:", X_train.shape)
    print("Test:", X_test.shape)
    return X_train, X_test, y_train, y_test



# ---------------------------------------------------------
# 2. Evaluate helper
# ---------------------------------------------------------
def evaluate_split(model, X, y, name=""):
    preds = model.predict(X)
    mae = mean_absolute_error(y, preds)
    rmse = np.sqrt(mean_squared_error(y, preds))
    r2 = r2_score(y, preds)

    print(f"\n--- {name} Evaluation ---")
    print(f"MAE:  {mae:,.2f}")
    print(f"RMSE: {rmse:,.2f}")
    print(f"R²:   {r2:.4f}")

    return {"Split": name, "MAE": mae, "RMSE": rmse, "R2": r2}



# ---------------------------------------------------------
# 3. Train XGBoost with Hyperparameter Tuning (NO VAL)
# ---------------------------------------------------------
def train_xgboost_tuned_no_val(X_train, y_train, preprocessor):

    # Define default model
    base_xgb = XGBRegressor(
        objective="reg:squarederror",
        tree_method="hist",
        booster="gbtree",
        eval_metric="rmse",
        n_estimators=400,
        learning_rate=0.05
    )

    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("model", base_xgb)
    ])

    # Parameter grid (lightweight for Windows)
    param_dist = {
        "model__n_estimators": [300, 400, 500],
        "model__learning_rate": [0.02, 0.05, 0.1],
        "model__max_depth": [3, 4, 5, 6],
        "model__subsample": [0.6, 0.8, 1.0],
        "model__colsample_bytree": [0.6, 0.8, 1.0]
    }

    print("\nStarting RandomizedSearchCV for XGBoost...")
    search = RandomizedSearchCV(
        pipe,
        param_distributions=param_dist,
        n_iter=10,
        scoring="neg_mean_absolute_error",
        cv=3,
        n_jobs=1,             # IMPORTANT: fixes BrokenProcessPool
        verbose=1,
        random_state=42
    )

    search.fit(X_train, y_train)

    print("\nBest parameters found:")
    print(search.best_params_)

    return search.best_estimator_



# ---------------------------------------------------------
# 4. Full workflow (train + test)
# ---------------------------------------------------------
def run_xgboost_tuned_no_val(df, preprocessor):

    # Split
    X_train, X_test, y_train, y_test = split_train_test(df)

    # Train tuned model
    model = train_xgboost_tuned_no_val(X_train, y_train, preprocessor)

    # Evaluate
    results = []
    results.append(evaluate_split(model, X_train, y_train, "Train"))
    results.append(evaluate_split(model, X_test, y_test, "Test"))

    # Save
    joblib.dump(model, "models/xgboost_tuned_no_val.pkl")
    print("\nXGBoost (Tuned, No Validation) saved to models/xgboost_tuned_no_val.pkl")

    return model, pd.DataFrame(results)


In [75]:
df = pd.read_csv("data/processed/cleaned_v2.csv", dtype={"postal_code": "string"})

# Build preprocessor correctly
preprocessor = build_preprocessor(df.drop(columns=["price"]))

# Run tuned XGBoost
xgb_model, xgb_results = run_xgboost_tuned_no_val(df, preprocessor)

xgb_results



Train: (11499, 15)
Test: (2875, 15)

Starting RandomizedSearchCV for XGBoost...
Fitting 3 folds for each of 10 candidates, totalling 30 fits

Best parameters found:
{'model__subsample': 0.8, 'model__n_estimators': 500, 'model__max_depth': 4, 'model__learning_rate': 0.1, 'model__colsample_bytree': 0.8}

--- Train Evaluation ---
MAE:  65,148.98
RMSE: 103,988.05
R²:   0.8482

--- Test Evaluation ---
MAE:  77,759.73
RMSE: 148,361.96
R²:   0.6547

XGBoost (Tuned, No Validation) saved to models/xgboost_tuned_no_val.pkl


Unnamed: 0,Split,MAE,RMSE,R2
0,Train,65148.975305,103988.04565,0.848218
1,Test,77759.731711,148361.963398,0.654654


## XGboot 

In [118]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import joblib


# ------------------------------------------------------------
# Train-only outlier removal
# ------------------------------------------------------------
def remove_train_outliers(X_train, y_train, columns=None, multiplier=1.5):
    """
    Removes outliers from TRAIN ONLY (IQR filtering).
    Avoids test leakage.
    """
    df = X_train.copy()
    df["price"] = y_train

    if columns is None:
        columns = ["price", "living_area", "number_rooms"]

    for col in columns:
        if col not in df.columns:
            continue

        series = df[col].dropna()
        if len(series) < 100:
            continue

        Q1 = series.quantile(0.25)
        Q3 = series.quantile(0.75)
        IQR = Q3 - Q1

        lower = Q1 - multiplier * IQR
        upper = Q3 + multiplier * IQR

        df = df[(df[col] >= lower) & (df[col] <= upper)]

    y_clean = df["price"]
    X_clean = df.drop(columns=["price"])
    return X_clean, y_clean


# ------------------------------------------------------------
# Group-based split
# ------------------------------------------------------------
def group_split(df, group_col="locality_name", test_size=0.20):
    splitter = GroupShuffleSplit(
        test_size=test_size,
        n_splits=1,
        random_state=42
    )
    groups = df[group_col]
    train_idx, test_idx = next(splitter.split(df, groups=groups))

    train = df.iloc[train_idx].copy()
    test = df.iloc[test_idx].copy()

    print("Train:", train.shape)
    print("Test :", test.shape)
    return train, test


# ------------------------------------------------------------
# FINAL XGBoost Training Function
# ------------------------------------------------------------
def train_xgboost(df, target="price", save_path="models/xgboost_geo_tuned.pkl"):

    # 1. Split (geospatial)
    train_df, test_df = group_split(df, "locality_name")

    # 2. Log target
    y_train = np.log1p(train_df[target])
    y_test = np.log1p(test_df[target])

    X_train = train_df.drop(columns=[target])
    X_test = test_df.drop(columns=[target])

    # 3. TRAIN-ONLY outlier removal
    X_train, y_train = remove_train_outliers(
        X_train, y_train,
        columns=["price", "living_area", "number_rooms"]
    )
    print("Train after outlier removal:", X_train.shape)

    # 4. Identify column types
    cat_cols = X_train.select_dtypes(include=["object", "string"]).columns.tolist()
    num_cols = X_train.select_dtypes(include=["number"]).columns.tolist()

    print("\nCategorical:", cat_cols)
    print("Numeric:", num_cols)

    # 5. Preprocessing
    preprocessor = ColumnTransformer([
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse=True), cat_cols),
        ("num", "passthrough", num_cols)
    ])

    # 6. XGBoost model
    model = XGBRegressor(
        objective="reg:squarederror",
        tree_method="hist",
        random_state=42
    )

    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("model", model)
    ])

    param_dist = {
        "model__max_depth": [4, 6, 8],
        "model__learning_rate": [0.03, 0.05, 0.1],
        "model__n_estimators": [300, 600, 900],
        "model__subsample": [0.7, 0.8, 1.0],
        "model__colsample_bytree": [0.6, 0.8, 1.0],
        "model__min_child_weight": [1, 5, 10],
        "model__gamma": [0, 1, 5],
        "model__reg_lambda": [1, 3, 5]
    }

    print("\nTuning XGBoost hyperparameters...")
    search = RandomizedSearchCV(
        pipe,
        param_distributions=param_dist,
        n_iter=20,
        cv=3,
        scoring="neg_root_mean_squared_error",
        n_jobs=-1,
        verbose=1,
        random_state=42
    )

    search.fit(X_train, y_train)
    best_model = search.best_estimator_

    print("\nBest hyperparameters:")
    print(search.best_params_)

    # 7. Evaluate
    preds_train = np.expm1(best_model.predict(X_train))
    preds_test = np.expm1(best_model.predict(X_test))

    mae_train = mean_absolute_error(train_df[target], preds_train)
    rmse_train = np.sqrt(mean_squared_error(train_df[target], preds_train))
    r2_train = r2_score(train_df[target], preds_train)

    mae_test = mean_absolute_error(test_df[target], preds_test)
    rmse_test = np.sqrt(mean_squared_error(test_df[target], preds_test))
    r2_test = r2_score(test_df[target], preds_test)

    print("\n===== FINAL XGBOOST RESULTS =====")
    print("\n--- Train ---")
    print(f"MAE:  {mae_train:,.2f}")
    print(f"RMSE: {rmse_train:,.2f}")
    print(f"R²:   {r2_train:.4f}")

    print("\n--- Test ---")
    print(f"MAE:  {mae_test:,.2f}")
    print(f"RMSE: {rmse_test:,.2f}")
    print(f"R²:   {r2_test:.4f}")

    # 8. Save
    joblib.dump(best_model, save_path)
    print(f"\nSaved model to: {save_path}")

    return best_model


In [120]:
df = feature_engineering()
model = train_xgboost(df)


Feature-engineered dataset saved to: data/processed/feature_engineered.csv
Final shape: (14374, 23)

Columns: ['build_year', 'facades', 'garden', 'living_area', 'locality_name', 'number_rooms', 'postal_code', 'price', 'property_type', 'state', 'swimming_pool', 'terrace', 'province', 'house_age', 'is_new_build', 'is_recent', 'is_old', 'build_decade', 'region', 'garden_flag', 'terrace_flag', 'swimming_pool_flag', 'postal_prefix']
Train: (12371, 23)
Test : (2003, 23)
Train after outlier removal: (9491, 22)

Categorical: ['garden', 'locality_name', 'postal_code', 'property_type', 'state', 'swimming_pool', 'terrace', 'province', 'region', 'postal_prefix']
Numeric: ['build_year', 'facades', 'living_area', 'number_rooms', 'house_age', 'is_new_build', 'is_recent', 'is_old', 'build_decade', 'garden_flag', 'terrace_flag', 'swimming_pool_flag']


TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'

In [100]:
df = pd.read_csv("data/processed/feature_engineered.csv")
model = train_xgboost(df)




KeyError: 'postal_prefix'