In [2]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
os.chdir(r"C:\Users\welde\Desktop\immo-eliza-ml") 


# ------------------------------------------------------------
# 1. Load cleaned data (already dtype-cleaned by your function)
# ------------------------------------------------------------
df = pd.read_csv("data/processed/dtype_cleaned.csv")
print("Loaded cleaned data:", df.shape)


# ------------------------------------------------------------
# 2. Split into train / validation / test (60 / 20 / 20)
# ------------------------------------------------------------
def split_data(df, target="price"):
    """
    Split data into train / validation / test using 60 / 20 / 20 ratio.
    """

    y = df[target]
    X = df.drop(columns=[target])

    # 60% train, 40% temp
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.40, random_state=42
    )

    # 20% val, 20% test from temp
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.50, random_state=42
    )

    print("Train:", X_train.shape)
    print("Validation:", X_val.shape)
    print("Test:", X_test.shape)

    return X_train, X_val, X_test, y_train, y_val, y_test


# ------------------------------------------------------------
# 3. Remove outliers ONLY from training set
# ------------------------------------------------------------
def remove_outliers_from_train(X_train, y_train):
    """
    Remove outliers from training data only, to avoid data leakage.
    - IQR-based filtering on price and living_area
    - Logical cap on number_rooms (<= 12)
    """

    df_train = X_train.copy()
    df_train["price"] = y_train

    def iqr_filter(df, col):
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        return df[(df[col] >= lower) & (df[col] <= upper)]

    # Apply IQR filtering
    if "price" in df_train.columns:
        df_train = iqr_filter(df_train, "price")

    if "living_area" in df_train.columns:
        df_train = iqr_filter(df_train, "living_area")

    # Logical constraint on number of rooms
    if "number_rooms" in df_train.columns:
        df_train = df_train[df_train["number_rooms"].fillna(0) <= 12]

    # Separate back X and y
    y_train_clean = df_train["price"]
    X_train_clean = df_train.drop(columns=["price"])

    print("Training after outlier removal:", X_train_clean.shape)
    return X_train_clean, y_train_clean


# ------------------------------------------------------------
# 4. Build preprocessing transformer
# ------------------------------------------------------------
def build_preprocessor(X_train):
    """
    Build a ColumnTransformer that:
    - imputes and scales numeric features
    - imputes and one-hot encodes categorical features
    """

    numeric_cols = X_train.select_dtypes(
        include=["float64", "int64", "Int64"]
    ).columns.tolist()

    categorical_cols = X_train.select_dtypes(
        include=["object", "string"]
    ).columns.tolist()

    numeric_pipeline = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    categorical_pipeline = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_pipeline, numeric_cols),
            ("cat", categorical_pipeline, categorical_cols)
        ]
    )

    print("Numeric columns:", numeric_cols)
    print("Categorical columns:", categorical_cols)

    return preprocessor


# ------------------------------------------------------------
# 5. Evaluation helper
# ------------------------------------------------------------
def evaluate_model(model, X, y):
    """
    Compute MAE, RMSE, RÂ² for a model on given data.
    """

    preds = model.predict(X)

    mae = mean_absolute_error(y, preds)
    rmse = np.sqrt(mean_squared_error(y, preds))
    r2 = r2_score(y, preds)

    return mae, rmse, r2


# ------------------------------------------------------------
# 6. Train and evaluate all 3 models
# ------------------------------------------------------------
def compare_models(df):
    """
    Full workflow:
      - split data
      - clean training outliers
      - build preprocessor
      - train Linear Regression, Random Forest, XGBoost
      - evaluate each on train / val / test
      - return comparison table
    """

    # Split
    X_train, X_val, X_test, y_train, y_val, y_test = split_data(df)

    # Outlier removal on training only
    X_train_clean, y_train_clean = remove_outliers_from_train(X_train, y_train)

    # Preprocessor based on cleaned training data
    preprocessor = build_preprocessor(X_train_clean)

    # Define the three models
    models = {
        "LinearRegression": LinearRegression(),
        "RandomForest": RandomForestRegressor(
            n_estimators=300,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            n_jobs=-1,
            random_state=42
        ),
        "XGBoost": XGBRegressor(
            objective="reg:squarederror",
            n_estimators=400,
            learning_rate=0.05,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=1.0,
            reg_alpha=0.0,
            n_jobs=-1,
            random_state=42,
            tree_method="hist"
        )
    }

    results = []

    # Loop over each model
    for name, estimator in models.items():
        print(f"\nTraining model: {name}")

        # Build a fresh pipeline for each model
        model = Pipeline(steps=[
            ("preprocessor", preprocessor),
            ("model", estimator)
        ])

        # Fit on cleaned training data
        model.fit(X_train_clean, y_train_clean)

        # Evaluate on train, val, test
        for split_name, X_split, y_split in [
            ("Train", X_train_clean, y_train_clean),
            ("Validation", X_val, y_val),
            ("Test", X_test, y_test),
        ]:
            mae, rmse, r2 = evaluate_model(model, X_split, y_split)

            results.append({
                "Model": name,
                "Split": split_name,
                "MAE": mae,
                "RMSE": rmse,
                "R2": r2
            })

    # Build a DataFrame with all results
    results_df = pd.DataFrame(results)

    # Nice sorting: by Split, then Model
    results_df = results_df.sort_values(by=["Split", "Model"])

    return results_df


# ------------------------------------------------------------
# 7. Run comparison and inspect results
# ------------------------------------------------------------
comparison_df = compare_models(df)

# Print nicely with rounded metrics
comparison_display = comparison_df.copy()
comparison_display["MAE"] = comparison_display["MAE"].round(2)
comparison_display["RMSE"] = comparison_display["RMSE"].round(2)
comparison_display["R2"] = comparison_display["R2"].round(4)

print("\nModel comparison (MAE, RMSE, R2):")
print(comparison_display)


Loaded cleaned data: (14375, 15)
Train: (8625, 14)
Validation: (2875, 14)
Test: (2875, 14)
Training after outlier removal: (6915, 14)
Numeric columns: ['build_year', 'facades', 'living_area', 'number_rooms', 'postal_code']
Categorical columns: ['garden', 'locality_name', 'property_type', 'state', 'swimming_pool', 'terrace', 'province', 'property_type_name', 'state_mapped']

Training model: LinearRegression

Training model: RandomForest

Training model: XGBoost

Model comparison (MAE, RMSE, R2):
              Model       Split       MAE       RMSE      R2
2  LinearRegression        Test  95775.96  213772.65  0.3824
5      RandomForest        Test  92281.81  226754.82  0.3052
8           XGBoost        Test  90599.93  218969.94  0.3520
0  LinearRegression       Train  45057.90   63861.97  0.7489
3      RandomForest       Train  17319.19   25256.01  0.9607
6           XGBoost       Train  37818.86   50996.55  0.8399
1  LinearRegression  Validation  87943.62  170357.72  0.4631
4      Rando