In [185]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, r2_score
import lightgbm as lgb
import xgboost as xgb
from sklearn.inspection import permutation_importance

In [186]:
df = pd.read_csv('../../data/processed/realestates_kh_SelectKBest_v1.csv')  

In [187]:
X = df.drop(['price'], axis=1)
y = df['price']

In [188]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
)

In [189]:
print("Max value in X_train:", X_train.max().max())
print("Max value in X_test:", X_test.max().max())
print("Max value in y_train:", y_train.max())
print("Max value in y_test:", y_test.max())


Max value in X_train: 104635
Max value in X_test: 55811
Max value in y_train: 188343000
Max value in y_test: 276264450


In [190]:
linear_model = LinearRegression()

In [191]:
lgb_model = lgb.LGBMRegressor(
    objective='regression',
    num_leaves=63,
    learning_rate=0.05,
    n_estimators=1000,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1  # Silence LightGBM output
)

In [192]:
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    max_depth=6,
    learning_rate=0.1,
    n_estimators=800,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [193]:
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)

In [194]:
stacked_model = StackingRegressor(
    estimators=[
        ('lgb', lgb_model),
        ('xgb', xgb_model),
        ('rf', rf_model)
    ],
    final_estimator=Ridge(),
    cv=KFold(n_splits=3, shuffle=True, random_state=42)
)

In [195]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np

# Define parameter grids for each model type
param_grids = {
    "Linear Regression": {
        'fit_intercept': [True, False],
        'positive': [True, False]  # For constrained regression
    },
    "LightGBM": {
        'num_leaves': [15, 31, 63],
        'learning_rate': [0.01, 0.05, 0.1],
        'n_estimators': [500, 1000, 1500],
        'min_child_samples': [5, 20, 50],
        'reg_alpha': [0, 0.1, 1],
        'reg_lambda': [0, 0.1, 1]
    },
    "XGBoost": {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.05, 0.1],
        'n_estimators': [500, 1000, 1500],
        'gamma': [0, 0.1, 0.5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    },
    "Random Forest": {
        'n_estimators': [300, 500, 800],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 0.8, None]
    },
    "Stacked Ensemble": {
        'final_estimator__alpha': [0.1, 1.0, 10.0],
        'final_estimator__fit_intercept': [True, False]
    }
}

In [196]:
tuning_method = {
    "Linear Regression": GridSearchCV,
    "LightGBM": RandomizedSearchCV,
    "XGBoost": RandomizedSearchCV,
    "Random Forest": RandomizedSearchCV,
    "Stacked Ensemble": GridSearchCV
}
search_params = {
    'cv': 5,
    'scoring': 'r2',
    'n_jobs': -1,
    'verbose': 1
}

randomized_settings = {
    'n_iter': 20  # Number of parameter settings sampled
}

In [197]:
models = {
    "Linear Regression": linear_model,
    "LightGBM": lgb_model,
    "XGBoost": xgb_model,
    "Random Forest": rf_model,
    "Stacked Ensemble": stacked_model
}

In [198]:
from sklearn.preprocessing import StandardScaler

# 1. Scale the target variable (y)
y_scaler = StandardScaler()
y_train_scaled = y_scaler.fit_transform(y_train.to_numpy().reshape(-1, 1)).flatten()
y_test_scaled = y_scaler.transform(y_test.to_numpy().reshape(-1, 1)).flatten()

In [199]:
import time
tuned_models = {}
best_params = {}

for name, model in models.items():
    print(f"\n{'='*40}\nTuning {name}\n{'='*40}")
    
    start_time = time.time()
    
    # Initialize search object
    if tuning_method[name] == RandomizedSearchCV:
        search = RandomizedSearchCV(
            model,
            param_grids[name],
            **search_params,
            **randomized_settings,
            random_state=42
        )
    else:
        search = GridSearchCV(
            model,
            param_grids[name],
            **search_params
        )
    
    # Run search - use LOG-TRANSFORMED TARGETS
    search.fit(X_train, y_train_scaled)
    
    # Store results
    tuned_models[name] = search.best_estimator_
    best_params[name] = search.best_params_
    
    print(f"Tuning completed in {time.time()-start_time:.1f}s")
    print(f"Best R²: {search.best_score_:.4f}")
    print(f"Best params: {search.best_params_}")


Tuning Linear Regression
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Tuning completed in 6.8s
Best R²: 0.0762
Best params: {'fit_intercept': True, 'positive': True}

Tuning LightGBM
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Tuning completed in 143.8s
Best R²: 0.4281
Best params: {'reg_lambda': 0.1, 'reg_alpha': 0, 'num_leaves': 15, 'n_estimators': 1000, 'min_child_samples': 50, 'learning_rate': 0.05}

Tuning XGBoost
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Tuning completed in 31.5s
Best R²: 0.6617
Best params: {'subsample': 1.0, 'n_estimators': 500, 'max_depth': 7, 'learning_rate': 0.01, 'gamma': 0.1, 'colsample_bytree': 0.8}

Tuning Random Forest
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Tuning completed in 243.0s
Best R²: 0.7079
Best params: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 20}

Tuning Stacked Ensemble
Fitting 5 folds for each of 6 candi