In [92]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, r2_score
import lightgbm as lgb
import xgboost as xgb
from sklearn.inspection import permutation_importance

In [93]:
df = pd.read_csv('../../data/preprocessed/realestates_kh_Pearson_v2.csv')  

In [94]:
df

Unnamed: 0,land_area,is_parent,latitude,longitude,population,nearest_cafe,n_cafe_in_1km,n_cafe_in_3km_to_5km,nearest_gas_station,n_gas_station_in_1km,...,h_id_88658478a3fffff,h_id_88658478b3fffff,h_id_88658478b7fffff,h_id_88658478bbfffff,h_id_8865847993fffff,h_id_886586a691fffff,h_id_886586a693fffff,h_id_886586a699fffff,h_id_886586a69bfffff,price_per_m2
0,124.0,0.0,11.0,104.0,16252.0,11.0,10.0,76.0,0.0,14.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8870.967742
1,80.0,0.0,11.0,104.0,7658.0,7.0,14.0,62.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8500.000000
2,66.0,0.0,11.0,104.0,16252.0,11.0,10.0,76.0,0.0,14.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8333.333333
3,116.0,0.0,11.0,104.0,23239.0,7.0,12.0,85.0,1.0,13.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6465.517241
4,65.0,0.0,11.0,104.0,5351.0,0.0,6.0,87.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6461.538462
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3337,,,,,,,,,,,...,,,,,,,,,,550.000000
3338,,,,,,,,,,,...,,,,,,,,,,390.000000
3339,,,,,,,,,,,...,,,,,,,,,,357.692308
3340,,,,,,,,,,,...,,,,,,,,,,250.000000


In [95]:
X = df.drop(['price_per_m2'], axis=1)
y = df['price_per_m2']

In [96]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.3, 
    random_state=42,
)

In [97]:
print("Max value in X_train:", X_train.max().max())
print("Max value in X_test:", X_test.max().max())
print("Max value in y_train:", y_train.max())
print("Max value in y_test:", y_test.max())


Max value in X_train: 104635.0
Max value in X_test: 84000.0
Max value in y_train: 8500.0
Max value in y_test: 8870.967741935483


In [98]:
linear_model = LinearRegression()

In [99]:
lgb_model = lgb.LGBMRegressor(
    objective='regression',
    num_leaves=63,
    learning_rate=0.05,
    n_estimators=1000,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1  # Silence LightGBM output
)

In [100]:
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    max_depth=6,
    learning_rate=0.1,
    n_estimators=800,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [101]:
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)

In [102]:
stacked_model = StackingRegressor(
    estimators=[
        ('lgb', lgb_model),
        ('xgb', xgb_model),
        ('rf', rf_model)
    ],
    final_estimator=Ridge(),
    cv=KFold(n_splits=3, shuffle=True, random_state=42)
)

In [103]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np

# Define parameter grids for each model type
param_grids = {
    "Linear Regression": {
        'fit_intercept': [True, False],
        'positive': [True, False]  # For constrained regression
    },
    "LightGBM": {
        'num_leaves': [15, 31, 63],
        'learning_rate': [0.01, 0.05, 0.1],
        'n_estimators': [500, 1000, 1500],
        'min_child_samples': [5, 20, 50],
        'reg_alpha': [0, 0.1, 1],
        'reg_lambda': [0, 0.1, 1]
    },
    "XGBoost": {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.05, 0.1],
        'n_estimators': [500, 1000, 1500],
        'gamma': [0, 0.1, 0.5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    },
    "Random Forest": {
        'n_estimators': [300, 500, 800],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 0.8, None]
    },
    "Stacked Ensemble": {
        'final_estimator__alpha': [0.1, 1.0, 10.0],
        'final_estimator__fit_intercept': [True, False]
    }
}

In [104]:
tuning_method = {
    "Linear Regression": GridSearchCV,
    "LightGBM": RandomizedSearchCV,
    "XGBoost": RandomizedSearchCV,
    "Random Forest": RandomizedSearchCV,
    "Stacked Ensemble": GridSearchCV
}
search_params = {
    'cv': 5,
    'scoring': 'r2',
    'n_jobs': -1,
    'verbose': 1
}

randomized_settings = {
    'n_iter': 20  # Number of parameter settings sampled
}

In [105]:
models = {
    "Linear Regression": linear_model,
    "LightGBM": lgb_model,
    "XGBoost": xgb_model,
    "Random Forest": rf_model,
    "Stacked Ensemble": stacked_model
}

In [106]:
# from sklearn.preprocessing import StandardScaler

# # 1. Scale the target variable (y)
# y_scaler = StandardScaler()
# y_train_scaled = y_scaler.fit_transform(y_train.to_numpy().reshape(-1, 1)).flatten()
# y_test_scaled = y_scaler.transform(y_test.to_numpy().reshape(-1, 1)).flatten()

In [107]:
df.dropna(inplace=True)

In [108]:
df.isnull().sum()

land_area               0
is_parent               0
latitude                0
longitude               0
population              0
                       ..
h_id_886586a691fffff    0
h_id_886586a693fffff    0
h_id_886586a699fffff    0
h_id_886586a69bfffff    0
price_per_m2            0
Length: 484, dtype: int64

In [109]:
import time
tuned_models = {}
best_params = {}

for name, model in models.items():
    print(f"\n{'='*40}\nTuning {name}\n{'='*40}")
    
    start_time = time.time()
    
    # Initialize search object
    if tuning_method[name] == RandomizedSearchCV:
        search = RandomizedSearchCV(
            model,
            param_grids[name],
            **search_params,
            **randomized_settings,
            random_state=42
        )
    else:
        search = GridSearchCV(
            model,
            param_grids[name],
            **search_params
        )
    
    # Run search - use LOG-TRANSFORMED TARGETS
    search.fit(X_train, y_train)
    
    # Store results
    tuned_models[name] = search.best_estimator_
    best_params[name] = search.best_params_
    
    print(f"Tuning completed in {time.time()-start_time:.1f}s")
    print(f"Best R²: {search.best_score_:.4f}")
    print(f"Best params: {search.best_params_}")


Tuning Linear Regression
Fitting 5 folds for each of 4 candidates, totalling 20 fits


ValueError: 
All the 20 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "d:\anaconda\envs\env_v3.10\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\anaconda\envs\env_v3.10\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "d:\anaconda\envs\env_v3.10\lib\site-packages\sklearn\linear_model\_base.py", line 601, in fit
    X, y = validate_data(
  File "d:\anaconda\envs\env_v3.10\lib\site-packages\sklearn\utils\validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
  File "d:\anaconda\envs\env_v3.10\lib\site-packages\sklearn\utils\validation.py", line 1370, in check_X_y
    X = check_array(
  File "d:\anaconda\envs\env_v3.10\lib\site-packages\sklearn\utils\validation.py", line 1107, in check_array
    _assert_all_finite(
  File "d:\anaconda\envs\env_v3.10\lib\site-packages\sklearn\utils\validation.py", line 120, in _assert_all_finite
    _assert_all_finite_element_wise(
  File "d:\anaconda\envs\env_v3.10\lib\site-packages\sklearn\utils\validation.py", line 169, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
