# Model training selection

In [1]:
import pandas as pd
import numpy as np
import toml
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

import sys
import os
import json
import time
from pathlib import Path
import joblib

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from src.eda.data_reader import DataReader
from src.data_preprocessing.data_processor import DataProcessor
from src.data_preprocessing.feature_engineer import FeatureEngineer

sns.set_theme(style="whitegrid")

## Data prep

In [2]:
np.random.seed(14)
reader = DataReader()
data = reader.read()

print(data.head(5))

2025-08-10 17:05:24,470 - RealEstateProject - INFO - DataReader: Readed files.
2025-08-10 17:05:24,472 - RealEstateProject - INFO - DataRows to train: 22028
   source           price  price_per_meter     area rooms     floor  \
0  otodom          944000       19667.0000  48.0000     2  parter/4   
1  otodom  Zapytaj o cenę              NaN  41.7900     2       2/6   
2  otodom          799000       12292.0000  65.0000     4      3/10   
3  otodom         1993000       42513.0000  46.8800     2       5/8   
4  otodom         1333000        9948.0000 134.0000     5       1/2   

  market_type furnished                                        description  \
0      wtórny       Nie  <p>      Z przyjemnością przedstawiam Państwu ...   
1   pierwotny       Nie  <ul><li>2-pokojowe mieszkanie<strong> numer B2...   
2      wtórny       Nie  Mam przyjemność zaprezentować Państwu, mieszka...   
3   pierwotny       Nie  <ul><li>2-pokojowe mieszkanie<strong> numer A....   
4      wtórny       Nie  <

In [3]:
def target_cleaner(y) -> pd.DataFrame:
    y['price'] = pd.to_numeric(y['price'], errors='coerce')
    y.dropna(subset=['price'], inplace=True)

    return y

data = target_cleaner(data)

In [4]:
X = data.drop('price', axis=1)
y = data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
with open(r"C:\Users\Jakub\Real Estate Price Prediction\config.toml", 'r') as f:
    config = toml.load(f)
    config = json.loads(json.dumps(config))

dp_params = config.get('data_processing', {})
fe_params = config.get('feature_engineering', {})
fe_params["columns_to_scale"]

['area',
 'rooms',
 'floor',
 'year_built',
 'rent',
 'building_max_floor',
 'rooms_per_area',
 'district_freq',
 'heating_freq',
 'ownership_freq',
 'building_type_freq',
 'finish_status_freq']

In [6]:
preprocessor = DataProcessor(config)
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

y_train = y_train.loc[X_train.index]
y_test = y_test.loc[X_test.index]
X_test

2025-08-10 17:05:24,575 - RealEstateProject - INFO - Fit
2025-08-10 17:05:24,584 - RealEstateProject - INFO - DataProcessor: Learned median for 'year_built'.
2025-08-10 17:05:24,587 - RealEstateProject - INFO - DataProcessor: Learned median for 'rent'.
2025-08-10 17:05:24,589 - RealEstateProject - INFO - Transform
2025-08-10 17:05:24,604 - RealEstateProject - INFO - DataProcessor: 'rooms' transformed.
2025-08-10 17:05:24,607 - RealEstateProject - INFO - DataProcessor: 'building_type' filled.
2025-08-10 17:05:24,609 - RealEstateProject - INFO - DataProcessor: 'year_built' filled.
2025-08-10 17:05:24,609 - RealEstateProject - INFO - DataProcessor: 'rent' filled.
2025-08-10 17:05:24,617 - RealEstateProject - INFO - DataProcessor: 'na' i columns: ['market_type', 'furnished', 'district', 'finish_status', 'ownership', 'elevator', 'year_built']  dropped.
2025-08-10 17:05:24,619 - RealEstateProject - INFO - DataProcessor: 'heating' filled.
2025-08-10 17:05:24,645 - RealEstateProject - INFO - D

Unnamed: 0,area,rooms,floor,market_type,furnished,description,district,building_type,year_built,rent,finish_status,ownership,heating,elevator,building_max_floor,is_above_10_floor
21108,65.5000,4,3.0000,wtórny,Nie,"Prezentowane mieszkanie ma powierzchnię 65,5 m...",mokotów,block,1969.0000,1215.0000,to_renovation,limited_ownership,miejskie,Nie,4.0000,0
9324,64.3000,3,4.0000,wtórny,Nie,<p> Z przyjemnością prezentujemy Państwu ofe...,wola,unknown,2013.0000,1100.0000,ready_to_use,full_ownership,unknown,Nie,6.0000,0
13468,36.3400,2,0.0000,pierwotny,Nie,<strong>— 0% prowizji</strong><br><strong>— Be...,praga-południe,apartment,2009.0000,0.0000,to_completion,full_ownership,miejskie,Nie,5.0000,0
8250,117.2300,4,3.0000,pierwotny,Nie,<ul><li>4-pokojowe mieszkanie<strong> numer A....,mokotów,unknown,2027.0000,700.0000,to_completion,full_ownership,unknown,Nie,3.0000,0
20400,68.4800,3,8.0000,wtórny,Nie,<p>Sprzedam mieszkanie na Bielanach. Lokal got...,bielany,block,2007.0000,1053.0000,ready_to_use,full_ownership,miejskie,Nie,9.0000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17621,56.0300,3,7.0000,pierwotny,Nie,<p><p>Mieszkanie w tej prestiżowej realizacji ...,wola,apartment,2026.0000,0.0000,to_completion,full_ownership,miejskie,Nie,9.0000,0
20055,54.0000,2,8.0000,wtórny,Nie,"<p>Oferuję do sprzedaży atrakcyjne, ciche, dwu...",ursynów,block,2004.0000,700.0000,ready_to_use,full_ownership,miejskie,Nie,9.0000,0
3282,41.0000,3,0.0000,wtórny,Nie,"<p>Bezpośrednio, bez prowizji.</p>\r\n<p>Pośre...",żoliborz,block,1970.0000,680.0000,ready_to_use,limited_ownership,miejskie,Nie,11.0000,0
9854,43.0000,2,3.0000,wtórny,Nie,"<p>***Sprzedaż bezpośrednia, agencjom dziękuję...",wola,block,1961.0000,590.0000,ready_to_use,full_ownership,miejskie,Nie,3.0000,0


In [7]:
engineer = FeatureEngineer(config)
X_train = engineer.fit_transform(X_train)
X_test = engineer.transform(X_test)

feature_scaler = StandardScaler()
X_train[fe_params["columns_to_scale"]] = feature_scaler.fit_transform(X_train[fe_params["columns_to_scale"]])
X_test[fe_params["columns_to_scale"]] = feature_scaler.transform(X_test[fe_params["columns_to_scale"]])

y_scaler = StandardScaler()
y_train = y_scaler.fit_transform(y_train.to_frame())
y_test = y_scaler.transform(y_test.to_frame())

X_test.head(5)

2025-08-10 17:05:24,714 - RealEstateProject - INFO - Fit
2025-08-10 17:05:24,721 - RealEstateProject - INFO - FeatureEngineer: Learned frequency map for 'district'.
2025-08-10 17:05:24,724 - RealEstateProject - INFO - FeatureEngineer: Learned frequency map for 'heating'.
2025-08-10 17:05:24,728 - RealEstateProject - INFO - FeatureEngineer: Learned frequency map for 'ownership'.
2025-08-10 17:05:24,730 - RealEstateProject - INFO - FeatureEngineer: Learned frequency map for 'building_type'.
2025-08-10 17:05:24,735 - RealEstateProject - INFO - FeatureEngineer: Learned frequency map for 'finish_status'.
2025-08-10 17:05:24,736 - RealEstateProject - INFO - Transform
2025-08-10 17:05:25,189 - RealEstateProject - INFO - FeatureEngineer: 'elevator' feature created.
2025-08-10 17:05:25,510 - RealEstateProject - INFO - FeatureEngineer: 'balcony' feature created.
2025-08-10 17:05:25,859 - RealEstateProject - INFO - FeatureEngineer: 'garage' feature created.
2025-08-10 17:05:26,301 - RealEstatePro

Unnamed: 0,area,rooms,floor,market_type,year_built,rent,building_max_floor,is_above_10_floor,garage,rooms_per_area,district_freq,heating_freq,ownership_freq,building_type_freq,finish_status_freq
21108,0.2356,1.4712,0.0243,0,-0.2967,0.3356,-0.5044,0,0,1.4313,1.6915,0.5971,-2.525,0.9153,-1.7773
9324,0.194,0.4378,0.4144,0,0.1459,0.2479,-0.0348,0,1,0.0731,0.672,-1.4732,0.3932,-1.6609,0.7054
13468,-0.7766,-0.5956,-1.1462,1,0.1056,-0.5907,-0.2696,0,1,0.8628,0.1884,0.5971,0.3932,-0.4061,-1.2075
8250,2.0313,1.4712,0.0243,1,0.2867,-0.057,-0.7392,0,0,-1.1082,1.6915,-1.4732,0.3932,-1.6609,-1.2075
20400,0.3391,0.4378,1.975,0,0.0855,0.2121,0.6696,0,1,-0.1953,-0.7223,0.5971,0.3932,0.9153,0.7054


In [8]:
X_test.shape

(2722, 15)

In [9]:
y_test.shape

(2722, 1)

## Models training

In [10]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(random_state=42),
    'Random Forest Regressor': RandomForestRegressor(random_state=42, n_jobs=-1),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
    'XGBoost Regressor': xgb.XGBRegressor(random_state=42, n_jobs=-1)
}

results_list = []

In [11]:
for name, model in models.items():
    print(f"--- Model: {name} ---")
    start_time = time.time()
    model.fit(X_train, y_train)
    
    end_time = time.time()
    training_time = end_time - start_time
    y_pred = model.predict(X_test)
    
    # Metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    # Save results
    results_list.append({
        'Model': name,
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R2 Score': r2,
        'Training time (s)': training_time
    })

--- Model: Linear Regression ---
--- Model: Decision Tree Regressor ---
--- Model: Random Forest Regressor ---
--- Model: Gradient Boosting Regressor ---
--- Model: XGBoost Regressor ---


In [12]:
results_df = pd.DataFrame(results_list)

results_df_sorted = results_df.sort_values(by='R2 Score', ascending=False).reset_index(drop=True)

print("\n--- Results comparison ---")
results_df_sorted


--- Results comparison ---


Unnamed: 0,Model,MAE,MSE,RMSE,R2 Score,Training time (s)
0,Random Forest Regressor,0.1516,0.1522,0.3901,0.8587,0.9981
1,XGBoost Regressor,0.1553,0.1615,0.4018,0.8501,0.1491
2,Gradient Boosting Regressor,0.1961,0.1769,0.4206,0.8357,1.0172
3,Decision Tree Regressor,0.195,0.3134,0.5598,0.709,0.0672
4,Linear Regression,0.2972,0.3471,0.5891,0.6777,0.0096


## GridSearchCV for RF Regressor and XGB Regressor

In [13]:
print("--- Random Forest Regressor ---")

# 1. Param Grid
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# 2. GridSearchCV instance
grid_search_rf = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42, n_jobs=-1),
    param_grid=param_grid_rf,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=2
)

# 3. Run
grid_search_rf.fit(X_train, y_train)

# 4. Show best results
print("\nBest params for Random Forest Regressor:")
print(grid_search_rf.best_params_)
print(f"Najlepszy wynik R2 (cross-validation): {grid_search_rf.best_score_:.4f}")

--- Random Forest Regressor ---
Fitting 5 folds for each of 24 candidates, totalling 120 fits

Best params for Random Forest Regressor:
{'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Najlepszy wynik R2 (cross-validation): 0.8607


In [14]:
print("\n--- XGBoost Regressor ---")

# 1. Param Grid
param_grid_xgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 1.0],
    'colsample_bytree': [0.7, 1.0]
}

# 2. GridSearchCV instance
grid_search_xgb = GridSearchCV(
    estimator=xgb.XGBRegressor(random_state=42, n_jobs=-1),
    param_grid=param_grid_xgb,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=2
)

# 3. Run
grid_search_xgb.fit(X_train, y_train)

# 4. Show best results
print("\nBest params for XGBoost:")
print(grid_search_xgb.best_params_)
print(f"Najlepszy wynik R2 (cross-validation): {grid_search_xgb.best_score_:.4f}")


--- XGBoost Regressor ---
Fitting 5 folds for each of 72 candidates, totalling 360 fits

Best params for XGBoost:
{'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 1.0}
Najlepszy wynik R2 (cross-validation): 0.8709


In [15]:
# Get best models
best_rf_model = grid_search_rf.best_estimator_
best_xgb_model = grid_search_xgb.best_estimator_

# Test
y_pred_rf = best_rf_model.predict(X_test)
y_pred_xgb = best_xgb_model.predict(X_test)

r2_rf_test = r2_score(y_test, y_pred_rf)
r2_xgb_test = r2_score(y_test, y_pred_xgb)

print("\n--- Final results ---")
print(f"Random Forest - R2 Score: {r2_rf_test:.4f}")
print(f"XGBoost - R2 Score: {r2_xgb_test:.4f}")



--- Final results ---
Random Forest - R2 Score: 0.8578
XGBoost - R2 Score: 0.8595


## Feature importance

In [16]:
importances = best_xgb_model.feature_importances_

feature_names = X_train.columns

feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

feature_importance_df_sorted = feature_importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)

print("Top 10 features for XGBoost:")
print(feature_importance_df_sorted.head(10))

Top 10 features for XGBoost:
              Feature  Importance
0               rooms      0.1836
1                area      0.1650
2  building_max_floor      0.1243
3      rooms_per_area      0.0958
4  building_type_freq      0.0734
5       district_freq      0.0687
6                rent      0.0570
7              garage      0.0544
8   is_above_10_floor      0.0421
9        heating_freq      0.0357
