In [39]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path
from datetime import datetime
from sklearn.base import BaseEstimator, TransformerMixin
import toml

pd.set_option('display.float_format', lambda x: f'{x:.4f}')
pd.set_option('future.no_silent_downcasting', True)

In [62]:
with open(r"C:\Users\Jakub\Real Estate Price Prediction\config.toml", 'r') as f:
    config = toml.load(f)

In [23]:
data = pd.read_csv(r"C:\Users\Jakub\Real Estate Price Prediction\data\cleaned\v1_2025_04_30.csv", sep=';')
data

Unnamed: 0,price,price_per_meter,area,rooms,floor,market_type,furnished,description,district,building_type,year_built,rent,finish_status,ownership,heating,elevator
0,944000.0,19667.0,48.00,2.0,parter/4,wtórny,Nie,<p> Z przyjemnością przedstawiam Państwu ...,śródmieście,tenement,1957.0,700.0,ready_to_use,full_ownership,miejskie,Nie
1,799000.0,12292.0,65.00,4.0,3/10,wtórny,Nie,"Mam przyjemność zaprezentować Państwu, mieszka...",praga-północ,block,1978.0,520.0,to_renovation,limited_ownership,miejskie,Nie
2,1993000.0,29011.2,46.88,2.0,5/8,pierwotny,Nie,<ul><li>2-pokojowe mieszkanie<strong> numer A....,wola,unknown,2026.0,700.0,to_completion,full_ownership,unknown,Nie
3,1333000.0,12250.9,134.00,5.0,1/2,wtórny,Nie,"<p>Na sprzedaż piękne, <strong>5- pokojowe, be...",ursynów,block,2016.0,700.0,ready_to_use,full_ownership,gazowe,Nie
4,1301000.0,23560.0,55.22,3.0,5/6,pierwotny,Nie,<ul><li>3-pokojowe mieszkanie<strong> numer AA...,wola,unknown,2026.0,700.0,to_completion,full_ownership,unknown,Nie
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11040,890000.0,24205.0,36.77,2.0,3/7,wtórny,Nie,<p>Sprzedam bezpośrednio.</p>\r\n<p>2-pokojowe...,wola,block,2016.0,341.0,ready_to_use,full_ownership,miejskie,Nie
11041,525000.0,17500.0,30.00,1.0,5/7,wtórny,Nie,<p>Ładne mieszkanie do zamieszkania od zaraz ....,ursus,block,2008.0,700.0,ready_to_use,full_ownership,gazowe,Nie
11042,525000.0,14045.0,37.38,2.0,3/3,wtórny,Nie,<p>Sprzedaż bezpośrednia.</p>\r\n<p>2-pokojowe...,praga-północ,tenement,1950.0,700.0,ready_to_use,full_ownership,miejskie,Nie
11043,748467.0,17698.0,42.29,2.0,5/5,wtórny,Nie,<p>Oferta bezpośrednia nie współpracuje z biur...,bemowo,block,2025.0,700.0,to_renovation,full_ownership,unknown,Nie


In [89]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, config_feature = None):
        self.config = config_feature if config_feature is not None else {}

        self.freq_encode_cols = self.config['feature_engineering']['frequency_encode_cols']
        self.encodings = {}

        self.one_hot_encode_cols = self.config['feature_engineering']['one_hot_encode_cols']
        self.drop_original_onehot = self.config['feature_engineering']['drop_original_onehot']
        self.categories_ = {}
        
        self.luxury_quantile = self.config['feature_engineering']['luxury_quantile']
        self.price_per_meter_quantile_threshold = None

        self.binary_reduce_cols = self.config['feature_engineering']['binary_reduce_cols']

        self.cols_to_drop_final = self.config['feature_engineering']['columns_to_drop_final']

        self.floor_map = self.config['feature_engineering']['floor_map']

        self.top_building_types = self.config['feature_engineering']['top_building_types']

    def fit(self, X: pd.DataFrame, y=None):
        temp_X_for_fit = X.copy()
        
        #Preprocessing
        if 'building_type' in self.one_hot_encode_cols and 'building_type' in temp_X_for_fit.columns:
            temp_X_for_fit['building_type'] = temp_X_for_fit['building_type'].apply(
                lambda x: x if x in self.top_building_types else 'other'
            )
            print("FeatureEngineer: 'building_type' reduced for fit purposes (to learn correct OHE categories).")

        #Frequency encoding
        for col in self.freq_encode_cols:
            if col in X.columns:
                freq_map = X[col].value_counts(normalize=True).to_dict()
                self.encodings[col] = freq_map
                print(f"FeatureEngineer: Learned frequency map for '{col}'.")

        #One hot encoding
        for col in self.one_hot_encode_cols:
            if col in temp_X_for_fit.columns:
                self.categories_[col] = pd.Categorical(temp_X_for_fit[col]).categories.tolist()
                print(f"FeatureEngineer: Learned categories for '{col}': {self.categories_[col]}.")

        #Is_luxury threshold
        if 'price_per_meter' in X.columns:
            self.price_per_meter_quantile_threshold = X['price_per_meter'].quantile(self.luxury_quantile)
            print(f"FeatureEngineer: Learned luxury threshold for 'price_per_meter': {self.price_per_meter_quantile_threshold:.2f}")
        else:
            self.price_per_meter_quantile_threshold = 0

        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        df_transformed = X.copy()

        #--- static columns
        #1. process floor
        if 'floor' in df_transformed.columns:
            if not df_transformed['floor'].isnull().all():
                df_transformed[['floor', 'building_max_floor']] = df_transformed['floor'].astype(str).str.split('/', expand=True)

            df_transformed['floor'] = df_transformed['floor'].replace(self.floor_map)
            df_transformed['building_max_floor'] = pd.to_numeric(df_transformed['building_max_floor'], errors='coerce')

            df_transformed.loc[df_transformed['floor'] == 'poddasze', 'floor'] = df_transformed['building_max_floor'] + 1

            df_transformed['is_above_10_floor'] = df_transformed['floor'].astype(str).str.contains('>').astype(int)

            df_transformed['floor'] = df_transformed['floor'].astype(str).str.replace('>', '', regex=False)
            df_transformed['floor'] = pd.to_numeric(df_transformed['floor'], errors='coerce')

            df_transformed.loc[df_transformed['building_max_floor'] > 60, 'building_max_floor'] = np.nan

            if 'floor' in df_transformed.columns and 'building_max_floor' in df_transformed.columns:
                df_transformed = df_transformed.dropna(subset=['floor', 'building_max_floor'])

            print("FeatureEngineer: 'floor' and 'building_max_floor' processed.")

        #2. has_elevator_in_desc
        if 'description' in df_transformed.columns:
            has_elevator = df_transformed['description'].str.contains(r'\bwinda\w*\b', case=False, na=False)
            df_transformed.loc[has_elevator, 'elevator'] = 1
            df_transformed['elevator'] = df_transformed['elevator'].replace({'Tak': 1, 'Nie': 0})
            df_transformed['elevator'] = df_transformed['elevator'].fillna(0).astype(int)
            print("FeatureEngineer: 'elevator' feature created.")

        #3. has_balcony_in_desc
        if 'description' in df_transformed.columns:
            has_balcony = df_transformed['description'].str.contains(r'\b(balkon\w*|taras\w*)\b', case=False, na=False)
            df_transformed.loc[has_balcony, 'balcony'] = 1
            df_transformed['balcony'] = df_transformed['balcony'].fillna(0).astype(int) # Change to int
            print("FeatureEngineer: 'balcony' feature created.")

        # 4. has_garage_in_desc
        if 'description' in df_transformed.columns:
            has_garage = df_transformed['description'].str.contains(r'\bgaraż\w*\b', case=False, na=False)
            df_transformed.loc[has_garage, 'garage'] = 1
            df_transformed['garage'] = df_transformed['garage'].fillna(0).astype(int) # Change to int
            print("FeatureEngineer: 'garage' feature created.")

        # 5. has_furniture_in_desc
        if 'description' in df_transformed.columns:
            has_furniture = df_transformed['description'].str.contains(r'\bmeble\w*\b', case=False, na=False)
            df_transformed.loc[has_furniture, 'furnished'] = 1
            df_transformed['furnished'] = df_transformed['furnished'].replace({'Tak': 1, 'Nie': 0})
            df_transformed['furnished'] = df_transformed['furnished'].fillna(0).astype(int)
            print("FeatureEngineer: 'furnished' feature created.")

        # 6. create room_per_area col
        if "rooms" in df_transformed.columns and "area" in df_transformed.columns and not df_transformed['area'].eq(0).any():
            df_transformed["rooms_per_area"] = (df_transformed["rooms"] / df_transformed["area"]).astype(float)
            # Handle potential division by zero results
            df_transformed['rooms_per_area'].replace([np.inf, -np.inf], np.nan, inplace=True)
            df_transformed['rooms_per_area'].fillna(0, inplace=True) # Fill with 0 or median/mean
            print("FeatureEngineer: 'rooms_per_area' feature created.")

        # 7. reduce_to_binary
        for col, positive_val in self.binary_reduce_cols.items():
            if col in df_transformed.columns:
                df_transformed[col] = (df_transformed[col] == positive_val).astype(int)
                print(f"FeatureEngineer: Reduced '{col}' to binary.")

        # 8. reduce_building_type
        if 'building_type' in df_transformed.columns:
            top_types = self.top_building_types
            df_transformed['building_type'] = df_transformed['building_type'].apply(
                lambda x: x if x in top_types else 'other'
            )
            print("FeatureEngineer: 'building_type' reduced.")

        #required 'fit'
        # 9. create_luxury_col (wymaga price_per_meter i nauczonego progu)
        if "price_per_meter" in df_transformed.columns and self.price_per_meter_quantile_threshold is not None:
            df_transformed["is_luxury"] = (df_transformed["price_per_meter"] > self.price_per_meter_quantile_threshold).astype(int)
            print("FeatureEngineer: 'is_luxury' feature created.")

        #10. frequency_encoding
        for col in self.freq_encode_cols:
            if col in df_transformed.columns and col in self.encodings:
                df_transformed[f'{col}_freq'] = df_transformed[col].map(self.encodings[col]).fillna(0)
                print(f"FeatureEngineer: Applied frequency encoding for '{col}'.")

        #11. one_hot_encoding
        for col in self.one_hot_encode_cols:
            if col in df_transformed.columns and col in self.categories_:
                df_transformed[col] = pd.Categorical(df_transformed[col], categories=self.categories_[col])
                
                dummies = pd.get_dummies(df_transformed[col], prefix=col, 
                                         drop_first=self.drop_original_onehot, dtype=int)
                df_transformed = pd.concat([df_transformed, dummies], axis=1)
                
                if self.drop_original_onehot:
                    df_transformed.drop(columns=[col], inplace=True)
                print(f"FeatureEngineer: Applied One-Hot Encoding for '{col}'.")

        #12. drop columns
        cols_to_actually_drop = [c for c in self.cols_to_drop_final if c in df_transformed.columns]
        if cols_to_actually_drop:
            df_transformed = df_transformed.drop(cols_to_actually_drop, axis=1)
            print(f"FeatureEngineer: Dropped columns: {cols_to_actually_drop}")

        df_transformed = df_transformed.reset_index(drop=True)

        print("FeatureEngineer: Transformation complete.")
        return df_transformed


In [90]:
X = data.drop('price', axis=1)
y = data['price']

engineer = FeatureEngineer(config)
engineer.fit(X)
X = engineer.transform(X)

FeatureEngineer: 'building_type' reduced for fit purposes (to learn correct OHE categories).
FeatureEngineer: Learned frequency map for 'district'.
FeatureEngineer: Learned categories for 'finish_status': ['ready_to_use', 'to_completion', 'to_renovation'].
FeatureEngineer: Learned categories for 'building_type': ['apartment', 'block', 'other', 'tenement'].
FeatureEngineer: Learned luxury threshold for 'price_per_meter': 25440.40
FeatureEngineer: 'floor' and 'building_max_floor' processed.
FeatureEngineer: 'elevator' feature created.


  has_balcony = df_transformed['description'].str.contains(r'\b(balkon\w*|taras\w*)\b', case=False, na=False)


FeatureEngineer: 'balcony' feature created.
FeatureEngineer: 'garage' feature created.
FeatureEngineer: 'furnished' feature created.
FeatureEngineer: 'rooms_per_area' feature created.
FeatureEngineer: Reduced 'heating' to binary.
FeatureEngineer: Reduced 'market_type' to binary.
FeatureEngineer: Reduced 'ownership' to binary.
FeatureEngineer: 'building_type' reduced.
FeatureEngineer: 'is_luxury' feature created.
FeatureEngineer: Applied frequency encoding for 'district'.
FeatureEngineer: Applied One-Hot Encoding for 'finish_status'.
FeatureEngineer: Applied One-Hot Encoding for 'building_type'.
FeatureEngineer: Dropped columns: ['description', 'district']
FeatureEngineer: Transformation complete.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_transformed['rooms_per_area'].replace([np.inf, -np.inf], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_transformed['rooms_per_area'].fillna(0, inplace=True) # Fill with 0 or median/mean


In [91]:
X

Unnamed: 0,price_per_meter,area,rooms,floor,market_type,furnished,year_built,rent,ownership,heating,...,balcony,garage,rooms_per_area,is_luxury,district_freq,finish_status_to_completion,finish_status_to_renovation,building_type_block,building_type_other,building_type_tenement
0,19667.0000,48.0000,2.0000,0.0000,0,1,1957.0000,700.0000,1,1,...,0,0,0.0417,0,0.0856,0,0,0,0,1
1,12292.0000,65.0000,4.0000,3.0000,0,0,1978.0000,520.0000,0,1,...,0,0,0.0615,0,0.0321,0,1,1,0,0
2,29011.2000,46.8800,2.0000,5.0000,1,0,2026.0000,700.0000,1,0,...,1,1,0.0427,1,0.1064,1,0,0,1,0
3,12250.9000,134.0000,5.0000,1.0000,0,0,2016.0000,700.0000,1,0,...,1,0,0.0373,0,0.0488,0,0,1,0,0
4,23560.0000,55.2200,3.0000,5.0000,1,0,2026.0000,700.0000,1,0,...,1,1,0.0543,0,0.1064,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10451,24205.0000,36.7700,2.0000,3.0000,0,0,2016.0000,341.0000,1,1,...,1,0,0.0544,0,0.1064,0,0,1,0,0
10452,17500.0000,30.0000,1.0000,5.0000,0,0,2008.0000,700.0000,1,0,...,0,0,0.0333,0,0.0608,0,0,1,0,0
10453,14045.0000,37.3800,2.0000,3.0000,0,0,1950.0000,700.0000,1,1,...,0,0,0.0535,0,0.0321,0,0,0,0,1
10454,17698.0000,42.2900,2.0000,5.0000,0,0,2025.0000,700.0000,1,0,...,1,1,0.0473,0,0.0457,0,1,1,0,0


In [92]:
X.columns

Index(['price_per_meter', 'area', 'rooms', 'floor', 'market_type', 'furnished',
       'year_built', 'rent', 'ownership', 'heating', 'elevator',
       'building_max_floor', 'is_above_10_floor', 'balcony', 'garage',
       'rooms_per_area', 'is_luxury', 'district_freq',
       'finish_status_to_completion', 'finish_status_to_renovation',
       'building_type_block', 'building_type_other', 'building_type_tenement'],
      dtype='object')

In [82]:
test = pd.read_csv(r"C:\Users\Jakub\Real Estate Price Prediction\data\processed\v1_2025_04_30.csv", sep=';')
test

Unnamed: 0,price,price_per_meter,area,rooms,floor,furnished,year_built,rent,elevator,building_max_floor,...,rooms_per_area,district_heating,full_ownership,primary_market,district_freq,finish_status_to_completion,finish_status_to_renovation,building_type_block,building_type_other,building_type_tenement
0,-0.2112,0.2875,-0.3769,-0.5892,-1.1451,1,-0.4054,-0.0595,0,-0.4993,...,0.0417,1,1,0,0.0876,0,0,0,0,1
1,-0.4864,-1.3168,0.1995,1.4686,0.0161,0,-0.1990,-0.6125,0,0.8673,...,0.0615,1,0,0,0.0322,0,1,1,0,0
2,1.7805,2.3202,-0.4149,-0.5892,0.7903,0,0.2729,-0.0595,0,0.4118,...,0.0427,0,1,1,0.1101,1,0,0,1,0
3,0.5274,-1.3258,2.5389,2.4974,-0.7580,0,0.1746,-0.0595,0,-0.9548,...,0.0373,0,1,0,0.0495,0,0,1,0,0
4,0.4666,1.1344,-0.1321,0.4397,0.7903,0,0.2729,-0.0595,0,-0.0438,...,0.0543,0,1,1,0.1101,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10450,-0.3137,1.2747,-0.7577,-0.5892,0.0161,0,0.1746,-1.1625,0,0.1840,...,0.0544,1,1,0,0.1101,0,0,1,0,0
10451,-1.0067,-0.1839,-0.9872,-1.6180,0.7903,0,0.0959,-0.0595,0,0.1840,...,0.0333,0,1,0,0.0332,0,0,1,0,0
10452,-1.0067,-0.9355,-0.7370,-0.5892,0.0161,0,-0.4743,-0.0595,0,-0.7271,...,0.0535,1,1,0,0.0322,0,0,0,0,1
10453,-0.5824,-0.1408,-0.5705,-0.5892,0.7903,0,0.2630,-0.0595,0,-0.2715,...,0.0473,0,1,0,0.0475,0,1,1,0,0


In [83]:
test.columns

Index(['price', 'price_per_meter', 'area', 'rooms', 'floor', 'furnished',
       'year_built', 'rent', 'elevator', 'building_max_floor',
       'is_above_10_floor', 'balcony', 'garage', 'is_luxury', 'rooms_per_area',
       'district_heating', 'full_ownership', 'primary_market', 'district_freq',
       'finish_status_to_completion', 'finish_status_to_renovation',
       'building_type_block', 'building_type_other', 'building_type_tenement'],
      dtype='object')