In [1]:
import pandas as pd

In [3]:
df = pd.read_parquet('/Users/vancescadinh/Documents/house-price-prediction/01-data-processing/data/data_numerical.parquet', engine='pyarrow')
df.head(2)

Unnamed: 0_level_0,address,price,construction year,building condition,asbestos certificate,living area,bedrooms,bathrooms,toilets,primary energy consumption,...,planning permission obtained,subdivision permit,possible priority purchase right,non-flood zone,g-score,shared building,surface of the plot,sewer network connection,designated land use,double glazing
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bff0933c-8706-450e-be72-df1c836eb396,vrijdagmarkt 61 box 305 ...,765000.0,2015.0,1.0,0.0,171.0,2.0,1.0,2.0,102.0,...,1,0.0,1.0,1,1.0,0.0,,,,
48103edf-d945-4052-a349-31a1bdee8300,gitsestraat 545 8800 — r...,321477.0,,,,,,,,,...,0,,,0,,,,,,


In [4]:
categorical_variables = ['construction year', 'asbestos certificate', 
                         'shared building', 'bedrooms', 'building condition', 
                         'possible priority purchase right', 
                         'inspection report of the electrical installation', 
                         'subdivision permit', 'sewer network connection', 
                         'planning permission obtained', 'non-flood zone', 
                         'g-score', 'double glazing', 'energy class', 
                         'bathrooms', 'toilets', 'designated land use']

for col in categorical_variables:
    df[col] = df[col].astype('category')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 870 entries, bff0933c-8706-450e-be72-df1c836eb396 to 6a3ae3f6-0901-46fa-836e-22e06d444738
Data columns (total 23 columns):
 #   Column                                            Non-Null Count  Dtype   
---  ------                                            --------------  -----   
 0   address                                           870 non-null    object  
 1   price                                             870 non-null    float64 
 2   construction year                                 523 non-null    category
 3   building condition                                686 non-null    category
 4   asbestos certificate                              561 non-null    category
 5   living area                                       749 non-null    float64 
 6   bedrooms                                          773 non-null    category
 7   bathrooms                                         747 non-null    category
 8   toilets                    

In [6]:
# Drop address
df.drop(columns='address', inplace=True)

In [7]:
for col in df:
    if df[col].dtype == 'category':
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna(df[col].median())

# Model: Attempt 1

In [37]:
# feature selection
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression

# preparation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# algorithms
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from xgboost import XGBRFRegressor
from sklearn.ensemble import RandomForestRegressor


# evaluation metrics
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score

In [24]:
df_standardized = pd.DataFrame(StandardScaler().fit_transform(df), columns=df.columns)

In [26]:
X = df_standardized.drop(columns='price')
y = df_standardized['price']

# Standardize
X_normalized = StandardScaler().fit_transform(X)

# Select best features
kbest = SelectKBest(score_func=f_regression, k=12)
X_kbest = kbest.fit_transform(X_normalized, y)
X.columns[kbest.get_support()]

Index(['building condition', 'asbestos certificate', 'living area', 'bedrooms',
       'bathrooms', 'toilets', 'energy class', 'non-flood zone', 'g-score',
       'surface of the plot', 'designated land use', 'double glazing'],
      dtype='object')

In [27]:
kbest_df = pd.DataFrame(X_kbest, columns=X.columns[kbest.get_support()])
kbest_df.head(2)

Unnamed: 0,building condition,asbestos certificate,living area,bedrooms,bathrooms,toilets,energy class,non-flood zone,g-score,surface of the plot,designated land use,double glazing
0,-1.934891,-1.552986,-0.169168,-0.942835,-0.314143,0.170208,-0.646085,0.692526,-0.428355,-0.155072,-0.389972,0.16107
1,0.060788,0.643921,-0.153113,-0.167536,-0.314143,0.170208,-0.646085,-1.44399,-0.428355,-0.155072,-0.389972,0.16107


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    kbest_df, y,
    test_size=0.2,       
    random_state=2025,       
)

In [38]:
models = [LinearRegression(), Lasso(), Ridge(),
        XGBRFRegressor(enable_categorical=True, random_state=2025), RandomForestRegressor()]


for i in range(len(models)):
    models[i].fit(X_train, y_train)
    print(f'{models[i]}: ')

    y_predictions = models[i].predict(X_test)

    rmse = root_mean_squared_error(y_test, y_predictions)
    mae = mean_absolute_error(y_test, y_predictions)
    r2 = r2_score(y_test, y_predictions)

    print(f'RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.2f}')

LinearRegression(): 
RMSE: 0.74, MAE: 0.47, R²: 0.48
Lasso(): 
RMSE: 1.03, MAE: 0.67, R²: -0.00
Ridge(): 
RMSE: 0.74, MAE: 0.47, R²: 0.48
XGBRFRegressor(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bytree=None, device=None,
               early_stopping_rounds=None, enable_categorical=True,
               eval_metric=None, feature_types=None, feature_weights=None,
               gamma=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=None, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               multi_strategy=None, n_estimators=None, n_jobs=None,
               num_parallel_tree=None, objective='reg:squarederror',
               random_state=2025, ...): 
RMSE: 0.66, MAE: 0.41, R²: 0.58
RandomForestRegressor(): 
RMSE: 0.70