In [1]:
import pandas as pd
import plotly.express as px

In [4]:
df = pd.read_parquet('../../01-data-processing/data/data_numerical.parquet', engine='pyarrow')
# Drop address
df.drop(columns='address', inplace=True)
df.head(2)

Unnamed: 0_level_0,price,construction year,building condition,asbestos certificate,living area,bedrooms,bathrooms,toilets,primary energy consumption,energy class,...,planning permission obtained,subdivision permit,possible priority purchase right,non-flood zone,g-score,shared building,surface of the plot,sewer network connection,designated land use,double glazing
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bff0933c-8706-450e-be72-df1c836eb396,765000.0,2015.0,1.0,0.0,171.0,2.0,1.0,2.0,102.0,4.0,...,1,0.0,1.0,1,1.0,0.0,,,,
48103edf-d945-4052-a349-31a1bdee8300,321477.0,,,,,,,,,,...,0,,,0,,,,,,


In [5]:
categorical_variables = ['construction year', 'asbestos certificate', 
                         'shared building', 'bedrooms', 'building condition', 
                         'possible priority purchase right', 
                         'inspection report of the electrical installation', 
                         'subdivision permit', 'sewer network connection', 
                         'planning permission obtained', 'non-flood zone', 
                         'g-score', 'double glazing', 'energy class', 
                         'bathrooms', 'toilets', 'designated land use']

for col in categorical_variables:
    df[col] = df[col].astype('category')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 870 entries, bff0933c-8706-450e-be72-df1c836eb396 to 6a3ae3f6-0901-46fa-836e-22e06d444738
Data columns (total 23 columns):
 #   Column                                            Non-Null Count  Dtype   
---  ------                                            --------------  -----   
 0   address                                           870 non-null    object  
 1   price                                             870 non-null    float64 
 2   construction year                                 523 non-null    category
 3   building condition                                686 non-null    category
 4   asbestos certificate                              561 non-null    category
 5   living area                                       749 non-null    float64 
 6   bedrooms                                          773 non-null    category
 7   bathrooms                                         747 non-null    category
 8   toilets                    

In [6]:
for col in df:
    if df[col].dtype == 'category':
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna(df[col].median())

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [8]:
df_standardized = pd.DataFrame(StandardScaler().fit_transform(df), columns=df.columns)
features = df_standardized.drop(columns='price')
response = df_standardized['price']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    features, response,
    test_size=0.2,       
    random_state=2025,       
)

In [14]:
features.shape[1]

20

# Determine optimal k (number of features)

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestRegressor

In [18]:
results = []
for k in range(1, features.shape[1] + 1):
    selector = SelectKBest(score_func=mutual_info_regression, k=k)
    X_selected = selector.fit_transform(features, response)
    score = cross_val_score(RandomForestRegressor(), X_selected, response, cv=10, scoring='r2').mean()
    results.append((k, score))

# Present results as df for easier manipulation
df = pd.DataFrame(results, columns=["k", "cv_score"])
df.sort_values("cv_score", ascending=False)

Unnamed: 0,k,cv_score
7,8,0.568193
17,18,0.564655
6,7,0.559205
9,10,0.556292
8,9,0.555449
13,14,0.554295
5,6,0.553022
12,13,0.551629
11,12,0.550106
4,5,0.550025


In [None]:
# import matplotlib.pyplot as plt


# plt.plot(df['k'], df['cv_score'], marker='o')
# plt.xticks(list(range(0,24,2)))
# plt.xlabel("Number of Features (k)")
# plt.ylabel("Cross-Validated R² Score")
# plt.title("Feature Selection Performance vs k")
# plt.axvline(x=6, color='red', linestyle='--', linewidth=1.5) 
# plt.show()

In [19]:
fig = px.line(
    df, x='k', y='cv_score', markers=True,
    title='Feature Selection Performance with Mode and Median Imputation', 
    labels={
        'k': 'Number of Features (k)',
        'cv_score': 'R² Score'
    }
)
fig.add_vline(
    x=5,
    line_dash="dash",
    line_color="#EF553B",
    annotation_text="Best k = 5",
    annotation_position="top right"
)
fig.update_layout(xaxis=dict(showgrid=False), 
                  yaxis=dict(showgrid=False))
# fig.update_traces(line=dict(color='blue'))
fig.show()

# Modelling

In [20]:
# algorithms
from sklearn.linear_model import LinearRegression
from xgboost import XGBRFRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor


# evaluation metrics
from sklearn.model_selection import cross_val_score
import time

In [22]:
# Select best features
kbest = SelectKBest(score_func=mutual_info_regression, k=5)
X_kbest = kbest.fit_transform(features, response)
features.columns[kbest.get_support()]

Index(['living area', 'bedrooms', 'toilets', 'energy class',
       'surface of the plot'],
      dtype='object')

In [24]:
kbest_df = pd.DataFrame(X_kbest, columns=features.columns[kbest.get_support()])
kbest_df.head(2)

Unnamed: 0,living area,bedrooms,toilets,energy class,surface of the plot
0,-0.169168,-0.942835,0.170208,-0.646085,-0.155072
1,-0.153113,-0.167536,0.170208,-0.646085,-0.155072


In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    kbest_df, response,
    test_size=0.2,       
    random_state=2025,       
)

In [27]:
models = {'Multiple Linear Regression': LinearRegression(), 
          'XGBoost-Random Forest': XGBRFRegressor(random_state=2025), 
          'Random Forest': RandomForestRegressor(),
          'CatBoost': CatBoostRegressor(verbose=0, random_state=2025)
}

results = []
for name, model in models.items():
    start = time.time()

    cod = cross_val_score(model, X_train, y_train, cv=10, scoring='r2').mean()
    rmse = (-cross_val_score(model, X_train, y_train, cv=10, scoring='neg_mean_squared_error').mean())**0.5
    mae = (-cross_val_score(model, X_train, y_train, cv=10, scoring='neg_mean_absolute_error').mean())
    
    end = time.time()
    duration=end-start

    results.append({
        'Model': name,
        'RMSE': round(rmse,3),
        'MAE': round(mae,3),
        'R²': round(cod,3),
        'Training Time (s)': round(duration, 3)
    })

# Create DataFrame
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,RMSE,MAE,R²,Training Time (s)
0,Multiple Linear Regression,0.857,0.5,0.246,0.076
1,XGBoost-Random Forest,0.704,0.445,0.51,2.28
2,Random Forest,0.678,0.421,0.529,3.27
3,CatBoost,0.662,0.417,0.563,9.826


# Visualization

In [29]:
import plotly.express as px
import pandas as pd

In [28]:
metrics_df = results_df.melt(id_vars='Model', value_vars=['RMSE', 'MAE', 'R²'],
                             var_name='Metric', value_name='Score')

fig = px.bar(metrics_df, x='Model', y='Score', color='Metric', barmode='group',
             title='Model Performance Comparison with Median and Mode Imputation',
             text='Score')
fig.update_traces(textposition='outside')
fig.update_yaxes(range=[0,1])
# fig.update_layout(width=800, height=500)
fig.show()

In [31]:
fig = px.scatter(results_df,
                 x='Training Time (s)',
                 y='R²',
                 text='Model',
                 title='Training Time vs. R²',
                 labels={'Training Time (s)': 'Training Time (seconds)', 'RMSE': 'RMSE'})
                #  hover_data=['MAE', 'R²'])

fig.update_traces(textposition='top center', marker=dict(size=12))
fig.update_xaxes(range=[-2,12])
fig.update_layout(xaxis=dict(showgrid=False))

fig.show()