In [1]:
import pandas as pd
import plotly.express as px

In [2]:
df = pd.read_parquet('../data/data_numerical.parquet', engine='pyarrow')
# Drop address
df.drop(columns='address', inplace=True)
df.head(2)

Unnamed: 0_level_0,price,construction year,building condition,asbestos certificate,living area,bedrooms,bathrooms,toilets,primary energy consumption,energy class,...,planning permission obtained,subdivision permit,possible priority purchase right,non-flood zone,g-score,shared building,surface of the plot,sewer network connection,designated land use,double glazing
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bff0933c-8706-450e-be72-df1c836eb396,765000.0,2015.0,1.0,0.0,171.0,2.0,1.0,2.0,102.0,4.0,...,1,0.0,1.0,1,1.0,0.0,,,,
48103edf-d945-4052-a349-31a1bdee8300,321477.0,,,,,,,,,,...,0,,,0,,,,,,


# Imputation

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRFRegressor

In [4]:
df_standardized = pd.DataFrame(StandardScaler().fit_transform(df), columns=df.columns)
X_stand = df_standardized.drop(columns='price')
y_stand = df_standardized['price']

In [5]:
# Build pipeline
pipeline_impute = Pipeline([
    ('imputer', KNNImputer()),
    ('model', XGBRFRegressor())]) # insert model of choice

# Determine optimal k for imputation
param_grid = {'imputer__n_neighbors': list(range(1,5))}

grid = GridSearchCV(pipeline_impute, param_grid, cv=10, scoring='neg_mean_squared_error')
grid.fit(X_stand, y_stand)

print("Best n_neighbors for KNNImputer:", grid.best_params_['imputer__n_neighbors'])
print("Best RMSE:", round((-grid.best_score_)**0.5,3))

Best n_neighbors for KNNImputer: 4
Best RMSE: 0.667


In [6]:
results_df = pd.DataFrame(grid.cv_results_)
results_df = results_df[['param_imputer__n_neighbors', 'mean_test_score']].copy()
results_df['RMSE'] = (-results_df['mean_test_score']) ** 0.5

In [7]:
# Determine number of neigbors based on R^2
fig = px.line(
    results_df,
    x='param_imputer__n_neighbors',
    y='RMSE',
    # markers=True,
    title='Model Performance vs. Number of Neighbors',
    labels={'param_imputer__n_neighbors': 'Number of Neigbors (k)', 'RMSE': 'Cross-validated RMSE'}
)
fig.update_xaxes(tick0=1, dtick=1, range=[0,5])    
fig.show()  

In [8]:
# Impute with KNN and k=4
pipeline = make_pipeline(KNNImputer(n_neighbors=4)
)

df_imputed = pd.DataFrame(pipeline.fit_transform(df_standardized), columns=df.columns)
df_imputed

Unnamed: 0,price,construction year,building condition,asbestos certificate,living area,bedrooms,bathrooms,toilets,primary energy consumption,energy class,...,planning permission obtained,subdivision permit,possible priority purchase right,non-flood zone,g-score,shared building,surface of the plot,sewer network connection,designated land use,double glazing
0,1.236413,1.035587,-1.704509,-1.095445,-0.180256,-0.910142,-0.341810,0.199653,-0.079678,-0.795582,...,0.818453,-0.602303,2.472319,0.692526,-0.524604,-0.263237,-0.237682,0.154471,-0.471694,0.201285
1,-0.503011,0.129029,0.733365,0.410792,-0.163465,0.554041,-0.341810,0.774331,0.025463,1.229257,...,-1.221817,1.094645,-0.404479,-1.443990,-0.266026,0.752289,-0.045844,0.154471,-0.342735,0.201285
2,2.079607,0.063852,0.068490,0.912871,0.133170,1.286132,-0.341810,0.199653,-0.058650,-0.217056,...,0.818453,1.094645,-0.404479,0.692526,-0.524604,3.798865,1.891646,0.154471,-0.471694,0.201285
3,-0.018574,1.201493,-1.704509,0.410792,-0.822033,-0.910142,-0.341810,-0.949702,-0.075766,-1.374107,...,0.818453,-0.602303,2.472319,0.692526,-0.007447,-0.263237,-0.246484,0.154471,-0.471694,0.201285
4,-0.469586,-1.713713,0.068490,-0.593366,-0.575770,-0.178051,-0.341810,-0.949702,-0.069083,-0.795582,...,-1.221817,-0.602303,-0.404479,0.692526,-0.524604,-0.263237,-0.357206,0.154471,0.044145,0.201285
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
865,-0.332321,-0.546445,-1.704509,-0.593366,-0.225031,0.554041,0.928499,1.349008,-0.065660,-0.795582,...,0.818453,-0.602303,-0.404479,0.692526,1.544025,3.798865,-0.318799,0.154471,-0.471694,0.201285
866,-0.685286,-1.263396,0.068490,0.912871,-0.896658,-1.642234,-0.341810,-0.949702,-0.063867,-0.795582,...,0.818453,-0.602303,2.472319,0.692526,-0.524604,-0.263237,-0.342803,0.154471,-0.471694,0.201285
867,-0.669599,-1.144892,1.841490,-1.095445,-0.336969,-0.178051,0.928499,-0.949702,-0.033547,0.361469,...,-1.221817,-0.602303,-0.404479,0.692526,-0.524604,-0.263237,-0.281591,0.154471,-0.471694,0.201285
868,0.534847,0.395664,0.068490,-1.095445,0.103319,-0.178051,-0.341810,0.199653,-0.036807,-0.650950,...,-1.221817,1.094645,-0.404479,-1.443990,-0.266026,-0.263237,-0.127560,0.154471,0.044145,0.201285


In [9]:
# prep
from sklearn.model_selection import train_test_split

# feature selection
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.model_selection import cross_val_score


In [10]:
X_imputed = df_imputed.drop(columns='price')
y_imputed = df_imputed['price']

X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y_imputed,
    test_size=0.2,        
    random_state=2025,       
)

In [11]:
results = []
for k in range(1, X_train.shape[1] + 1):
    selector = SelectKBest(score_func=mutual_info_regression, k=k)
    X_selected = selector.fit_transform(X_imputed, y_imputed)
    score = cross_val_score(RandomForestRegressor(), X_selected, y_stand, cv=10, scoring='r2').mean() # insert model of choice
    results.append((k, score))

# Pass results to df for easier manipulation
df = pd.DataFrame(results, columns=["k", "cv_score"])
df.sort_values("cv_score", ascending=False)

Unnamed: 0,k,cv_score
12,13,0.636337
16,17,0.635753
11,12,0.63301
7,8,0.632796
6,7,0.631413
19,20,0.627061
13,14,0.626919
8,9,0.626518
5,6,0.625687
14,15,0.625032


In [12]:
fig = px.line(
    df, x='k', y='cv_score', markers=True,
    title='Feature Selection with KNN', 
    labels={
        'k': 'Number of Features (k)',
        'cv_score': 'R² Score'
    }
)
fig.update_layout(xaxis=dict(showgrid=False), 
                  yaxis=dict(showgrid=False))
# fig.update_traces(line=dict(color='blue'))
fig.add_vline(
    x=5,
    line_dash="dash",
    line_color="#EF553B",
    annotation_text="Best k = 5",
    annotation_position="top right"
)

fig.show()

In [13]:
# best k=5
kbest = SelectKBest(score_func=mutual_info_regression, k=5)
X_kbest = kbest.fit_transform(X_imputed, y_imputed)
X_stand.columns[kbest.get_support()]

Index(['construction year', 'living area', 'bedrooms', 'energy class',
       'surface of the plot'],
      dtype='object')

In [14]:
kbest_df = pd.DataFrame(X_kbest, columns=X_stand.columns[kbest.get_support()])
kbest_df.head(2)

Unnamed: 0,construction year,living area,bedrooms,energy class,surface of the plot
0,1.035587,-0.180256,-0.910142,-0.795582,-0.237682
1,0.129029,-0.163465,0.554041,1.229257,-0.045844


# Modeling

In [15]:
# algorithms
from sklearn.linear_model import LinearRegression
from xgboost import XGBRFRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

# evaluation
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
import time 

from sklearn.model_selection import cross_val_score


In [17]:
models = {'Multiple Linear Regression': LinearRegression(), 
          'XGBoost-Random Forest': XGBRFRegressor(random_state=2025), 
          'Random Forest': RandomForestRegressor(),
          'CatBoost': CatBoostRegressor(verbose=0, random_state=2025)
}

model_results = []
for name, model in models.items():
    start = time.time()

    cod = cross_val_score(model, X_train, y_train, cv=10, scoring='r2').mean()
    rmse = (-cross_val_score(model, X_train, y_train, cv=10, scoring='neg_mean_squared_error').mean())**0.5
    mae = (-cross_val_score(model, X_train, y_train, cv=10, scoring='neg_mean_absolute_error').mean())
    
    end = time.time()
    duration=end-start

    model_results.append({
        'Model': name,
        'RMSE': round(rmse,3),
        'MAE': round(mae,3),
        'R²': round(cod,3),
        'Training Time (s)': round(duration, 5)
    })

# Create DataFrame
model_results_df = pd.DataFrame(model_results)
model_results_df

Unnamed: 0,Model,RMSE,MAE,R²,Training Time (s)
0,Multiple Linear Regression,0.875,0.493,0.164,0.13461
1,XGBoost-Random Forest,0.654,0.41,0.58,4.13647
2,Random Forest,0.622,0.368,0.621,6.95354
3,CatBoost,0.619,0.363,0.617,13.35801


In [18]:
# Assume `results_df` is already created
metrics_df = model_results_df.melt(id_vars='Model', value_vars=['RMSE', 'MAE', 'R²'],
                             var_name='Metric', value_name='Score')

fig = px.bar(metrics_df, x='Model', y='Score', color='Metric', barmode='group',
             title='Model Performance Comparison with KNN Imputation',
             text='Score')
fig.update_traces(textposition='outside')
fig.update_yaxes(range=[0,1])
fig.show()