In [83]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [84]:
df= pd.read_csv(r"data\cleaned\properties.csv")

In [85]:
df.head()

Unnamed: 0,id,price,zip_code,construction_year,total_area_sqm,nbr_bedrooms,equipped_kitchen,fl_furnished,terrace_sqm,garden_sqm,...,province_East Flanders,province_Flemish Brabant,province_Hainaut,province_Limburg,province_Liège,province_Luxembourg,province_Namur,province_Walloon Brabant,province_West Flanders,price_per_sqm
0,34221000,225000.0,2050,1963.0,-0.463249,2,INSTALLED,0,-0.208716,-0.135719,...,False,False,False,False,False,False,False,False,False,-485700.2
1,58496000,501000.0,2275,2024.0,0.200771,3,MISSING,0,-0.328667,-0.135719,...,False,False,False,False,False,False,False,False,False,2495376.27
2,48727000,982700.0,1410,2022.0,0.063388,2,HYPER_EQUIPPED,0,0.151135,-0.024745,...,False,False,False,False,False,False,False,True,False,15502966.89
3,58028000,3500000.0,8300,1935.0,0.887689,6,MISSING,0,-0.328667,-0.135719,...,False,False,False,False,False,False,False,False,True,3942824.01
4,10079000,549000.0,9240,2001.0,0.185506,4,HYPER_EQUIPPED,0,-0.328667,-0.135719,...,True,False,False,False,False,False,False,False,False,2959465.09


In [86]:
df.isnull().sum()

id                          0
price                       0
zip_code                    0
construction_year           0
total_area_sqm              0
                           ..
province_Luxembourg         0
province_Namur              0
province_Walloon Brabant    0
province_West Flanders      0
price_per_sqm               0
Length: 97, dtype: int64

In [87]:
df.isnull().sum().sum()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 97 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     20758 non-null  int64  
 1   price                                  20758 non-null  float64
 2   zip_code                               20758 non-null  int64  
 3   construction_year                      20758 non-null  float64
 4   total_area_sqm                         20758 non-null  float64
 5   nbr_bedrooms                           20758 non-null  int64  
 6   equipped_kitchen                       20758 non-null  object 
 7   fl_furnished                           20758 non-null  int64  
 8   terrace_sqm                            20758 non-null  float64
 9   garden_sqm                             20758 non-null  float64
 10  fl_floodzone                           20758 non-null  int64  
 11  pr

In [88]:
# Separate the features and target variable
X = df.drop("price", axis=1)  # Drop the target variable 'price'
y = df["price"]  # Set 'price' as the target variable

# Handle missing values
X.fillna("MISSING", inplace=True)  # Fill categorical NaNs with 'MISSING'
y.fillna(y.mean(), inplace=True)  # Fill target variable NaNs with the mean value

# One-hot encode categorical variables
X = pd.get_dummies(X, drop_first=True)

# Standardize numeric features (for numerical stability)
scaler = StandardScaler()
numeric_columns = X.select_dtypes(include=["float64", "int64"]).columns
X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

In [89]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [90]:
# Initialize the Decision Tree Regressor with optional parameters
regressor = DecisionTreeRegressor(max_depth=10, min_samples_split=5, random_state=42)

# Fit the model
regressor.fit(X_train, y_train)


In [91]:
(regressor.score(X_train, y_train)) * 100

97.9118437734963

In [92]:
(regressor.score(X_test, y_test)) * 100

92.83717323420848

In [93]:
# Make predictions
y_pred = regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 14602172975.310751


In [94]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test,y_pred)

14602172975.310751

In [95]:
np.sqrt(mean_squared_error(y_test,y_pred))

120839.45123721288

In [96]:
from sklearn.model_selection import GridSearchCV

# Set up the parameter grid
param_grid = {
    "max_depth": [5, 10, 15, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 5]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(DecisionTreeRegressor(random_state=42), param_grid, cv=5, scoring="neg_mean_squared_error")
grid_search.fit(X_train, y_train)

# Get the best model from GridSearchCV
best_regressor = grid_search.best_estimator_

# Evaluate the best model
y_pred_best = best_regressor.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred_best)
print("Best Model Mean Squared Error:", mse_best)


Best Model Mean Squared Error: 10586936381.54668
