In [80]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler




In [81]:
data= pd.read_csv(r"data\cleaned\properties.csv")

In [82]:
data.isnull().sum()

id                          0
price                       0
zip_code                    0
construction_year           0
total_area_sqm              0
                           ..
province_Luxembourg         0
province_Namur              0
province_Walloon Brabant    0
province_West Flanders      0
price_per_sqm               0
Length: 97, dtype: int64

In [83]:
data.isnull().sum().sum()

0

In [84]:
# Separate the features and target variable
X = data.drop("price", axis=1)  # Drop the target variable 'price'
y = data["price"]  # Set 'price' as the target variable

# Handle missing values
X.fillna("MISSING", inplace=True)  # Fill categorical NaNs with 'MISSING'
y.fillna(y.mean(), inplace=True)  # Fill target variable NaNs with the mean value

# One-hot encode categorical variables
X = pd.get_dummies(X, drop_first=True)

# Standardize numeric features (for numerical stability)
scaler = StandardScaler()
numeric_columns = X.select_dtypes(include=["float64", "int64"]).columns
X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

In [85]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [86]:
#Configure and train the model on the training data:
xgb_regressor = xgb.XGBRegressor(
    objective='reg:squarederror',  # Specifies regression with squared error
    n_estimators=1000,             # Number of boosting rounds; higher values increase training time but may improve performance
    learning_rate=0.05,            # Step size shrinkage to make the model more robust; 0.01–0.2 is typical
    max_depth=6,                   # Maximum depth of each tree; higher values make the model more complex
    subsample=0.8,                 # Fraction of samples to be used per tree; helps prevent overfitting
    colsample_bytree=0.8,          # Fraction of features to consider per tree; used for regularization
    random_state=42                # Sets the random seed for reproducibility
)



In [87]:
#Train the Model on the Training Data
xgb_regressor.fit(X_train, y_train)


In [88]:

# Instantiate the model
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=5, learning_rate=0.1)

# Fit the model
xg_reg.fit(X_train, y_train)

# Predict and evaluate
preds = xg_reg.predict(X_test)
rmse = mean_squared_error(y_test, preds, squared=False)
print(f"RMSE: {rmse}")

RMSE: 100968.71257292773




In [89]:
(xg_reg.score(X_train, y_train)) * 100

98.72294635843295

In [90]:
(xg_reg.score(X_test, y_test)) * 100

94.99918721164545