In [5]:
import data_preparation_functions as dpf
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
import time
import pickle
import json

### Read DataFrame

In [6]:
path = "/content/drive/MyDrive/Data Analysis/Apartments-Prices-in-Poland/apartments_pl_2024_06.csv"

df = pd.read_csv(path)

In [7]:
df.head()

Unnamed: 0,id,city,type,squareMeters,rooms,floor,floorCount,buildYear,latitude,longitude,...,pharmacyDistance,ownership,buildingMaterial,condition,hasParkingSpace,hasBalcony,hasElevator,hasSecurity,hasStorageRoom,price
0,811891f98a870dfd6e414374a0a85560,szczecin,blockOfFlats,47.0,2.0,6.0,12.0,1981.0,53.428544,14.552812,...,0.085,condominium,concreteSlab,,no,yes,yes,no,yes,449000
1,adaf636d0c44d8d9325bce42403eefee,szczecin,apartmentBuilding,88.22,3.0,1.0,2.0,2000.0,53.449093,14.516844,...,0.668,condominium,brick,premium,yes,yes,no,no,no,950000
2,9b957bd60885a469c96f17b58a914f4b,szczecin,apartmentBuilding,117.0,5.0,4.0,4.0,,53.443096,14.561348,...,0.229,udział,brick,premium,yes,yes,no,no,no,1099000
3,74fef2ff7135bc70797a3fbfd7d44ed6,szczecin,blockOfFlats,33.31,1.0,1.0,4.0,1963.0,53.4361,14.5412,...,0.388,cooperative,brick,,yes,no,no,yes,yes,380000
4,77cc78c75b0d09bf84d6d3124a28803c,szczecin,blockOfFlats,56.0,3.0,7.0,7.0,2018.0,53.447465,14.557811,...,0.178,condominium,brick,premium,yes,yes,yes,yes,yes,799000


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21501 entries, 0 to 21500
Data columns (total 28 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    21501 non-null  object 
 1   city                  21501 non-null  object 
 2   type                  17104 non-null  object 
 3   squareMeters          21501 non-null  float64
 4   rooms                 21501 non-null  float64
 5   floor                 17928 non-null  float64
 6   floorCount            21292 non-null  float64
 7   buildYear             18121 non-null  float64
 8   latitude              21501 non-null  float64
 9   longitude             21501 non-null  float64
 10  centreDistance        21501 non-null  float64
 11  poiCount              21501 non-null  float64
 12  schoolDistance        21490 non-null  float64
 13  clinicDistance        21438 non-null  float64
 14  postOfficeDistance    21481 non-null  float64
 15  kindergartenDistanc

### Prepare data

In [9]:
df = dpf.categorize_location(df)

In [10]:
df = df.drop(columns=['id', 'ownership',  'rooms', 'hasSecurity', 'hasStorageRoom', 'buildingMaterial', 'schoolDistance', 'restaurantDistance', 'postOfficeDistance'])

In [11]:
# Add price_per_m2 column
df['price_per_m2'] = (df['price'] / df['squareMeters']).round(0)

In [12]:
df.head()

Unnamed: 0,city,type,squareMeters,floor,floorCount,buildYear,centreDistance,poiCount,clinicDistance,kindergartenDistance,collegeDistance,pharmacyDistance,condition,hasParkingSpace,hasBalcony,hasElevator,price,locationCategory,price_per_m2
0,szczecin,blockOfFlats,47.0,6.0,12.0,1981.0,0.79,67.0,0.285,0.245,0.593,0.085,,no,yes,yes,449000,1,9553.0
1,szczecin,apartmentBuilding,88.22,1.0,2.0,2000.0,4.09,0.0,1.039,0.676,1.192,0.668,premium,yes,yes,no,950000,2,10769.0
2,szczecin,apartmentBuilding,117.0,4.0,4.0,,2.19,10.0,0.611,0.28,1.522,0.229,premium,yes,yes,no,1099000,3,9393.0
3,szczecin,blockOfFlats,33.31,1.0,4.0,1963.0,1.93,39.0,0.326,0.089,0.041,0.388,,yes,no,no,380000,4,11408.0
4,szczecin,blockOfFlats,56.0,7.0,7.0,2018.0,2.68,10.0,0.771,0.26,1.643,0.178,premium,yes,yes,yes,799000,3,14268.0


Manipulate data per city

In [13]:
df_all = pd.DataFrame()

for city in df['city'].unique():

  city_df = df[df['city'] == city]

  city_df = dpf.fill_na_per_city(city_df)

  city_df = dpf.handle_outliers_per_city(city_df)

  df_all = pd.concat([df_all, city_df]).reset_index(drop=True)

Manipulate data for the whole dataset

In [14]:
df_all = df_all.drop(columns=['price', 'price_per_m2', 'locationCategory', 'floorCount'])

df_all = df_all.drop_duplicates()

df_all, bins = dpf.split_and_save_bins(df_all)

df_all, ohe_dict = dpf.encode_and_save_encoder(df_all)

Save OneHotEncoders for predictions

In [15]:
with open('encoders.pkl', 'wb') as file:
    pickle.dump(ohe_dict, file)

Save bins for predictions

In [16]:
with open("bins.json", "w") as file:
    json.dump(bins, file)

Save processed data

In [17]:
df_all.to_csv('processed_data_all.csv', index=False)

In [18]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21216 entries, 0 to 21500
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   squareMeters                 21216 non-null  float64
 1   floor                        21216 non-null  float64
 2   buildYear                    21216 non-null  float64
 3   poiCount                     21216 non-null  float64
 4   price_per_m2_log             21216 non-null  float64
 5   clinicDistance_binned        21216 non-null  int64  
 6   kindergartenDistance_binned  21216 non-null  int64  
 7   collegeDistance_binned       21216 non-null  int64  
 8   pharmacyDistance_binned      21216 non-null  int64  
 9   centreDistance_binned        21216 non-null  int64  
 10  city_bialystok               21216 non-null  float64
 11  city_bydgoszcz               21216 non-null  float64
 12  city_czestochowa             21216 non-null  float64
 13  city_gdansk          

#  Build model

In [19]:
# Assign features and target column
X = df_all.drop(columns=['price_per_m2_log'])
y = df_all['price_per_m2_log']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
y_mean = y_train.mean()
y_pred_baseline = [y_mean] * len(y_train)
baseline_mae = mean_absolute_error(y_pred_baseline, y_train)
print("Mean apt price:", y_mean)
print("Baseline MAE:", baseline_mae)

Mean apt price: 9.519120327067927
Baseline MAE: 0.3011460487361257


In [21]:
# Initialize and configure the XGBoost Regressor
xgb_model = XGBRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)

# Train the model
start = time.time()
xgb_model.fit(X_train, y_train)
end = time.time()
print(f"Training time: {end - start:.4f}s")

# Predict on the test set
start = time.time()
y_pred = xgb_model.predict(X_test)
end = time.time()
print(f"Prediction time: {end - start:.4f}s")

# Evaluate the model
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"R² Score: {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")

Training time: 2.6831s
Prediction time: 0.0401s
R² Score: 0.8692
Mean Absolute Error (MAE): 0.1017


In [22]:
# Save model
with open('xgboost_all_cities.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)