In [2]:
from housing_pricer.scraping.utilities.data_manager import DataManager
from housing_pricer.data_processing.data_processing_utils import format_json_to_dataframe
import pandas as pd
import plotly.express as px
import numpy as np
import xgboost as xgb
import os
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

RAW_DATA_STORAGE = "../scraping/data_storage"
LISTINGS_DF_PATH = "listings_dataframe.pickle"
if os.path.exists(LISTINGS_DF_PATH):
    listings = pd.read_pickle(LISTINGS_DF_PATH)
else:
    data = DataManager(base_dir=RAW_DATA_STORAGE, data_filename="scraped_data").load_data()
    listings = format_json_to_dataframe(data)
    listings.to_pickle(LISTINGS_DF_PATH)

listings

Unnamed: 0,url_listing_type,url_listing_id,market_status,booli_id,sold_date,days_listed,residence_type,address,tenure_form,apartment_number,...,energy_class,floor,building_floors,latitude,longitude,has_solar_panels,agency_id,agent_id,booli_ids_of_previous_sales,n_previous_sales
0,bostad,93631,Slutpris,5137876,2023-12-18,6.0,Lägenhet,Havstenavägen 9A,,1202,...,E,3,,58.404068,13.845960,,20,11905,[5137876],1
1,annons,5174850,Slutpris,5174850,2023-12-18,18.0,Lägenhet,Rörstrandsgatan 37,,1102,...,,1,,59.340639,18.028567,,840,5762,[5174850],1
2,annons,5172906,Slutpris,5172906,2023-12-18,27.0,Lägenhet,Sicklastråket 15,,,...,,,,59.305984,18.117877,,26,12358,[5172906],1
3,bostad,720112,Slutpris,5180749,2023-12-18,16.0,Lägenhet,Sandhamnsgatan 21,,1201,...,F,2,,59.345380,18.109405,,947,9277,"[5180749, 3723302]",2
4,bostad,641258,Slutpris,5182218,2023-12-18,10.0,Lägenhet,Kransbindarvägen 12,,1102,...,F,2,,59.302136,18.002433,,840,13737,[5182218],1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
968633,bostad,1585253,Ej på marknaden,1585253,,,Kedjehus,Frösögatan 92,,,...,,,,55.965969,12.778081,,,,"[-9560136, -7800535, 1583813, 138797]",4
968634,bostad,2261395,Ej på marknaden,2261395,,,Radhus,Carl Thunbergs väg 27,,,...,E,,,57.802823,14.146297,,,,[3063349],1
968635,bostad,3767451,Ej på marknaden,3767451,,,Villa,Björkön 1:152,,,...,,,,62.224093,17.569335,,,,[-9548700],1
968636,bostad,3139535,Ej på marknaden,3139535,,,Villa,Spänstvägen 38,,,...,,,,57.770553,11.847665,,,,"[-9544131, -9402929]",2


# Get viable apartment listings

In [3]:
GREATER_STHLM_AREA_MUNICIPALITIES = ["Stockholm", "Sundbyberg", "Lidingö", "Solna", "Täby", "Huddinge", "Sollentuna", "Danderyd"]

def preprocess_apartment_listings(listings: pd.DataFrame, predictor_cols: list[str], target_cols: list[str]) -> pd.DataFrame:
    viable_listings = (
        listings
        .replace({None: np.NaN})
        .query("sold_price.notna() and municipality.isin(@GREATER_STHLM_AREA_MUNICIPALITIES) and residence_type == 'Lägenhet'")
    )
    processed_listings = viable_listings[predictor_cols + target_cols].dropna().copy()
    return processed_listings

PREDICTOR_COLS = [
    "construction_year", 
    "living_area",
    "latitude", 
    "longitude",
    ]
TARGET_COL = ["sold_price"]
apartment_listings = preprocess_apartment_listings(listings, PREDICTOR_COLS, TARGET_COL)
apartment_listings

Unnamed: 0,construction_year,living_area,latitude,longitude,sold_price
1,1929.0,66.0,59.340639,18.028567,7250000.0
3,1943.0,41.0,59.345380,18.109405,3450000.0
4,1939.0,50.0,59.302136,18.002433,3575000.0
5,1998.0,76.0,59.368137,18.017108,4500000.0
11,2023.0,26.0,59.362120,17.956920,1955000.0
...,...,...,...,...,...
874630,1990.0,95.0,59.237766,17.974410,3000000.0
874651,1989.0,59.0,59.345270,17.976365,3300000.0
874653,2013.0,39.0,59.372982,17.847414,1755000.0
874658,1988.0,57.0,59.312292,18.063453,4110000.0


# Validation-test split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(apartment_listings[PREDICTOR_COLS], apartment_listings[TARGET_COL], test_size=0.2)

dtrain_matrix = xgb.DMatrix(data=X_train, label=y_train)
dtest_matrix = xgb.DMatrix(data=X_test)

In [35]:
import json
def derive_training_domain(training_data: pd.DataFrame, save_to_file: bool = False) -> dict[str, dict[str, float]]:
    stats = training_data.describe()

    domain = {}
    for col in stats.columns:
        domain[col] = {'min': stats[col]['min'], 'max': stats[col]['max']}
    
    if save_to_file:
        with open('training_domain.json', 'w') as file:
            json.dump(domain, file, indent=4)

    return domain

derive_training_domain(X_train, save_to_file=True)

{'construction_year': {'min': 1058.0, 'max': 2024.0},
 'living_area': {'min': 11.0, 'max': 264.0},
 'latitude': {'min': 59.2067555, 'max': 59.504883},
 'longitude': {'min': 17.80370072, 'max': 18.2322282}}

In [33]:
stats = X_train.describe()
stats

Unnamed: 0,construction_year,living_area,latitude,longitude
count,15959.0,15959.0,15959.0,15959.0
mean,1960.819914,61.806203,59.336044,18.026241
std,40.823886,25.517923,0.045269,0.061593
min,1058.0,11.0,59.206755,17.803701
25%,1934.0,42.0,59.30879,17.988151
50%,1955.0,58.0,59.337684,18.033225
75%,2000.0,77.0,59.35912,18.072187
max,2024.0,264.0,59.504883,18.232228


In [14]:
X_train.query("construction_year < 1800")

Unnamed: 0,construction_year,living_area,latitude,longitude
149784,1650.0,151.0,59.325969,18.074949
108717,1400.0,61.0,59.324741,18.069861
97869,1623.0,54.0,59.324751,18.074381
117092,1600.0,108.0,59.315475,18.066055
146464,1758.0,45.0,59.334533,18.057861
139232,1797.0,76.0,59.337814,18.065923
138290,1600.0,175.0,59.324393,18.068599
17503,1058.0,75.0,59.354053,17.87921
1565,1650.0,61.0,59.322955,18.070913
404,1600.0,72.0,59.32457,18.06853


# Hyperoptimisation with CV
To be properly implemented

In [6]:
TEST_METRIC: str = "rmse"
MAX_DEPTH: int = 10
N_FOLD: int = 10
EARLY_STOPPING_ROUNDS: int = 10
N_BOOST_ROUND: int = 75

xgb_hyperparams={
    'objective': 'reg:absoluteerror', 
    'max_depth': MAX_DEPTH,
    }

cv_params = {
    "nfold": N_FOLD,
    "metrics":{f'{TEST_METRIC}'},
    "early_stopping_rounds":EARLY_STOPPING_ROUNDS,
}

cv_results: pd.DataFrame = xgb.cv( # type: ignore
    dtrain=dtrain_matrix,
    params=xgb_hyperparams,
    **cv_params,
    as_pandas=True,
    num_boost_round=N_BOOST_ROUND
)

cv_results[f"test-{TEST_METRIC}-mean"].mean()

1072401.7065356076

# Run on test set

In [7]:
def train_and_evaluate(model: Pipeline, X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    abs_prediction_error = np.abs(y_pred - y_test)
    med_rel_err = np.median(abs_prediction_error/y_test)
    print(f"Median relative error: {med_rel_err}")

    return model

xgb_model = xgb.train(dtrain=dtrain_matrix, num_boost_round=N_BOOST_ROUND, params=xgb_hyperparams)
y_pred = xgb_model.predict(dtest_matrix)
residuals = y_pred - y_test.sold_price
abs_prediction_error = np.abs(residuals)
med_rel_err = np.median(abs_prediction_error/y_test.sold_price)
print(f"Median relative error: {med_rel_err}")

Median relative error: 0.06666955402989926


In [8]:
apartments = listings.query("residence_type=='Lägenhet'")
predicted = X_test.merge(apartments[["sold_date"]], left_index=True, right_index=True)
predicted["residuals"] = residuals
px.scatter(data_frame=predicted, x='sold_date', y='residuals')

# Saving the model

In [9]:
xgb_model.save_model("xgb.json")

# Loading and predicting

In [10]:
import json
from typing import Any
from housing_pricer.valuation_api._utilities.validation import ApartmentData
def read_request() -> dict[str, Any]:
    with open("test_listing.json", "r") as file:
        data_dict = json.load(file)
    return data_dict

model_input = read_request()
validated_model_input = ApartmentData(**model_input)
valuator = xgb.Booster()
valuator.load_model("xgb.json")


input = xgb.DMatrix(pd.DataFrame([validated_model_input.model_dump()]))
valuator.predict(input)[0]

6317482.0