In [64]:
from housing_pricer.scraping.utilities.data_manager import DataManager
from housing_pricer.data_processing.data_processing_utils import format_json_to_dataframe, get_nested_dict_value, MissingDataError
import pandas as pd
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split
import os

def lookup_raw_listing(data_manager: DataManager, listing_id: str) -> dict:
    for entry in data_manager.load_data():
        if entry["id"] == listing_id:
            return entry
    raise RuntimeError("Missing entry")

pd.set_option('display.max_columns', None)
RAW_DATA_STORAGE = "../scraping/data_storage"
LISTINGS_DF_PATH = "listings_dataframe.pickle"
if os.path.exists(LISTINGS_DF_PATH):
    listings = pd.read_pickle(LISTINGS_DF_PATH)
else:
    data = DataManager(base_dir=RAW_DATA_STORAGE, data_filename="scraped_data_old").load_data()
    listings = format_json_to_dataframe(data)
    listings.to_pickle(LISTINGS_DF_PATH)

In [126]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from typing import Any
GREATER_STHLM_AREA_MUNICIPALITIES = ["Stockholm", "Sundbyberg", "Lidingö", "Solna", "Täby", "Huddinge", "Sollentuna", "Danderyd"]

def preprocess_listings(listings: pd.DataFrame, predictor_cols: list[str], target_cols: list[str]) -> pd.DataFrame:
    viable_listings = (
        listings
        .replace({None: np.NaN})
        .assign(total_monthly_cost = listings.operating_cost + listings.rent)
        .query("sold_price.notna() and municipality.isin(@GREATER_STHLM_AREA_MUNICIPALITIES)")
    )
    processed_listings = viable_listings[predictor_cols + target_cols].dropna().copy()
    return processed_listings

from sklearn.base import BaseEstimator, RegressorMixin
class XGBRegressorDMatrix(BaseEstimator, RegressorMixin):
    def __init__(self, **xgb_params):
        self.xgb_params = xgb_params
        self.model = xgb.XGBRegressor(**self.xgb_params)

    def fit(self, X, y):
        dtrain = xgb.DMatrix(data=X, label=y)
        self.model = xgb.train(self.xgb_params, dtrain)
        return self

    def predict(self, X):
        dtest = xgb.DMatrix(data=X)
        return self.model.predict(dtest)
    
def build_xgb_model(numeric_predictors: list[str], categorical_predictors: list[str], xgb_hyperparams: dict[str, Any] = {}) -> Pipeline:
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numeric_predictors),
            ('cat', OneHotEncoder(), categorical_predictors)
        ])

    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', XGBRegressorDMatrix(**xgb_hyperparams))
    ])
    return model

def train_and_evaluate(model: Pipeline, X: pd.DataFrame, y: pd.DataFrame, test_size: float, random_state: int):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_test = y_test.values.flatten()
    abs_prediction_error = np.abs(y_pred - y_test)
    med_rel_err = np.median(abs_prediction_error/y_test)
    print(f"Median relative error: {med_rel_err}")

    return model

# def plot_feature_importance(pipeline: Pipeline, predictor_cols: list[str]):
#     feature_importances = pipeline.named_steps['regressor'].feature_importances_
#     preprocessor = pipeline.named_steps['preprocessor']
#     all_feature_names = predictor_cols.copy()

#     for transformer in preprocessor.transformers_:
#         if transformer[0] == 'cat':
#             ohe_features = transformer[1].get_feature_names_out()
#             all_feature_names.extend(ohe_features)
#             all_feature_names = [f for f in all_feature_names if f not in transformer[2]]

#     feature_importance_df = pd.DataFrame({'Feature': all_feature_names, 'Importance': feature_importances})
#     feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

#     plt.figure(figsize=(6, 3))
#     plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
#     plt.xlabel('Feature Importance')
#     plt.ylabel('Feature')
#     plt.title('Feature Importance from XGBoost')
#     plt.show()

In [127]:
PREDICTOR_COLS = [
    "residence_type", 
    "construction_year", 
    "living_area",
    "total_monthly_cost",
    "latitude", 
    "longitude",
    ]
TARGET_COL = ["sold_price"]
viable_listings = preprocess_listings(listings, PREDICTOR_COLS, TARGET_COL)

In [129]:
def train_and_evaluate_with_cv(X: pd.DataFrame, y: pd.DataFrame, xgb_params: dict, cv_params: dict):
    dmatrix = xgb.DMatrix(data=X, label=y)
    cv_results = xgb.cv(
        dtrain=dmatrix,
        params=xgb_params,
        **cv_params
    )
    print(f"CV results:\n{cv_results}")

    test_metric = cv_params.get('metrics', 'rmse') #+ '-test'
    mean_test_metric = cv_results[test_metric].mean()
    std_test_metric = cv_results[test_metric].std()
    print(f"Mean {test_metric}: {mean_test_metric:.4f}, Std: {std_test_metric:.4f}")
    return cv_results

xgb_hyperparams={'objective': 'reg:absoluteerror', 'max_depth': 15}
cv_params = {"nfold": 5,
    "metrics":{'rmse'},
    "early_stopping_rounds":10
}
train_and_evaluate_with_cv(X=viable_listings[PREDICTOR_COLS], y=viable_listings[TARGET_COL], xgb_params=xgb_hyperparams, cv_params=cv_params)

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:residence_type: object

In [124]:
model = build_xgb_model(
    numeric_predictors=[
    "construction_year", 
    "living_area",
    "total_monthly_cost",
    "latitude", 
    "longitude"], 
    categorical_predictors=["residence_type"],
    xgb_hyperparams={

        'objective': 'reg:absoluteerror', 
        'max_depth': 15}
)
trained_model = train_and_evaluate(model, X=viable_listings[PREDICTOR_COLS], y=viable_listings[TARGET_COL], test_size=0.1, random_state=5)
# plot_feature_importance(model, predictor_cols=PREDICTOR_COLS)

Median relative error: 0.06391958677685951


# Data analysis

In [None]:
viable_listings = listings.query("sold_date.notna()").copy()
px.scatter(x=viable_listings.sold_date.unique(), y=viable_listings.sold_date.value_counts())

In [None]:
viable_listings["sold_YYYY_MM"] = [date[:-3] for date in viable_listings["sold_date"]]

viable_listings

Unnamed: 0,url_listing_type,url_listing_id,booli_id,sold_date,days_listed,residence_type,address,apartment_number,urban_area,municipality,construction_year,list_price,sold_price,sold_price_type,first_price,monthly_payment,rent,operating_cost,energy_class,floor,building_floors,latitude,longitude,has_solar_panels,agency_id,agent_id,booli_ids_of_previous_sales,n_previous_sales,sold_YYYY_MM
0,bostad,2556516,5153109,2023-12-05,25.0,Villa,Västerbyvägen 9,,Norsholm,Norrköping,1979.0,3325000.0,3250000.0,Slutpris,3 325 000,,,4398.0,D,,,58.505988,15.976788,,26,11016,[5153109],1,2023-12
1,bostad,1890105,5147734,2023-12-05,32.0,Villa,Sliparevägen 3,,Rejmyre,Finspång,1978.0,1495000.0,1495000.0,Slutpris,1 495 000,,,4102.0,C,,,58.826413,15.938500,,962,9543,"[5147734, -9035770]",2,2023-12
2,bostad,1706785,5168064,2023-12-05,8.0,Villa,Kronoparksvägen 22,,Ivetofta,Bromölla,1980.0,2095000.0,2400000.0,Slutpris,2 095 000,,,2648.0,C,,,56.091915,14.466163,,20,11931,"[5168064, 984971, 350836]",3,2023-12
3,bostad,3997519,5078126,2023-12-05,102.0,Lägenhet,Borgarfjordsgatan 21B,1303,Kista,Stockholm,2015.0,2495000.0,2450000.0,Slutpris,2 595 000,,4274.0,,D,3,,59.404946,17.941505,,16,6529,[5078126],1,2023-12
4,annons,5119460,5119460,2023-12-05,39.0,Lägenhet,Rålambsvägen 48,,Fredhäll,Stockholm,1931.0,3400000.0,3400000.0,Slutpris,3 195 000,,2213.0,855.0,,3,,59.330127,18.005056,,34,9162,[5119460],1,2023-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309957,bostad,2407266,4818529,2023-06-01,133.0,Villa,Zacharias väg 14,,Kulltorp,Kristianstad,1937.0,2695000.0,2630000.0,Lagfart,2 890 000,,,2373.0,G,,,56.052645,14.185546,,1071,11246,"[4818529, 4547164]",2,2023-06
310141,bostad,518000,4990861,2023-06-01,10.0,Lägenhet,Norra Liden 8,1501,Kungshöjd Och Inom Vallgraven,Göteborg,1947.0,5850000.0,5900000.0,Slutpris,5 850 000,,4575.0,325.0,D,4,,57.704597,11.958324,,294,11605,"[4990861, 4577769]",2,2023-06
310187,bostad,4010918,5111124,2023-12-05,32.0,Lägenhet,Råcksta Gårdsväg 28,1802,Vällingby Parkstad,Stockholm,2014.0,2495000.0,2650000.0,Slutpris,2 495 000,,3583.0,375.0,B,8,,59.355201,17.885914,,32,10216,"[5111124, 4581935]",2,2023-12
310450,bostad,441268,5107654,2023-12-14,61.0,Lägenhet,Karolinagatan 1,1101,Huvudsta,Solna,1951.0,4995000.0,5125000.0,Sista bud,5 395 000,,5189.0,1146.0,G,,,59.352290,17.996105,,12,13693,"[5107654, 4570371]",2,2023-12
