# 02 Modelling (Past Sales)

In this notebook, we test the model performance with past sales in the neighborhood as an additional feature.

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer

In [2]:
df_train = pd.read_csv("data/past_sales/df_train_2.csv", index_col=0)
df_test = pd.read_csv("data/past_sales/df_test_2.csv", index_col=0)
df_val = pd.read_csv("data/past_sales/df_val_2.csv", index_col=0)

In [3]:
X_train = df_train.drop(columns=["price", "id"])
X_test = df_test.drop(columns=["price", "id"])
X_val = df_val.drop(columns=["price", "id"])

y_train = df_train.loc[:, "price"]
y_test = df_test.loc[:, "price"]
y_val = df_val.loc[:, "price"]

In [4]:
columns = ['city', 'yrblt', 'effyrblt', 'nbed', 'nbath', 'nhalfbath', 'livarea',
       'efflivarea', 'distance_aerodrome', 'distance_ferry_terminal',
       'distance_railway_station', 'distance_market', 'distance_hospital',
       'distance_hotel', 'distance_museum', 'n_reli_inst', 'n_edu_fac',
       'n_healthcare', 'n_emergency', 'n_animalcare', 'n_commu_venu',
       'n_commu_serv', 'n_food_drink', 'n_financial', 'n_transport',
       'n_entertainment', 'n_sports', 'n_utilities', 'n_accommodation',
       'n_government_civic', 'n_recreational', 'year', 'hpi',
       'household_income', 'new_housing', 'population', 'n_poverty',
       'n_poverty_young', 'unemployment_rate', 'n_employed', 'age', 'eff_age',
       'longitude', 'latitude', 'county_Fairfield', 'county_Litchfield',
       'cond_desc_Average', 'cond_desc_Fair', 'cond_desc_Good',
       'cond_desc_Poor', "price_level"]

In [5]:
X_train = X_train[columns]
X_test = X_test[columns]
X_val = X_val[columns]

In [6]:
# some loss functions
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_pred - y_true) ** 2))

def smape(y_true, y_pred):
    return np.mean(200 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def rmspe(y_true, y_pred):
    epsilon = 1e-6  # Small constant to avoid division by zero
    return np.sqrt(np.mean(((y_true - y_pred) / (y_true + epsilon)) ** 2))

In [7]:
# validation performance with price level as additional feature

parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train, y_train)
y_hat = model.predict(X_val)

print(f"MSE: {mean_squared_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")

MSE: 39523626576.019554
MAE: 60203.46716044203
MAPE: 27.44796761598017
R2: 0.8110031081102647
RMSE: 198805.49936060508
SMAPE: 15.68204678658603
RMSPE: 1855.800340763175


In [8]:
# test performance with price level as additional feature

parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train, y_train)
y_hat = model.predict(X_test)

print(f"MSE: {mean_squared_error(y_true=y_test, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_test, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_test, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_test, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_test, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_test, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_test, y_pred=y_hat)}")

MSE: 41930822339.94914
MAE: 60553.97961016542
MAPE: 20.488895900431487
R2: 0.8005052320863466
RMSE: 204770.16955589294
SMAPE: 15.769139190387472
RMSPE: 978.926738316452


In [9]:
# validation set performance without price level as a feature

parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train.drop(columns="price_level"), y_train)
y_hat = model.predict(X_val.drop(columns="price_level"))

print(f"MSE: {mean_squared_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")

MSE: 39043538878.83786
MAE: 60097.73400510445
MAPE: 26.56653750326256
R2: 0.8132988256458841
RMSE: 197594.37967421507
SMAPE: 15.722447415301327
RMSPE: 1646.08826421604


In [10]:
# test set performance without price level as a feature

parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train.drop(columns="price_level"), y_train)
y_hat = model.predict(X_test.drop(columns="price_level"))

print(f"MSE: {mean_squared_error(y_true=y_test, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_test, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_test, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_test, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_test, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_test, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_test, y_pred=y_hat)}")

MSE: 41510431414.36363
MAE: 60407.92143701395
MAPE: 20.231673589848686
R2: 0.8025053309504413
RMSE: 203741.08916554763
SMAPE: 15.781087205235808
RMSPE: 960.1953133189792


In [11]:
# impute missing values for price level (only feature with missing values)
imputer = SimpleImputer(strategy="mean")
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(imputer.fit_transform(X_test), columns=X_test.columns, index=X_test.index)
X_val = pd.DataFrame(imputer.fit_transform(X_val), columns=X_val.columns, index=X_val.index)

In [12]:
# validation set performance with imputed price level feature

parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train, y_train)
y_hat = model.predict(X_val)

print(f"MSE: {mean_squared_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")

MSE: 39548013288.3665
MAE: 60096.01831473916
MAPE: 27.68383160189674
R2: 0.8108864939926782
RMSE: 198866.82299560803
SMAPE: 15.691616835526585
RMSPE: 1825.4741667241046


In [13]:
# test set performance with imputed price level feature

parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train, y_train)
y_hat = model.predict(X_test)

print(f"MSE: {mean_squared_error(y_true=y_test, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_test, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_test, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_test, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_test, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_test, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_test, y_pred=y_hat)}")

MSE: 41982554904.44256
MAE: 60392.5502186213
MAPE: 20.388396771329596
R2: 0.8002591034542029
RMSE: 204896.44922360798
SMAPE: 15.750586034362719
RMSPE: 970.7285574005283
