# 05 Modelling (Different Targets)

In this notebook, we test different targets to predict for our house price appreciation task.

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

from xgboost import XGBRegressor
import plotly.express as px


In [2]:
df_train = pd.read_csv("data/df_train_app.csv", index_col=0).dropna(subset="population")
df_test = pd.read_csv("data/df_test_app.csv", index_col=0).dropna(subset="population")
df_val = pd.read_csv("data/df_val_app.csv", index_col=0).dropna(subset="population")

In [3]:
X_train = df_train.drop(columns=["appreciation", "price", "id", "prior_saledate"])
X_val = df_val.drop(columns=["appreciation","price", "id", "prior_saledate"])
X_test = df_test.drop(columns=["appreciation","price", "id", "prior_saledate"])

In [4]:
# some loss functions
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_pred - y_true) ** 2))

def smape(y_true, y_pred):
    return np.mean(200 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def rmspe(y_true, y_pred):
    epsilon = 1e-6  # Small constant to avoid division by zero
    return np.sqrt(np.mean(((y_true - y_pred) / (y_true + epsilon)) ** 2))

### Two Models

In [5]:
df_train_2 = df_train.dropna(subset="population")
df_test_2 = df_test.dropna(subset="population")
df_val_2 = df_val.dropna(subset="population")

In [6]:
X_train = df_train_2.drop(columns=["appreciation", "price", "id", "prior_saledate"])
X_val = df_val_2.drop(columns=["appreciation","price", "id", "prior_saledate"])
X_test = df_test_2.drop(columns=["appreciation","price", "id", "prior_saledate"])

y_train = df_train_2.loc[:, "price"]
y_val = df_val_2.loc[:, "price"]
y_test = df_test_2.loc[:, "price"]

In [7]:
X_train_prior = df_train_2.drop(columns=["appreciation", "price", "id", "prior_saledate", "prior_price"])
X_val_prior= df_val_2.drop(columns=["appreciation","price", "id", "prior_saledate", "prior_price"])
X_test_prior = df_test_2.drop(columns=["appreciation","price", "id", "prior_saledate", "prior_price"])

y_train_prior = df_train_2.loc[:, "prior_price"]
y_val_prior = df_val_2.loc[:, "prior_price"]
y_test_prior = df_test_2.loc[:, "prior_price"]

In [8]:
# predict prior price

parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model_prior = XGBRegressor(**parameters)
model_prior.fit(X_train_prior, y_train_prior)
y_hat_prior = model_prior.predict(X_val_prior)

print(f"MSE: {mean_squared_error(y_true=y_val_prior, y_pred=y_hat_prior)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_prior, y_pred=y_hat_prior)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_prior, y_pred=y_hat_prior)}")
print(f"R2: {r2_score(y_true=y_val_prior, y_pred=y_hat_prior)}")
print(f"RMSE: {rmse(y_true=y_val_prior, y_pred=y_hat_prior)}")
print(f"SMAPE: {smape(y_true=y_val_prior, y_pred=y_hat_prior)}")
print(f"RMSPE: {rmspe(y_true=y_val_prior, y_pred=y_hat_prior)}")

MSE: 21704888896.82967
MAE: 58299.635753514776
MAPE: 0.3146561731832743
R2: 0.8258882708485414
RMSE: 147325.79168913251
SMAPE: 17.919337958652473
RMSPE: 2.6823619780099146


In [9]:
y_hat_train = model_prior.predict(X_train_prior)
y_hat_val = model_prior.predict(X_val_prior)
y_hat_test = model_prior.predict(X_test_prior)

In [10]:
X_train_appr = X_train.drop(columns="prior_price")
X_test_appr = X_test.drop(columns="prior_price")
X_val_appr = X_val.drop(columns="prior_price")

X_train_appr["pred_prior_price"] = y_hat_train
X_val_appr["pred_prior_price"] = y_hat_val
X_test_appr["pred_prior_price"] = y_hat_test

In [13]:
y_train = (y_train - y_hat_train)/y_hat_train
y_test = (y_test - y_hat_test)/y_hat_test
y_val = (y_val - y_hat_val)/y_hat_val

In [14]:
# predict feature price
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_appr, y_train)
y_hat = model.predict(X_val_appr)

print(f"MSE: {mean_squared_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")

MSE: 10.520264029386734
MAE: 0.26979589066504883
MAPE: 2.474398654486511
R2: -0.7894539775419616
RMSE: 3.2434956496636054
SMAPE: 58.53926273949068
RMSPE: 73.05214897847561


- As expected through the insights we got through the data analysis and the fact that we have no clear correlated features with the target and non observable clear patterns between features and the target, we observe a very bad performance when trying to predict the appreciation directly. This supports our approach to first predict the future house price and then calculate appreciation manually.