# 05 Modelling (Final Performance)

In this notebook, we test our model architecture on unseen data (on the test set).

In [59]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

from xgboost import XGBRegressor
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


In [60]:
df_train = pd.read_csv("data/df_train_app.csv", index_col=0).dropna(subset="population")
df_test = pd.read_csv("data/df_test_app.csv", index_col=0).dropna(subset="population")
df_val = pd.read_csv("data/df_val_app.csv", index_col=0).dropna(subset="population")

In [61]:
# some loss functions
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_pred - y_true) ** 2))

def smape(y_true, y_pred):
    return np.mean(200 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def rmspe(y_true, y_pred):
    epsilon = 1e-6  # Small constant to avoid division by zero
    return np.sqrt(np.mean(((y_true - y_pred) / (y_true + epsilon)) ** 2))

In [62]:
df_train_2 = df_train.dropna(subset="population")
df_test_2 = df_test.dropna(subset="population")
df_val_2 = df_val.dropna(subset="population")

In [63]:
X_train_prior = df_train_2.drop(columns=["appreciation", "price", "id", "prior_saledate", "prior_price"])
X_val_prior = df_val_2.drop(columns=["appreciation","price", "id", "prior_saledate", "prior_price"])
X_test_prior = df_test_2.drop(columns=["appreciation","price", "id", "prior_saledate", "prior_price"])

y_train = df_train_2.loc[:, "price"]
y_val = df_val_2.loc[:, "price"]
y_test = df_test_2.loc[:, "price"]

In [64]:
y_train_prior = df_train_2.loc[:, "prior_price"]
y_val_prior = df_val_2.loc[:, "prior_price"]
y_test_prior = df_test_2.loc[:, "prior_price"]

### Prior Performance

In [52]:
# predict prior price
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model_prior = XGBRegressor(**parameters)
model_prior.fit(X_train_prior, y_train_prior)
y_hat_prior = model_prior.predict(X_test_prior)

print(f"MSE: {mean_squared_error(y_true=y_test_prior, y_pred=y_hat_prior)}")
print(f"MAE: {mean_absolute_error(y_true=y_test_prior, y_pred=y_hat_prior)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_test_prior, y_pred=y_hat_prior)}")
print(f"R2: {r2_score(y_true=y_test_prior, y_pred=y_hat_prior)}")
print(f"RMSE: {rmse(y_true=y_test_prior, y_pred=y_hat_prior)}")
print(f"SMAPE: {smape(y_true=y_test_prior, y_pred=y_hat_prior)}")
print(f"RMSPE: {rmspe(y_true=y_test_prior, y_pred=y_hat_prior)}")

MSE: 21598249699.823746
MAE: 58409.60919126375
MAPE: 0.30268785485725186
R2: 0.8420368393281011
RMSE: 146963.42980423308
SMAPE: 17.822996259874014
RMSPE: 2.247581862665495


In [53]:
X_train_appr_2 = X_train_prior.copy()
X_test_appr_2 = X_test_prior.copy()
X_val_appr_2 = X_val_prior.copy()

In [54]:
y_hat_train = model_prior.predict(X_train_prior)
y_hat_val = model_prior.predict(X_val_prior)
y_hat_test = model_prior.predict(X_test_prior)

In [55]:
X_train_appr_2["pred_prior_price"] = y_hat_train
X_val_appr_2["pred_prior_price"] = y_hat_val
X_test_appr_2["pred_prior_price"] = y_hat_test

In [56]:
# predict future price
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_appr_2, y_train)
y_hat = model.predict(X_test_appr_2)

print(f"MSE: {mean_squared_error(y_true=y_test, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_test, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_test, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_test, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_test, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_test, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_test, y_pred=y_hat)}")

MSE: 26049656228.30359
MAE: 63728.44137860761
MAPE: 0.16560435508216714
R2: 0.8696563205181287
RMSE: 161399.05894491327
SMAPE: 13.0838233078704
RMSPE: 0.5795071196540915


In [57]:
y_hat_appr = (y_hat - y_hat_test)/y_hat_test
y_appr = (y_test - y_hat_test)/y_hat_test

In [58]:
print(f"MSE: {mean_squared_error(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"MAE: {mean_absolute_error(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"R2: {r2_score(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"RMSE: {rmse(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"SMAPE: {smape(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"RMSPE: {rmspe(y_true=y_appr, y_pred=y_hat_appr)}")

MSE: 2.970461533113999
MAE: 0.2177164796731326
MAPE: 2.221913876915849
R2: 0.9983937830178984
RMSE: 1.7235026930974024
SMAPE: 57.45460778662334
RMSPE: 71.25257089970663


### After Performance

In [65]:
columns = ['city', 'yrblt', 'effyrblt', 'nbed', 'nbath', 'nhalfbath', 'livarea',
       'efflivarea', 'distance_aerodrome', 'distance_ferry_terminal',
       'distance_railway_station', 'distance_market', 'distance_hospital',
       'distance_hotel', 'distance_museum', 'n_reli_inst', 'n_edu_fac',
       'n_healthcare', 'n_emergency', 'n_animalcare', 'n_commu_venu',
       'n_commu_serv', 'n_food_drink', 'n_financial', 'n_transport',
       'n_entertainment', 'n_sports', 'n_utilities', 'n_accommodation',
       'n_government_civic', 'n_recreational', 'year', 'hpi',
       'household_income', 'new_housing', 'population', 'n_poverty',
       'n_poverty_young', 'unemployment_rate', 'n_employed', 'age', 'eff_age',
       'longitude', 'latitude', 'county_Fairfield', 'county_Litchfield',
       'cond_desc_Average', 'cond_desc_Fair', 'cond_desc_Good',
       'cond_desc_Poor']

In [66]:
X_train_prior = X_train_prior[columns]
X_test_prior = X_test_prior[columns]
X_val_prior = X_val_prior[columns]

In [67]:
pca = PCA(n_components=3)

In [68]:
X_train_prior_2 = pd.concat([X_train_prior, pd.DataFrame(pca.fit_transform(X_train_prior), index=X_train_prior.index)], axis=1)
X_val_prior_2 = pd.concat([X_val_prior, pd.DataFrame(pca.fit_transform(X_val_prior), index=X_val_prior.index)], axis=1)
X_test_prior_2 = pd.concat([X_test_prior, pd.DataFrame(pca.fit_transform(X_test_prior), index=X_test_prior.index)], axis=1)

In [69]:
columns = ['longitude', 'latitude', 'county_Fairfax', 'county_Fairfield',
       'county_Hartford', 'county_Litchfield', 'county_Middlesex',
       'county_New Haven', 'county_New London', 'county_Tolland',
       'county_Windham', 'state_Connecticut', 'state_Virginia',
       'cond_desc_Average', 'cond_desc_Average Plus', 'cond_desc_Fair',
       'cond_desc_Good', 'cond_desc_Poor']
columns = list(set(X_train_prior_2.columns) & set(columns))
X_train_prior_2 = X_train_prior_2.rename(columns={0:"pca_0", 1:"pca_1", 2:"pca_2"})
X_test_prior_2 = X_test_prior_2.rename(columns={0:"pca_0", 1:"pca_1", 2:"pca_2"})
X_val_prior_2 = X_val_prior_2.rename(columns={0:"pca_0", 1:"pca_1", 2:"pca_2"})


scaler = StandardScaler()

X_train_scaled = X_train_prior_2.drop(columns=columns)
X_test_scaled = X_test_prior_2.drop(columns=columns)
X_val_scaled = X_val_prior_2.drop(columns=columns)
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_scaled), columns=X_train_scaled.columns, index=X_train_scaled.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_scaled), columns=X_test_scaled.columns, index=X_test_scaled.index)
X_val_scaled = pd.DataFrame(scaler.transform(X_val_scaled), columns=X_val_scaled.columns, index=X_val_scaled.index)
X_train_scaled = pd.concat([X_train_prior.loc[:, columns], X_train_scaled], axis=1)
X_test_scaled = pd.concat([X_test_prior.loc[:, columns], X_test_scaled], axis=1)
X_val_scaled = pd.concat([X_val_prior.loc[:, columns], X_val_scaled], axis=1)

In [70]:
k = 6 # You can choose the number of clusters based on your analysis
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_train_scaled)



In [71]:
X_train_prior_2 = pd.concat([X_train_prior_2, pd.DataFrame(kmeans.transform(X_train_scaled), index=X_train_prior_2.index)], axis=1)
X_test_prior_2 = pd.concat([X_test_prior_2, pd.DataFrame(kmeans.transform(X_test_scaled), index=X_test_prior_2.index)], axis=1)
X_val_prior_2= pd.concat([X_val_prior_2, pd.DataFrame(kmeans.transform(X_val_scaled), index=X_val_prior_2.index)], axis=1)

In [72]:
# predict prior price
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model_prior = XGBRegressor(**parameters)
model_prior.fit(X_train_prior_2, y_train_prior)
y_hat_prior = model_prior.predict(X_test_prior_2)

print(f"MSE: {mean_squared_error(y_true=y_test_prior, y_pred=y_hat_prior)}")
print(f"MAE: {mean_absolute_error(y_true=y_test_prior, y_pred=y_hat_prior)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_test_prior, y_pred=y_hat_prior)}")
print(f"R2: {r2_score(y_true=y_test_prior, y_pred=y_hat_prior)}")
print(f"RMSE: {rmse(y_true=y_test_prior, y_pred=y_hat_prior)}")
print(f"SMAPE: {smape(y_true=y_test_prior, y_pred=y_hat_prior)}")
print(f"RMSPE: {rmspe(y_true=y_test_prior, y_pred=y_hat_prior)}")

MSE: 21585907914.72414
MAE: 59217.01843082848
MAPE: 0.3081833171670887
R2: 0.8421271034656939
RMSE: 146921.434497231
SMAPE: 18.118420280244898
RMSPE: 2.311008502371336


In [73]:
X_train_appr_2 = X_train_prior_2.copy()
X_test_appr_2 = X_test_prior_2.copy()
X_val_appr_2 = X_val_prior_2.copy()

In [74]:
y_hat_train = model_prior.predict(X_train_prior_2)
y_hat_val = model_prior.predict(X_val_prior_2)
y_hat_test = model_prior.predict(X_test_prior_2)

In [75]:
X_train_appr_2["pred_prior_price"] = y_hat_train
X_val_appr_2["pred_prior_price"] = y_hat_val
X_test_appr_2["pred_prior_price"] = y_hat_test

In [76]:
# predict future price
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_appr_2, y_train)
y_hat = model.predict(X_test_appr_2)

print(f"MSE: {mean_squared_error(y_true=y_test, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_test, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_test, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_test, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_test, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_test, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_test, y_pred=y_hat)}")

MSE: 25829313905.27914
MAE: 64507.9158031307
MAPE: 0.1680123685570303
R2: 0.8707588390648949
RMSE: 160715.00833860892
SMAPE: 13.298007851613349
RMSPE: 0.5903337767030805


In [77]:
y_hat_appr = (y_hat - y_hat_prior)/y_hat_prior
y_appr = (y_test - y_hat_prior)/y_hat_prior

In [92]:
print(f"MSE: {mean_squared_error(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"MAE: {mean_absolute_error(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"R2: {r2_score(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"RMSE: {rmse(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"SMAPE: {smape(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"RMSPE: {rmspe(y_true=y_appr, y_pred=y_hat_appr)}")

MSE: 13.119801404737503
MAE: 0.22118710232231795
MAPE: 3.0158065890305505
R2: 0.711817363311501
RMSE: 3.6221266411788395
SMAPE: 58.2238863280834
RMSPE: 345.35692677144925
