# 02 Modelling (Final Performance)

In this notebook, we test our model architecture on unseen data (on the test set).

In [16]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

from xgboost import XGBRegressor
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [17]:
df_train= pd.read_csv("data/after_pca_clustering/df_train.csv", index_col=0)
df_test = pd.read_csv("data/after_pca_clustering/df_test.csv", index_col=0)
df_val = pd.read_csv("data/after_pca_clustering/df_val.csv", index_col=0)

In [18]:
X_train = df_train.drop(columns=["price"])
X_test = df_test.drop(columns=["price"])
X_val = df_val.drop(columns=["price"])

y_train = df_train.loc[:, "price"]
y_test = df_test.loc[:, "price"]
y_val = df_val.loc[:, "price"]

In [19]:
X_train = pd.concat([X_train, X_val])

In [20]:
y_train = pd.concat([y_train, y_val])

In [21]:
# some loss functions
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_pred - y_true) ** 2))

def smape(y_true, y_pred):
    return np.mean(200 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def rmspe(y_true, y_pred):
    epsilon = 1e-6  # Small constant to avoid division by zero
    return np.sqrt(np.mean(((y_true - y_pred) / (y_true + epsilon)) ** 2))

In [22]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train, y_train)
y_hat = model.predict(X_test)

print(f"MSE: {mean_squared_error(y_true=y_test, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_test, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_test, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_test, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_test, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_test, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_test, y_pred=y_hat)}")    

MSE: 39282358776.85899
MAE: 59284.11326224095
MAPE: 22.727785816312032
R2: 0.813105858412319
RMSE: 198197.7769220911
SMAPE: 15.676415337325084
RMSPE: 1271.9488852125667


### Clustering of Residual Error

In [23]:
residual = model.predict(X_train) - y_train

In [24]:
columns = ['longitude', 'latitude', 'county_Fairfax', 'county_Fairfield',
       'county_Hartford', 'county_Litchfield', 'county_Middlesex',
       'county_New Haven', 'county_New London', 'county_Tolland',
       'county_Windham', 'state_Connecticut', 'state_Virginia',
       'cond_desc_Average', 'cond_desc_Average Plus', 'cond_desc_Fair',
       'cond_desc_Good', 'cond_desc_Poor']
columns = list(set(X_train.columns) & set(columns))
X_train_2 = X_train.rename(columns={0:"c_0", 1:"c_1", 2:"c_2", 3:"c_3", 4:"c_4",5:"c_5"})
X_test_2 = X_test.rename(columns={0:"c_0", 1:"c_1", 2:"c_2", 3:"c_3", 4:"c_4",5:"c_5"})



scaler = StandardScaler()

X_train_scaled = X_train_2.drop(columns=columns)
X_test_scaled = X_test_2.drop(columns=columns)
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_scaled), columns=X_train_scaled.columns, index=X_train_scaled.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_scaled), columns=X_test_scaled.columns, index=X_test_scaled.index)
X_train_scaled = pd.concat([X_train.loc[:, columns], X_train_scaled], axis=1)
X_test_scaled = pd.concat([X_test.loc[:, columns], X_test_scaled], axis=1)

In [25]:
scaler = StandardScaler()
scaled_residuals = scaler.fit_transform(residual.to_frame())

In [26]:
X_train_scaled["residuals"] = scaled_residuals

In [27]:
k = 4 # You can choose the number of clusters based on your analysis
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X_train_scaled)



In [28]:
X_train_scaled = X_train_scaled.drop(columns="residuals")

In [29]:
model = KNeighborsClassifier(n_neighbors=6)
model.fit(X_train_scaled, clusters)
test_labels = model.predict(X_test_scaled)

In [30]:
X_train_3 = X_train_2.copy()
X_test_3 = X_test_2.copy()

In [31]:
X_train_3["cluster"] = clusters
X_test_3["cluster"] = test_labels

In [32]:
X_train_cl0 = X_train_3[X_train_3.cluster == 0].drop(columns="cluster")
X_test_cl0 = X_test_3[X_test_3.cluster == 0].drop(columns="cluster")

y_train_cl0 = y_train[X_train_cl0.index]
y_test_cl0 = y_test[X_test_cl0.index]

X_train_cl1 = X_train_3[X_train_3.cluster == 1].drop(columns="cluster")
X_test_cl1 = X_test_3[X_test_3.cluster == 1].drop(columns="cluster")

y_train_cl1 = y_train[X_train_cl1.index]
y_test_cl1 = y_test[X_test_cl1.index]


X_train_cl2 = X_train_3[X_train_3.cluster == 2].drop(columns="cluster")
X_test_cl2 = X_test_3[X_test_3.cluster == 2].drop(columns="cluster")

y_train_cl2 = y_train[X_train_cl2.index]
y_test_cl2 = y_test[X_test_cl2.index]

X_train_cl3 = X_train_3[X_train_3.cluster == 3].drop(columns="cluster")
X_test_cl3 = X_test_3[X_test_3.cluster == 3].drop(columns="cluster")

y_train_cl3 = y_train[X_train_cl3.index]
y_test_cl3 = y_test[X_test_cl3.index]

In [33]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.11,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_cl0, y_train_cl0)
y_hat_0 = model.predict(X_test_cl0) 

print(f"MSE: {mean_squared_error(y_true=y_test_cl0, y_pred=y_hat_0)}")
print(f"MAE: {mean_absolute_error(y_true=y_test_cl0, y_pred=y_hat_0)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_test_cl0, y_pred=y_hat_0)}")
print(f"R2: {r2_score(y_true=y_test_cl0, y_pred=y_hat_0)}")
print(f"RMSE: {rmse(y_true=y_test_cl0, y_pred=y_hat_0)}")
print(f"SMAPE: {smape(y_true=y_test_cl0, y_pred=y_hat_0)}")
print(f"RMSPE: {rmspe(y_true=y_test_cl0, y_pred=y_hat_0)}")    

MSE: 52570086096.83933
MAE: 75458.07490421548
MAPE: 40.960817706423676
R2: 0.5123273426697197
RMSE: 229281.67414086833
SMAPE: 25.176234733422266
RMSPE: 1126.339366104452


In [34]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_cl1, y_train_cl1)
y_hat_1 = model.predict(X_test_cl1) 

print(f"MSE: {mean_squared_error(y_true=y_test_cl1, y_pred=y_hat_1)}")
print(f"MAE: {mean_absolute_error(y_true=y_test_cl1, y_pred=y_hat_1)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_test_cl1, y_pred=y_hat_1)}")
print(f"R2: {r2_score(y_true=y_test_cl1, y_pred=y_hat_1)}")
print(f"RMSE: {rmse(y_true=y_test_cl1, y_pred=y_hat_1)}")
print(f"SMAPE: {smape(y_true=y_test_cl1, y_pred=y_hat_1)}")
print(f"RMSPE: {rmspe(y_true=y_test_cl1, y_pred=y_hat_1)}")    

MSE: 68049365536.2117
MAE: 78419.31471793672
MAPE: 27.604819756689235
R2: 0.7850959518782324
RMSE: 260862.73313030304
SMAPE: 19.885085566096663
RMSPE: 1169.0654057556467


In [35]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":400,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":7,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_cl2, y_train_cl2)
y_hat_2 = model.predict(X_test_cl2) 

print(f"MSE: {mean_squared_error(y_true=y_test_cl2, y_pred=y_hat_2)}")
print(f"MAE: {mean_absolute_error(y_true=y_test_cl2, y_pred=y_hat_2)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_test_cl2, y_pred=y_hat_2)}")
print(f"R2: {r2_score(y_true=y_test_cl2, y_pred=y_hat_2)}")
print(f"RMSE: {rmse(y_true=y_test_cl2, y_pred=y_hat_2)}")
print(f"SMAPE: {smape(y_true=y_test_cl2, y_pred=y_hat_2)}")
print(f"RMSPE: {rmspe(y_true=y_test_cl2, y_pred=y_hat_2)}")    

MSE: 114466412367.38861
MAE: 97387.93432592026
MAPE: 199.78499143117986
R2: 0.7412816244335212
RMSE: 338328.8524016073
SMAPE: 28.507830151730868
RMSPE: 5994.235803674219


In [36]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":400,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":5,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_cl3, y_train_cl3)
y_hat_3 = model.predict(X_test_cl3) 

print(f"MSE: {mean_squared_error(y_true=y_test_cl3, y_pred=y_hat_3)}")
print(f"MAE: {mean_absolute_error(y_true=y_test_cl3, y_pred=y_hat_3)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_test_cl3, y_pred=y_hat_3)}")
print(f"R2: {r2_score(y_true=y_test_cl3, y_pred=y_hat_3)}")
print(f"RMSE: {rmse(y_true=y_test_cl3, y_pred=y_hat_3)}")
print(f"SMAPE: {smape(y_true=y_test_cl3, y_pred=y_hat_3)}")
print(f"RMSPE: {rmspe(y_true=y_test_cl3, y_pred=y_hat_3)}")    

MSE: 7090395821.474136
MAE: 40203.4389293014
MAPE: 0.10191687443070946
R2: 0.9412278625212546
RMSE: 84204.48813141813
SMAPE: 9.911839084581963
RMSPE: 0.1749490518025319


In [37]:
y_hat_cl = np.concatenate([y_hat_0, y_hat_1, y_hat_2, y_hat_3])
y_test_cl = np.concatenate([y_test_cl0, y_test_cl1, y_test_cl2, y_test_cl3])

In [38]:
print(f"MSE: {mean_squared_error(y_true=y_test_cl, y_pred=y_hat_cl)}")
print(f"MAE: {mean_absolute_error(y_true=y_test_cl, y_pred=y_hat_cl)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_test_cl, y_pred=y_hat_cl)}")
print(f"R2: {r2_score(y_true=y_test_cl, y_pred=y_hat_cl)}")
print(f"RMSE: {rmse(y_true=y_test_cl, y_pred=y_hat_cl)}")
print(f"SMAPE: {smape(y_true=y_test_cl, y_pred=y_hat_cl)}")
print(f"RMSPE: {rmspe(y_true=y_test_cl, y_pred=y_hat_cl)}")    

MSE: 39454083894.079544
MAE: 60759.09045494535
MAPE: 22.802638455730882
R2: 0.8122888397970599
RMSE: 198630.52105373822
SMAPE: 16.028900486293487
RMSPE: 1447.657280308212


## Test Regular Model

In [39]:
df_train = pd.read_csv("data/df_train_2.csv", index_col=0)
df_test = pd.read_csv("data/df_test_2.csv", index_col=0)
df_val = pd.read_csv("data/df_val_2.csv", index_col=0)
X_train = df_train.drop(columns=["price", "id"])
X_val = df_val.drop(columns=["price", "id"])
X_test = df_test.drop(columns=["price", "id"])

y_train = df_train.loc[:, "price"]
y_val = df_val.loc[:, "price"]
y_test = df_test.loc[:, "price"]

X_train = pd.concat([X_train, X_val])
y_train = pd.concat([y_train, y_val])

parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train, y_train)
y_hat = model.predict(X_test)

print(f"MSE: {mean_squared_error(y_true=y_test, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_test, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_test, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_test, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_test, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_test, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_test, y_pred=y_hat)}")

MSE: 41315527756.182846
MAE: 59566.71324610917
MAPE: 22.063876907784053
R2: 0.8034326263833584
RMSE: 203262.21428534828
SMAPE: 15.603445444726134
RMSPE: 1230.294791320309
