# 05 Modelling (Hybrid Model)

In this notebook, we apply and test a hybrid model architecture (combining clustering and classification using residual error).

In [86]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

from xgboost import XGBRegressor
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier


In [87]:
df_train = pd.read_csv("data/df_train_app.csv", index_col=0).dropna(subset="population")
df_test = pd.read_csv("data/df_test_app.csv", index_col=0).dropna(subset="population")
df_val = pd.read_csv("data/df_val_app.csv", index_col=0).dropna(subset="population")

In [88]:
# some loss functions
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_pred - y_true) ** 2))

def smape(y_true, y_pred):
    return np.mean(200 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def rmspe(y_true, y_pred):
    epsilon = 1e-6  # Small constant to avoid division by zero
    return np.sqrt(np.mean(((y_true - y_pred) / (y_true + epsilon)) ** 2))

In [89]:
df_train_2 = df_train.dropna(subset="population")
df_test_2 = df_test.dropna(subset="population")
df_val_2 = df_val.dropna(subset="population")

In [90]:
X_train_prior = df_train_2.drop(columns=["appreciation", "price", "id", "prior_saledate", "prior_price"])
X_val_prior = df_val_2.drop(columns=["appreciation","price", "id", "prior_saledate", "prior_price"])
X_test_prior = df_test_2.drop(columns=["appreciation","price", "id", "prior_saledate", "prior_price"])

y_train = df_train_2.loc[:, "price"]
y_val = df_val_2.loc[:, "price"]
y_test = df_test_2.loc[:, "price"]

In [91]:
y_train_prior = df_train_2.loc[:, "prior_price"]
y_val_prior = df_val_2.loc[:, "prior_price"]
y_test_prior = df_test_2.loc[:, "prior_price"]

In [92]:
columns = ['city', 'yrblt', 'effyrblt', 'nbed', 'nbath', 'nhalfbath', 'livarea',
       'efflivarea', 'distance_aerodrome', 'distance_ferry_terminal',
       'distance_railway_station', 'distance_market', 'distance_hospital',
       'distance_hotel', 'distance_museum', 'n_reli_inst', 'n_edu_fac',
       'n_healthcare', 'n_emergency', 'n_animalcare', 'n_commu_venu',
       'n_commu_serv', 'n_food_drink', 'n_financial', 'n_transport',
       'n_entertainment', 'n_sports', 'n_utilities', 'n_accommodation',
       'n_government_civic', 'n_recreational', 'year', 'hpi',
       'household_income', 'new_housing', 'population', 'n_poverty',
       'n_poverty_young', 'unemployment_rate', 'n_employed', 'age', 'eff_age',
       'longitude', 'latitude', 'county_Fairfield', 'county_Litchfield',
       'cond_desc_Average', 'cond_desc_Fair', 'cond_desc_Good',
       'cond_desc_Poor']

In [93]:
X_train_prior = X_train_prior[columns]
X_test_prior = X_test_prior[columns]
X_val_prior = X_val_prior[columns]

In [94]:
pca = PCA(n_components=3)

In [95]:
X_train_prior_2 = pd.concat([X_train_prior, pd.DataFrame(pca.fit_transform(X_train_prior), index=X_train_prior.index)], axis=1)
X_val_prior_2 = pd.concat([X_val_prior, pd.DataFrame(pca.fit_transform(X_val_prior), index=X_val_prior.index)], axis=1)
X_test_prior_2 = pd.concat([X_test_prior, pd.DataFrame(pca.fit_transform(X_test_prior), index=X_test_prior.index)], axis=1)

In [96]:
columns = ['longitude', 'latitude', 'county_Fairfax', 'county_Fairfield',
       'county_Hartford', 'county_Litchfield', 'county_Middlesex',
       'county_New Haven', 'county_New London', 'county_Tolland',
       'county_Windham', 'state_Connecticut', 'state_Virginia',
       'cond_desc_Average', 'cond_desc_Average Plus', 'cond_desc_Fair',
       'cond_desc_Good', 'cond_desc_Poor']
columns = list(set(X_train_prior_2.columns) & set(columns))
X_train_prior_2 = X_train_prior_2.rename(columns={0:"pca_0", 1:"pca_1", 2:"pca_2"})
X_test_prior_2 = X_test_prior_2.rename(columns={0:"pca_0", 1:"pca_1", 2:"pca_2"})
X_val_prior_2 = X_val_prior_2.rename(columns={0:"pca_0", 1:"pca_1", 2:"pca_2"})


scaler = StandardScaler()

X_train_scaled = X_train_prior_2.drop(columns=columns)
X_test_scaled = X_test_prior_2.drop(columns=columns)
X_val_scaled = X_val_prior_2.drop(columns=columns)
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_scaled), columns=X_train_scaled.columns, index=X_train_scaled.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_scaled), columns=X_test_scaled.columns, index=X_test_scaled.index)
X_val_scaled = pd.DataFrame(scaler.transform(X_val_scaled), columns=X_val_scaled.columns, index=X_val_scaled.index)
X_train_scaled = pd.concat([X_train_prior.loc[:, columns], X_train_scaled], axis=1)
X_test_scaled = pd.concat([X_test_prior.loc[:, columns], X_test_scaled], axis=1)
X_val_scaled = pd.concat([X_val_prior.loc[:, columns], X_val_scaled], axis=1)

In [97]:
k = 6 # You can choose the number of clusters based on your analysis
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_train_scaled)



In [98]:
X_train_prior_2 = pd.concat([X_train_prior_2, pd.DataFrame(kmeans.transform(X_train_scaled), index=X_train_prior_2.index)], axis=1)
X_test_prior_2 = pd.concat([X_test_prior_2, pd.DataFrame(kmeans.transform(X_test_scaled), index=X_test_prior_2.index)], axis=1)
X_val_prior_2= pd.concat([X_val_prior_2, pd.DataFrame(kmeans.transform(X_val_scaled), index=X_val_prior_2.index)], axis=1)

### Predict Prior Price

In [99]:
# predict prior price
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model_prior = XGBRegressor(**parameters)
model_prior.fit(X_train_prior_2, y_train_prior)
y_hat_prior = model_prior.predict(X_val_prior_2)

print(f"MSE: {mean_squared_error(y_true=y_val_prior, y_pred=y_hat_prior)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_prior, y_pred=y_hat_prior)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_prior, y_pred=y_hat_prior)}")
print(f"R2: {r2_score(y_true=y_val_prior, y_pred=y_hat_prior)}")
print(f"RMSE: {rmse(y_true=y_val_prior, y_pred=y_hat_prior)}")
print(f"SMAPE: {smape(y_true=y_val_prior, y_pred=y_hat_prior)}")
print(f"RMSPE: {rmspe(y_true=y_val_prior, y_pred=y_hat_prior)}")

MSE: 21857016793.08298
MAE: 59325.246938719196
MAPE: 0.31774004512544185
R2: 0.8246679351354798
RMSE: 147841.18774239803
SMAPE: 18.22856502266729
RMSPE: 2.6964687697536567


In [100]:
y_hat_train = model_prior.predict(X_train_prior_2)
y_hat_val = model_prior.predict(X_val_prior_2)
y_hat_test = model_prior.predict(X_test_prior_2)

In [101]:
residual = model_prior.predict(X_train_prior_2) - y_train

In [102]:
columns = ['longitude', 'latitude', 'county_Fairfax', 'county_Fairfield',
       'county_Hartford', 'county_Litchfield', 'county_Middlesex',
       'county_New Haven', 'county_New London', 'county_Tolland',
       'county_Windham', 'state_Connecticut', 'state_Virginia',
       'cond_desc_Average', 'cond_desc_Average Plus', 'cond_desc_Fair',
       'cond_desc_Good', 'cond_desc_Poor']
columns = list(set(X_train_prior_2.columns) & set(columns))
X_train_prior_2 = X_train_prior_2.rename(columns={0:"c_0", 1:"c_1", 2:"c_2", 3:"c_3", 4:"c_4",5:"c_5"})
X_test_prior_2 = X_test_prior_2.rename(columns={0:"c_0", 1:"c_1", 2:"c_2", 3:"c_3", 4:"c_4",5:"c_5"})
X_val_prior_2 = X_val_prior_2.rename(columns={0:"c_0", 1:"c_1", 2:"c_2", 3:"c_3", 4:"c_4",5:"c_5"})



scaler = StandardScaler()

X_train_scaled = X_train_prior_2.drop(columns=columns)
X_test_scaled = X_test_prior_2.drop(columns=columns)
X_val_scaled = X_val_prior_2.drop(columns=columns)
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_scaled), columns=X_train_scaled.columns, index=X_train_scaled.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_scaled), columns=X_test_scaled.columns, index=X_test_scaled.index)
X_val_scaled = pd.DataFrame(scaler.transform(X_val_scaled), columns=X_val_scaled.columns, index=X_val_scaled.index)
X_train_scaled = pd.concat([X_train_prior_2.loc[:, columns], X_train_scaled], axis=1)
X_test_scaled = pd.concat([X_test_prior_2.loc[:, columns], X_test_scaled], axis=1)
X_val_scaled = pd.concat([X_val_prior_2.loc[:, columns], X_val_scaled], axis=1)

In [103]:
scaler = StandardScaler()
scaled_residuals = scaler.fit_transform(residual.to_frame())

In [104]:
X_train_scaled["residuals"] = scaled_residuals

In [105]:
k = 4 # You can choose the number of clusters based on your analysis
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X_train_scaled)



In [106]:
X_train_scaled = X_train_scaled.drop(columns="residuals")

In [107]:
model = KNeighborsClassifier(n_neighbors=6)
model.fit(X_train_scaled, clusters)
test_labels = model.predict(X_test_scaled)
val_labels = model.predict(X_val_scaled)

In [108]:
X_train_3 = X_train_prior_2.copy()
X_test_3 = X_test_prior_2.copy()
X_val_3 = X_val_prior_2.copy()

In [109]:
X_train_3["cluster"] = clusters
X_test_3["cluster"] = test_labels
X_val_3["cluster"] = val_labels

In [110]:
X_train_cl0 = X_train_3[X_train_3.cluster == 0].drop(columns="cluster")
X_test_cl0 = X_test_3[X_test_3.cluster == 0].drop(columns="cluster")
X_val_cl0 = X_val_3[X_val_3.cluster == 0].drop(columns="cluster")

y_train_prior_cl0 = y_train_prior[X_train_cl0.index]
y_test_prior_cl0 = y_test_prior[X_test_cl0.index]
y_val_prior_cl0 = y_val_prior[X_val_cl0.index]

y_train_cl0 = y_train[X_train_cl0.index]
y_test_cl0 = y_test[X_test_cl0.index]
y_val_cl0 = y_val[X_val_cl0.index]

X_train_cl1 = X_train_3[X_train_3.cluster == 1].drop(columns="cluster")
X_test_cl1 = X_test_3[X_test_3.cluster == 1].drop(columns="cluster")
X_val_cl1 = X_val_3[X_val_3.cluster == 1].drop(columns="cluster")

y_train_prior_cl1 = y_train_prior[X_train_cl1.index]
y_test_prior_cl1 = y_test_prior[X_test_cl1.index]
y_val_prior_cl1 = y_val_prior[X_val_cl1.index]

y_train_cl1 = y_train[X_train_cl1.index]
y_test_cl1 = y_test[X_test_cl1.index]
y_val_cl1 = y_val[X_val_cl1.index]


X_train_cl2 = X_train_3[X_train_3.cluster == 2].drop(columns="cluster")
X_test_cl2 = X_test_3[X_test_3.cluster == 2].drop(columns="cluster")
X_val_cl2 = X_val_3[X_val_3.cluster == 2].drop(columns="cluster")

y_train_prior_cl2 = y_train_prior[X_train_cl2.index]
y_test_prior_cl2 = y_test_prior[X_test_cl2.index]
y_val_prior_cl2 = y_val_prior[X_val_cl2.index]

y_train_cl2 = y_train[X_train_cl2.index]
y_test_cl2 = y_test[X_test_cl2.index]
y_val_cl2 = y_val[X_val_cl2.index]

X_train_cl3 = X_train_3[X_train_3.cluster == 3].drop(columns="cluster")
X_test_cl3 = X_test_3[X_test_3.cluster == 3].drop(columns="cluster")
X_val_cl3 = X_val_3[X_val_3.cluster == 3].drop(columns="cluster")

y_train_prior_cl3 = y_train_prior[X_train_cl3.index]
y_test_prior_cl3 = y_test_prior[X_test_cl3.index]
y_val_prior_cl3 = y_val_prior[X_val_cl3.index]

y_train_cl3 = y_train[X_train_cl3.index]
y_test_cl3 = y_test[X_test_cl3.index]
y_val_cl3 = y_val[X_val_cl3.index]

In [111]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.11,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_cl0, y_train_prior_cl0)
y_hat_train_prior_0 = model.predict(X_train_cl0)
y_hat_val_prior_0 = model.predict(X_val_cl0) 
y_hat_test_prior_0 = model.predict(X_test_cl0) 

print(f"MSE: {mean_squared_error(y_true=y_val_prior_cl0, y_pred=y_hat_val_prior_0)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_prior_cl0, y_pred=y_hat_val_prior_0)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_prior_cl0, y_pred=y_hat_val_prior_0)}")
print(f"R2: {r2_score(y_true=y_val_prior_cl0, y_pred=y_hat_val_prior_0)}")
print(f"RMSE: {rmse(y_true=y_val_prior_cl0, y_pred=y_hat_val_prior_0)}")
print(f"SMAPE: {smape(y_true=y_val_prior_cl0, y_pred=y_hat_val_prior_0)}")
print(f"RMSPE: {rmspe(y_true=y_val_prior_cl0, y_pred=y_hat_val_prior_0)}")    

MSE: 36602784109.54706
MAE: 81192.34259111251
MAPE: 0.4446207662748165
R2: 0.8131663218472218
RMSE: 191318.5409455839
SMAPE: 24.228147716313057
RMSPE: 3.1025060196093515


In [112]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_cl1, y_train_prior_cl1)
y_hat_train_prior_1 = model.predict(X_train_cl1)
y_hat_val_prior_1 = model.predict(X_val_cl1) 
y_hat_test_prior_1 = model.predict(X_test_cl1) 

print(f"MSE: {mean_squared_error(y_true=y_val_prior_cl1, y_pred=y_hat_val_prior_1)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_prior_cl1, y_pred=y_hat_val_prior_1)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_prior_cl1, y_pred=y_hat_val_prior_1)}")
print(f"R2: {r2_score(y_true=y_val_prior_cl1, y_pred=y_hat_val_prior_1)}")
print(f"RMSE: {rmse(y_true=y_val_prior_cl1, y_pred=y_hat_val_prior_1)}")
print(f"SMAPE: {smape(y_true=y_val_prior_cl1, y_pred=y_hat_val_prior_1)}")
print(f"RMSPE: {rmspe(y_true=y_val_prior_cl1, y_pred=y_hat_val_prior_1)}")    

MSE: 6276431298.47345
MAE: 33404.86909655906
MAPE: 0.08137021061944143
R2: 0.9154065543968974
RMSE: 79223.93134951995
SMAPE: 7.645481521342433
RMSPE: 0.16945566854708913


In [113]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":400,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":7,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_cl2, y_train_prior_cl2)
y_hat_train_prior_2 = model.predict(X_train_cl2)
y_hat_val_prior_2 = model.predict(X_val_cl2) 
y_hat_test_prior_2 = model.predict(X_test_cl2) 

print(f"MSE: {mean_squared_error(y_true=y_val_prior_cl2, y_pred=y_hat_val_prior_2)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_prior_cl2, y_pred=y_hat_val_prior_2)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_prior_cl2, y_pred=y_hat_val_prior_2)}")
print(f"R2: {r2_score(y_true=y_val_prior_cl2, y_pred=y_hat_val_prior_2)}")
print(f"RMSE: {rmse(y_true=y_val_prior_cl2, y_pred=y_hat_val_prior_2)}")
print(f"SMAPE: {smape(y_true=y_val_prior_cl2, y_pred=y_hat_val_prior_2)}")
print(f"RMSPE: {rmspe(y_true=y_val_prior_cl2, y_pred=y_hat_val_prior_2)}")    

MSE: 34944389496.12374
MAE: 68937.85330369484
MAPE: 0.5906724199485736
R2: 0.6225023895376279
RMSE: 186934.18493181962
SMAPE: 32.33248472026234
RMSPE: 3.3624653548484664


In [114]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":400,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":5,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_cl3, y_train_prior_cl3)
y_hat_train_prior_3 = model.predict(X_train_cl3)
y_hat_val_prior_3 = model.predict(X_val_cl3) 
y_hat_test_prior_3 = model.predict(X_test_cl3) 

print(f"MSE: {mean_squared_error(y_true=y_val_prior_cl3, y_pred=y_hat_val_prior_3)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_prior_cl3, y_pred=y_hat_val_prior_3)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_prior_cl3, y_pred=y_hat_val_prior_3)}")
print(f"R2: {r2_score(y_true=y_val_prior_cl3, y_pred=y_hat_val_prior_3)}")
print(f"RMSE: {rmse(y_true=y_val_prior_cl3, y_pred=y_hat_val_prior_3)}")
print(f"SMAPE: {smape(y_true=y_val_prior_cl3, y_pred=y_hat_val_prior_3)}")
print(f"RMSPE: {rmspe(y_true=y_val_prior_cl3, y_pred=y_hat_val_prior_3)}")    

MSE: 25972250404.560394
MAE: 74675.29088968004
MAPE: 0.6997323102444674
R2: 0.5798276131683539
RMSE: 161159.08415153145
SMAPE: 33.86389447482056
RMSPE: 4.608707225083973


In [115]:
y_hat_prior_cl = np.concatenate([y_hat_val_prior_0, y_hat_val_prior_1, y_hat_val_prior_2, y_hat_val_prior_3])
y_val_prior_cl = np.concatenate([y_val_prior_cl0, y_val_prior_cl1, y_val_prior_cl2, y_val_prior_cl3])

In [116]:
print(f"MSE: {mean_squared_error(y_true=y_val_prior_cl, y_pred=y_hat_prior_cl)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_prior_cl, y_pred=y_hat_prior_cl)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_prior_cl, y_pred=y_hat_prior_cl)}")
print(f"R2: {r2_score(y_true=y_val_prior_cl, y_pred=y_hat_prior_cl)}")
print(f"RMSE: {rmse(y_true=y_val_prior_cl, y_pred=y_hat_prior_cl)}")
print(f"SMAPE: {smape(y_true=y_val_prior_cl, y_pred=y_hat_prior_cl)}")
print(f"RMSPE: {rmspe(y_true=y_val_prior_cl, y_pred=y_hat_prior_cl)}")    

MSE: 21447522050.374027
MAE: 57950.50339881633
MAPE: 0.30904263853707187
R2: 0.827952809712373
RMSE: 146449.7253338975
SMAPE: 17.894151436580213
RMSPE: 2.539574743484283


In [117]:
X_train_appr_cl0 = X_train_cl0.copy()
X_test_appr_cl0 = X_test_cl0.copy()
X_val_appr_cl0 = X_val_cl0.copy()

X_train_appr_cl0["pred_prior_price"] = y_hat_train_prior_0
X_test_appr_cl0["pred_prior_price"] = y_hat_test_prior_0
X_val_appr_cl0["pred_prior_price"] = y_hat_val_prior_0

X_train_appr_cl1 = X_train_cl1.copy()
X_test_appr_cl1 = X_test_cl1.copy()
X_val_appr_cl1 = X_val_cl1.copy()

X_train_appr_cl1["pred_prior_price"] = y_hat_train_prior_1
X_test_appr_cl1["pred_prior_price"] = y_hat_test_prior_1
X_val_appr_cl1["pred_prior_price"] = y_hat_val_prior_1


X_train_appr_cl2 = X_train_cl2.copy()
X_test_appr_cl2 = X_test_cl2.copy()
X_val_appr_cl2 = X_val_cl2.copy()

X_train_appr_cl2["pred_prior_price"] = y_hat_train_prior_2
X_test_appr_cl2["pred_prior_price"] = y_hat_test_prior_2
X_val_appr_cl2["pred_prior_price"] = y_hat_val_prior_2

X_train_appr_cl3 = X_train_cl3.copy()
X_test_appr_cl3 = X_test_cl3.copy()
X_val_appr_cl3 = X_val_cl3.copy()

X_train_appr_cl3["pred_prior_price"] = y_hat_train_prior_3
X_test_appr_cl3["pred_prior_price"] = y_hat_test_prior_3
X_val_appr_cl3["pred_prior_price"] = y_hat_val_prior_3

In [118]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.11,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_appr_cl0, y_train_cl0)
y_hat_val_appr_0 = model.predict(X_val_appr_cl0) 

print(f"MSE: {mean_squared_error(y_true=y_val_cl0, y_pred=y_hat_val_appr_0)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_cl0, y_pred=y_hat_val_appr_0)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_cl0, y_pred=y_hat_val_appr_0)}")
print(f"R2: {r2_score(y_true=y_val_cl0, y_pred=y_hat_val_appr_0)}")
print(f"RMSE: {rmse(y_true=y_val_cl0, y_pred=y_hat_val_appr_0)}")
print(f"SMAPE: {smape(y_true=y_val_cl0, y_pred=y_hat_val_appr_0)}")
print(f"RMSPE: {rmspe(y_true=y_val_cl0, y_pred=y_hat_val_appr_0)}")    

MSE: 41403052151.22462
MAE: 80511.47946441131
MAPE: 0.23396853328194966
R2: 0.8502142677617776
RMSE: 203477.39960797763
SMAPE: 17.145084850533802
RMSPE: 1.2168591287157695


In [119]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_appr_cl1, y_train_cl1)
y_hat_val_appr_1 = model.predict(X_val_appr_cl1)

print(f"MSE: {mean_squared_error(y_true=y_val_cl1, y_pred=y_hat_val_appr_1)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_cl1, y_pred=y_hat_val_appr_1)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_cl1, y_pred=y_hat_val_appr_1)}")
print(f"R2: {r2_score(y_true=y_val_cl1, y_pred=y_hat_val_appr_1)}")
print(f"RMSE: {rmse(y_true=y_val_cl1, y_pred=y_hat_val_appr_1)}")
print(f"SMAPE: {smape(y_true=y_val_cl1, y_pred=y_hat_val_appr_1)}")
print(f"RMSPE: {rmspe(y_true=y_val_cl1, y_pred=y_hat_val_appr_1)}")    

MSE: 12221633110.900297
MAE: 44904.63342503658
MAPE: 0.07152920622623775
R2: 0.9074594555362846
RMSE: 110551.49529020536
SMAPE: 7.009758859562851
RMSPE: 0.11004791733832704


In [120]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":400,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":7,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_appr_cl2, y_train_cl2)
y_hat_val_appr_2 = model.predict(X_val_appr_cl2) 

print(f"MSE: {mean_squared_error(y_true=y_val_cl2, y_pred=y_hat_val_appr_2)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_cl2, y_pred=y_hat_val_appr_2)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_cl2, y_pred=y_hat_val_appr_2)}")
print(f"R2: {r2_score(y_true=y_val_cl2, y_pred=y_hat_val_appr_2)}")
print(f"RMSE: {rmse(y_true=y_val_cl2, y_pred=y_hat_val_appr_2)}")
print(f"SMAPE: {smape(y_true=y_val_cl2, y_pred=y_hat_val_appr_2)}")
print(f"RMSPE: {rmspe(y_true=y_val_cl2, y_pred=y_hat_val_appr_2)}")    

MSE: 62332872356.49772
MAE: 86537.12258128748
MAPE: 0.3045684569324735
R2: 0.6890521362998783
RMSE: 249665.52096053975
SMAPE: 21.932744211582904
RMSPE: 0.8344672057435556


In [121]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":400,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":5,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_appr_cl3, y_train_cl3)
y_hat_val_appr_3 = model.predict(X_val_appr_cl3)

print(f"MSE: {mean_squared_error(y_true=y_val_cl3, y_pred=y_hat_val_appr_3)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_cl3, y_pred=y_hat_val_appr_3)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_cl3, y_pred=y_hat_val_appr_3)}")
print(f"R2: {r2_score(y_true=y_val_cl3, y_pred=y_hat_val_appr_3)}")
print(f"RMSE: {rmse(y_true=y_val_cl3, y_pred=y_hat_val_appr_3)}")
print(f"SMAPE: {smape(y_true=y_val_cl3, y_pred=y_hat_val_appr_3)}")
print(f"RMSPE: {rmspe(y_true=y_val_cl3, y_pred=y_hat_val_appr_3)}")    

MSE: 41481036096.91843
MAE: 77384.67844817598
MAPE: 0.3383735836537267
R2: 0.5163267621392051
RMSE: 203668.93748659472
SMAPE: 21.858733143697986
RMSPE: 1.147480379700474


In [122]:
y_hat_cl = np.concatenate([y_hat_val_appr_0, y_hat_val_appr_1, y_hat_val_appr_2, y_hat_val_appr_3])
y_val_cl = np.concatenate([y_val_cl0, y_val_cl1, y_val_cl2, y_val_cl3])

In [123]:
y_hat_appr = (y_hat_cl - y_hat_prior_cl)/y_hat_prior_cl

In [124]:
y_appr = (y_val_cl - y_hat_prior_cl)/y_hat_prior_cl

In [125]:
print(f"MSE: {mean_squared_error(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"MAE: {mean_absolute_error(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"R2: {r2_score(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"RMSE: {rmse(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"SMAPE: {smape(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"RMSPE: {rmspe(y_true=y_appr, y_pred=y_hat_appr)}")

MSE: 0.713839594942796
MAE: 0.2115166235708938
MAPE: 4.094289411880109
R2: 0.8800447896855709
RMSE: 0.8448902857429454
SMAPE: 57.13114370846846
RMSPE: 296.9851326910885
