# 05 Modelling (Feature Selection)

In this notebbook, we examine the effect of feature selection.

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

from xgboost import XGBRegressor
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


In [2]:
df_train = pd.read_csv("data/df_train_app.csv", index_col=0).dropna(subset="population")
df_test = pd.read_csv("data/df_test_app.csv", index_col=0).dropna(subset="population")
df_val = pd.read_csv("data/df_val_app.csv", index_col=0).dropna(subset="population")

In [3]:
# some loss functions
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_pred - y_true) ** 2))

def smape(y_true, y_pred):
    return np.mean(200 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def rmspe(y_true, y_pred):
    epsilon = 1e-6  # Small constant to avoid division by zero
    return np.sqrt(np.mean(((y_true - y_pred) / (y_true + epsilon)) ** 2))

In [4]:
df_train_2 = df_train.dropna(subset="population")
df_test_2 = df_test.dropna(subset="population")
df_val_2 = df_val.dropna(subset="population")

In [5]:
X_train_prior = df_train_2.drop(columns=["appreciation", "price", "id", "prior_saledate", "prior_price"])
X_val_prior = df_val_2.drop(columns=["appreciation","price", "id", "prior_saledate", "prior_price"])
X_test_prior = df_test_2.drop(columns=["appreciation","price", "id", "prior_saledate", "prior_price"])

y_train = df_train_2.loc[:, "price"]
y_val = df_val_2.loc[:, "price"]
y_test = df_test_2.loc[:, "price"]

In [6]:
y_train_prior = df_train_2.loc[:, "prior_price"]
y_val_prior = df_val_2.loc[:, "prior_price"]
y_test_prior = df_test_2.loc[:, "prior_price"]

In [7]:
columns = ['city', 'yrblt', 'effyrblt', 'nbed', 'nbath', 'nhalfbath', 'livarea',
       'efflivarea', 'distance_aerodrome', 'distance_ferry_terminal',
       'distance_railway_station', 'distance_market', 'distance_hospital',
       'distance_hotel', 'distance_museum', 'n_reli_inst', 'n_edu_fac',
       'n_healthcare', 'n_emergency', 'n_animalcare', 'n_commu_venu',
       'n_commu_serv', 'n_food_drink', 'n_financial', 'n_transport',
       'n_entertainment', 'n_sports', 'n_utilities', 'n_accommodation',
       'n_government_civic', 'n_recreational', 'year', 'hpi',
       'household_income', 'new_housing', 'population', 'n_poverty',
       'n_poverty_young', 'unemployment_rate', 'n_employed', 'age', 'eff_age',
       'longitude', 'latitude', 'county_Fairfield', 'county_Litchfield',
       'cond_desc_Average', 'cond_desc_Fair', 'cond_desc_Good',
       'cond_desc_Poor']

In [8]:
X_train_prior = X_train_prior[columns]
X_test_prior = X_test_prior[columns]
X_val_prior = X_val_prior[columns]

In [9]:
# predict prior price
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model_prior = XGBRegressor(**parameters)
model_prior.fit(X_train_prior, y_train_prior)
y_hat_prior = model_prior.predict(X_val_prior)

print(f"MSE: {mean_squared_error(y_true=y_val_prior, y_pred=y_hat_prior)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_prior, y_pred=y_hat_prior)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_prior, y_pred=y_hat_prior)}")
print(f"R2: {r2_score(y_true=y_val_prior, y_pred=y_hat_prior)}")
print(f"RMSE: {rmse(y_true=y_val_prior, y_pred=y_hat_prior)}")
print(f"SMAPE: {smape(y_true=y_val_prior, y_pred=y_hat_prior)}")
print(f"RMSPE: {rmspe(y_true=y_val_prior, y_pred=y_hat_prior)}")

MSE: 21746812144.419926
MAE: 58951.13992620251
MAPE: 0.3117605798437321
R2: 0.8255519720006514
RMSE: 147468.00379885777
SMAPE: 18.18318702763046
RMSPE: 2.5544143063157345


In [10]:
y_hat_train = model_prior.predict(X_train_prior)
y_hat_val = model_prior.predict(X_val_prior)
y_hat_test = model_prior.predict(X_test_prior)

In [11]:
X_train_appr = X_train_prior.copy()
X_test_appr = X_test_prior.copy()
X_val_appr = X_val_prior.copy()

X_train_appr["pred_prior_price"] = y_hat_train
X_val_appr["pred_prior_price"] = y_hat_val
X_test_appr["pred_prior_price"] = y_hat_test

In [12]:
# predict feature price
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_appr, y_train)
y_hat = model.predict(X_val_appr)

print(f"MSE: {mean_squared_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")

MSE: 29284990823.63912
MAE: 64543.59303165187
MAPE: 0.1744008742591075
R2: 0.8501895544603549
RMSE: 171128.57979787924
SMAPE: 13.292636477914685
RMSPE: 0.8594061280560179


In [13]:
y_hat_appr = (y_hat - y_hat_prior)/y_hat_prior
y_appr = (y_val - y_hat_prior)/y_hat_prior

In [14]:
print(f"MSE: {mean_squared_error(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"MAE: {mean_absolute_error(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"R2: {r2_score(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"RMSE: {rmse(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"SMAPE: {smape(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"RMSPE: {rmspe(y_true=y_appr, y_pred=y_hat_appr)}")

MSE: 3.878868648855498
MAE: 0.2215503332602512
MAPE: 2.261991425554606
R2: 0.8219162845071375
RMSE: 1.9694843611604278
SMAPE: 57.94181955556445
RMSPE: 49.98824114792523


### Feature Extraction

In [28]:
pca = PCA(n_components=3)

In [29]:
X_train_prior_2 = pd.concat([X_train_prior, pd.DataFrame(pca.fit_transform(X_train_prior), index=X_train_prior.index)], axis=1)
X_val_prior_2 = pd.concat([X_val_prior, pd.DataFrame(pca.fit_transform(X_val_prior), index=X_val_prior.index)], axis=1)
X_test_prior_2 = pd.concat([X_test_prior, pd.DataFrame(pca.fit_transform(X_test_prior), index=X_test_prior.index)], axis=1)

In [30]:
columns = ['longitude', 'latitude', 'county_Fairfax', 'county_Fairfield',
       'county_Hartford', 'county_Litchfield', 'county_Middlesex',
       'county_New Haven', 'county_New London', 'county_Tolland',
       'county_Windham', 'state_Connecticut', 'state_Virginia',
       'cond_desc_Average', 'cond_desc_Average Plus', 'cond_desc_Fair',
       'cond_desc_Good', 'cond_desc_Poor']
columns = list(set(X_train_prior_2.columns) & set(columns))
X_train_prior_2 = X_train_prior_2.rename(columns={0:"pca_0", 1:"pca_1", 2:"pca_2"})
X_test_prior_2 = X_test_prior_2.rename(columns={0:"pca_0", 1:"pca_1", 2:"pca_2"})
X_val_prior_2 = X_val_prior_2.rename(columns={0:"pca_0", 1:"pca_1", 2:"pca_2"})


scaler = StandardScaler()

X_train_scaled = X_train_prior_2.drop(columns=columns)
X_test_scaled = X_test_prior_2.drop(columns=columns)
X_val_scaled = X_val_prior_2.drop(columns=columns)
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_scaled), columns=X_train_scaled.columns, index=X_train_scaled.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_scaled), columns=X_test_scaled.columns, index=X_test_scaled.index)
X_val_scaled = pd.DataFrame(scaler.transform(X_val_scaled), columns=X_val_scaled.columns, index=X_val_scaled.index)
X_train_scaled = pd.concat([X_train.loc[:, columns], X_train_scaled], axis=1)
X_test_scaled = pd.concat([X_test.loc[:, columns], X_test_scaled], axis=1)
X_val_scaled = pd.concat([X_val.loc[:, columns], X_val_scaled], axis=1)

In [31]:
k = 6 # You can choose the number of clusters based on your analysis
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_train_scaled)



In [37]:
X_train_prior_2 = pd.concat([X_train_prior_2, pd.DataFrame(kmeans.transform(X_train_scaled), index=X_train_prior_2.index)], axis=1)
X_test_prior_2 = pd.concat([X_test_prior_2, pd.DataFrame(kmeans.transform(X_test_scaled), index=X_test_prior_2.index)], axis=1)
X_val_prior_2= pd.concat([X_val_prior_2, pd.DataFrame(kmeans.transform(X_val_scaled), index=X_val_prior_2.index)], axis=1)

In [43]:
# predict prior price
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model_prior = XGBRegressor(**parameters)
model_prior.fit(X_train_prior_2, y_train_prior)
y_hat_prior = model_prior.predict(X_val_prior_2)

print(f"MSE: {mean_squared_error(y_true=y_val_prior, y_pred=y_hat_prior)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_prior, y_pred=y_hat_prior)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_prior, y_pred=y_hat_prior)}")
print(f"R2: {r2_score(y_true=y_val_prior, y_pred=y_hat_prior)}")
print(f"RMSE: {rmse(y_true=y_val_prior, y_pred=y_hat_prior)}")
print(f"SMAPE: {smape(y_true=y_val_prior, y_pred=y_hat_prior)}")
print(f"RMSPE: {rmspe(y_true=y_val_prior, y_pred=y_hat_prior)}")

MSE: 21857016793.08298
MAE: 59325.246938719196
MAPE: 0.31774004512544185
R2: 0.8246679351354798
RMSE: 147841.18774239803
SMAPE: 18.22856502266729
RMSPE: 2.6964687697536567


In [44]:
X_train_appr_2 = X_train_prior_2.copy()
X_test_appr_2 = X_test_prior_2.copy()
X_val_appr_2 = X_val_prior_2.copy()

In [45]:
y_hat_train = model_prior.predict(X_train_prior_2)
y_hat_val = model_prior.predict(X_val_prior_2)
y_hat_test = model_prior.predict(X_test_prior_2)

In [46]:
X_train_appr_2["pred_prior_price"] = y_hat_train
X_val_appr_2["pred_prior_price"] = y_hat_val
X_test_appr_2["pred_prior_price"] = y_hat_test

In [47]:
# predict future price
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_appr_2, y_train)
y_hat = model.predict(X_val_appr_2)

print(f"MSE: {mean_squared_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")

MSE: 28792039967.039116
MAE: 64902.86904116897
MAPE: 0.1748440905881417
R2: 0.8527112963280965
RMSE: 169682.17339201874
SMAPE: 13.36324707048929
RMSPE: 0.8621946952804941


In [48]:
y_hat_appr = (y_hat - y_hat_prior)/y_hat_prior
y_appr = (y_val - y_hat_prior)/y_hat_prior

In [49]:
print(f"MSE: {mean_squared_error(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"MAE: {mean_absolute_error(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"R2: {r2_score(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"RMSE: {rmse(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"SMAPE: {smape(y_true=y_appr, y_pred=y_hat_appr)}")
print(f"RMSPE: {rmspe(y_true=y_appr, y_pred=y_hat_appr)}")

MSE: 0.9145181962115421
MAE: 0.21318037303100779
MAPE: 2.7336057415587125
R2: 0.9192788042782525
RMSE: 0.9563044474494208
SMAPE: 58.119775161460424
RMSPE: 89.26524684705142
