# 02 Modelling (Models, Features, Data Splits, Loss Functions)

In this notebook, we test different models, data splits, features and loss functions.

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

from xgboost import XGBRegressor
import statsmodels.api as sm

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
df_train = pd.read_csv("data/df_train.csv", index_col=0)
df_test = pd.read_csv("data/df_test.csv", index_col=0)
df_val = pd.read_csv("data/df_val.csv", index_col=0)


df_train_2 = pd.read_csv("data/df_train_2.csv", index_col=0)
df_test_2 = pd.read_csv("data/df_test_2.csv", index_col=0)
df_val_2 = pd.read_csv("data/df_val_2.csv", index_col=0)

In [None]:
X_train = df_train.drop(columns=["price", "id"])
X_val = df_val.drop(columns=["price", "id"])
X_test = df_test.drop(columns=["price", "id"])

y_train = df_train.loc[:, "price"]
y_val = df_val.loc[:, "price"]
y_test = df_test.loc[:, "price"]

In [None]:
X_train_2 = df_train_2.drop(columns=["price", "id"])
X_test_2 = df_test_2.drop(columns=["price", "id"])
X_val_2 = df_val_2.drop(columns=["price", "id"])

y_train_2 = df_train_2.loc[:, "price"]
y_test_2 = df_test_2.loc[:, "price"]
y_val_2 = df_val_2.loc[:, "price"]

In [None]:
# some loss functions
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_pred - y_true) ** 2))

def smape(y_true, y_pred):
    return np.mean(200 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def rmspe(y_true, y_pred):
    epsilon = 1e-6  # Small constant to avoid division by zero
    return np.sqrt(np.mean(((y_true - y_pred) / (y_true + epsilon)) ** 2))

### Parameter Search

In [None]:
# grid search
model = XGBRegressor()
param_grid = {
    'max_depth': [8, 10, 12],
    'learning_rate': [0.5, 0.1, 0.01],
    'n_estimators': [300, 400, 450]
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring="neg_mean_squared_error")
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

In [None]:
# manuell 
# best combinations:
# 1. n_estimator: 300, learning_rate:0.2, max_depth: 10
# 2. n_estimator: 500, learning_rate:0.1, max_depth: 10

In [None]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_2, y_train_2)
y_hat = model.predict(X_val_2)

print(f"MSE: {mean_squared_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val_2, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val_2, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val_2, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val_2, y_pred=y_hat)}")

### Compare Data Splits

In [None]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_2, y_train_2)
y_hat = model.predict(X_val_2)

print(f"MSE: {mean_squared_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val_2, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val_2, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val_2, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val_2, y_pred=y_hat)}")

In [None]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train, y_train)
y_hat = model.predict(X_val)

print(f"MSE: {mean_squared_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")

### Different Models

In [None]:
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_2), columns=X_train_2.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_2), columns=X_test_2.columns)
X_val_scaled = pd.DataFrame(scaler.transform(X_val_2), columns=X_val_2.columns)

In [None]:
# Linear Regression
print("Linear Regression")
linear_model = LinearRegression()

linear_model.fit(X_train_scaled, y_train_2)
y_hat = linear_model.predict(X_val_scaled)

print(f"MSE: {mean_squared_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val_2, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val_2, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val_2, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val_2, y_pred=y_hat)}")

In [None]:
# Decision Tree Regressor
print("Decision Tree Regressor")
dt_model = DecisionTreeRegressor(max_depth=10, random_state=42)

dt_model.fit(X_train_2, y_train_2)
y_hat = dt_model.predict(X_val_2)

print(f"MSE: {mean_squared_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val_2, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val_2, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val_2, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val_2, y_pred=y_hat)}")


In [None]:
print("Random Forest Regressor")
rf_model = RandomForestRegressor(random_state=42)

rf_model.fit(X_train_2, y_train_2)
y_hat = rf_model.predict(X_val_2)

print(f"MSE: {mean_squared_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val_2, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val_2, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val_2, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val_2, y_pred=y_hat)}")


In [None]:
print("Gradient Boosting Regressor")
gb_model = GradientBoostingRegressor(n_estimators=350, learning_rate=0.1, max_depth=10, random_state=42)

gb_model.fit(X_train_2, y_train_2)
y_hat = gb_model.predict(X_val_2)

print(f"MSE: {mean_squared_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val_2, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val_2, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val_2, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val_2, y_pred=y_hat)}")


### Different Features

In [None]:
features_core = ['city', 'yrblt', 'effyrblt', 'nbed', 'nbath', 'nhalfbath', 'county_Fairfax', 'county_Fairfield',
       'county_Hartford', 'county_Litchfield', 'county_Middlesex',
       'county_New Haven', 'county_New London', 'county_Tolland',
       'county_Windham', 'state_Connecticut', 'state_Virginia',
       'cond_desc_Average', 'cond_desc_Average Plus', 'cond_desc_Fair',
       'cond_desc_Good', 'cond_desc_Poor', 'year', 'month', 'age', 'eff_age', 'livarea', 'efflivarea', 'longitude', 'latitude']
features_geo_distance = [ 'distance_aerodrome', 'distance_ferry_terminal',
       'distance_railway_station', 'distance_market', 'distance_hospital',
       'distance_hotel', 'distance_museum']
features_geo_frequency = ['n_reli_inst', 'n_edu_fac',
       'n_healthcare', 'n_emergency', 'n_animalcare', 'n_commu_venu',
       'n_commu_serv', 'n_shopping', 'n_food_drink', 'n_financial',
       'n_transport', 'n_entertainment', 'n_adults_entertain', 'n_sports',
       'n_utilities', 'n_accommodation', 'n_government_civic',
       'n_recreational']
features_econ = ['hpi', 'household_income',
       'new_housing', 'population', 'n_poverty', 'poverty_rate',
       'poverty_rate_young', 'n_poverty_young', 'n_unemployed',
       'unemployment_rate', 'civilian_labour', 'n_employed']


In [None]:
X_train_core = X_train_2[features_core]
X_test_core = X_test_2[features_core]
X_val_core = X_val_2[features_core]

In [None]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_core, y_train_2)
y_hat = model.predict(X_val_core)

print(f"MSE: {mean_squared_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val_2, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val_2, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val_2, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val_2, y_pred=y_hat)}")

In [None]:
X_train_geo_dist = X_train_2[features_core + features_geo_distance]
X_test_geo_dist = X_test_2[features_core + features_geo_distance]
X_val_geo_dist = X_val_2[features_core + features_geo_distance]

In [None]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }


    
model = XGBRegressor(**parameters)
model.fit(X_train_geo_dist, y_train_2)
y_hat = model.predict(X_val_geo_dist)

print(f"MSE: {mean_squared_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val_2, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val_2, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val_2, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val_2, y_pred=y_hat)}")

In [None]:
X_train_econ = X_train_2[features_core + features_econ]
X_test_econ = X_test_2[features_core + features_econ]
X_val_econ = X_val_2[features_core + features_econ]

In [None]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_econ, y_train_2)
y_hat = model.predict(X_val_econ)

print(f"MSE: {mean_squared_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val_2, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val_2, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val_2, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val_2, y_pred=y_hat)}")

In [None]:
X_train_geo_freq = X_train_2[features_core + features_geo_frequency]
X_test_geo_freq = X_test_2[features_core + features_geo_frequency]
X_val_geo_freq = X_val_2[features_core + features_geo_frequency]

In [None]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_geo_freq, y_train_2)
y_hat = model.predict(X_val_geo_freq)

print(f"MSE: {mean_squared_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val_2, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val_2, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val_2, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val_2, y_pred=y_hat)}")

In [None]:
X_train_geo = X_train_2[features_core + features_geo_frequency + features_geo_distance]
X_test_geo = X_test_2[features_core + features_geo_frequency + features_geo_distance]
X_val_geo = X_val_2[features_core + features_geo_frequency + features_geo_distance]

In [None]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_geo, y_train_2)
y_hat = model.predict(X_val_geo)

print(f"MSE: {mean_squared_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val_2, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val_2, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val_2, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val_2, y_pred=y_hat)}")

In [None]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }


    
model = XGBRegressor(**parameters)
model.fit(X_train_2, y_train_2)
y_hat = model.predict(X_val_2)

print(f"MSE: {mean_squared_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val_2, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val_2, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val_2, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val_2, y_pred=y_hat)}")

### Different Loss Functions

In [None]:
def rmspe_obj(y_pred, y_train):
    epsilon = 1e-6  # Small constant to avoid division by zero
    grad = -2 * (y_train - y_pred) / (y_train ** 2 + epsilon)
    hess = 2 / (y_train ** 2 + epsilon)
    return grad, hess

In [None]:
def mse_loss(y_pred, y_val):
    # l(y_val, y_pred) = (y_val-y_pred)**2
    grad = 2*(y_val-y_pred)
    hess = np.repeat(2,y_val.shape[0])
    return grad, hess  

In [None]:
def mae_loss(y_pred, y_val):
    # f(y_val) = abs(y_val-y_pred)
    grad = np.sign(y_val-y_pred)*np.repeat(1,y_val.shape[0])
    hess = np.repeat(0,y_val.shape[0])
    return grad, hess

In [None]:
def pseudo_huber_loss(y_pred, y_val):
    d = (y_val-y_pred)
    delta = 1  
    scale = 1 + (d / delta) ** 2
    scale_sqrt = np.sqrt(scale)
    grad = d / scale_sqrt 
    hess = (1 / scale) / scale_sqrt
    return grad, hess

In [None]:
def cubic(y_pred, y_val):
    # f(y_val) = (y_val-y_pred)**4
    grad = 4*(y_val - y_pred)**3
    hess = 12*(y_val - y_pred)**2
    return grad, hess

In [None]:
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_pred - y_true) ** 2))

def smape(y_true, y_pred):
    return np.mean(200 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def rmspe(y_true, y_pred):
    epsilon = 1e-6  # Small constant to avoid division by zero
    return np.sqrt(np.mean(((y_true - y_pred) / (y_true + epsilon)) ** 2))

In [None]:
parameters = {"objective": mse_loss,
            "n_estimators":100,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":6,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train, y_train)
y_hat = model.predict(X_val)

print(f"MSE: {mean_squared_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")

In [None]:
parameters = {"objective": rmspe_obj,
            "n_estimators":100,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":6,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train, y_train)
y_hat = model.predict(X_val)

print(f"MSE: {mean_squared_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")

In [None]:
parameters = {"objective": mae_loss,
            "n_estimators":100,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":6,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train, y_train)
y_hat = model.predict(X_val)

print(f"MSE: {mean_squared_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")

In [None]:
parameters = {"objective": pseudo_huber_loss,
            "n_estimators":100,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":6,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train, y_train)
y_hat = model.predict(X_val)

print(f"MSE: {mean_squared_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")

In [None]:
parameters = {"objective": cubic,
            "n_estimators":100,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":6,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train, y_train)
y_hat = model.predict(X_val)

print(f"MSE: {mean_squared_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")

In [None]:
features_core = ['city', 'yrblt', 'effyrblt', 'nbed', 'nbath', 'nhalfbath', 'county_Fairfax', 'county_Fairfield',
       'county_Hartford', 'county_Litchfield', 'county_Middlesex',
       'county_New Haven', 'county_New London', 'county_Tolland',
       'county_Windham', 'state_Connecticut', 'state_Virginia',
       'cond_desc_Average', 'cond_desc_Average Plus', 'cond_desc_Fair',
       'cond_desc_Good', 'cond_desc_Poor', 'year', 'month', 'age', 'eff_age', 'longitude', 'latitude']
features_geo_distance = [ 'distance_aerodrome', 'distance_ferry_terminal',
       'distance_railway_station', 'distance_market', 'distance_hospital',
       'distance_hotel', 'distance_museum']
features_geo_frequency = ['n_reli_inst', 'n_edu_fac',
       'n_healthcare', 'n_emergency', 'n_animalcare', 'n_commu_venu',
       'n_commu_serv', 'n_shopping', 'n_food_drink', 'n_financial',
       'n_transport', 'n_entertainment', 'n_adults_entertain', 'n_sports',
       'n_utilities', 'n_accommodation', 'n_government_civic',
       'n_recreational']
features_econ = ['hpi', 'household_income',
       'new_housing', 'population', 'n_poverty', 'poverty_rate',
       'poverty_rate_young', 'n_poverty_young', 'n_unemployed',
       'unemployment_rate', 'civilian_labour', 'n_employed']


In [None]:
X_train_core = X_train_2[features_core]
X_test_core = X_test_2[features_core]
X_val_core = X_val_2[features_core]

In [None]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":100,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":6,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_core, y_train_2)
y_hat = model.predict(X_val_core)

print(f"MSE: {mean_squared_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val_2, y_pred=y_hat)}")
##print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
#print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
#print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")

In [None]:
X_train_geo_dist = X_train_2[features_core + features_geo_distance]
X_test_geo_dist = X_test_2[features_core + features_geo_distance]
X_val_geo_dist = X_val_2[features_core + features_geo_distance]

In [None]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":100,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":6,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_geo_dist, y_train_2)
y_hat = model.predict(X_val_geo_dist)

print(f"MSE: {mean_squared_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val_2, y_pred=y_hat)}")
##print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
#print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
#print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")

In [None]:
X_train_econ = X_train_2[features_core + features_econ]
X_test_econ = X_test_2[features_core + features_econ]
X_val_econ = X_val_2[features_core + features_econ]

In [None]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":100,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":6,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_econ, y_train_2)
y_hat = model.predict(X_val_econ)

print(f"MSE: {mean_squared_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val_2, y_pred=y_hat)}")
##print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
#print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
#print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")

In [None]:
X_train_geo_freq = X_train_2[features_core + features_geo_frequency]
X_test_geo_freq = X_test_2[features_core + features_geo_frequency]
X_val_geo_freq = X_val_2[features_core + features_geo_frequency]

In [None]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":100,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":6,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_geo_freq, y_train_2)
y_hat = model.predict(X_val_geo_freq)

print(f"MSE: {mean_squared_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val_2, y_pred=y_hat)}")
##print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
#print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
#print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")

In [None]:
X_train_geo = X_train_2[features_core + features_geo_frequency + features_geo_distance]
X_test_geo = X_test_2[features_core + features_geo_frequency + features_geo_distance]
X_val_geo = X_val_2[features_core + features_geo_frequency + features_geo_distance]

In [None]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":100,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":6,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_geo, y_train_2)
y_hat = model.predict(X_val_geo)

print(f"MSE: {mean_squared_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val_2, y_pred=y_hat)}")
##print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
#print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
#print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")

In [None]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":100,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":6,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_2, y_train_2)
y_hat = model.predict(X_val_2)

print(f"MSE: {mean_squared_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val_2, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val_2, y_pred=y_hat)}")
##print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
#print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
#print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")