# 02 Modelling (Manuell Feature Selection)

In this notebook, we apply manuell feature selection.

In [149]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

from xgboost import XGBRegressor

In [150]:
df_train = pd.read_csv("data/df_train_2.csv", index_col=0)
df_test = pd.read_csv("data/df_test_2.csv", index_col=0)
df_val = pd.read_csv("data/df_val_2.csv", index_col=0)

In [151]:
X_train = df_train.drop(columns=["price", "id"])
X_test = df_test.drop(columns=["price", "id"])
X_val = df_val.drop(columns=["price", "id"])

y_train = df_train.loc[:, "price"]
y_test = df_test.loc[:, "price"]
y_val = df_val.loc[:, "price"]

In [152]:
# some loss functions
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_pred - y_true) ** 2))

def smape(y_true, y_pred):
    return np.mean(200 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def rmspe(y_true, y_pred):
    epsilon = 1e-6  # Small constant to avoid division by zero
    return np.sqrt(np.mean(((y_true - y_pred) / (y_true + epsilon)) ** 2))

In [153]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train, y_train)
y_hat = model.predict(X_val)

print(f"MSE: {mean_squared_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")    

MSE: 39580179840.86055
MAE: 59818.07929516394
MAPE: 27.778591455226675
R2: 0.8107326776815031
RMSE: 198947.6811648242
SMAPE: 15.64003403007576
RMSPE: 2010.568970144624


#### Manuell Feature Selection

In [154]:
X_train_2 = X_train.drop(columns=["state_Virginia", "n_poverty_young", "n_employed", "county_Fairfax", "state_Connecticut", "cond_desc_Average Plus", "distance_hotel", "n_healthcare", "n_recreational"])
X_test_2 = X_test.drop(columns=["state_Virginia", "n_poverty_young", "n_employed", "county_Fairfax", "state_Connecticut", "cond_desc_Average Plus", "distance_hotel", "n_healthcare", "n_recreational"])
X_val_2 = X_val.drop(columns=["state_Virginia", "n_poverty_young", "n_employed", "county_Fairfax", "state_Connecticut", "cond_desc_Average Plus", "distance_hotel", "n_healthcare", "n_recreational"])

In [155]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_2, y_train)
y_hat = model.predict(X_val_2)

print(f"MSE: {mean_squared_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")    

MSE: 39462572902.69741
MAE: 60019.51382863653
MAPE: 28.425632945259977
R2: 0.8112950589127585
RMSE: 198651.88874686646
SMAPE: 15.665694524668652
RMSPE: 2238.3988073076935


In [145]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_2, y_train)
y_hat = model.predict(X_val_2)

print(f"MSE: {mean_squared_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")    

MSE: 44851449366.6923
MAE: 67753.08245647169
MAPE: 30.30833792208257
R2: 0.7855261457156398
RMSE: 211781.6077158078
SMAPE: 17.46996224620131
RMSPE: 2509.5342936162206
