# 02 Modelling (Feature Selection)

In this notebook, we test different feature selection strategies (MRMR, SelectKBest, RFE).

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

from xgboost import XGBRegressor
from sklearn.feature_selection import SelectKBest, mutual_info_regression, RFE
from skfeature.function.information_theoretical_based import MRMR

from sklearn.linear_model import LinearRegression

In [2]:
df_train = pd.read_csv("data/df_train_2.csv", index_col=0)
df_test = pd.read_csv("data/df_test_2.csv", index_col=0)
df_val = pd.read_csv("data/df_val_2.csv", index_col=0)

In [3]:
X_train = df_train.drop(columns=["price", "id"])
X_test = df_test.drop(columns=["price", "id"])
X_val = df_val.drop(columns=["price", "id"])

y_train = df_train.loc[:, "price"]
y_test = df_test.loc[:, "price"]
y_val = df_val.loc[:, "price"]

In [4]:
# some loss functions
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_pred - y_true) ** 2))

def smape(y_true, y_pred):
    return np.mean(200 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def rmspe(y_true, y_pred):
    epsilon = 1e-6  # Small constant to avoid division by zero
    return np.sqrt(np.mean(((y_true - y_pred) / (y_true + epsilon)) ** 2))

### MRMR

In [5]:
# Apply mRMR feature selection
k = 10  # Number of features to select
selected_features = MRMR.mrmr(X_train.values, y_train.values, n_selected_features=k)


In [6]:
selected_features

(array([ 9, 66, 14, 59, 11, 55, 12, 64, 13, 58]),
 array([ 8.97218251, -0.01135385,  0.32457738, -0.01337415,  0.32216236,
        -0.0208459 ,  0.31808128, -0.03281177,  0.31144909, -0.05780968]),
 array([8.97218251e+00, 4.45756273e-03, 8.97217254e+00, 3.41725120e-03,
        8.97208314e+00, 6.14227908e-03, 8.97205674e+00, 1.38188469e-02,
        8.97195161e+00, 1.72433108e-02]))

In [7]:
selected_features[0]

array([ 9, 66, 14, 59, 11, 55, 12, 64, 13, 58])

In [8]:
X_train_mrmr = X_train.iloc[:, selected_features[0]]
X_test_mrmr = X_test.iloc[:, selected_features[0]]
X_val_mrmr = X_val.iloc[:, selected_features[0]]

In [9]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_mrmr, y_train)
y_hat = model.predict(X_val_mrmr)

print(f"MSE: {mean_squared_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")

MSE: 109066479139.45757
MAE: 170022.34642273994
MAPE: 29.593937994729774
R2: 0.4784581438379205
RMSE: 330252.1447916085
SMAPE: 43.75452198115559
RMSPE: 2629.7537485714556


In [10]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_mrmr, y_train)
y_hat = model.predict(X_test_mrmr)

print(f"MSE: {mean_squared_error(y_true=y_test, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_test, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_test, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_test, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_test, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_test, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_test, y_pred=y_hat)}")

MSE: 115914455764.22357
MAE: 170206.83254689796
MAPE: 20.436272959254566
R2: 0.4485124268958164
RMSE: 340462.12089485605
SMAPE: 43.6771322240549
RMSPE: 913.2130165092813


### SelectKBest (mutual information)

In [11]:
# Select top k features using mutual information
k = 10
selector_mi = SelectKBest(score_func=mutual_info_regression, k=k)
X_selected_mi = selector_mi.fit_transform(X_train, y_train)

selected_feature_names_mi = X_train.columns[selector_mi.get_support()]


In [12]:
X_train_mi = X_train[selected_feature_names_mi]
X_test_mi = X_test[selected_feature_names_mi]
X_val_mi = X_val[selected_feature_names_mi]

In [13]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_mi, y_train)
y_hat = model.predict(X_val_mi)

print(f"MSE: {mean_squared_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")

MSE: 161557788976.78708
MAE: 181929.92293517585
MAPE: 24.822141107119954
R2: 0.22745146074938993
RMSE: 401942.51949350553
SMAPE: 43.10971910757757
RMSPE: 1177.7680450018383


In [14]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_mi, y_train)
y_hat = model.predict(X_test_mi)

print(f"MSE: {mean_squared_error(y_true=y_test, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_test, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_test, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_test, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_test, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_test, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_test, y_pred=y_hat)}")

MSE: 162582000138.378
MAE: 182216.91787008522
MAPE: 26.23100631541109
R2: 0.22648170070249485
RMSE: 403214.5832412042
SMAPE: 43.26171412154254
RMSPE: 1228.6340408770625


### RFE

In [15]:
# Use XGBoost model for RFE
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
estimator = XGBRegressor(**parameters)
selector_rfe = RFE(estimator, n_features_to_select=55, step=1)
X_selected_rfe = selector_rfe.fit_transform(X_train, y_train)

selected_feature_names_rfe = X_train.columns[selector_rfe.get_support()]


In [16]:
X_train_rfe = X_train[selected_feature_names_rfe]
X_test_rfe = X_test[selected_feature_names_rfe]
X_val_rfe = X_val[selected_feature_names_rfe]

In [17]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_rfe, y_train)
y_hat = model.predict(X_val_rfe)

print(f"MSE: {mean_squared_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")

MSE: 39139954732.50104
MAE: 60026.28633276946
MAPE: 26.843362576905424
R2: 0.8128377774514278
RMSE: 197838.20342012064
SMAPE: 15.685094777927645
RMSPE: 1772.2296761862274


In [18]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_rfe, y_train)
y_hat = model.predict(X_test_rfe)

print(f"MSE: {mean_squared_error(y_true=y_test, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_test, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_test, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_test, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_test, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_test, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_test, y_pred=y_hat)}")

MSE: 41982461750.9394
MAE: 60455.27642792355
MAPE: 20.400995884203834
R2: 0.8002595466517233
RMSE: 204896.22190499122
SMAPE: 15.776542853204715
RMSPE: 975.6298177669254


In [20]:
# Use XGBoost model for RFE
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
estimator = XGBRegressor(**parameters)
selector_rfe = RFE(estimator, n_features_to_select=50, step=1)
X_selected_rfe = selector_rfe.fit_transform(X_train, y_train)

selected_feature_names_rfe = X_train.columns[selector_rfe.get_support()]


In [21]:
selected_feature_names_rfe

Index(['city', 'yrblt', 'effyrblt', 'nbed', 'nbath', 'nhalfbath', 'livarea',
       'efflivarea', 'distance_aerodrome', 'distance_ferry_terminal',
       'distance_railway_station', 'distance_market', 'distance_hospital',
       'distance_hotel', 'distance_museum', 'n_reli_inst', 'n_edu_fac',
       'n_healthcare', 'n_emergency', 'n_animalcare', 'n_commu_venu',
       'n_commu_serv', 'n_food_drink', 'n_financial', 'n_transport',
       'n_entertainment', 'n_sports', 'n_utilities', 'n_accommodation',
       'n_government_civic', 'n_recreational', 'year', 'hpi',
       'household_income', 'new_housing', 'population', 'n_poverty',
       'n_poverty_young', 'unemployment_rate', 'n_employed', 'age', 'eff_age',
       'longitude', 'latitude', 'county_Fairfield', 'county_Litchfield',
       'cond_desc_Average', 'cond_desc_Fair', 'cond_desc_Good',
       'cond_desc_Poor'],
      dtype='object')

In [22]:
X_train_rfe = X_train[selected_feature_names_rfe]
X_test_rfe = X_test[selected_feature_names_rfe]
X_val_rfe = X_val[selected_feature_names_rfe]

In [23]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_rfe, y_train)
y_hat = model.predict(X_val_rfe)

print(f"MSE: {mean_squared_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")

MSE: 39127435488.808556
MAE: 60071.32587975346
MAPE: 28.01581486794406
R2: 0.8128976428623647
RMSE: 197806.56078302496
SMAPE: 15.692555953301708
RMSPE: 1878.978749028425


In [24]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_rfe, y_train)
y_hat = model.predict(X_test_rfe)

print(f"MSE: {mean_squared_error(y_true=y_test, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_test, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_test, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_test, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_test, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_test, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_test, y_pred=y_hat)}")

MSE: 41637620494.6193
MAE: 60414.97664862687
MAPE: 20.453423300975334
R2: 0.8019002019634387
RMSE: 204052.98452759592
SMAPE: 15.758015716652533
RMSPE: 974.7378292905348
