In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt

sns.set_theme(palette="Set2")

# execution time
from timeit import default_timer as timer
from datetime import timedelta

# increase column width
pd.set_option('display.max_colwidth', 200)

# Load the data

In [2]:
trainset = pd.read_csv("trainset-ca-housing.csv")
testset = pd.read_csv("testset-ca-housing.csv")

Xtrain = trainset.drop("median_house_value", axis=1)
ytrain = trainset["median_house_value"].copy()
Xtest = testset.drop("median_house_value", axis=1)
ytest = testset["median_house_value"].copy()

# Feasure selection using Pearson's r

In [3]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import r_regression 

selector = SelectKBest(r_regression, k=10)
Xtrain_tmp = selector.fit_transform(Xtrain, ytrain)
Xtest_tmp = selector.transform(Xtest)

In [4]:
selector.get_feature_names_out()

array(['longitude', 'housing_median_age', 'total_rooms', 'total_bedrooms',
       'population', 'households', 'median_income', 'rooms_per_household',
       'ocean_proximity_NEAR BAY', 'ocean_proximity_NEAR OCEAN'],
      dtype=object)

In [5]:
# fit_transform returns a NumPy array, so we need to put it back 
# into a Pandas dataframe
Xtrain_tmp = pd.DataFrame(Xtrain_tmp, columns=selector.get_feature_names_out())
Xtest_tmp = pd.DataFrame(Xtest_tmp, columns=selector.get_feature_names_out())

In [6]:
Xtrain_tmp.shape, Xtest_tmp.shape

((15262, 10), (3819, 10))

# Recursive Feature Elimination

In [7]:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(random_state=7, max_depth=10)
selector = RFE(model, n_features_to_select=5, step=1)
selector = selector.fit(Xtrain, ytrain)

In [8]:
Xtrain_tmp = selector.transform(Xtrain)
Xtest_tmp = selector.transform(Xtest)

In [9]:
selector.get_feature_names_out()

array(['longitude', 'housing_median_age', 'total_bedrooms',
       'median_income', 'rooms_per_household'], dtype=object)

In [10]:
Xtrain_tmp.shape, Xtest_tmp.shape

((15262, 5), (3819, 5))