<a href="https://colab.research.google.com/github/vannicc/CCADMACL_EXERCISES_COM222/blob/main/Exercise1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exercise 1

Use all feature selection methods to find the best features

## Dataset Information

## Features

Number of Instances: 20640

Number of Attributes: 8 numeric, predictive attributes and the target

Attribute Information:

MedInc - median income in block group

HouseAge - median house age in block group

AveRooms - average number of rooms per household

AveBedrms - average number of bedrooms per household

Population - block group population

AveOccup - average number of household members

Latitude - block group latitude

Longitude - block group longitude

## Target
The target variable is the median house value for California districts, expressed in hundreds of thousands of dollars ($100,000).

In [99]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

In [100]:
housing = fetch_california_housing(as_frame=True)
df = pd.concat([housing.data, housing.target], axis=1)

In [101]:
df_housing_features = pd.DataFrame(housing.data, columns=housing.feature_names)
df_housing_features.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [102]:
df_housing_target = pd.DataFrame(housing.target, columns=["MedHouseVal"])
df_housing_target.head()

Unnamed: 0,MedHouseVal
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422


In [103]:
df.corr()['MedHouseVal'].abs().sort_values(ascending=False)

Unnamed: 0,MedHouseVal
MedHouseVal,1.0
MedInc,0.688075
AveRooms,0.151948
Latitude,0.14416
HouseAge,0.105623
AveBedrms,0.046701
Longitude,0.045967
Population,0.02465
AveOccup,0.023737


1. Use any filter method to select the best features

In [104]:
# put your answer here
from sklearn.feature_selection import f_classif

threshold = 5 # the number of most relevant features
high_score_features = []
feature_scores = f_classif(df_housing_features, df_housing_target.values.ravel())[0]

In [105]:
for score, f_name in sorted(zip(feature_scores, df_housing_features.columns), reverse=True)[:threshold]:
      # print(f_name, score)
      high_score_features.append(f_name)

df_housing_fc = df_housing_features[high_score_features]
df_housing_fc.columns

Index(['MedInc', 'Latitude', 'Population', 'HouseAge', 'Longitude'], dtype='object')

2. Use any wrapper method to select the best features

In [106]:
# put your answer here
from sklearn.ensemble import RandomForestRegressor # Changed to Regressor for continuous target
from sklearn.feature_selection import RFE

threshold = 5 # the number of most relevant features
# Changed to Regressor for continuous target
model_rf = RandomForestRegressor(n_estimators=500, random_state=0, max_depth = 3)
selector = RFE(model_rf, n_features_to_select=5, step=1)

selector = selector.fit(df_housing_features, df_housing_target.values.ravel())
selector_ind = selector.get_support()
df_housing_rfe = df_housing_features.iloc[:, selector_ind]
df_housing_rfe.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveOccup', 'Latitude'], dtype='object')

3. Use any embedded method to select the best features

In [114]:
# put your answer here
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor # Import RandomForestRegressor

# Changed to Regressor for continuous target
model_rf = RandomForestRegressor(n_estimators=500, random_state=0, max_depth=3)
model_rf.fit(X_train, y_train.values.ravel())
model_rf.predict(X_test)

sel_sfm = SelectFromModel(model_rf, prefit=True)
sel_sfm_index = sel_sfm.get_support()
df_housing_sfm = df_housing_features.iloc[:, sel_sfm_index]
df_housing_sfm.columns

Index(['MedInc', 'AveOccup'], dtype='object')

Comparison

In [108]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [109]:
#Default
X_train, X_test, y_train, y_test = train_test_split(df_housing_features, df_housing_target, test_size=0.2, random_state=0) # Added train_test_split

model = RandomForestRegressor(n_estimators=500, random_state=0, max_depth=3)
model.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [110]:
#F_classif
X_train_fc, X_test_fc, y_train_fc, y_test_fc = train_test_split(df_housing_fc, df_housing_target, test_size=0.2, random_state=0)
model_fc = RandomForestRegressor(n_estimators=500, random_state=0, max_depth=3)
model_fc.fit(X_train_fc, y_train_fc)

  return fit_method(estimator, *args, **kwargs)


In [111]:
#rfe
X_train_rfe, X_test_rfe, y_train_rfe, y_test_rfe = train_test_split(df_housing_rfe, df_housing_target, test_size=0.2, random_state=0)
model_rfe = RandomForestRegressor(n_estimators=500, random_state=0, max_depth=3)
model_rfe.fit(X_train_rfe, y_train_rfe)

  return fit_method(estimator, *args, **kwargs)


In [115]:
default_preds = model.predict(X_test)
fc_preds = model_fc.predict(X_test_fc)
rfe_preds = model_rfe.predict(X_test_rfe)

default_rmse = mean_squared_error(y_test, default_preds, squared=False)
fc_rmse = mean_squared_error(y_test_fc, fc_preds, squared=False)
rfe_rmse = mean_squared_error(y_test_rfe, rfe_preds, squared=False)
sfm_rmse = mean_squared_error(y_test_sfm, sfm_preds, squared=False)

print(f"Default RMSE: {default_rmse}")
print(f"F_classif RMSE: {fc_rmse}")
print(f"RFE RMSE: {rfe_rmse}")
print(f"SFM RMSE: {sfm_rmse}")

Default RMSE: 0.7855725233084597
F_classif RMSE: 0.8226803956852501
RFE RMSE: 0.7851767281378094
SFM RMSE: 0.7896646751110269


