# Exhaustive Feature Selection

In [2]:
import numpy as np
import pandas as pd
import pandas_profiling as pp
import calendar as cal
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV

from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS


from IPython.core.display import HTML
%matplotlib inline

  from numpy.core.umath_tests import inner1d


In [3]:
# Read data
df = pd.read_csv('https://s3.us-east-2.amazonaws.com/ads-demo1/E_Dataset.csv',parse_dates=['date'])

In [4]:
df

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.890000,47.596667,19.200000,44.790000,19.790000,44.730000,19.000000,...,17.033333,45.530000,6.600000,733.500000,92.000000,7.000000,63.000000,5.300000,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.890000,46.693333,19.200000,44.722500,19.790000,44.790000,19.000000,...,17.066667,45.560000,6.483333,733.600000,92.000000,6.666667,59.166667,5.200000,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.890000,46.300000,19.200000,44.626667,19.790000,44.933333,18.926667,...,17.000000,45.500000,6.366667,733.700000,92.000000,6.333333,55.333333,5.100000,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.890000,46.066667,19.200000,44.590000,19.790000,45.000000,18.890000,...,17.000000,45.400000,6.250000,733.800000,92.000000,6.000000,51.500000,5.000000,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.890000,46.333333,19.200000,44.530000,19.790000,45.000000,18.890000,...,17.000000,45.400000,6.133333,733.900000,92.000000,5.666667,47.666667,4.900000,10.084097,10.084097
5,2016-01-11 17:50:00,50,40,19.890000,46.026667,19.200000,44.500000,19.790000,44.933333,18.890000,...,17.000000,45.290000,6.016667,734.000000,92.000000,5.333333,43.833333,4.800000,44.919484,44.919484
6,2016-01-11 18:00:00,60,50,19.890000,45.766667,19.200000,44.500000,19.790000,44.900000,18.890000,...,17.000000,45.290000,5.900000,734.100000,92.000000,5.000000,40.000000,4.700000,47.233763,47.233763
7,2016-01-11 18:10:00,60,50,19.856667,45.560000,19.200000,44.500000,19.730000,44.900000,18.890000,...,17.000000,45.290000,5.916667,734.166667,91.833333,5.166667,40.000000,4.683333,33.039890,33.039890
8,2016-01-11 18:20:00,60,40,19.790000,45.597500,19.200000,44.433333,19.730000,44.790000,18.890000,...,17.000000,45.290000,5.933333,734.233333,91.666667,5.333333,40.000000,4.666667,31.455702,31.455702
9,2016-01-11 18:30:00,70,40,19.856667,46.090000,19.230000,44.400000,19.790000,44.863333,18.890000,...,17.000000,45.290000,5.950000,734.300000,91.500000,5.500000,40.000000,4.650000,3.089314,3.089314


# Feature Engineering

In [5]:
df['year'] = df['date'].dt.year
df['quarter'] = df['date'].dt.quarter
df['month'] = df['date'].dt.month
df['weekNumberInYear'] = df['date'].dt.week
df['dayOfMonth'] = df['date'].dt.day
df['dayInWeek'] = df['date'].dt.dayofweek.apply(lambda x : cal.day_name[x])
df['hourOfDay'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute

df['WeekendFlag'] = df['dayInWeek'].apply(lambda x : 1 if (x == 'Saturday') or (x == 'Sunday') else 0).astype('int64')
df['TotalSecondsMidnight'] = (df['date'].dt.hour * 3600) + (df['date'].dt.minute * 60) + (df['date'].dt.second)

dayInWeek = pd.get_dummies(df.dayInWeek,prefix='dayInWeek').astype('int64')
weekendflag = pd.get_dummies(df.WeekendFlag,prefix='weekendflag').astype('int64')

df = pd.concat([df,dayInWeek,weekendflag],axis=1)

In [6]:
features = ['Appliances', 'lights', 'T1', 'RH_1', 'T2', 'RH_2', 
            'T3','RH_3', 'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8','RH_8', 'T9', 'RH_9', 
            'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed','Visibility', 'Tdewpoint','TotalSecondsMidnight', 
            'dayInWeek_Friday','dayInWeek_Monday', 'dayInWeek_Saturday', 'dayInWeek_Sunday','dayInWeek_Thursday', 'dayInWeek_Tuesday', 'dayInWeek_Wednesday',
            'weekendflag_0', 'weekendflag_1']

In [7]:
df = df[features]

In [35]:
df

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,TotalSecondsMidnight,dayInWeek_Friday,dayInWeek_Monday,dayInWeek_Saturday,dayInWeek_Sunday,dayInWeek_Thursday,dayInWeek_Tuesday,dayInWeek_Wednesday,weekendflag_0,weekendflag_1
0,60,30,19.890000,47.596667,19.200000,44.790000,19.790000,44.730000,19.000000,45.566667,...,61200,0,1,0,0,0,0,0,1,0
1,60,30,19.890000,46.693333,19.200000,44.722500,19.790000,44.790000,19.000000,45.992500,...,61800,0,1,0,0,0,0,0,1,0
2,50,30,19.890000,46.300000,19.200000,44.626667,19.790000,44.933333,18.926667,45.890000,...,62400,0,1,0,0,0,0,0,1,0
3,50,40,19.890000,46.066667,19.200000,44.590000,19.790000,45.000000,18.890000,45.723333,...,63000,0,1,0,0,0,0,0,1,0
4,60,40,19.890000,46.333333,19.200000,44.530000,19.790000,45.000000,18.890000,45.530000,...,63600,0,1,0,0,0,0,0,1,0
5,50,40,19.890000,46.026667,19.200000,44.500000,19.790000,44.933333,18.890000,45.730000,...,64200,0,1,0,0,0,0,0,1,0
6,60,50,19.890000,45.766667,19.200000,44.500000,19.790000,44.900000,18.890000,45.790000,...,64800,0,1,0,0,0,0,0,1,0
7,60,50,19.856667,45.560000,19.200000,44.500000,19.730000,44.900000,18.890000,45.863333,...,65400,0,1,0,0,0,0,0,1,0
8,60,40,19.790000,45.597500,19.200000,44.433333,19.730000,44.790000,18.890000,45.790000,...,66000,0,1,0,0,0,0,0,1,0
9,70,40,19.856667,46.090000,19.230000,44.400000,19.790000,44.863333,18.890000,46.096667,...,66600,0,1,0,0,0,0,0,1,0


# Train and Test


In [8]:
# Train/test split
X = df.drop('Appliances',axis=1)
y = df['Appliances']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

y_train = y_train.ravel()
y_test = y_test.ravel()

print('Training dataset shape:', X_train.shape, y_train.shape)
print('Testing dataset shape:', X_test.shape, y_test.shape)

Training dataset shape: (13814, 35) (13814,)
Testing dataset shape: (5921, 35) (5921,)


# Linear Regressor with Exhaustive Search

In [37]:
lr1 = LinearRegression()

In [15]:
# Build RF classifier to use in feature selection
#rf1 = LinearRegression( n_jobs=-1)



In [38]:
#Build ExhaustiveFeatureSelector
efs = EFS(lr1,
        min_features=3,
        max_features=4,
        scoring='neg_mean_squared_error',
        cv=0)

In [39]:
efs = efs.fit(X_train.values, y_train)

Features: 58905/58905

In [13]:
#print('Selected features:', efs.best_idx_)

Selected features: (0, 2, 4, 16)


In [43]:
feat_cols = list(efs.best_idx_)
print(feat_cols)

[0, 2, 4, 16]


In [45]:
lr1.fit(X_train.iloc[:, feat_cols], y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [46]:
pred = lr1.predict(X_test.iloc[:,feat_cols])

In [47]:
r2_score(y_test,pred)

0.11544846707140755

In [48]:
print(mean_absolute_error(y_test,pred))
print(mean_squared_error(y_test,pred))

55.10267618162067
9413.427985171887


In [9]:
lr2 = LinearRegression()

In [10]:
lr2.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [11]:
preds_all_features = lr2.predict(X_test)

In [12]:
r2_score(y_test,preds_all_features)

0.17646027028152234

In [13]:
print(mean_absolute_error(y_test,preds_all_features))
print(mean_squared_error(y_test,preds_all_features))

53.78235804565764
8764.138266728476


### We can see that out of all the 30 features since we are only using 3 or 4 in our EFS model, therefore our EFS model is performing poorly as compared to the non-EFS model