# Forward / Backward Feature Selection

In [26]:
import numpy as np
import pandas as pd
import pandas_profiling as pp
import calendar as cal
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV

from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs


from IPython.core.display import HTML
%matplotlib inline

In [3]:
# Read data
df = pd.read_csv('https://s3.us-east-2.amazonaws.com/ads-demo1/E_Dataset.csv',parse_dates=['date'])

In [4]:
display(HTML(df.head().to_html()))

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,55.2,7.026667,84.256667,17.2,41.626667,18.2,48.9,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,55.2,6.833333,84.063333,17.2,41.56,18.2,48.863333,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,55.09,6.56,83.156667,17.2,41.433333,18.2,48.73,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,55.09,6.433333,83.423333,17.133333,41.29,18.1,48.59,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,55.09,6.366667,84.893333,17.2,41.23,18.1,48.59,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


# Feature Engineering

In [5]:
df['year'] = df['date'].dt.year
df['quarter'] = df['date'].dt.quarter
df['month'] = df['date'].dt.month
df['weekNumberInYear'] = df['date'].dt.week
df['dayOfMonth'] = df['date'].dt.day
df['dayInWeek'] = df['date'].dt.dayofweek.apply(lambda x : cal.day_name[x])
df['hourOfDay'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute

df['WeekendFlag'] = df['dayInWeek'].apply(lambda x : 1 if (x == 'Saturday') or (x == 'Sunday') else 0).astype('int64')
df['TotalSecondsMidnight'] = (df['date'].dt.hour * 3600) + (df['date'].dt.minute * 60) + (df['date'].dt.second)

dayInWeek = pd.get_dummies(df.dayInWeek,prefix='dayInWeek').astype('int64')
weekendflag = pd.get_dummies(df.WeekendFlag,prefix='weekendflag').astype('int64')

df = pd.concat([df,dayInWeek,weekendflag],axis=1)

In [6]:
features = ['Appliances', 'lights', 'T1', 'RH_1', 'T2', 'RH_2', 
            'T3','RH_3', 'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8','RH_8', 'T9', 'RH_9', 
            'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed','Visibility', 'Tdewpoint','TotalSecondsMidnight', 
            'dayInWeek_Friday','dayInWeek_Monday', 'dayInWeek_Saturday', 'dayInWeek_Sunday','dayInWeek_Thursday', 'dayInWeek_Tuesday', 'dayInWeek_Wednesday',
            'weekendflag_0', 'weekendflag_1']

In [7]:
df = df[features]

In [17]:
df.shape

(19735, 36)

In [18]:
rmse_dict = {}

def rmse(correct,estimated):
    rmse_val = np.sqrt(mean_squared_error(correct,estimated)) 
    return rmse_val

def calc_error_metric(name, model, X_train, y_train, X_test, y_test):
    
    y_train_predicted = model.predict(X_train)
    y_test_predicted = model.predict(X_test)
        
    #MAE, RMS, MAPE, R2
    
    r2_train = r2_score(y_train, y_train_predicted)
    r2_test = r2_score(y_test, y_test_predicted)
    
    rms_train = rmse(y_train, y_train_predicted)
    rms_test = rmse(y_test, y_test_predicted)
        
    mae_train = mean_absolute_error(y_train, y_train_predicted)
    mae_test = mean_absolute_error(y_test, y_test_predicted)
        
    mape_train = np.mean(np.abs((y_train - y_train_predicted) / y_train)) * 100
    mape_test = np.mean(np.abs((y_test - y_test_predicted) / y_test)) * 100
    
    rmse_dict[name] = rms_test
        
    df_local = pd.DataFrame({'Model':[name],
                            'r2_train': [r2_train],
                            'r2_test': [r2_test],
                            'rms_train':[rms_train], 
                            'rms_test': [rms_test],
                            'mae_train': [mae_train],
                            'mae_test': [mae_test],
                            'mape_train':[mape_train],
                            'mape_test':[mape_test]})

    error_metric = pd.concat([df_local])
 
    return error_metric

# Train and Test Split

In [8]:
# Train/test split
X = df.drop('Appliances',axis=1)
y = df['Appliances']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

# Random Forrest with All Features Included

In [19]:
rf_all_features = RandomForestRegressor(n_estimators=30, max_depth=30, n_jobs=-1)
rf_all_features.fit(X_train,y_train)
calc_error_metric('Random Forrest All Features',rf_all_features,X_train,y_train,X_test,y_test)

Unnamed: 0,Model,r2_train,r2_test,rms_train,rms_test,mae_train,mae_test,mape_train,mape_test
0,Random Forrest All Features,0.926856,0.519981,27.652819,71.472937,12.811918,33.544789,12.673099,32.406006


# Random Forest with Forward Feature Selection

In [9]:
# Build RF classifier to use in feature selection
rf1 = RandomForestRegressor(n_estimators=30, max_depth=30, n_jobs=-1)

In [10]:
# Build step forward feature selection
sfs1 = sfs(rf1, k_features=15, forward=True, verbose=2,scoring='neg_mean_squared_error',cv=2,n_jobs=-1)

In [11]:
sfs1 = sfs1.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  35 out of  35 | elapsed:   15.8s finished

[2018-11-04 21:41:17] Features: 1/30 -- score: -9057.664519878455[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  34 out of  34 | elapsed:   18.7s finished

[2018-11-04 21:41:36] Features: 2/30 -- score: -8823.408226180793[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  33 out of  33 | elapsed:   22.1s finished

[2018-11-04 21:41:58] Features: 3/30 -- score: -6568.073696611838[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed:   24.4s finished

[2018-11-04 21:42:22] Features: 4/30 -- score: -6142.487788319474[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  31 out of  31 | elapsed:   2

In [12]:
feat_cols = list(sfs1.k_feature_idx_)
print(feat_cols)

[1, 3, 4, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]


In [13]:
rf_fwd = RandomForestRegressor(n_estimators=30, max_depth=30)
rf_fwd.fit(X_train.iloc[:, feat_cols], y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [21]:
calc_error_metric('Random Forrest FWD Selection',rf_fwd,X_train.iloc[:, feat_cols],y_train,X_test.iloc[:,feat_cols],y_test)

Unnamed: 0,Model,r2_train,r2_test,rms_train,rms_test,mae_train,mae_test,mape_train,mape_test
0,Random Forrest FWD Selection,0.935977,0.575913,25.871247,67.179993,11.95273,31.081771,11.704076,30.192527


# Random Forest with Backward Feature Selection

In [27]:
rf2 = RandomForestRegressor(n_estimators=30,max_depth=30,n_jobs=-1)

In [28]:
# Build step backward feature selection
sfs_bwd = sfs(rf2, k_features=30, forward=False, verbose=2,scoring='neg_mean_squared_error',cv=2,n_jobs=-1)

In [30]:
sfs_bwd = sfs_bwd.fit(X_train.values,y_train.values)

[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:  1.6min finished

[2018-11-04 22:17:40] Features: 34/30 -- score: -6012.418103961182[Parallel(n_jobs=-1)]: Done  34 out of  34 | elapsed:  1.6min finished

[2018-11-04 22:19:18] Features: 33/30 -- score: -5900.1960127265365[Parallel(n_jobs=-1)]: Done  33 out of  33 | elapsed:  1.5min finished

[2018-11-04 22:20:48] Features: 32/30 -- score: -5888.020126303865[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:  1.5min finished

[2018-11-04 22:22:19] Features: 31/30 -- score: -5885.324066461927[Parallel(n_jobs=-1)]: Done  31 out of  31 | elapsed:  1.4min finished

[2018-11-04 22:23:42] Features: 30/30 -- score: -5906.5993726289

In [31]:
feat_cols_bw = list(sfs_bwd.k_feature_idx_)
print(feat_cols_bw)

[1, 3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34]


In [33]:
rf_bw = RandomForestRegressor(n_estimators=30, max_depth=30)
rf_bw.fit(X_train.iloc[:, feat_cols_bw], y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [34]:
calc_error_metric('Random Forrest BWD Selection',
                  rf_bw,X_train.iloc[:, feat_cols_bw],y_train,
                  X_test.iloc[:,feat_cols_bw],y_test)

Unnamed: 0,Model,r2_train,r2_test,rms_train,rms_test,mae_train,mae_test,mape_train,mape_test
0,Random Forrest BWD Selection,0.935673,0.561712,25.932621,68.295498,12.082505,31.644982,11.917106,30.582572


### The Random Forrest model with Forward Selection yields a better model than the Backward Selection.  
### However both the models are better than the one which contains all the features.