# Project 2 Part 3: Feature Engineering and Preprocessing

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFECV

In [3]:
# set option to display max columns always
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
# Import cleaned datasets
concat_df = pd.read_csv('datasets/concat_df_cleaned.csv')

In [5]:
concat_df.shape

(2926, 282)

## Feature Engineering Methodology

#### "Art is the reduction of the unnecessary" Pablo Picasso

After cleaning null values and dummifying the categorical data, I ended up with 218 features in my the overall concatenated dataset. I wish to avoid using the kitchen sink approach of throwing all features into the models, as there are bound to be many irrelevant and redundant features. 

These are the downside of using the kitchen sink approach that I wish to avoid:

### Downsides of having too much features

1) Overfitting - leading to less accurate results.

2) Slow and expensive processing and execution time.

3) Curse of dimensionality where predictive power decreases as dimensionality increases.

4) Law of Parsimony and Occam's razor, where the simplest method which reaches the same result should be used. In this case, keeping the results the same, less features, and easier to explain features would be the best.


### Methods used for feature selection

Out of the 12 methods that my external research has netted me, I will be be using a mix of different methods, which includes:

1,2) Correlation analysis - Pairwise correlation and correlation with target 

3) Variance analysis - to drop low variation features

4) Backward elimination (RFE)

Together with other steps that are located in Data Cleaning and my final Model notebook which are: 

5) Multicollinearity

6) Lasso

7) Intuition on irrelevant and redundant features

### External research

Recursive feature elimination
https://www.linkedin.com/pulse/what-recursive-feature-elimination-amit-mittal

Vishal Patel | A Practical Guide to Dimensionality Reduction Techniques
https://www.youtube.com/watch?v=ioXKxulmwVQ

## Correlation analysis

In [6]:
# Preliminary splitting of full housing training data and test data from concatenated dataframe
# iloc 2048 instead of 2051 as I have removed 3 outliers
housing = concat_df.iloc[:2048]
test = concat_df.iloc[2048:]

A correlation matrix will be created using the housing training dataset, to ensure there is no leakage of test data. As I will be looking at the correation to sales price, I will not be able to create it with the test data still inside the concatenated dataframe.

In [7]:
# Create matrix of all feature correlations
corr_matrix = housing.corr().abs()

sol = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1)
                  .astype(bool))
                  .stack()
                  .sort_values(ascending=False))

# Convert to dataframe and reset multi-level index
corr_df = pd.DataFrame(sol.head(20)).reset_index()

# Rename columns
corr_df.columns = 'v1', 'v2', 'pair_corr'

In [8]:
def corr_target(row):
    row['v1_y_corr'] = housing.corr()['SalePrice'][row['v1']]
    row['v2_y_corr'] = housing.corr()['SalePrice'][row['v2']]
    return row

In [9]:
# Create df with pairwise correlation and correlation to target
corr_df = corr_df.apply(corr_target, axis=1)

In [10]:
corr_df.head(20)

Unnamed: 0,v1,v2,pair_corr,v1_y_corr,v2_y_corr
0,Central Air_N,Central Air_Y,1.0,-0.277493,0.277493
1,Garage Qual_None,Garage Cond_None,1.0,-0.230954,-0.230954
2,Garage Finish_None,Garage Qual_None,1.0,-0.230954,-0.230954
3,Garage Finish_None,Garage Cond_None,1.0,-0.230954,-0.230954
4,Street_Grvl,Street_Pave,1.0,-0.069864,0.069864
5,Bldg Type_Duplex,MS SubClass_90,1.0,-0.103759,-0.103759
6,Garage Yr Blt,Garage Qual_None,0.998579,0.258554,-0.230954
7,Garage Yr Blt,Garage Finish_None,0.998579,0.258554,-0.230954
8,Garage Yr Blt,Garage Cond_None,0.998579,0.258554,-0.230954
9,Exterior 1st_CemntBd,Exterior 2nd_CmentBd,0.988254,0.168285,0.157714


A large number of exterior features for 1st and 2nd floor kept appearing. To reduce similar terms, I will be creating interaction features for external features before dropping the original exterior features columns.

In [11]:
# Standardize column names for 2nd floor to 1st floor naming, to enable function to run
concat_df = concat_df.rename(columns={'Exterior 2nd_Wd Shng': 'Exterior 2nd_WdShing'})
concat_df = concat_df.rename(columns={'Exterior 2nd_Brk Cmn': 'Exterior 2nd_BrkComm'})
concat_df = concat_df.rename(columns={'Exterior 2nd_CmentBd': 'Exterior 2nd_CemntBd'})

In [12]:
ext_features = concat_df.columns[concat_df.columns.str.contains('Exterior 1st')]
ext_features

Index(['Exterior 1st_AsbShng', 'Exterior 1st_AsphShn', 'Exterior 1st_BrkComm',
       'Exterior 1st_BrkFace', 'Exterior 1st_CBlock', 'Exterior 1st_CemntBd',
       'Exterior 1st_HdBoard', 'Exterior 1st_ImStucc', 'Exterior 1st_MetalSd',
       'Exterior 1st_Plywood', 'Exterior 1st_PreCast', 'Exterior 1st_Stone',
       'Exterior 1st_Stucco', 'Exterior 1st_VinylSd', 'Exterior 1st_Wd Sdng',
       'Exterior 1st_WdShing'],
      dtype='object')

In [13]:
# Create interaction columns for Exterior features
for i in ext_features:
    ext_type = i.split('_')[1]
    concat_df[f'Ext{ext_type}'] = concat_df[f'Exterior 1st_{ext_type}'] * concat_df[f'Exterior 2nd_{ext_type}']
    concat_df = concat_df.drop([f'Exterior 1st_{ext_type}', f'Exterior 2nd_{ext_type}'], axis=1)

In [14]:
# Dropping due to perfect pairwise correlation score of 1
# Dropping all Garage None as high correlation with Garage Yr Blt
concat_df = concat_df.drop(['Street_Grvl',
                        'Garage Cond_None',
                        'Garage Finish_None',
                        'Garage Qual_None',
                        'Central Air_N',
                        'MS SubClass_90',
                       ],
                       axis=1)

In [15]:
# Dropping due to high pairwise correlation score, lower of two pair
concat_df = concat_df.drop(['MS SubClass_190',
                    'MS SubClass_80',
                    'Roof Style_Gable',
                    'MS SubClass_50',
                    'Garage Cars',
                    'MS Zoning_FV',
                    'MS SubClass_45', 
                       ],
                       axis=1)

In [16]:
concat_df.shape

(2926, 253)

### Low Variance analysis

I have managed to shrink the number of features to 253 in my previous step. In the next steps ahead, a large number of features with low variance will be identified, which suggests that these values do not change much and are pretty constant. As such, I will be dropping these features. A total of 85 features will be dropped through low variance. I have looked through the features and I do not disagree with the dropping of these features.

In [17]:
#Sort variance and mask data
low_variance = concat_df.var().sort_values(ascending=False)
low_variance = low_variance[low_variance.values < 0.009]

In [18]:
# Drop low variance features (var<0.009)
low_var_drop_list = [i for i in low_variance.index]
concat_df = concat_df.drop(low_var_drop_list, axis=1)

In [19]:
concat_df.shape

(2926, 168)

In [20]:
#Second splitting of full housing training data and test data from concatenated dataframe
housing = concat_df.iloc[:2048]
test = concat_df.iloc[2048:]

### Recursive feature elimination 

Recursive feature elimination (RFE) is a feature selection method that fits a model and removes the weakest feature (or features) until the specified number of features is reached. Using RFE with cross validation, I will allow Scikit learn to help optimise my optimal number of features that gives me the lowest negative mean squared error. Similarly, RFE will only be run on training dataset to ensure that there is no leakage of test data.

In [21]:
housing.shape

(2048, 168)

In [22]:
test.shape

(878, 168)

In [23]:
features = [col for col in housing._get_numeric_data().columns if col !='SalePrice']
features
X = housing[features]
y = housing['SalePrice']

In [24]:
from sklearn.feature_selection import RFECV
selector = RFECV(estimator=LinearRegression(), cv=20, scoring = 'neg_mean_squared_error')
selector.fit(housing.loc[:, housing.columns != 'SalePrice'], housing['SalePrice'])
print('Optimal number of features: %d'
 % selector.n_features_)

Optimal number of features: 166


RFECV has optimised the features to be 165. As such, 2 weakest features are identified which should increase the negative mean squared error score.

In [25]:
# Checking the column names which are selected
final_column =  list(housing.loc[:, housing.columns != 'SalePrice'].columns[selector.support_])

In [26]:
final_column

['Lot Frontage',
 'Lot Area',
 'Lot Shape',
 'Land Slope',
 'Overall Qual',
 'Overall Cond',
 'Mas Vnr Area',
 'Exter Cond',
 'Bsmt Exposure',
 'BsmtFin Type 1',
 'BsmtFin SF 1',
 'BsmtFin Type 2',
 'BsmtFin SF 2',
 'Bsmt Unf SF',
 'Total Bsmt SF',
 'Heating QC',
 'Electrical',
 '1st Flr SF',
 '2nd Flr SF',
 'Low Qual Fin SF',
 'Gr Liv Area',
 'Bsmt Full Bath',
 'Bsmt Half Bath',
 'Full Bath',
 'Half Bath',
 'Bedroom AbvGr',
 'Kitchen AbvGr',
 'TotRms AbvGrd',
 'Fireplaces',
 'Garage Yr Blt',
 'Garage Area',
 'Paved Drive',
 'Wood Deck SF',
 'Open Porch SF',
 'Enclosed Porch',
 'Screen Porch',
 'Pool Area',
 'Fence',
 'Misc Val',
 'Age',
 'Misc Feature_Shed',
 'Alley_Grvl',
 'Alley_Pave',
 'Fireplace Qu_Ex',
 'Fireplace Qu_Fa',
 'Fireplace Qu_Gd',
 'Fireplace Qu_Po',
 'Fireplace Qu_TA',
 'Garage Type_Attchd',
 'Garage Type_Basment',
 'Garage Type_BuiltIn',
 'Garage Type_Detchd',
 'Bsmt Cond_Fa',
 'Bsmt Cond_Gd',
 'Bsmt Cond_TA',
 'Bsmt Qual_Ex',
 'Bsmt Qual_Fa',
 'Bsmt Qual_Gd',
 'Bsmt

In [27]:
#Extract final concatenated df
concat_df_final = concat_df[[c for c in concat_df.columns if c in final_column]]

In [28]:
#Add SalePrice back
concat_df_final.insert(0,'SalePrice',concat_df['SalePrice'])

In [29]:
concat_df_final.shape

(2926, 167)

In [30]:
#Final splitting of full housing training data and test data from concatenated dataframe
housing = concat_df_final.iloc[:2048]
test = concat_df_final.iloc[2048:]

### Final checks to ensure same number of columns

In [31]:
# Checking for features in concat_dfing but not in test dataset
[x for x in housing if x not in test]

[]

In [32]:
# Checking for features in test dataset but not in housing dataset
[x for x in test if x not in housing]

[]

In [33]:
housing.shape

(2048, 167)

In [34]:
test.shape

(878, 167)

### Saving processed test and housing dataset to be used in Part 4

In [35]:
# Save cleaned test dataset
test.to_csv('./datasets/test_final.csv', index=False)

In [36]:
# Save cleaned housing training dataset
housing.to_csv('./datasets/housing_final.csv', index=False)