%% [markdown]<br>
### Import necessary packages<br>


%%

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%%

In [2]:
df = pd.read_csv('data/housing.csv')
print('Shape: ', df.shape)
print(df.head())

Shape:  (1460, 63)
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street LotShape LandContour  \
0   1          60       RL         65.0     8450   Pave      Reg         Lvl   
1   2          20       RL         80.0     9600   Pave      Reg         Lvl   
2   3          60       RL         68.0    11250   Pave      IR1         Lvl   
3   4          70       RL         60.0     9550   Pave      IR1         Lvl   
4   5          60       RL         84.0    14260   Pave      IR1         Lvl   

  Utilities LotConfig  ... EnclosedPorch 3SsnPorch ScreenPorch PoolArea  \
0    AllPub    Inside  ...             0         0           0        0   
1    AllPub       FR2  ...             0         0           0        0   
2    AllPub    Inside  ...             0         0           0        0   
3    AllPub    Corner  ...           272         0           0        0   
4    AllPub       FR2  ...             0         0           0        0   

  MiscVal MoSold  YrSold  SaleType  SaleCondition

%% [markdown]<br>
### Display the data types of the dataframe

%%

In [3]:
print(df.dtypes)

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 63, dtype: object


%% [markdown]<br>
### Split the data in to train, test split. In this housing data, "SalePrice" is the dependent/target feature.

%%

In [4]:
from sklearn.model_selection import train_test_split
X_train, y_train, X_test, y_test = train_test_split(df.drop(['SalePrice'], axis=1), df['SalePrice'], test_size=0.3, random_state=0)

%% [markdown]<br>
### Select the numeric columns only

%%

In [5]:
numeric_X_train = X_train[X_train.select_dtypes([np.number]).columns]

In [6]:
print(len(numeric_X_train.columns))
print(numeric_X_train.columns)
print(numeric_X_train.head())

35
Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')
        Id  MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  \
64      65          60    70.049958     9375            7            5   
682    683         120    70.049958     2887            6            5   
960    961          20    50.000000     7207            5            7   
1384  1385          50    60.000000     9060            6            5   
1100  1101          30    60.000000     8400            2            5 

%% [markdown]<br>
### Use VarianceThreshold feature selector to select the feature which have more<br>
### variance i.e more than zero

%%

In [7]:
from sklearn.feature_selection import VarianceThreshold

In [8]:
vs_constants = VarianceThreshold(threshold=0)
vs_constants.fit(numeric_X_train)

VarianceThreshold(threshold=0)

In [9]:
print(len(vs_constants.get_support()))
print(vs_constants.get_support())

35
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True]


%% [markdown]<br>
### Get all the selected column names

%%

In [10]:
constant_columns = [column for column in numeric_X_train 
                    if column not in numeric_X_train.columns[vs_constants.get_support()]]
#
print('Lenght of X train columns: ', len(X_train.columns))
print('Lenght of numeric X train columns: ', len(numeric_X_train.columns))
print('Lenght of constant columns: ', len(constant_columns))

Lenght of X train columns:  62
Lenght of numeric X train columns:  35
Lenght of constant columns:  0


%%