<h1><center>Regularization & Feature Selection</center></h1>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import ShuffleSplit

from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [2]:
housing_train_data = pd.read_csv("../Data/cleaned_housing_train_set.csv")
housing_test_data = pd.read_csv("../Data/cleaned_housing_test_set.csv")

In [3]:
housing_train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LandSlope,...,Neighborhood_NridgHt,Neighborhood_NWAmes,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker
0,1,60,5,65.0,8450,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,20,5,80.0,9600,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,3,60,5,68.0,11250,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,70,5,60.0,9550,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6,50,5,85.0,14115,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
housing_train_data.shape

(1149, 100)

In [5]:
housing_test_data.shape

(288, 100)

In [6]:
housing_train_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'LandSlope', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', 'FstFlrSF', 'SecndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
       'ThreeSsnPorch', 'ScreenPorch'

In [7]:
input_features = [x for x in housing_train_data.columns if x not in ['SalePrice']]

In [8]:
train_X = housing_train_data.loc[:,input_features]
train_y = housing_train_data['SalePrice']

In [9]:
from sklearn.preprocessing import StandardScaler

In [10]:
sc = StandardScaler()
scaled_train_X = sc.fit_transform(train_X)

In [21]:
sgd = SGDRegressor(random_state=20, eta0=0.01, max_iter=200, alpha=0.3)
sgd.fit(scaled_train_X, train_y)

SGDRegressor(alpha=0.3, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=200,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=20,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [22]:
r2_score(train_y, sgd.predict(scaled_train_X))

0.832003335886051

### Test Set

In [13]:
test_X = housing_test_data.loc[:,input_features]
test_y = housing_test_data['SalePrice']

In [14]:
scaled_test_X = sc.transform(test_X)

In [23]:
r2_score(test_y, sgd.predict(scaled_test_X))

0.83548285872368

### Feature Selection

In [24]:
print(sgd.coef_)

[ -781.43531164 -4285.09417182   249.09433036 -2238.94597066
  1330.13227026  1236.48141592 -1360.65734199  1266.34662518
 -1597.68619021  1159.30888039  -434.22622288 -2374.66160847
 -1339.2357939   -105.57841395 11290.0722243   2829.91680478
  2846.60628616  2800.20398117  3118.26427491  4141.21899312
  -999.49781447  -503.48302078 -5723.81404647   682.65990827
  1808.43828427     0.           103.85587998     0.
     0.          2557.29760642     0.           586.85794948
  -264.83046365  2613.22240576   206.65221589 -2610.6613561
   258.77052062   438.92161025  4025.82413636  4877.78242769
  -340.661859    6959.22030523  2574.8488      1275.35955533
  5180.06157897  1976.93434982 -1676.85479697 -3586.75284175
 -5741.99049571  5126.8451916  -2376.54651938  3803.69866991
     0.          2889.93643554     0.          5823.11475063
  1761.80876873     0.             0.          -760.60869973
  3445.762869     154.03686269  -449.699214     861.70949984
  2395.73392653 -2973.46606405  -

In [25]:
for (fet, coef) in zip(input_features, sgd.coef_):
    if coef == 0:
        print(fet, coef)

BsmtQual 0.0
BsmtExposure 0.0
BsmtFinType1 0.0
BsmtFinType2 0.0
GarageType 0.0
GarageFinish 0.0
GarageQual 0.0
GarageCond 0.0
Neighborhood_Names 0.0


### Let's eliminate the unimportant features and build model to see how it is working.

In [26]:
selected_features = [x for x in input_features if x not in ['BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
                                                         'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
                                                         'Neighborhood_Names']]

In [27]:
len(selected_features)

90

In [30]:
train_X = housing_train_data.loc[:, selected_features]
train_y = housing_train_data['SalePrice']

sc = StandardScaler()
scaled_train_X = sc.fit_transform(train_X)

sgd = SGDRegressor(random_state=20, eta0=0.01, max_iter=200, alpha=0.3)
sgd.fit(scaled_train_X, train_y)

r2_score(train_y, sgd.predict(scaled_train_X))

0.832003335886051

In [31]:
test_X = housing_test_data.loc[:, selected_features]
test_y = housing_test_data['SalePrice']

scaled_test_X = sc.transform(test_X)

r2_score(test_y, sgd.predict(scaled_test_X))

0.83548285872368

### Play with sklearn.linear_model.ElasticNet
### Play with sklearn.linear_model.Lasso
### Play with sklearn.linear_model.Ridge