## ENV Setup

In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.model_selection import cross_validate

# Initial Investigation & Processing
Read in the housing data from kaggle and investigate the data types as well as missing values. For this initial implementation only numerical data will be used 

In [4]:
house_data = pd.read_csv('./housing_kaggle_train.csv')

In [5]:
# Initial look at data
print(house_data.shape)
print(house_data.head())
print(house_data.isnull().sum())

(1460, 81)
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities    ...     PoolArea PoolQC Fence MiscFeature MiscVal  \
0         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
1         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
2         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
3         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
4         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   

  MoSold YrSold  SaleType  SaleCondition  SalePrice  
0      2 

In [6]:
# Select only numeric factors for initial regression 
house_data_num = house_data.select_dtypes(['int64', 'float64'])
print(house_data_num.head())

   Id  MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  \
0   1          60         65.0     8450            7            5       2003   
1   2          20         80.0     9600            6            8       1976   
2   3          60         68.0    11250            7            5       2001   
3   4          70         60.0     9550            7            5       1915   
4   5          60         84.0    14260            8            5       2000   

   YearRemodAdd  MasVnrArea  BsmtFinSF1    ...      WoodDeckSF  OpenPorchSF  \
0          2003       196.0         706    ...               0           61   
1          1976         0.0         978    ...             298            0   
2          2002       162.0         486    ...               0           42   
3          1970         0.0         216    ...               0           35   
4          2000       350.0         655    ...             192           84   

   EnclosedPorch  3SsnPorch  ScreenPorch  Po

### Train & Test split
Split the data into two groups roughly 70:30 for the training and test sets respectively

In [7]:
house_train, house_test, target_train, target_test = train_test_split(
    house_data_num.iloc[:,house_data_num.columns != 'SalePrice'], 
    house_data_num['SalePrice'], test_size=0.33, random_state=42)

In [8]:
# Replace NaN & Nulls with the median of the column then standardise
print(house_train.isnull().sum())

house_train = house_train.fillna(house_train.median())
print(house_train.isnull().sum())

print(house_test.isnull().sum())

house_test = house_test.fillna(house_train.median())
print(house_test.isnull().sum())

Id                 0
MSSubClass         0
LotFrontage      183
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea         2
BsmtFinSF1         0
BsmtFinSF2         0
BsmtUnfSF          0
TotalBsmtSF        0
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       0
BsmtHalfBath       0
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt       53
GarageCars         0
GarageArea         0
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
dtype: int64
Id               0
MSSubClass       0
LotFrontage      0
LotArea          0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
MasVnrArea       0
BsmtFinSF1       0
BsmtFinSF2       0
B

### Scale / Normalise the data
In order to take advantage of feature selection with lasso regression the data must first be scaled to accurately gauge how much influnce each factotor has on the models accuracy.

In [9]:
scaler = StandardScaler()
scaler.fit(house_train)
house_train_scaled = scaler.transform(house_train)
house_test_scaled = scaler.transform(house_test)

  return self.partial_fit(X, y)
  app.launch_new_instance()


## Modelling
Using Lasso regression, fit a model and identify the factors most contributing to the models performance. Once fit, test the score of the fitted model

In [10]:
# Lasso regression to identify factors
lasso = linear_model.Lasso(alpha=1)
lasso.fit(house_train_scaled, target_train)
print(lasso.coef_)

[-1083.80772589 -9300.97132316 -3226.97289586  5567.9339081
 25007.36195276  3264.44192893  7460.7889029   4309.87938111
  4636.80194279  4114.54927755  -151.10248799     0.
   136.04618342 13219.00426165 13264.3992655   2599.96531143
  2164.49921224  6271.03971668   -87.38119454  2898.815841
    47.28086466 -7627.50058546 -2199.00894872 11184.20720622
  3376.44040775  2290.1809666   9349.4514674     87.64147296
  3408.56778486  -792.89021574   927.51999443  1643.37333593
  5060.53718887 -5232.10825868  -475.54045883  -124.06356004
  -267.74915703]


In [11]:
lasso.score(house_test_scaled, target_test)

0.7849895105683152

### Linear Regression Models

In [12]:
reg = linear_model.LinearRegression()
reg_std = linear_model.LinearRegression()

reg.fit(house_train, target_train)
reg_std.fit(house_train_scaled, target_train)

print(reg.coef_)
print(reg_std.coef_)

[-2.53694161e+00 -2.17102867e+02 -1.40368828e+02  4.82166940e-01
  1.82549595e+04  2.93482163e+03  2.42832760e+02  2.07644232e+02
  2.65232481e+01  7.04007679e+00 -2.91363068e+00 -1.90904847e+00
  2.21739764e+00  5.21575432e+00  1.48287256e+00  2.61874092e+01
  3.28860361e+01  1.20547814e+04 -3.60760732e+02  5.32858709e+03
  9.76494410e+01 -9.40831571e+03 -1.02593285e+04  6.91622569e+03
  5.18926608e+03  9.44398674e+01  1.27610491e+04  4.17867532e-01
  2.59362668e+01 -1.14057296e+01  1.49771916e+01  4.94624666e+01
  8.91274945e+01 -1.43461687e+02 -1.38191014e+00 -4.68628628e+01
 -2.02709325e+02]
[-1.08502256e+03 -9.30316806e+03 -3.21978776e+03  5.56571892e+03
  2.49826291e+04  3.27336925e+03  7.48232162e+03  4.30752781e+03
  4.64164133e+03  1.69848260e+16  5.60366594e+15  1.64016761e+16
 -1.65310775e+16 -2.09572579e+16 -2.36292736e+16 -2.55006874e+15
  2.81720481e+16  6.28410745e+03 -7.93892806e+01  2.90315400e+03
  5.27993080e+01 -7.63034319e+03 -2.19846950e+03  1.11963883e+04
  3.379

In [14]:
print(reg.score(house_test, target_test))
print(reg_std.score(house_test_scaled, target_test))

0.7849800694711453
0.7849595300702676


### Cross Validation for Model selection

In [23]:
lasso_cv = cross_validate(lasso, house_test_scaled, target_test, cv=(len(house_test_scaled)//10))
reg_cv = cross_validate(lasso, house_test, target_test, cv=(len(house_test)//10))
reg_std_cv = cross_validate(lasso, house_test_scaled, target_test, cv=(len(house_test_scaled)//10))









In [25]:
print(lasso_cv.keys())

dict_keys(['fit_time', 'score_time', 'test_score', 'train_score'])


In [31]:
print("\n-----------\tLasso Regression\t-----------")
print(lasso_cv['train_score'].mean())
print(lasso_cv['test_score'].mean())


print("\n-----------\tRegression\t-----------")
print(reg_cv['train_score'].mean())
print(reg_cv['test_score'].mean())

print("\n-----------\tRegression STD\t-----------")
print(reg_std_cv['train_score'].mean())
print(reg_std_cv['test_score'].mean())


-----------	Lasso Regression	-----------
0.8785364870983864
0.7240172682503259

-----------	Regression	-----------
0.878536479875437
0.7239234522634709

-----------	Regression STD	-----------
0.8785364870983864
0.7240172682503259


# Conclusion
As seen in the above results, there is little improvement provided by lasso regression in the feature selection. For that reason and to avoid computational complexity, a simple linear model should be used with out standardising the predictors.

Thanks =)