In [57]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model

In [59]:
house_data = pd.read_csv('./housing_kaggle_train.csv')

In [60]:
# Initial look at data
print(house_data.shape)
print(house_data.head())
house_data.isnull().sum()

(1460, 81)
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities    ...     PoolArea PoolQC Fence MiscFeature MiscVal  \
0         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
1         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
2         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
3         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
4         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   

  MoSold YrSold  SaleType  SaleCondition  SalePrice  
0      2 

Id                  0
MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          8
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
                 ... 
BedroomAbvGr        0
KitchenAbvGr        0
KitchenQual         0
TotRmsAbvGrd        0
Functional          0
Fireplaces          0
FireplaceQu       690
GarageType         81
GarageYrBlt        81
GarageFinish       81
GarageCars          0
GarageArea          0
GarageQual         81
GarageCond         81
PavedDrive

In [61]:
# Select only numeric factors for initial regression 
house_data_num = house_data.select_dtypes(['int64', 'float64'])
print(house_data_num.head())

   Id  MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  \
0   1          60         65.0     8450            7            5       2003   
1   2          20         80.0     9600            6            8       1976   
2   3          60         68.0    11250            7            5       2001   
3   4          70         60.0     9550            7            5       1915   
4   5          60         84.0    14260            8            5       2000   

   YearRemodAdd  MasVnrArea  BsmtFinSF1    ...      WoodDeckSF  OpenPorchSF  \
0          2003       196.0         706    ...               0           61   
1          1976         0.0         978    ...             298            0   
2          2002       162.0         486    ...               0           42   
3          1970         0.0         216    ...               0           35   
4          2000       350.0         655    ...             192           84   

   EnclosedPorch  3SsnPorch  ScreenPorch  Po

### Train & Test split

In [63]:
house_train, house_test, target_train, target_test = train_test_split(
    house_data_num.iloc[:,house_data_num.columns != 'SalePrice'], 
    house_data_num['SalePrice'], test_size=0.33, random_state=42)

In [65]:
# Replace NaN & Nulls with the median of the column then standardise
print(house_train.isnull().sum())

house_train = house_train.fillna(house_train.median())
print(house_train.isnull().sum())

print(house_test.isnull().sum())

house_test = house_test.fillna(house_train.median())
print(house_test.isnull().sum())

Id               0
MSSubClass       0
LotFrontage      0
LotArea          0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
MasVnrArea       0
BsmtFinSF1       0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageYrBlt      0
GarageCars       0
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
dtype: int64
Id               0
MSSubClass       0
LotFrontage      0
LotArea          0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
MasVnrArea       0
BsmtFinSF1       0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0

In [66]:
scaler = StandardScaler()
scaler.fit(house_train)
house_train_scaled = scaler.transform(house_train)
house_test_scaled = scaler.transform(house_test)

  return self.partial_fit(X, y)
  app.launch_new_instance()


In [80]:
# Lasso regression to identify factors
lasso = linear_model.Lasso(alpha=1)
lasso.fit(house_train_scaled, target_train)
print(lasso.coef_)

[-1083.80772589 -9300.97132316 -3226.97289586  5567.9339081
 25007.36195276  3264.44192893  7460.7889029   4309.87938111
  4636.80194279  4114.54927755  -151.10248799     0.
   136.04618342 13219.00426165 13264.3992655   2599.96531143
  2164.49921224  6271.03971668   -87.38119454  2898.815841
    47.28086466 -7627.50058546 -2199.00894872 11184.20720622
  3376.44040775  2290.1809666   9349.4514674     87.64147296
  3408.56778486  -792.89021574   927.51999443  1643.37333593
  5060.53718887 -5232.10825868  -475.54045883  -124.06356004
  -267.74915703]


In [81]:
lasso.score(house_test_scaled, target_test)

0.7849895105683152

In [82]:
results = []
for i in np.arange(0, 1, 0.1):
    print(i)
    lasso_test = linear_model.Lasso(alpha=i)
    lasso_test.fit(house_train_scaled, target_train)
    score = lasso_test.score(house_test_scaled, target_test)
    results.append((i, score))
    
print(results)
    

0.0


  positive)


0.1




0.2




0.30000000000000004
0.4
0.5
0.6000000000000001
0.7000000000000001
0.8
0.9
[(0.0, 0.7849800694711407), (0.1, 0.7849810130040558), (0.2, 0.7849819564123158), (0.30000000000000004, 0.7849829053876289), (0.4, 0.7849838466575371), (0.5, 0.7849847886511799), (0.6000000000000001, 0.7849857330085324), (0.7000000000000001, 0.7849866753493205), (0.8, 0.7849876279869215), (0.9, 0.7849885647210982)]


In [83]:
lasso.predict(house_test)

array([9.30959118e+07, 1.41362422e+08, 9.21287038e+07, 8.14003455e+07,
       1.28949761e+08, 4.40645481e+07, 1.46895533e+08, 6.15216357e+07,
       4.60914984e+07, 1.40763841e+08, 8.37249690e+07, 9.61656427e+07,
       1.39325839e+08, 1.08347362e+08, 9.74207514e+07, 9.39208107e+07,
       1.00009464e+08, 8.51813203e+07, 8.71435087e+07, 1.15381120e+08,
       1.27895761e+08, 1.01615014e+08, 9.86559313e+07, 7.50109712e+07,
       1.02148498e+08, 9.02485819e+07, 9.80440135e+07, 1.46797794e+08,
       1.11818110e+08, 6.97995684e+07, 8.74111307e+07, 1.23528042e+08,
       1.69382210e+08, 8.97387533e+07, 1.28442697e+08, 1.00329808e+08,
       9.56768559e+07, 1.02734406e+08, 1.50967562e+08, 1.10915592e+08,
       7.59305703e+07, 1.11826760e+08, 9.67387063e+07, 1.30963212e+08,
       1.00695089e+08, 1.13835566e+08, 8.45298632e+07, 8.81437658e+07,
       1.45997120e+08, 1.14798060e+08, 8.30800959e+07, 1.12412048e+08,
       6.08174646e+07, 1.62234128e+08, 6.77235984e+07, 1.25202134e+08,
      