## Import Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import  StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import  RFE
from sklearn.pipeline import Pipeline


In [2]:
housing = pd.read_csv('../datasets/housing.csv',index_col='Id')

In [3]:
housing

Unnamed: 0_level_0,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice,BedroomA bvGr
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
109,533352170,SC60,RL,0.0,13517,2,,3,Lvl,4,...,0,0,No,No,0,Mar,2010,WD,130500,3
544,531379050,SC60,RL,43.0,11492,2,,3,Lvl,4,...,0,0,No,No,0,Apr,2009,WD,220000,4
153,535304180,SC20,RL,68.0,7922,2,,4,Lvl,4,...,0,0,No,No,0,Jan,2010,WD,109000,3
318,916386060,SC60,RL,73.0,9802,2,,4,Lvl,4,...,0,0,No,No,0,Apr,2010,WD,174000,3
255,906425045,SC50,RL,82.0,14235,2,,3,Lvl,4,...,0,0,No,No,0,Mar,2010,WD,138500,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1587,921126030,SC20,RL,79.0,11449,2,,3,HLS,4,...,0,0,No,No,0,Jan,2008,WD,298751,3
785,905377130,SC30,RL,0.0,12342,2,,3,Lvl,4,...,0,0,No,No,0,Mar,2009,WD,82500,1
916,909253010,SC50,RL,57.0,7558,2,,4,Bnk,4,...,0,0,No,No,0,Mar,2009,WD,177000,3
639,535179160,SC20,RL,80.0,10400,2,,4,Lvl,4,...,0,0,No,No,0,Nov,2009,WD,144000,3


## Preprocessing and Feature Engineering

In [4]:
#Creating new features to analyse age related features better

housing ['house age'] = housing['Yr Sold'] - housing['Year Built'] 
housing ['reno newness'] = housing['Yr Sold'] - housing['Year Remod/Add'] 

In [5]:
#Changing the data type of some numerical features as they should be categorical
housing[['PID','Year Remod/Add','Year Built','Garage Yr Blt','Yr Sold']] = housing[['PID','Year Remod/Add','Year Built','Garage Yr Blt','Yr Sold']].astype(str) 

In [6]:
# Droping PID and Year of remodling as they do not add information to analysis
housing.drop(['PID','Year Remod/Add'] , axis =1, inplace = True)

In [7]:
categorical_features = housing.select_dtypes(include = ["object"]).columns
categorical_features

Index(['MS SubClass', 'MS Zoning', 'Alley', 'Land Contour', 'Lot Config',
       'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type',
       'House Style', 'Year Built', 'Roof Style', 'Roof Matl', 'Exterior 1st',
       'Exterior 2nd', 'Mas Vnr Type', 'Foundation', 'BsmtFin Type 1',
       'BsmtFin Type 2', 'Heating', 'Central Air', 'Electrical', 'Garage Type',
       'Garage Yr Blt', 'Garage Finish', 'Fence', 'Misc Feature', 'Mo Sold',
       'Yr Sold', 'Sale Type'],
      dtype='object')

In [8]:
#Enconding categorical features with get_dummies

housing =pd.get_dummies(housing, columns =['MS SubClass', 'MS Zoning', 'Alley', 'Land Contour', 'Lot Config',
       'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type',
       'House Style', 'Year Built', 'Roof Style', 'Roof Matl', 'Exterior 1st',
       'Exterior 2nd', 'Mas Vnr Type', 'Foundation', 'BsmtFin Type 1',
       'BsmtFin Type 2', 'Heating', 'Central Air', 'Electrical', 'Garage Type',
       'Garage Yr Blt', 'Garage Finish', 'Fence', 'Misc Feature', 'Mo Sold',
       'Yr Sold', 'Sale Type'], drop_first = True)

In [9]:
#sns_plot.figure.savefig("output.png")

In [10]:
#dropping outliers

housing = housing[(housing['Gr Liv Area']<4000)]

In [11]:

housing.shape


(2049, 458)

### Selecting features via correlation and RFE

In [12]:
cor =housing.corr()

In [13]:
cor_target = abs(cor['SalePrice'])

In [14]:
drop_columns = pd.DataFrame([cor_target[cor_target<0.25]])

In [15]:
housing =housing.drop(columns =drop_columns)

In [16]:
housing.shape

(2049, 51)

In [17]:
X = housing.drop(columns ="SalePrice")
y = housing['SalePrice']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state=42)

In [19]:
# Scaling data
ss = StandardScaler()
ss.fit(X_train)

X_train_ss = ss.transform(X_train)
X_test_ss = ss.transform(X_test)

In [20]:
#Instatiating model and recursive feature elimination. Fitting data.
lasso = LassoCV(n_alphas = 200, cv =5)
rfe = RFE(lasso)
rfe_fit = rfe.fit(X_train,y_train)

In [21]:
# creating a dataframe of top features after RFE
rfe_features = pd.DataFrame(rfe_fit.support_)
rfe_rank = pd.DataFrame(rfe_fit.ranking_)
rfe_columns = pd.DataFrame(X.columns)
# concating into two dataframes 
features = pd.concat([rfe_columns,rfe_features,rfe_rank],axis=1)
# naming the dataframe columns
features.columns = ['Features','Selected','Rank']

In [22]:
features

Unnamed: 0,Features,Selected,Rank
0,Lot Area,True,1
1,Lot Shape,True,1
2,Overall Qual,True,1
3,Mas Vnr Area,True,1
4,Exter Qual,True,1
5,Bsmt Qual,True,1
6,Bsmt Exposure,True,1
7,BsmtFin SF 1,True,1
8,Total Bsmt SF,True,1
9,Heating QC,True,1


In [23]:
# isolate the top features into a dataframe
top_f = features.loc[features['Selected']==True]

In [24]:
top_f = [i for i in top_f['Features']]

In [25]:
top_f

['Lot Area',
 'Lot Shape',
 'Overall Qual',
 'Mas Vnr Area',
 'Exter Qual',
 'Bsmt Qual',
 'Bsmt Exposure',
 'BsmtFin SF 1',
 'Total Bsmt SF',
 'Heating QC',
 '1st Flr SF',
 'Gr Liv Area',
 'Garage Area',
 'Wood Deck SF',
 'Open Porch SF',
 'house age',
 'reno newness',
 'MS SubClass_SC60',
 'MS Zoning_RM',
 'Neighborhood_NoRidge',
 'Neighborhood_NridgHt',
 'Neighborhood_StoneBr',
 'Year Built_2008',
 'Roof Style_Gable',
 'Roof Style_Hip']

In [26]:
X_top = housing[top_f]

In [27]:
X_top

Unnamed: 0_level_0,Lot Area,Lot Shape,Overall Qual,Mas Vnr Area,Exter Qual,Bsmt Qual,Bsmt Exposure,BsmtFin SF 1,Total Bsmt SF,Heating QC,...,house age,reno newness,MS SubClass_SC60,MS Zoning_RM,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_StoneBr,Year Built_2008,Roof Style_Gable,Roof Style_Hip
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
109,13517,3,6,289.0,4,3,0,533.0,725.0,5,...,34,5,1,0,0,0,0,0,1,0
544,11492,3,7,132.0,4,4,0,637.0,913.0,5,...,13,12,1,0,0,0,0,0,1,0
153,7922,4,5,0.0,3,3,0,731.0,1057.0,3,...,57,3,0,0,0,0,0,0,1,0
318,9802,4,5,0.0,3,4,0,0.0,384.0,4,...,4,3,1,0,0,0,0,0,1,0
255,14235,3,6,0.0,3,2,0,0.0,676.0,3,...,110,17,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1587,11449,3,8,0.0,4,4,2,1011.0,1884.0,5,...,1,1,0,0,0,0,0,0,1,0
785,12342,3,4,0.0,3,3,0,262.0,861.0,5,...,69,59,0,0,0,0,0,0,1,0
916,7558,4,6,0.0,3,3,0,0.0,896.0,4,...,81,59,0,0,0,0,0,0,1,0
639,10400,4,4,0.0,3,3,0,155.0,1200.0,3,...,53,53,0,0,0,0,0,0,1,0


In [29]:
# Saving X: Top 25 columns and Y data to CSV
X_top.to_csv('../datasets/X_top.csv')
y.to_csv('../datasets/y.csv')