# Modeling

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score

In [3]:
%store -r ames
%store -r features
%store -r cat

In [4]:
ames.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,68.0,13517,Pave,No Alley,IR1,Lvl,...,0,0,No Pool,No Fence,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,No Alley,IR1,Lvl,...,0,0,No Pool,No Fence,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,No Alley,Reg,Lvl,...,0,0,No Pool,No Fence,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,No Alley,Reg,Lvl,...,0,0,No Pool,No Fence,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,No Alley,IR1,Lvl,...,0,0,No Pool,No Fence,,0,3,2010,WD,138500


In [6]:
ames.shape

(2051, 81)

## One Hot-Encoding

In [22]:
# finds the first element in each category when sorted, because these will be dropped when dummied
comp = []
for x in cat:
    comp.append(sorted(ames[x].unique())[0])

In [23]:
comp

[20,
 'A (agr)',
 'Grvl',
 'Grvl',
 'IR1',
 'Bnk',
 'AllPub',
 'Corner',
 'Gtl',
 'Blmngtn',
 'Artery',
 'Artery',
 '1Fam',
 '1.5Fin',
 'Flat',
 'ClyTile',
 'AsbShng',
 'AsbShng',
 'BrkCmn',
 'Ex',
 'Ex',
 'BrkTil',
 'Ex',
 'Ex',
 'Av',
 'ALQ',
 'ALQ',
 'GasA',
 'Ex',
 'N',
 'FuseA',
 'Ex',
 'Maj1',
 'Ex',
 '2Types',
 'Fin',
 'Ex',
 'Ex',
 'N',
 'Ex',
 'GdPrv',
 'Elev',
 'COD']

Above are features that will be dropped once one-hot encoded is used.  When making inference from the linear model's coefficient, all these variables are held constant

In [25]:
features

['Lot Frontage',
 'Lot Area',
 'Overall Qual',
 'Overall Cond',
 'Year Built',
 'Year Remod/Add',
 'Mas Vnr Area',
 'BsmtFin SF 1',
 'Bsmt Unf SF',
 'BsmtFin SF 2',
 '1st Flr SF',
 '2nd Flr SF',
 'Low Qual Fin SF',
 'Gr Liv Area',
 'Bsmt Full Bath',
 'Bsmt Half Bath',
 'Full Bath',
 'Half Bath',
 'Bedroom AbvGr',
 'Kitchen AbvGr',
 'TotRms AbvGrd',
 'Fireplaces',
 'Garage Cars',
 'Garage Area',
 'Wood Deck SF',
 'Open Porch SF',
 'Enclosed Porch',
 '3Ssn Porch',
 'Screen Porch',
 'Pool Area',
 'Misc Val',
 'Mo Sold',
 'Yr Sold',
 'Total Bsmt SF']

In [27]:
cat

['MS SubClass',
 'MS Zoning',
 'Street',
 'Alley',
 'Lot Shape',
 'Land Contour',
 'Utilities',
 'Lot Config',
 'Land Slope',
 'Neighborhood',
 'Condition 1',
 'Condition 2',
 'Bldg Type',
 'House Style',
 'Roof Style',
 'Roof Matl',
 'Exterior 1st',
 'Exterior 2nd',
 'Mas Vnr Type',
 'Exter Qual',
 'Exter Cond',
 'Foundation',
 'Bsmt Qual',
 'Bsmt Cond',
 'Bsmt Exposure',
 'BsmtFin Type 1',
 'BsmtFin Type 2',
 'Heating',
 'Heating QC',
 'Central Air',
 'Electrical',
 'Kitchen Qual',
 'Functional',
 'Fireplace Qu',
 'Garage Type',
 'Garage Finish',
 'Garage Qual',
 'Garage Cond',
 'Paved Drive',
 'Pool QC',
 'Fence',
 'Misc Feature',
 'Sale Type']

In [40]:
cate = pd.get_dummies(ames, columns = cat, drop_first = True)

In [41]:
cate

Unnamed: 0,Id,PID,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,...,Misc Feature_Shed,Misc Feature_TenC,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_WD
0,109,533352170,68.0,13517,6,8,1976,2005,289.0,533.0,...,0,0,0,0,0,0,0,0,0,1
1,544,531379050,43.0,11492,7,5,1996,1997,132.0,637.0,...,0,0,0,0,0,0,0,0,0,1
2,153,535304180,68.0,7922,5,7,1953,2007,0.0,731.0,...,0,0,0,0,0,0,0,0,0,1
3,318,916386060,73.0,9802,5,5,2006,2007,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
4,255,906425045,82.0,14235,6,8,1900,1993,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,1587,921126030,79.0,11449,8,5,2007,2007,0.0,1011.0,...,0,0,0,0,0,0,0,0,0,1
2047,785,905377130,68.0,12342,4,5,1940,1950,0.0,262.0,...,0,0,0,0,0,0,0,0,0,1
2048,916,909253010,57.0,7558,6,6,1928,1950,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2049,639,535179160,80.0,10400,4,5,1956,1956,0.0,155.0,...,0,0,0,0,0,0,0,0,0,1


In [42]:
cate.shape

(2051, 279)

In [43]:
df = pd.merge(left = ames[features], right = cate, left_index = True, right_index = True )

In [44]:
ames[features].shape

(2051, 34)

In [46]:
df.shape

(2051, 313)

Index(['Lot Frontage_x', 'Lot Area_x', 'Overall Qual_x', 'Overall Cond_x',
       'Year Built_x', 'Year Remod/Add_x', 'Mas Vnr Area_x', 'BsmtFin SF 1_x',
       'Bsmt Unf SF_x', 'BsmtFin SF 2_x',
       ...
       'Misc Feature_Shed', 'Misc Feature_TenC', 'Sale Type_CWD',
       'Sale Type_Con', 'Sale Type_ConLD', 'Sale Type_ConLI',
       'Sale Type_ConLw', 'Sale Type_New', 'Sale Type_Oth', 'Sale Type_WD '],
      dtype='object', length=313)

## Baseline Model Score

In [51]:
X = df.drop(columns ='SalePrice')
y = ames['SalePrice']

In [52]:
lr = LinearRegression()

In [54]:
cross_val_score(lr,X, y).mean() # this is the baseline Score

0.8038096976011048

We need to improve the model with scores higher then .804

### Model 1

In [3]:
X = ames[['Mas Vnr Area','Total Bsmt SF','1st Flr SF','Gr Liv Area',
         'Full Bath','TotRms AbvGrd','Fireplaces','Garage Yr Blt','Garage Cars','Garage Area','Open Porch SF','Wood Deck SF','Lot Area']]
y = ames['SalePrice']

Train Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42, test_size = .1, train_size = .9)

Scaling

In [5]:
ss = StandardScaler()

Xs_train = ss.fit_transform(X_train) # data leakage if you put test data.
Xs_test = ss.transform(X_test) # test, 

Methods

fit(X[, y]) - Compute the mean and std to be used for later scaling.
fit_transform(X[, y]) - Fit to data, then transform it.
get_params([deep]) - parameters for this estimator.
inverse_transform(X[, copy])
Scale back the data to the original representation
partial_fit(X[, y])
Online computation of mean and std on X for later scaling.
set_params(**params)
Set the parameters of this estimator.
transform(X[, copy]) -Perform standardization by centering and scaling

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html 

Instantiate, Fit, Score Model

In [6]:
lr = LinearRegression()

In [7]:
lr.fit(X_train, y_train)

LinearRegression()

In [8]:
lr.score(X_train, y_train), lr.score(X_test, y_test) # this model did worst then baseline score.

(0.7149251482994713, 0.7904325092185084)

Low Bias, high variance - ok for first model  r2 score without scaling

In [9]:
cross_val_score(lr, X, y) # checking the score 5 more times

array([0.71976524, 0.76441344, 0.6246985 , 0.77556205, 0.67102882])

### Model 2 - includes Scaling

Scaling our features is critical because it put all our features into the same "measured unit".  Sklearn turns each feature into mean and standard deviation.

In [12]:
X = ames[['Mas Vnr Area','Total Bsmt SF','1st Flr SF','Gr Liv Area',
         'Full Bath','TotRms AbvGrd','Fireplaces','Garage Yr Blt','Garage Cars','Garage Area','Open Porch SF','Wood Deck SF','Lot Area']]
y = ames['SalePrice']

In [13]:
ss = StandardScaler()

Xs_train = ss.fit_transform(X_train) # data leakage if you put test data.
Xs_test = ss.transform(X_test) # test, 

Methods

fit(X[, y]) - Compute the mean and std to be used for later scaling.
fit_transform(X[, y]) - Fit to data, then transform it.
get_params([deep]) - parameters for this estimator.
inverse_transform(X[, copy])
Scale back the data to the original representation
partial_fit(X[, y])
Online computation of mean and std on X for later scaling.
set_params(**params)
Set the parameters of this estimator.
transform(X[, copy]) -Perform standardization by centering and scaling

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html 

Instantiate, Fit, Score Model

In [14]:
lr = LinearRegression()

In [15]:
lr.fit(Xs_train, y_train)

LinearRegression()

In [16]:
lr.score(Xs_train, y_train), lr.score(Xs_test, y_test) # this model did worst then baseline score.

(0.7149251482994712, 0.7904325092185089)

The model is currently underfit, low bias, and high variance.  At this point we want to see which features make a difference in the model.  Lasso is good for feature selection.

### Model 3 - Numeric Features

In [18]:
X = ames[features]
y = ames['SalePrice']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42, test_size = .1, train_size = .9)
ss = StandardScaler()

Xs_train = ss.fit_transform(X_train) # data leakage if you put test data.
Xs_test = ss.transform(X_test) # test, 

In [20]:
lr = LinearRegression()
lr.fit(Xs_train, y_train)

LinearRegression()

In [22]:
lr.score(Xs_train, y_train), lr.score(Xs_test, y_test) 

(0.8280297407245422, 0.8786907586695374)

The model is still currently underfitting.  Linear Regression models tend to be high bias and low variance.

### Model 4 - Numeric and Categorical columns

In [64]:
Steps in this process:
    - add numeric and categorical columns
    - create linear regression model

SyntaxError: invalid syntax (<ipython-input-64-2f6049e4318d>, line 1)

In [65]:
# one-hot-encode the categorical columns
comb_ames = pd.DataFrame()
comb_ames = pd.get_dummies(data = ames, columns = cat, drop_first = True)

In [66]:
comb_ames

Unnamed: 0,Id,PID,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,...,Misc Feature_Shed,Misc Feature_TenC,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_WD
0,109,533352170,68.0,13517,6,8,1976,2005,289.0,533.0,...,0,0,0,0,0,0,0,0,0,1
1,544,531379050,43.0,11492,7,5,1996,1997,132.0,637.0,...,0,0,0,0,0,0,0,0,0,1
2,153,535304180,68.0,7922,5,7,1953,2007,0.0,731.0,...,0,0,0,0,0,0,0,0,0,1
3,318,916386060,73.0,9802,5,5,2006,2007,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
4,255,906425045,82.0,14235,6,8,1900,1993,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,1587,921126030,79.0,11449,8,5,2007,2007,0.0,1011.0,...,0,0,0,0,0,0,0,0,0,1
2047,785,905377130,68.0,12342,4,5,1940,1950,0.0,262.0,...,0,0,0,0,0,0,0,0,0,1
2048,916,909253010,57.0,7558,6,6,1928,1950,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2049,639,535179160,80.0,10400,4,5,1956,1956,0.0,155.0,...,0,0,0,0,0,0,0,0,0,1


In [67]:
comb_ames = pd.merge(left = comb_ames, right = ames[features], left_index = True, right_index = True)

In [68]:
ames[features]

Unnamed: 0,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,Bsmt Unf SF,BsmtFin SF 2,...,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,Total Bsmt SF
0,68.0,13517,6,8,1976,2005,289.0,533.0,192.0,0.0,...,0,44,0,0,0,0,0,3,2010,725.0
1,43.0,11492,7,5,1996,1997,132.0,637.0,276.0,0.0,...,0,74,0,0,0,0,0,4,2009,913.0
2,68.0,7922,5,7,1953,2007,0.0,731.0,326.0,0.0,...,0,52,0,0,0,0,0,1,2010,1057.0
3,73.0,9802,5,5,2006,2007,0.0,0.0,384.0,0.0,...,100,0,0,0,0,0,0,4,2010,384.0
4,82.0,14235,6,8,1900,1993,0.0,0.0,676.0,0.0,...,0,59,0,0,0,0,0,3,2010,676.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,79.0,11449,8,5,2007,2007,0.0,1011.0,873.0,0.0,...,0,276,0,0,0,0,0,1,2008,1884.0
2047,68.0,12342,4,5,1940,1950,0.0,262.0,599.0,0.0,...,158,0,0,0,0,0,0,3,2009,861.0
2048,57.0,7558,6,6,1928,1950,0.0,0.0,896.0,0.0,...,0,0,0,0,0,0,0,3,2009,896.0
2049,80.0,10400,4,5,1956,1956,0.0,155.0,295.0,750.0,...,0,189,140,0,0,0,0,11,2009,1200.0


In [69]:
34 + 279 # features = 34, one

313

In [70]:
comb_ames # need to add 'SalePrice' column

Unnamed: 0,Id,PID,Lot Frontage_x,Lot Area_x,Overall Qual_x,Overall Cond_x,Year Built_x,Year Remod/Add_x,Mas Vnr Area_x,BsmtFin SF 1_x,...,Wood Deck SF_y,Open Porch SF_y,Enclosed Porch_y,3Ssn Porch_y,Screen Porch_y,Pool Area_y,Misc Val_y,Mo Sold_y,Yr Sold_y,Total Bsmt SF_y
0,109,533352170,68.0,13517,6,8,1976,2005,289.0,533.0,...,0,44,0,0,0,0,0,3,2010,725.0
1,544,531379050,43.0,11492,7,5,1996,1997,132.0,637.0,...,0,74,0,0,0,0,0,4,2009,913.0
2,153,535304180,68.0,7922,5,7,1953,2007,0.0,731.0,...,0,52,0,0,0,0,0,1,2010,1057.0
3,318,916386060,73.0,9802,5,5,2006,2007,0.0,0.0,...,100,0,0,0,0,0,0,4,2010,384.0
4,255,906425045,82.0,14235,6,8,1900,1993,0.0,0.0,...,0,59,0,0,0,0,0,3,2010,676.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,1587,921126030,79.0,11449,8,5,2007,2007,0.0,1011.0,...,0,276,0,0,0,0,0,1,2008,1884.0
2047,785,905377130,68.0,12342,4,5,1940,1950,0.0,262.0,...,158,0,0,0,0,0,0,3,2009,861.0
2048,916,909253010,57.0,7558,6,6,1928,1950,0.0,0.0,...,0,0,0,0,0,0,0,3,2009,896.0
2049,639,535179160,80.0,10400,4,5,1956,1956,0.0,155.0,...,0,189,140,0,0,0,0,11,2009,1200.0


In [71]:
comb_ames = pd.merge(left = comb_ames, right = ames['SalePrice'], left_index = True, right_index = True)

In [72]:
comb_ames

Unnamed: 0,Id,PID,Lot Frontage_x,Lot Area_x,Overall Qual_x,Overall Cond_x,Year Built_x,Year Remod/Add_x,Mas Vnr Area_x,BsmtFin SF 1_x,...,Open Porch SF_y,Enclosed Porch_y,3Ssn Porch_y,Screen Porch_y,Pool Area_y,Misc Val_y,Mo Sold_y,Yr Sold_y,Total Bsmt SF_y,SalePrice_y
0,109,533352170,68.0,13517,6,8,1976,2005,289.0,533.0,...,44,0,0,0,0,0,3,2010,725.0,130500
1,544,531379050,43.0,11492,7,5,1996,1997,132.0,637.0,...,74,0,0,0,0,0,4,2009,913.0,220000
2,153,535304180,68.0,7922,5,7,1953,2007,0.0,731.0,...,52,0,0,0,0,0,1,2010,1057.0,109000
3,318,916386060,73.0,9802,5,5,2006,2007,0.0,0.0,...,0,0,0,0,0,0,4,2010,384.0,174000
4,255,906425045,82.0,14235,6,8,1900,1993,0.0,0.0,...,59,0,0,0,0,0,3,2010,676.0,138500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,1587,921126030,79.0,11449,8,5,2007,2007,0.0,1011.0,...,276,0,0,0,0,0,1,2008,1884.0,298751
2047,785,905377130,68.0,12342,4,5,1940,1950,0.0,262.0,...,0,0,0,0,0,0,3,2009,861.0,82500
2048,916,909253010,57.0,7558,6,6,1928,1950,0.0,0.0,...,0,0,0,0,0,0,3,2009,896.0,177000
2049,639,535179160,80.0,10400,4,5,1956,1956,0.0,155.0,...,189,140,0,0,0,0,11,2009,1200.0,144000


In [97]:
comb_ames.drop(columns = ['SalePrice_x'], inplace = True)

In [98]:
X= comb_ames.drop(columns = ['SalePrice_y'])
y = comb_ames['SalePrice_y']

X_train,X_test,y_train, y_test = train_test_split(X, y, random_state = 42, test_size = .1, train_size = .9 )


In [99]:
X_train.shape

(1845, 312)

In [100]:
y_train.shape

(1845,)

In [101]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train), lr.score(X_test, y_test) # the model is currently slightly overfit

(0.9453435706473343, 0.9215803187408077)

### Model 3 - Lasso Regression