# Ames housing - linear regression project

## Libraries import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Data import

In [2]:
df = pd.read_csv('DATA/ames_final_processed.csv')

In [3]:
df.head()

Unnamed: 0,Lot.Frontage,Lot.Area,Overall.Qual,Overall.Cond,Year.Built,Year.Remod.Add,Mas.Vnr.Area,BsmtFin.SF.1,BsmtFin.SF.2,Bsmt.Unf.SF,...,Sale.Type_ConLw,Sale.Type_New,Sale.Type_Oth,Sale.Type_VWD,Sale.Type_WD,Sale.Condition_AdjLand,Sale.Condition_Alloca,Sale.Condition_Family,Sale.Condition_Normal,Sale.Condition_Partial
0,141.0,31770,6,5,1960,1960,112.0,639.0,0.0,441.0,...,0,0,0,0,1,0,0,0,1,0
1,80.0,11622,5,6,1961,1961,0.0,468.0,144.0,270.0,...,0,0,0,0,1,0,0,0,1,0
2,81.0,14267,6,6,1958,1958,108.0,923.0,0.0,406.0,...,0,0,0,0,1,0,0,0,1,0
3,93.0,11160,7,5,1968,1968,0.0,1065.0,0.0,1045.0,...,0,0,0,0,1,0,0,0,1,0
4,74.0,13830,5,5,1997,1998,0.0,791.0,0.0,137.0,...,0,0,0,0,1,0,0,0,1,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2922 entries, 0 to 2921
Columns: 274 entries, Lot.Frontage to Sale.Condition_Partial
dtypes: float64(11), int64(263)
memory usage: 6.1 MB


In [5]:
df.columns[0:40]

Index(['Lot.Frontage', 'Lot.Area', 'Overall.Qual', 'Overall.Cond',
       'Year.Built', 'Year.Remod.Add', 'Mas.Vnr.Area', 'BsmtFin.SF.1',
       'BsmtFin.SF.2', 'Bsmt.Unf.SF', 'Total.Bsmt.SF', 'X1st.Flr.SF',
       'X2nd.Flr.SF', 'Low.Qual.Fin.SF', 'Gr.Liv.Area', 'Bsmt.Full.Bath',
       'Bsmt.Half.Bath', 'Full.Bath', 'Half.Bath', 'Bedroom.AbvGr',
       'Kitchen.AbvGr', 'TotRms.AbvGrd', 'Fireplaces', 'Garage.Yr.Blt',
       'Garage.Cars', 'Garage.Area', 'Wood.Deck.SF', 'Open.Porch.SF',
       'Enclosed.Porch', 'X3Ssn.Porch', 'Screen.Porch', 'Pool.Area',
       'Misc.Val', 'Mo.Sold', 'Yr.Sold', 'SalePrice', 'MS.SubClass_150',
       'MS.SubClass_160', 'MS.SubClass_180', 'MS.SubClass_190'],
      dtype='object')

## Data prepartion

### X -> features, y -> labels (SalePrice)

In [6]:
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

### Features split (10% for test)

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)

### Data scaling

In [9]:
from sklearn.preprocessing import StandardScaler

In [10]:
scaler = StandardScaler() #scaler defnition

In [11]:
scaler.fit(X_train)

StandardScaler()

In [12]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## L1 and L2 regularization - Elastic Net model (using Grid Search)

In [13]:
from sklearn.linear_model import ElasticNet

In [14]:
base_elastic_model = ElasticNet() #model definition

In [15]:
#grid of values for ElasticNet (alpha and L1 ratio)
g_o_v = {'alpha':[0.01, 0.1, 1, 50, 100], 'l1_ratio':[0.01, 0.1, 0.5, 0.9, 1]}

In [16]:
from sklearn.model_selection import GridSearchCV

In [17]:
#grid model definition
grid_model = GridSearchCV(estimator = base_elastic_model,
                          param_grid = g_o_v,
                          scoring='neg_mean_squared_error',
                          cv = 5, verbose = 2)

In [18]:
grid_model.fit(X_train,y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] alpha=0.01, l1_ratio=0.01 .......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  positive)
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] ........................ alpha=0.01, l1_ratio=0.01, total=   0.3s
[CV] alpha=0.01, l1_ratio=0.01 .......................................


  positive)


[CV] ........................ alpha=0.01, l1_ratio=0.01, total=   0.3s
[CV] alpha=0.01, l1_ratio=0.01 .......................................


  positive)


[CV] ........................ alpha=0.01, l1_ratio=0.01, total=   0.3s
[CV] alpha=0.01, l1_ratio=0.01 .......................................


  positive)


[CV] ........................ alpha=0.01, l1_ratio=0.01, total=   0.3s
[CV] alpha=0.01, l1_ratio=0.01 .......................................


  positive)


[CV] ........................ alpha=0.01, l1_ratio=0.01, total=   0.3s
[CV] alpha=0.01, l1_ratio=0.1 ........................................


  positive)


[CV] ......................... alpha=0.01, l1_ratio=0.1, total=   0.3s
[CV] alpha=0.01, l1_ratio=0.1 ........................................


  positive)


[CV] ......................... alpha=0.01, l1_ratio=0.1, total=   0.3s
[CV] alpha=0.01, l1_ratio=0.1 ........................................


  positive)


[CV] ......................... alpha=0.01, l1_ratio=0.1, total=   0.3s
[CV] alpha=0.01, l1_ratio=0.1 ........................................


  positive)


[CV] ......................... alpha=0.01, l1_ratio=0.1, total=   0.3s
[CV] alpha=0.01, l1_ratio=0.1 ........................................


  positive)


[CV] ......................... alpha=0.01, l1_ratio=0.1, total=   0.3s
[CV] alpha=0.01, l1_ratio=0.5 ........................................


  positive)


[CV] ......................... alpha=0.01, l1_ratio=0.5, total=   0.3s
[CV] alpha=0.01, l1_ratio=0.5 ........................................


  positive)


[CV] ......................... alpha=0.01, l1_ratio=0.5, total=   0.3s
[CV] alpha=0.01, l1_ratio=0.5 ........................................


  positive)


[CV] ......................... alpha=0.01, l1_ratio=0.5, total=   0.3s
[CV] alpha=0.01, l1_ratio=0.5 ........................................


  positive)


[CV] ......................... alpha=0.01, l1_ratio=0.5, total=   0.3s
[CV] alpha=0.01, l1_ratio=0.5 ........................................


  positive)


[CV] ......................... alpha=0.01, l1_ratio=0.5, total=   0.3s
[CV] alpha=0.01, l1_ratio=0.9 ........................................


  positive)


[CV] ......................... alpha=0.01, l1_ratio=0.9, total=   0.3s
[CV] alpha=0.01, l1_ratio=0.9 ........................................


  positive)


[CV] ......................... alpha=0.01, l1_ratio=0.9, total=   0.3s
[CV] alpha=0.01, l1_ratio=0.9 ........................................


  positive)


[CV] ......................... alpha=0.01, l1_ratio=0.9, total=   0.3s
[CV] alpha=0.01, l1_ratio=0.9 ........................................


  positive)


[CV] ......................... alpha=0.01, l1_ratio=0.9, total=   0.3s
[CV] alpha=0.01, l1_ratio=0.9 ........................................


  positive)


[CV] ......................... alpha=0.01, l1_ratio=0.9, total=   0.3s
[CV] alpha=0.01, l1_ratio=1 ..........................................


  positive)


[CV] ........................... alpha=0.01, l1_ratio=1, total=   0.3s
[CV] alpha=0.01, l1_ratio=1 ..........................................


  positive)


[CV] ........................... alpha=0.01, l1_ratio=1, total=   0.3s
[CV] alpha=0.01, l1_ratio=1 ..........................................


  positive)


[CV] ........................... alpha=0.01, l1_ratio=1, total=   0.3s
[CV] alpha=0.01, l1_ratio=1 ..........................................


  positive)


[CV] ........................... alpha=0.01, l1_ratio=1, total=   0.3s
[CV] alpha=0.01, l1_ratio=1 ..........................................


  positive)


[CV] ........................... alpha=0.01, l1_ratio=1, total=   0.3s
[CV] alpha=0.1, l1_ratio=0.01 ........................................
[CV] ......................... alpha=0.1, l1_ratio=0.01, total=   0.1s
[CV] alpha=0.1, l1_ratio=0.01 ........................................
[CV] ......................... alpha=0.1, l1_ratio=0.01, total=   0.1s
[CV] alpha=0.1, l1_ratio=0.01 ........................................
[CV] ......................... alpha=0.1, l1_ratio=0.01, total=   0.1s
[CV] alpha=0.1, l1_ratio=0.01 ........................................
[CV] ......................... alpha=0.1, l1_ratio=0.01, total=   0.1s
[CV] alpha=0.1, l1_ratio=0.01 ........................................
[CV] ......................... alpha=0.1, l1_ratio=0.01, total=   0.1s
[CV] alpha=0.1, l1_ratio=0.1 .........................................
[CV] .......................... alpha=0.1, l1_ratio=0.1, total=   0.1s
[CV] alpha=0.1, l1_ratio=0.1 .........................................
[CV] .

  positive)


[CV] .......................... alpha=0.1, l1_ratio=0.9, total=   0.3s
[CV] alpha=0.1, l1_ratio=0.9 .........................................


  positive)


[CV] .......................... alpha=0.1, l1_ratio=0.9, total=   0.3s
[CV] alpha=0.1, l1_ratio=0.9 .........................................


  positive)


[CV] .......................... alpha=0.1, l1_ratio=0.9, total=   0.3s
[CV] alpha=0.1, l1_ratio=0.9 .........................................


  positive)


[CV] .......................... alpha=0.1, l1_ratio=0.9, total=   0.3s
[CV] alpha=0.1, l1_ratio=0.9 .........................................


  positive)


[CV] .......................... alpha=0.1, l1_ratio=0.9, total=   0.3s
[CV] alpha=0.1, l1_ratio=1 ...........................................


  positive)


[CV] ............................ alpha=0.1, l1_ratio=1, total=   0.3s
[CV] alpha=0.1, l1_ratio=1 ...........................................


  positive)


[CV] ............................ alpha=0.1, l1_ratio=1, total=   0.3s
[CV] alpha=0.1, l1_ratio=1 ...........................................


  positive)


[CV] ............................ alpha=0.1, l1_ratio=1, total=   0.3s
[CV] alpha=0.1, l1_ratio=1 ...........................................


  positive)


[CV] ............................ alpha=0.1, l1_ratio=1, total=   0.3s
[CV] alpha=0.1, l1_ratio=1 ...........................................


  positive)


[CV] ............................ alpha=0.1, l1_ratio=1, total=   0.3s
[CV] alpha=1, l1_ratio=0.01 ..........................................
[CV] ........................... alpha=1, l1_ratio=0.01, total=   0.0s
[CV] alpha=1, l1_ratio=0.01 ..........................................
[CV] ........................... alpha=1, l1_ratio=0.01, total=   0.0s
[CV] alpha=1, l1_ratio=0.01 ..........................................
[CV] ........................... alpha=1, l1_ratio=0.01, total=   0.0s
[CV] alpha=1, l1_ratio=0.01 ..........................................
[CV] ........................... alpha=1, l1_ratio=0.01, total=   0.0s
[CV] alpha=1, l1_ratio=0.01 ..........................................
[CV] ........................... alpha=1, l1_ratio=0.01, total=   0.0s
[CV] alpha=1, l1_ratio=0.1 ...........................................
[CV] ............................ alpha=1, l1_ratio=0.1, total=   0.0s
[CV] alpha=1, l1_ratio=0.1 ...........................................
[CV] .

  positive)


[CV] .............................. alpha=1, l1_ratio=1, total=   0.3s
[CV] alpha=1, l1_ratio=1 .............................................


  positive)


[CV] .............................. alpha=1, l1_ratio=1, total=   0.3s
[CV] alpha=1, l1_ratio=1 .............................................


  positive)


[CV] .............................. alpha=1, l1_ratio=1, total=   0.3s
[CV] alpha=1, l1_ratio=1 .............................................


  positive)


[CV] .............................. alpha=1, l1_ratio=1, total=   0.3s
[CV] alpha=1, l1_ratio=1 .............................................


  positive)


[CV] .............................. alpha=1, l1_ratio=1, total=   0.3s
[CV] alpha=50, l1_ratio=0.01 .........................................
[CV] .......................... alpha=50, l1_ratio=0.01, total=   0.0s
[CV] alpha=50, l1_ratio=0.01 .........................................
[CV] .......................... alpha=50, l1_ratio=0.01, total=   0.0s
[CV] alpha=50, l1_ratio=0.01 .........................................
[CV] .......................... alpha=50, l1_ratio=0.01, total=   0.0s
[CV] alpha=50, l1_ratio=0.01 .........................................
[CV] .......................... alpha=50, l1_ratio=0.01, total=   0.0s
[CV] alpha=50, l1_ratio=0.01 .........................................
[CV] .......................... alpha=50, l1_ratio=0.01, total=   0.0s
[CV] alpha=50, l1_ratio=0.1 ..........................................
[CV] ........................... alpha=50, l1_ratio=0.1, total=   0.0s
[CV] alpha=50, l1_ratio=0.1 ..........................................
[CV] .

[Parallel(n_jobs=1)]: Done 125 out of 125 | elapsed:   16.9s finished


GridSearchCV(cv=5, estimator=ElasticNet(),
             param_grid={'alpha': [0.01, 0.1, 1, 50, 100],
                         'l1_ratio': [0.01, 0.1, 0.5, 0.9, 1]},
             scoring='neg_mean_squared_error', verbose=2)

### Best results:

In [19]:
grid_model.best_estimator_

ElasticNet(alpha=100, l1_ratio=1)

## Model evaluation on the test data (10% unseen data)

In [20]:
y_prediction = grid_model.predict(X_test)

In [21]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [22]:
MAE = round(mean_absolute_error(y_test, y_prediction),1)

In [23]:
RMSE = round(np.sqrt(mean_squared_error(y_test,y_prediction)),1)

In [24]:
MAE #[$]

13856.8

In [25]:
RMSE #[$]

19625.5

## Performance achieved on the test data set:
### MAE =    $\$$ 13856.8
### RMSE =    $\$$ 19625.5