# Regression Assignment - Ames Housing Dataset
Daniel Trovato

This assignment is a regression on the Ames Iowa Housing Dataset.  It includes feature engineering and various regression models.

### Importing the python packages and data

In [29]:
#Importing the python data and regression packages
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf


In [30]:
#Importing the data
ames = pd.read_csv('data/ames_housing.csv')

### Cleaning the data

In [None]:
#The below data-points are incomplete
LotFrontage
Alley
MasVnrType	1452	non-null	object
MasVnrArea	1452	non-null	float64
BsmtQual	1423	non-null	object
BsmtCond	1423	non-null	object
BsmtExposure	1422	non-null	object
BsmtFinType1	1423	non-null	object
BsmtFinType2	1422	non-null	object
Electrical	1459	non-null	object
FireplaceQu	770	non-null	object
GarageType	1379	non-null	object
GarageYrBlt	1379	non-null	float64
GarageFinish	1379	non-null	object
GarageQual	1379	non-null	object
GarageCond	1379	non-null	object
PoolQC	7	non-null	object
Fence	281	non-null	object
MiscFeature	54	non-null	object

In [36]:
ames['Alley'] = ames['Alley'].fillna("None")
ames['LotFrontage'] = ames['LotFrontage'].fillna(0)
ames['MasVnrType'] = ames['MasVnrType'].fillna("None")
ames['MasVnrArea'] = ames['MasVnrArea'].fillna(0)
ames['BsmtQual'] = ames['BsmtQual'].fillna("No")
ames['BsmtCond'] = ames['BsmtCond'].fillna("No")
ames['BsmtExposure'] = ames['BsmtExposure'].fillna("No")
ames['BsmtFinType1'] = ames['BsmtFinType1'].fillna("No")
ames['BsmtFinType2'] = ames['BsmtFinType2'].fillna("No")
ames['Electrical'] = ames['Electrical'].fillna("None")
ames['FireplaceQu'] = ames['FireplaceQu'].fillna("None")
ames['GarageType'] = ames['GarageType'].fillna("None")
ames['GarageYrBlt'] = ames['GarageYrBlt'].fillna(0)
ames['GarageFinish'] = ames['GarageFinish'].fillna("None")
ames['GarageQual'] = ames['GarageQual'].fillna("None")
ames['GarageCond'] = ames['GarageCond'].fillna("None")
ames['PoolQC'] = ames['PoolQC'].fillna("None")
ames['Fence'] = ames['Fence'].fillna("None")
ames['MiscFeature'] = ames['MiscFeature'].fillna("None")

In [39]:
ames.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1460 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            1460 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non

In [None]:
#Converting some of the quality variables into numbers

In [47]:
ames = ames.replace({"GarageQual" : {"None" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5}})

In [45]:
ames = ames.replace({"BsmtQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5}})

In [None]:
ames = ames.replace({"BsmtCond": {"No": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}})

In [59]:
ames = ames.replace({"KitchenQual": {"No": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}})

### Feature Engineering

In [62]:
ames['BasementOverall'] = ames['BsmtCond'] * ames['BsmtQual']
ames['OverallGrade'] = ames['OverallQual'] * ames['OverallCond']
ames['GarageOverall'] = ames['GarageQual'] * ames['GarageCond']
ames['PoolOverall'] = ames['PoolArea'] * ames['PoolQC']
#Common wisdom is to redo your kitchen before selling your house.  Is it correct?
ames['KitchenScore'] = ames['KitchenAbvGr']*ames['KitchenQual']
#Adding an overall score to the commonly held most valuable parts of a home, bedrooms, baths, and kitchen
ames['LivingOverall'] = ames['BedroomAbvGr'] + ames ['FullBath'] + (0.5*ames['HalfBath']) + ames['KitchenScore']

### Linear Regression

In [74]:
#Importing the regression packages
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [72]:
lm = LinearRegression()
X = ames['GrLivArea']
y = ames['SalePrice']
lm.fit(X.values.reshape(-1,1), y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [73]:
lm.coef_

array([107.13035897])

In [76]:
predictions = lm.predict(X.values.reshape(-1,1))

In [77]:
mse = mean_squared_error(y, predictions)

In [78]:
rmse = np.sqrt(mse)

In [79]:
print("MSE: ", mse, "\nRMSE: ", rmse)

MSE:  3139843209.6665273 
RMSE:  56034.303865279944
