In [15]:
# Import Required Libraries
import pandas as pd
from datetime import datetime
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# Data Loading

In [3]:
# Path of the file to read
iowa_file_path = 'Data/Housing Prices Competition for Kaggle Learn Users/train.csv'

# Fill in the line below to read the file into a variable home_data
home_data = pd.read_csv(iowa_file_path)

# Call line below with no argument to check that you've loaded the data correctly
home_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [4]:
# What is the average lot size (rounded to nearest integer)?
avg_lot_size = round(home_data['LotArea'].mean())

# As of today, how old is the newest home (current year - the date in which it was built)
current_year = datetime.now().year
newest_home_age = current_year - home_data['YearBuilt'].max()

print("Average Lo Size: ", avg_lot_size)
print("Newest Home Age: ", newest_home_age)

Average Lo Size:  10517
Newest Home Age:  14


# Prediction Target

##### Targrt Variable which corresponds to SalesPrice and store it in y variable

In [6]:
y = home_data.SalePrice

# Choosing Features

In [9]:
feature_names = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']

X = home_data[feature_names]

In [10]:
X.head()

Unnamed: 0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
0,8450,2003,856,854,2,3,8
1,9600,1976,1262,0,2,3,6
2,11250,2001,920,866,2,3,6
3,9550,1915,961,756,1,3,7
4,14260,2000,1145,1053,2,4,9


# Machine Learning Model

In [11]:
home_data_model = DecisionTreeRegressor(random_state=1)

#Fit the model
home_data_model.fit(X, y)

# Make Predictions

In [13]:
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
print(home_data_model.predict(X))

Making predictions for the following 5 houses:
   LotArea  YearBuilt  1stFlrSF  2ndFlrSF  FullBath  BedroomAbvGr  \
0     8450       2003       856       854         2             3   
1     9600       1976      1262         0         2             3   
2    11250       2001       920       866         2             3   
3     9550       1915       961       756         1             3   
4    14260       2000      1145      1053         2             4   

   TotRmsAbvGrd  
0             8  
1             6  
2             6  
3             7  
4             9  
The predictions are
[208500. 181500. 223500. ... 266500. 142125. 147500.]


In [14]:
# Compare predictions with actual home values
comparison = pd.DataFrame({'Actual Home Values': y.head(), 'Predicted Home Values': home_data_model.predict(X.head())})
comparison

Unnamed: 0,Actual Home Values,Predicted Home Values
0,208500,208500.0
1,181500,181500.0
2,223500,223500.0
3,140000,140000.0
4,250000,250000.0


# Model Validation

In [16]:
predicted_home_prices = home_data_model.predict(X)
mean_absolute_error(y, predicted_home_prices)

62.35433789954339

In [17]:
# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.

train_X, val_X, train_y, val_y = train_test_split(X,y, random_state=0)

# Define model
home_data_model = DecisionTreeRegressor()
# Fit model
home_data_model.fit(train_X, train_y)

In [19]:
# get predicted prices on validation data
val_predictions = home_data_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

32427.87671232877


In [20]:
# Compare predictions with actual home values
comparison = pd.DataFrame({'Actual Home Values': val_y, 'Predicted Home Values': val_predictions})
comparison

Unnamed: 0,Actual Home Values,Predicted Home Values
529,200624,335000.0
491,133000,205000.0
459,110000,124000.0
279,192000,205000.0
655,88000,91500.0
...,...,...
583,325000,265979.0
1245,178000,190000.0
1390,235000,217000.0
1375,239000,245000.0
