In [8]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
file_path = "iowa_data.csv"

home_data = pd.read_csv(file_path)

home_data.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [31]:
y = home_data.SalePrice

feature_names = ["LotArea", "YearBuilt", "1stFlrSF", "2ndFlrSF", "FullBath", "BedroomAbvGr", "TotRmsAbvGrd"]

X = home_data[feature_names]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

In [10]:
iowa_home_model = DecisionTreeRegressor(random_state=1)

iowa_home_model.fit(train_X, train_y)

In [13]:
predicitions = iowa_home_model.predict(val_X)

print("Previsões realizadas: ", predicitons[:5])
print("Preço das casas de entrada como teste: ", val_y.head())

Previsões realizadas:  [208500. 176432. 239500. 200000. 135000.]
Preço das casas de entrada como teste:  536    188000
236    185500
244    205000
941    214000
394    109000
Name: SalePrice, dtype: int64


In [27]:
from sklearn.metrics import mean_absolute_error

val_mae = mean_absolute_error(val_y, predicitions)

print("Validation MAE: {:,.0f}".format(val_mae))

Validation MAE: 32,929


In [22]:
# função que irá medir o MAE (Mean Absolute Error) o número de nós number_leaf_nodes da árvore de decisão

def get_mae(number_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=number_leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    predict = model.predict(val_X)
    return mean_absolute_error(val_y, predict)


# medindo entre vários candidatos para número de leaf nodes da árvore de decisão qual irá gerar o menor erro absoluto médio (MAE)

candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]

scores = {number_leaf_nodes: get_mae(number_leaf_nodes, train_X, val_X, train_y, val_y) for number_leaf_nodes in candidate_max_leaf_nodes}

best_tree_size = min(scores, key=scores.get)

25


In [23]:
# gerando um novo modelo utilizando o número de nós que irá proporcionar o menor MAE

iowa_home_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)

iowa_home_model.fit(X, y)

In [33]:
# usando random forest para treinar um novo modelo

from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(random_state=1)

rf_model.fit(train_X, train_y)

rf_val_mae = mean_absolute_error(val_y, rf_model.predict(val_X))

print("Validation MAE for Random Forest Model: {:,.2f}".format(rf_val_mae))

Validation MAE for Random Forest Model: 21,857.16
