<a href="https://colab.research.google.com/github/victorhmota/cursoml/blob/main/boston_housing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
#Definindo uma seed
seed = 10
np.random.seed(seed)

In [None]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rc('axes', labelsize = 14)
mpl.rc('xtick', labelsize= 12)
mpl.rc('ytick', labelsize=12)

In [None]:
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [None]:
#Obtendo os dados
from sklearn.datasets import load_boston
housing = load_boston()

In [None]:
housing

In [None]:
X = pd.DataFrame(housing.data, columns=housing.feature_names)
y = pd.DataFrame(housing.target)

In [None]:
#Análise exploratória

In [None]:
X.head() # Perceber que a variável 'CHAS' é uma variável dummy

In [None]:
X.describe()

In [None]:
y.describe()

In [None]:
X.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
#Separando o conjunto treino e teste (proporção 80:20, test_size = 0,20)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = seed)

In [None]:
#Visualização dos dados
boston = X_train.copy()
boston_prices = y_train.copy()
boston_housing = pd.concat([boston, boston_prices], axis=1)

In [None]:
corr_matrix = boston_housing.corr()
corr_matrix

In [None]:
corr_matrix.iloc[:,-1].sort_values(ascending=False) #Alta correlação positiva entre 'RM'(Número de quartos por casa) e o preço. Alta correlação negativa entre população de baixo status e o preço.

In [None]:
boston_housing.plot(kind='scatter', x = 'RM', y=0, alpha = 0.2)
plt.show()

In [None]:
boston_housing.plot(kind='scatter', x='LSTAT', y = 0, alpha = 0.2)
plt.show()

In [None]:
#Verificando existência de dados faltantes
saw_incomplete_rows = boston_housing[boston_housing.isnull().any(axis=1)].head()
saw_incomplete_rows
#Não há dados faltantes

In [None]:
#Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([(('std_scaler'), StandardScaler())])

In [None]:
boston_prepared = pipeline.fit_transform(boston)

In [None]:
#Treinando o modelo
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(boston_prepared, boston_prices)

In [None]:
some_data = boston.iloc[:5]
some_labels = boston_prices.iloc[:5]
some_prepared_data = pipeline.transform(some_data)

lin_reg.predict(some_prepared_data)

In [None]:
print('Label:',some_labels)

In [None]:
#Utilizando métricas para avaliar o modelo

from sklearn.metrics import mean_squared_error as MSE
boston_predictons = lin_reg.predict(boston)
lin_mse = MSE(boston_prices, boston_predictons)
lin_rmse = np.sqrt(lin_mse)
print(lin_mse, lin_rmse)


In [None]:
from sklearn.metrics import mean_absolute_error as MAE

lin_mae = MAE(boston_prices, boston_predictons)
lin_mae

In [None]:
#Testando outros modelos

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state = seed)
tree_reg.fit(boston_prepared, boston_prices)
tree_prediction = tree_reg.predict(boston_prepared)

In [None]:
tree_mse = MSE(boston_prices, tree_prediction)
tree_mse

In [None]:
#Validação cruzada

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, boston_prepared, boston_prices, cv = 10, scoring = 'neg_mean_squared_error')
scores

In [None]:
tree_rmse_scores = np.sqrt(-scores)
tree_rmse_scores

In [None]:
def display_scores(scores):
  print('Score:', scores)
  print('Mean:', scores.mean())
  print('SD:', scores.std())

In [None]:
#Para a Decision Tree
display_scores(tree_rmse_scores)

In [None]:
#Para a Regressão Linear
lin_rmse_scores = np.sqrt(-cross_val_score(lin_reg, boston_prepared, boston_prices, scoring='neg_mean_squared_error', cv=10))
display_scores(lin_rmse)

In [None]:
#Random Forest
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators = 10, random_state=seed)
forest_reg.fit(boston_prepared, boston_prices)

In [None]:
forest_scores = cross_val_score(forest_reg, boston_prepared, boston_prices, cv=10, scoring='neg_mean_squared_error')
forest_rmse = np.sqrt(-forest_scores)
display_scores(forest_rmse)

In [None]:
#Aprimorando parâmetros da Floresta Aletória
from sklearn.model_selection import GridSearchCV

param_grid = [{'n_estimators': [3,10,30], 'max_features':[2, 4, 6, 8]}, 
              {'bootstrap': [False], 'n_estimators': [3,10], 'max_features':[2, 3, 4]}]

forest_reg = RandomForestRegressor(random_state=seed)
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score = True)
grid_search.fit(boston_prepared, boston_prices)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
  print(np.sqrt(-mean_score), params)

In [None]:
#Escolhendo o melhor modelo e finalizando

In [None]:
final_model = grid_search.best_estimator_

X_test_prepared = pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = MSE(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [None]:
print(final_mse, final_rmse)

17.76077254901961 4.214353159029225
