<a href="https://colab.research.google.com/github/yMugrelo/The-Boston-Housing/blob/main/boston-housing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()

| Coluna      | Significado                                                                                       |
| ----------- | ------------------------------------------------------------------------------------------------- |
| **CRIM**    | Taxa de criminalidade por cidade (per capita)                                                     |
| **ZN**      | Proporção de terrenos residenciais de mais de 25.000 pés²                                         |
| **INDUS**   | Proporção de hectares de área não residencial (comercial, industrial) por cidade                  |
| **CHAS**    | Dummy (0 ou 1) se a casa está próximo ao rio Charles (1 = sim)                                    |
| **NOX**     | Concentração de óxidos nítricos (poluição do ar)                                                  |
| **RM**      | Número médio de quartos por residência                                                            |
| **AGE**     | Proporção de unidades ocupadas pelos proprietários construídas antes de 1940                      |
| **DIS**     | Distância ponderada para cinco centros de emprego em Boston                                       |
| **RAD**     | Índice de acessibilidade a rodovias radiais                                                       |
| **TAX**     | Taxa de imposto predial por \$10.000                                                              |
| **PTRATIO** | Proporção aluno-professor por cidade                                                              |
| **B**       | Proporção de população negra por cidade *(B = 1000(Bk − 0.63)², onde Bk é a proporção de negros)* |
| **LSTAT**   | Percentual da população de baixa renda                                                            |
| **MEDV**    | Valor médio das casas ocupadas pelos proprietários (em \$1.000)                                   |


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
df = pd.read_csv('housing.csv', header=None, delimiter=r"\s+", names=column_names)
df.head()

In [None]:
print(df.describe())

In [None]:
from scipy import stats

fig, axs = plt.subplots(ncols = 7, nrows = 2, figsize = (20,10))
index = 0
axs = axs.flatten()
for k, v in df.items():
  sns.boxplot(y = k, data = df, ax = axs[index])
  index += 1
plt.tight_layout(pad = 0.4, w_pad = 0.5, h_pad = 5.0)

In [None]:
for k, v in df.items():
  q1 = v.quantile(0.25)
  q3 = v.quantile(0.75)
  irq = q3 - q1
  v_col = v[(v <= q1 - 1.5 * irq) | (v >= q3 + 1.5 * irq)]
  perc = np.shape(v_col)[0] * 100.0 / np.shape(df)[0]
  print("Column %s outliers = %.2f%%" % (k, perc))

In [None]:
df = df[~(df['MEDV'] >= 50.0)]
print(np.shape(df))

In [None]:
fig, axs = plt.subplots(ncols = 7, nrows = 2 , figsize = (20,10))
index = 0
axs = axs.flatten()
for k, v in df.items():
  sns.histplot(v, ax = axs[index])
  index += 1
plt.tight_layout(pad = 0.4, w_pad = 0.5, h_pad = 5.0)

In [None]:
plt.figure(figsize =(20, 10))
sns.heatmap(df.corr().abs(), annot = True)

In [None]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
colors = ['red', 'blue', 'green', 'orange', 'purple', 'brown', 'pink', 'gray']
column_sels = ['LSTAT', 'INDUS', 'NOX', 'PTRATIO', 'RM', 'TAX', 'DIS', 'AGE']
x = df.loc[:, column_sels]
y = df['MEDV']
x = pd.DataFrame(data = min_max_scaler.fit_transform(x), columns = column_sels)
fig, axs = plt.subplots(ncols = 4, nrows = 2, figsize = (20, 10))
index = 0
axs = axs.flatten()
for i, k in enumerate(column_sels):
  sns.regplot(y = y, x = x[k], ax = axs[i], color = colors[i])

plt.tight_layout(pad = 0.4, w_pad = 0.5, h_pad = 5.0)


In [None]:
y =  np.log1p(y)
for col in x.columns:
  if np.abs(x[col].skew()) > 0.3:
    x[col] = np.log1p(x[col])

In [None]:
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing

from sklearn.model_selection import KFold
l_regression = linear_model.LinearRegression()
kf = KFold(n_splits = 10, shuffle = True)
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)

scores = cross_val_score(l_regression, x_scaled, y, scoring = 'neg_mean_squared_error', cv = kf, n_jobs = -1)
print("MSE: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

scores_map = {}
scores_map['Linear Regression'] = scores
l_ridge = linear_model.Ridge()
scores  = cross_val_score(l_ridge, x_scaled, y, scoring = 'neg_mean_squared_error', cv = kf, n_jobs = -1)
print("MSE: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))


from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

model = make_pipeline(PolynomialFeatures(degree = 3), linear_model.Ridge())
scores = cross_val_score(model, x_scaled, y, scoring = 'neg_mean_squared_error', cv = kf)
scores_map['PolyRidge'] = scores
print("MSE: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

svb_rbf = SVR(kernel = 'rbf', C = 1e3, gamma = 0.1)
scores = cross_val_score(svb_rbf, x_scaled, y, scoring = 'neg_mean_squared_error', cv = kf, n_jobs = -1)
scores_map['SVR'] = scores
print("MSE: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

In [None]:
from sklearn.tree import DecisionTreeRegressor

desc_tr = DecisionTreeRegressor(max_depth=5)

scores = cross_val_score(desc_tr, x_scaled, y, cv=kf, scoring='neg_mean_squared_error')
scores_map['DecisionTreeRegressor'] = scores
print("MSE: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

In [None]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=7)
scores = cross_val_score(knn, x_scaled, y, cv=kf, scoring='neg_mean_squared_error')
scores_map['KNeighborsRegressor'] = scores

print("KNN Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(alpha=0.9,learning_rate=0.05, max_depth=2, min_samples_leaf=5, min_samples_split=2, n_estimators=100, random_state=30)

scores = cross_val_score(gbr, x_scaled, y, cv=kf, scoring='neg_mean_squared_error')
scores_map['GradientBoostingRegressor'] = scores
print("MSE: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

In [None]:
plt.figure(figsize = (20, 10))
scores_map = pd.DataFrame(scores_map)
sns.boxplot(data = scores_map, orient = 'h', palette = 'Set2')
plt.xlabel('MSE')