# Seção 14 - Regressão Linear (Desafio_Auto Imports Database)
Para esta seção, usarei a base de dados descrita abaixo:

1. Title: 1985 Auto Imports Database

2. Source Information:
   -- Creator/Donor: Jeffrey C. Schlimmer (Jeffrey.Schlimmer@a.gp.cs.cmu.edu)
   -- Date: 19 May 1987
   -- Sources:
     1) 1985 Model Import Car and Truck Specifications, 1985 Ward's Automotive Yearbook.
     2) Personal Auto Manuals, Insurance Services Office, 160 Water Street, New York, NY 10038 
     3) Insurance Collision Report, Insurance Institute for Highway Safety, Watergate 600, Washington, DC 20037
        
A base acima citadada pode ser acessada no link: https://archive.ics.uci.edu/ml/datasets/Automobile.

In [22]:
# Instruções basiconas de sempre.

# Lidar com dataframes.
import numpy as np
import pandas as pd

# Lidar com gráficos.
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
from bokeh.plotting import figure
from bokeh.io import show, output_notebook
from plotting import (multiple_histograms_plot,
                      bar_plot_with_categorical,
                      plot_confusion_matrix,
                      plot_confusion_matrix_2,
                      plot_roc)

# Lidar com preparação de dados.
from data_prep import data_prep as dp # Eu que fiz esse modulinho ("uuuuuuuuuma bosts!").
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# Lidar com validação de modelos.
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

pd.set_option('display.max_columns', None)  
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)

In [23]:
nomeDaBase = "../bases/carros/imports-85.csv"
df = pd.read_csv(nomeDaBase, sep=",", encoding="utf-8")

In [24]:
df.sample(5).T

Unnamed: 0,204,18,109,176,85
symboling,-1,2,0,-1,1
normalized-losses,95,121,?,65,125
make,volvo,chevrolet,peugot,toyota,mitsubishi
fuel-type,gas,gas,gas,gas,gas
aspiration,turbo,std,std,std,std
num-of-doors,four,two,four,four,four
body-style,sedan,hatchback,wagon,sedan,sedan
drive-wheels,rwd,fwd,rwd,fwd,fwd
engine-location,front,front,front,front,front
wheel-base,109.1,88.4,114.2,102.4,96.3


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
symboling            205 non-null int64
normalized-losses    205 non-null object
make                 205 non-null object
fuel-type            205 non-null object
aspiration           205 non-null object
num-of-doors         205 non-null object
body-style           205 non-null object
drive-wheels         205 non-null object
engine-location      205 non-null object
wheel-base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb-weight          205 non-null int64
engine-type          205 non-null object
num-of-cylinders     205 non-null object
engine-size          205 non-null int64
fuel-system          205 non-null object
bore                 205 non-null object
stroke               205 non-null object
compression-ratio    205 non-null float64
horsepower           205 non-nul

In [26]:
df.describe()

Unnamed: 0,symboling,wheel-base,length,width,height,curb-weight,engine-size,compression-ratio,city-mpg,highway-mpg
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0
mean,0.834146,98.756585,174.049268,65.907805,53.724878,2555.565854,126.907317,10.142537,25.219512,30.75122
std,1.245307,6.021776,12.337289,2.145204,2.443522,520.680204,41.642693,3.97204,6.542142,6.886443
min,-2.0,86.6,141.1,60.3,47.8,1488.0,61.0,7.0,13.0,16.0
25%,0.0,94.5,166.3,64.1,52.0,2145.0,97.0,8.6,19.0,25.0
50%,1.0,97.0,173.2,65.5,54.1,2414.0,120.0,9.0,24.0,30.0
75%,2.0,102.4,183.1,66.9,55.5,2935.0,141.0,9.4,30.0,34.0
max,3.0,120.9,208.1,72.3,59.8,4066.0,326.0,23.0,49.0,54.0


### Observação:
Na base os valores com a string "?" indicam *nulls*. Eu creio que isso pode nos levar a conclusões errôneas como estimar errôneamente a quantidade de *nulls*. Vou substituir os "?" por "nulls".

In [27]:
df.replace(to_replace="?", value=np.nan, inplace=True)

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
symboling            205 non-null int64
normalized-losses    164 non-null object
make                 205 non-null object
fuel-type            205 non-null object
aspiration           205 non-null object
num-of-doors         203 non-null object
body-style           205 non-null object
drive-wheels         205 non-null object
engine-location      205 non-null object
wheel-base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb-weight          205 non-null int64
engine-type          205 non-null object
num-of-cylinders     205 non-null object
engine-size          205 non-null int64
fuel-system          205 non-null object
bore                 201 non-null object
stroke               201 non-null object
compression-ratio    205 non-null float64
horsepower           203 non-nul

**ATENÇÃO:** Perceba que agora alguns dos atributos possuem nulos.

Agora, vou transformar os atributos que ainda estão como objects em floats.

In [29]:
lista_converter = ["normalized-losses", "bore", "stroke", "horsepower", "peak-rpm", "city-mpg", "highway-mpg", "price"]

In [30]:
for item in lista_converter:
    df[item] = df[item].astype("float64")

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
symboling            205 non-null int64
normalized-losses    164 non-null float64
make                 205 non-null object
fuel-type            205 non-null object
aspiration           205 non-null object
num-of-doors         203 non-null object
body-style           205 non-null object
drive-wheels         205 non-null object
engine-location      205 non-null object
wheel-base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb-weight          205 non-null int64
engine-type          205 non-null object
num-of-cylinders     205 non-null object
engine-size          205 non-null int64
fuel-system          205 non-null object
bore                 201 non-null float64
stroke               201 non-null float64
compression-ratio    205 non-null float64
horsepower           203 non-

In [32]:
# state_dummies = pd.get_dummies(x_train["addr_state"],drop_first=True)
# x_train = pd.concat([x_train,state_dummies],axis=1)

df_teste = df

In [33]:
lista_atributos_categoricos = ["make", "fuel-type", "aspiration", "num-of-doors",
                               "body-style", "drive-wheels", "engine-location", "engine-type",
                               "num-of-cylinders", "fuel-system"]

In [35]:
state_dummies = pd.get_dummies(df_teste[lista_atributos_categoricos])
df_teste.drop(lista_atributos_categoricos, axis=1, inplace=True)

In [37]:
df_teste = pd.concat([df_teste, state_dummies], axis=1)

In [38]:
df_teste.head()

Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,make_alfa-romero,make_audi,make_bmw,make_chevrolet,make_dodge,make_honda,make_isuzu,make_jaguar,make_mazda,make_mercedes-benz,make_mercury,make_mitsubishi,make_nissan,make_peugot,make_plymouth,make_porsche,make_renault,make_saab,make_subaru,make_toyota,make_volkswagen,make_volvo,fuel-type_diesel,fuel-type_gas,aspiration_std,aspiration_turbo,num-of-doors_four,num-of-doors_two,body-style_convertible,body-style_hardtop,body-style_hatchback,body-style_sedan,body-style_wagon,drive-wheels_4wd,drive-wheels_fwd,drive-wheels_rwd,engine-location_front,engine-location_rear,engine-type_dohc,engine-type_dohcv,engine-type_l,engine-type_ohc,engine-type_ohcf,engine-type_ohcv,engine-type_rotor,num-of-cylinders_eight,num-of-cylinders_five,num-of-cylinders_four,num-of-cylinders_six,num-of-cylinders_three,num-of-cylinders_twelve,num-of-cylinders_two,fuel-system_1bbl,fuel-system_2bbl,fuel-system_4bbl,fuel-system_idi,fuel-system_mfi,fuel-system_mpfi,fuel-system_spdi,fuel-system_spfi
0,3,,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111.0,5000.0,21.0,27.0,13495.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
1,3,,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111.0,5000.0,21.0,27.0,16500.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
2,1,,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154.0,5000.0,19.0,26.0,16500.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
3,2,164.0,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102.0,5500.0,24.0,30.0,13950.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
4,2,164.0,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115.0,5500.0,18.0,22.0,17450.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0


### Problemas encontrados na base:

Como é possível observar nas duas células acima, temos os seguintes pontos a tratar na base fornecida:

1) Há *"missing values"* na coluna "age"; e

2) Para o atributo *"age"*, há valores negativos, o que não faz sentido.

### Resolvendo os problemas na base:

A abordagem tomada será:

1) Substituição dos valores negativos de idade para a média dos valores não negativos; e

2) Substituição dos valores faltantes pelo valor médio.

#### OBSERVAÇÃO:

No mundo real, os problemas com as bases são muito piores. Deve-se ter muita atenção pois os dados podem não representar a realidade do processo.

In [None]:
# Copiando df original para um novo df (df_tratado) e substituindo os valores negativos pela média dos positivos.

df_tratado = df.copy()
df_tratado.age[df_tratado["age"]<0] = df_tratado.age[df_tratado["age"]>0].mean()

In [None]:
# Substituindo os valores faltantes de "age" pela média de "age":

imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean', copy=True)
df_tratado.iloc[:,0:4] = imputer.fit_transform(df_tratado.iloc[:,0:4])

In [None]:
# Como resultado, temos:

df_tratado.describe()

In [None]:
df_tratado.info()

## 1) Análise preliminar dos dados:
Como é possível observar, no datast acima não há dados categóricos, somente numéricos. Neste caso, uma estratégia satisfatória para preparar esse dataset para o modelo de aprendizagem Bayesiana é normatizar para que amplitudes e escalas muito diferentes não enviesem o modelo.
Na verdade, a aprendizagem por árvores de decisão não são impactado por atributos com escalas diferentes, mas é uma boa prática normatizar os valores.

In [None]:
# Para normatizar/normalizar/escalonar os dados, usaremos o StandardScaler:

scaler = StandardScaler()
df_tratado.iloc[:,0:3] = scaler.fit_transform(df_tratado.iloc[:,0:3])

In [None]:
df_tratado.describe()

## 2) Aplicação do k-means

O principal argumento (e também fraqueza) do modelo k-means (e, também, do k-medians) é a indicação a priori da quantidade de clusters. Portanto, faremos duas abordagens: (1) definição arbitrária da quantidade de clusters e (2) definição da quantidade ideal de clusters através da curva de entropia vs n.º clusters.

### 2.1) Simplesmente aplicando o K-means com split test

In [None]:
# Separando os fatores/previsores da resposta/target:
    
x = df_tratado.iloc[:, 0:3]
y = df_tratado.iloc[:, 3]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.cluster import KMeans
classificador = KMeans(n_clusters=2,
                      random_state=42)
classificador.fit(x_train, y_train)
y_pred = classificador.predict(x_test)

### Verificando o como o modelo se saiu

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print("Métrica 1 (Acurácia): {:.6}.".format(
    accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)))
print("Métrica 2 (Relatório de classificação):\n{}.".format(
    classification_report(y_test, y_pred)))
print("Métrica 2 (Relatório de classificação):\n{}.".format(
    confusion_matrix(y_test, y_pred)))

In [None]:
plot_confusion_matrix_2(y_test, y_pred, normalize=True);

### Agora, usando validação cruzada.

In [None]:
classificador = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p = 2)
classificador.fit(x, y)

resultados = cross_val_score(classificador, x, y, cv = 200)

classe = y.values

print("Acurácia média: {:.3} ± {:.3}.".format(resultados.mean(), resultados.std()))