# ECO904 - Atividade 1 - Etapa 3 - Capacidade de Aprendizado sobre a Base

[Proposta On-line](https://docs.google.com/document/d/e/2PACX-1vTbH1JBaBWc30jTjL6ECWhNfLF23-Iv9afdu7KL2oVP8WbiDxUewcHaAE5y6dQJVs6heCOiGmyO9fFX/pub)

- Incorporar um pipeline no aprendizado;
- Utilizar pré-processamento na base de dados;
- Reduzir a dimensionalidade da base;
- Incorporar a validação cruzada para melhor avaliar as técnicas;
- Reavaliar as metodologias e modelos com essa nova abordagem.

In [1]:
import pandas as pd
df = pd.read_csv('base_filtrada.csv')
df.head()

Unnamed: 0,dispositivo_1,dispositivo_2,dispositivo_3,dispositivo_4,dispositivo_5,dispositivo_6,dispositivo_7,dispositivo_8,dispositivo_9,dispositivo_10,...,dispositivo_42,dispositivo_43,dispositivo_44,dispositivo_45,dispositivo_46,dispositivo_47,dispositivo_48,dispositivo_49,dispositivo_50,falha
0,48.7,36.58,42.64,51.02,66.17,43.68,51.84,57.06,40.92,33.1,...,42.58,45.03,55.41,56.54,34.13,50.11,49.88,49.82,69.11,0
1,45.65,69.17,48.58,34.39,42.41,41.61,59.15,55.03,59.03,59.72,...,74.03,48.05,39.78,58.47,63.05,54.8,68.53,45.07,71.07,0
2,63.11,49.81,38.17,59.98,61.59,59.39,48.5,55.62,52.2,30.47,...,43.08,47.89,32.3,66.46,54.78,60.01,21.4,53.12,50.01,0
3,28.41,38.22,43.15,39.12,58.32,71.58,36.61,45.84,35.68,45.38,...,58.2,55.04,36.48,52.88,54.85,66.86,50.58,58.64,53.66,0
4,64.94,49.23,63.78,54.09,53.86,66.0,36.42,23.26,46.84,57.69,...,55.68,57.47,42.4,49.21,52.69,66.94,55.73,38.38,38.92,1


In [4]:
from sklearn.model_selection import train_test_split

X = df.drop('falha', axis=1).values
y = df['falha'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.95, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6912, 50), (131328, 50), (6912,), (131328,))

## Incorporar um pipeline no aprendizado

In [11]:
from sklearn.pipeline import Pipeline

## Utilizar pré-processamento na base de dados e Reduzir a dimensionalidade da base;

In [6]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA, TruncatedSVD

# Incorporar a validação cruzada para melhor avaliar as técnicas

In [7]:
from sklearn.model_selection import cross_validate

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.dummy import DummyClassifier
import numpy as np
from tqdm import tqdm

prepros = [
    None,
    StandardScaler(),
    MinMaxScaler(),
]

redutores = [
    None,
    PCA(random_state=42),
    TruncatedSVD(random_state=42),
]

aprendizados = [
    RandomForestClassifier(random_state=42),
    DecisionTreeClassifier(random_state=42),
    SVC(random_state=42),
    LogisticRegression(random_state=42),
    MLPClassifier(random_state=42),
    DummyClassifier(random_state=42),
]

resultados = []
for pp, red, ap in tqdm([(pp, red, ap) for pp in prepros for red in redutores for ap in aprendizados]):

    steps = []
    if pp is not None:
        steps.append((pp.__class__.__name__, pp))
    if red is not None:
        steps.append((red.__class__.__name__, red))
    steps.append((ap.__class__.__name__, ap))

    pipe = Pipeline(steps)

    cv = cross_validate(pipe, X_train, y_train, cv=5, scoring=['accuracy', 'f1'], n_jobs=-1)
    res = {
        'preprocessamento': pp.__class__.__name__,
        'reducao': red.__class__.__name__,
        'aprendizado': ap.__class__.__name__,
        'acuracia': np.mean(cv['test_accuracy']),
        'f1': np.mean(cv['test_f1']),
    }
    resultados.append(res)

df_res = pd.DataFrame(resultados)
# Ordenar decrescente os melhores modelos avaliados pelas métricas
df_res.sort_values('f1', ascending=False)

 13%|█▎        | 7/54 [00:18<02:13,  2.85s/it]