In [2]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV



In [None]:

treino = pd.read_csv('./data/train.csv')
teste = pd.read_csv('./data/test.csv')

display(treino.head())
display(teste.head())

### Criando pipeline

In [None]:
pipeline_inical = Pipeline([
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')),
    ('min_max_scaler', StandardScaler(with_mean=False)),
    ('classificador', RandomForestClassifier())
])

pipeline_inicial

In [None]:
pipeline_inicial.steps[1]

### make_pipeline
---



In [None]:
make_pipeline(OneHotEncoder(handle_unknown='ignore'), StandardScaler(with_mean=False), RandomForestClassifier())

Pipeline(steps=[('onehotencoder', OneHotEncoder(handle_unknown='ignore')),
                ('standardscaler', StandardScaler(with_mean=False)),
                ('randomforestclassifier', RandomForestClassifier())])

In [None]:
# Separando treino e validação!!
X = treino.drop('Survived', axis = 1)
y = treino['Survived']

X_treino, X_valid, y_treino, y_valid = train_test_split(X, y)

X_treino.shape, X_valid.shape, y_treino.shape, y_valid.shape


In [None]:
pipeline_inicial.fit(X_treino, y_treino)

Pipeline(steps=[('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')),
                ('min_max_scaler', StandardScaler(with_mean=False)),
                ('classificador', RandomForestClassifier())])

In [None]:
pipeline_inicial.predict(X_valid)

In [None]:
pipeline_inicial.score(X_valid, y_valid)

### Separando as transformações das variáveis categóricas e numéricas

In [None]:
X_treino.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [None]:
X_treino.head()

In [None]:
X_treino['Name'].dtype.name

In [None]:
variaveis_categoricas = [coluna for coluna in X_treino.columns if X_treino[coluna].dtype.name == 'object']
variaveis_categoricas

In [None]:
variaveis_numericas = [coluna for coluna in X_treino.columns if coluna not in variaveis_categoricas]
variaveis_numericas

['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [None]:
# Separando tratamento para colunas categóricas (imputer = tratar dados faltantes)
pipeline_categoricas = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [None]:
# Tratamento para numéricas
pipeline_numericas = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

In [None]:

pre_processamento = ColumnTransformer([
    ('cat', pipeline_categoricas, variaveis_categoricas),
    ('num', pipeline_numericas, variaveis_numericas)
])

In [None]:
pipeline_random_forest = make_pipeline(pre_processamento, RandomForestClassifier(random_state=42))
pipeline_log_reg = make_pipeline(pre_processamento, LogisticRegression(random_state=42))

In [None]:
pipeline_random_forest

In [None]:
pipeline_random_forest.fit(X_treino, y_treino)
pipeline_random_forest.score(X_valid, y_valid)

In [None]:
pipeline_log_reg.fit(X_treino, y_treino)
pipeline_log_reg.score(X_valid, y_valid)

### Usando cross validation

In [None]:
validacao_cruzada = KFold(n_splits=10, shuffle=True, random_state=42)
validacao_cruzada

In [None]:
cross_val_score(pipeline_random_forest, X_valid, y_valid, cv=validacao_cruzada)

In [None]:
acuracia_media_rf = cross_val_score(pipeline_random_forest, X_valid, y_valid, cv=validacao_cruzada).mean()
acuracia_media_rf

In [None]:
acuracia_media_log_reg = cross_val_score(pipeline_log_reg, X_valid, y_valid, cv=validacao_cruzada).mean()
acuracia_media_log_reg