In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Pre-processamento
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [33]:
df = pd.read_csv('../data/raw/data_titanic.csv')
df_dict = pd.read_csv('../data/external/dictionary.csv')
df_dict.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Variável   15 non-null     object
 1   Descrição  15 non-null     object
 2   Tipo       15 non-null     object
 3   Subtipo    15 non-null     object
 4   Valores    12 non-null     object
dtypes: object(5)
memory usage: 728.0+ bytes


In [34]:
tipos = df_dict['Tipo'].unique()
tipos

array(['Qualitativa', 'Quantitativa', 'Inútil'], dtype=object)

In [35]:
target_variable = 'survived'

useless_variables =  (
    df_dict
    .query("Tipo == 'Inútil'")
    .Variável
    .to_list()
)

nominal_variables = (
    df_dict
    .query("Subtipo == 'Nominal' and Tipo != 'Inútil' and Variável != @target_variable")
    .Variável
    .to_list()
)

ordinal_variables = (
    df_dict
    .query("Subtipo == 'Ordinal' and Tipo != 'Inútil' and Variável != @target_variable")
    .Variável
    .to_list()
)

continuous_variables = (
    df_dict
    .query("Subtipo == 'Contínua' and Tipo != 'Inútil' and Variável != @target_variable")
    .Variável
    .to_list()
)

discrete_variables = (
    df_dict
    .query("Subtipo == 'Discreta' and Tipo != 'Inútil' and Variável != @target_variable")
    .Variável
    .to_list()
)


In [36]:
print(f'Useless: {useless_variables}')
print(f'Nominal: {nominal_variables}')
print(f'Ordinal: {ordinal_variables}')
print(f'Continuous {continuous_variables}')
print(f'Discrete: {discrete_variables}')

Useless: ['embarked', 'class', 'alive']
Nominal: ['sex', 'who', 'adult_male', 'deck', 'embark_town', 'alone']
Ordinal: ['pclass']
Continuous ['age', 'fare']
Discrete: ['sibsp', 'parch']


In [37]:
set(df.columns) - set(nominal_variables+ordinal_variables+discrete_variables+continuous_variables)

{'alive', 'class', 'embarked', 'survived'}

### Pré-processamento

Vamos usar o **SimpleImputer** do Scikit Learn para inserir valores faltantes para os dados do tipo **nominal, ordinal, contínuo**. Para os dados discretos, iremos usar o **KNNImputer**. Por isso, será necessário criar uma estratégia para cada tipo de dado faltante.

In [38]:
# Tratamento para dados faltantes.

nominal_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
ordinal_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
continuous_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
discrete_imputer = KNNImputer(missing_values=np.nan)

Agora, serão criados os Pipelines.  
Nesse momento, iremos unir o tratamento para dados faltantes com a codificação/normalização das variáveis.

In [39]:
nominal_preprocessor = Pipeline(steps=[
    ("missing", nominal_imputer),  # Tratamento de dados faltantes
    ("encoding", OneHotEncoder(sparse_output=False)),  # Codificação de variáveis
])

ordinal_preprocessor = Pipeline(steps=[
    ("missing", ordinal_imputer),
    ("encoding", OrdinalEncoder()),
])

continuous_preprocessor = Pipeline(steps=[
    ("missing", continuous_imputer),
    ("normalization", StandardScaler()),
])

discrete_preprocessor = Pipeline(steps=[
    ("missing", discrete_imputer),
    ("normalization", StandardScaler()),
])

In [40]:
preprocessor = ColumnTransformer([
    ("nominal", nominal_preprocessor, nominal_variables),
    ("ordinal", ordinal_preprocessor, ordinal_variables),
    ("continuous", continuous_preprocessor, continuous_variables),
    ("discrete", discrete_preprocessor, discrete_variables),
    
])

In [41]:
preprocessor

In [42]:
X = df.drop(columns=[target_variable] + useless_variables)
y = df[target_variable]

In [43]:
preprocessor.fit(X)
X_transformed = preprocessor.transform(X)

In [28]:
X_

In [29]:
y_hat = model.predict(X_transformed)

In [30]:
y_hat

array([ 0.15625  ,  0.8984375,  0.5546875,  0.8515625,  0.0546875,
        0.1171875,  0.4609375,  0.4296875,  0.59375  ,  0.890625 ,
        0.5390625,  0.6953125,  0.1015625, -0.1484375,  0.6328125,
        0.5859375,  0.40625  ,  0.1953125,  0.609375 ,  0.5859375,
        0.1796875,  0.3359375,  0.6796875,  0.4296875,  0.5234375,
        0.328125 ,  0.1171875,  0.2734375,  0.5859375,  0.0703125,
        0.328125 ,  1.0390625,  0.5859375,  0.0703125,  0.46875  ,
        0.359375 ,  0.1171875,  0.1015625,  0.5703125,  0.7578125,
        0.5703125,  0.7421875,  0.1171875,  0.8203125,  0.6171875,
        0.0703125,  0.1875   ,  0.5859375,  0.109375 ,  0.6484375,
        0.3359375,  0.1015625,  1.0234375,  0.7421875,  0.4296875,
        0.3203125,  0.6953125,  0.1171875,  0.7578125,  0.1953125,
        0.140625 ,  0.8671875,  0.3671875,  0.375    ,  0.3671875,
        0.1328125,  0.8359375,  0.1015625,  0.3046875,  0.0625   ,
        0.1796875,  0.2421875,  0.25     ,  0.1953125,  0.0859

In [31]:
y_hat.shape

(891,)

In [22]:
np.mean((y_hat - y.values)**2)

0.13413746295419474

## 08

### Codificação de variáveis

Efetuar toda e qualquer transformação necessãria para o funcionamento do modelo, seja ela codificação de variáveis qualitativas (nominais ou ordinais), temporais ou textuais.