In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Pre-processamento
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv('../data/raw/data_titanic.csv')
df_dict = pd.read_csv('../data/external/dictionary.csv')
df_dict.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Variável   15 non-null     object
 1   Descrição  15 non-null     object
 2   Tipo       15 non-null     object
 3   Subtipo    15 non-null     object
 4   Valores    12 non-null     object
dtypes: object(5)
memory usage: 728.0+ bytes


In [3]:
tipos = df_dict['Tipo'].unique()
tipos

array(['Qualitativa', 'Quantitativa', 'Inútil'], dtype=object)

In [4]:
target_variable = 'survived'

useless_variables =  (
    df_dict
    .query("Tipo == 'Inútil'")
    .Variável
    .to_list()
)

nominal_variables = (
    df_dict
    .query("Subtipo == 'Nominal' and Tipo != 'Inútil' and Variável != @target_variable")
    .Variável
    .to_list()
)

ordinal_variables = (
    df_dict
    .query("Subtipo == 'Ordinal' and Tipo != 'Inútil' and Variável != @target_variable")
    .Variável
    .to_list()
)

continuous_variables = (
    df_dict
    .query("Subtipo == 'Contínua' and Tipo != 'Inútil' and Variável != @target_variable")
    .Variável
    .to_list()
)

discrete_variables = (
    df_dict
    .query("Subtipo == 'Discreta' and Tipo != 'Inútil' and Variável != @target_variable")
    .Variável
    .to_list()
)


In [5]:
print(f'Useless: {useless_variables}')
print(f'Nominal: {nominal_variables}')
print(f'Ordinal: {ordinal_variables}')
print(f'Continuous {continuous_variables}')
print(f'Discrete: {discrete_variables}')

Useless: ['embarked', 'class', 'alive']
Nominal: ['sex', 'who', 'adult_male', 'deck', 'embark_town', 'alone']
Ordinal: ['pclass']
Continuous ['age', 'fare']
Discrete: ['sibsp', 'parch']


In [6]:
set(df.columns) - set(nominal_variables+ordinal_variables+discrete_variables+continuous_variables)

{'alive', 'class', 'embarked', 'survived'}

### Pré-processamento

Vamos usar o **SimpleImputer** do Scikit Learn para inserir valores faltantes para os dados do tipo **nominal, ordinal, contínuo**. Para os dados discretos, iremos usar o **KNNImputer**. Por isso, será necessário criar uma estratégia para cada tipo de dado faltante.

In [7]:
# Tratamento para dados faltantes.

nominal_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
ordinal_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
continuous_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
discrete_imputer = KNNImputer(missing_values=np.nan)

Agora, serão criados os Pipelines.  
Nesse momento, iremos unir o tratamento para dados faltantes com a codificação/normalização das variáveis.

In [8]:
nominal_preprocessor = Pipeline(steps=[
    ("missing", nominal_imputer),  # Tratamento de dados faltantes
    ("encoding", OneHotEncoder(sparse_output=False)),  # Codificação de variáveis
])

ordinal_preprocessor = Pipeline(steps=[
    ("missing", ordinal_imputer),
    ("encoding", OrdinalEncoder()),
])

continuous_preprocessor = Pipeline(steps=[
    ("missing", continuous_imputer),
    ("normalization", StandardScaler()),
])

discrete_preprocessor = Pipeline(steps=[
    ("missing", discrete_imputer),
    ("normalization", StandardScaler()),
])

In [9]:
preprocessor = ColumnTransformer([
    ("nominal", nominal_preprocessor, nominal_variables),
    ("ordinal", ordinal_preprocessor, ordinal_variables),
    ("continuous", continuous_preprocessor, continuous_variables),
    ("discrete", discrete_preprocessor, discrete_variables),
    
])

In [10]:
preprocessor

In [11]:
X = df.drop(columns=[target_variable] + useless_variables)
y = df[target_variable]

In [12]:
preprocessor.fit(X)
X_transformed = preprocessor.transform(X)

In [13]:
X_transformed

array([[ 0.        ,  1.        ,  0.        , ..., -0.50244517,
         0.43279337, -0.47367361],
       [ 1.        ,  0.        ,  0.        , ...,  0.78684529,
         0.43279337, -0.47367361],
       [ 1.        ,  0.        ,  0.        , ..., -0.48885426,
        -0.4745452 , -0.47367361],
       ...,
       [ 1.        ,  0.        ,  0.        , ..., -0.17626324,
         0.43279337,  2.00893337],
       [ 0.        ,  1.        ,  0.        , ..., -0.04438104,
        -0.4745452 , -0.47367361],
       [ 0.        ,  1.        ,  0.        , ..., -0.49237783,
        -0.4745452 , -0.47367361]])