In [None]:
import seaborn as sns
import pandas as pd
from sklearn.impute import SimpleImputer

sample dataset

In [None]:
penguins = sns.load_dataset('penguins')
penguins

In [None]:
penguins.isnull().sum()

`df.select_dtypes(include='number').columns` will return the columns with numerical data types.

In [None]:
num_cols = penguins.select_dtypes('number').columns
print(num_cols.tolist())
cat_cols = penguins.select_dtypes(exclude='number').columns
print(cat_cols.tolist())

common sklearn steps:
1. Import the class
2. Instantiate the class
3. Fit and transform the data

Concept:
- use simpleimputer for missing data
- use onehotencoder/ordinalencoder for categorical data

In [None]:
num_imp = SimpleImputer()
cat_imp = SimpleImputer(strategy='most_frequent')

In [None]:
num_cols = ['bill_length_mm', 'bill_depth_mm', 
            'flipper_length_mm', 'body_mass_g']
penguins[num_cols] = num_imp.fit_transform(penguins[num_cols])
penguins

In [None]:
penguins[['sex']] = cat_imp.fit_transform(penguins[['sex']])
penguins

In [None]:
penguins

In [None]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [None]:
sex_enc = OrdinalEncoder()
penguins[['sex']] = sex_enc.fit_transform(penguins[['sex']])

In [None]:
cat_cols = ['species', 'island']
cat_enc = OneHotEncoder(drop='first')
dummy_cols = cat_enc.fit_transform(penguins[cat_cols]).toarray()
dummy_df = pd.DataFrame(dummy_cols)
dummy_df

In [None]:
clean_df = pd.concat([penguins, dummy_df], axis=1).drop(
    columns=cat_cols)
clean_df

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
import numpy as np
np.set_printoptions(precision=2)
pd.set_option('display.float_format', '{:.2f}'.format)

In [None]:
scaler = StandardScaler()
clean_df[num_cols] = scaler.fit_transform(clean_df[num_cols])
clean_df

### pipeline

In [76]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [82]:
df = sns.load_dataset('penguins')
num_cols = df.select_dtypes('number').columns
cat_cols = df.select_dtypes(exclude='number').columns
num_pipeline = Pipeline(
    steps=(
        ('imputer', SimpleImputer()),
        ('scaler', StandardScaler())
    ),
)
cat_pipeline = Pipeline(
    steps=(
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(drop='first'))
    ),
)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols)
    ],
)
preprocessor

In [83]:
preprocessor.fit_transform(df)

array([[-0.89,  0.79, -1.42, ...,  0.  ,  1.  ,  1.  ],
       [-0.81,  0.13, -1.07, ...,  0.  ,  1.  ,  0.  ],
       [-0.67,  0.43, -0.42, ...,  0.  ,  1.  ,  0.  ],
       ...,
       [ 1.19, -0.74,  1.51, ...,  0.  ,  0.  ,  1.  ],
       [ 0.24, -1.2 ,  0.79, ...,  0.  ,  0.  ,  0.  ],
       [ 1.1 , -0.53,  0.86, ...,  0.  ,  0.  ,  1.  ]])