In [None]:
"""

Pipeline

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.make_pipeline.html

apply several preprocessors sequentially

1. Execute the code 
   (in Jupyter, split it into multiple cells)

2. Understand what is happening

3. Explain to the rest of the group what you did
"""

import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline

df = pd.read_csv('penguins_simple.csv', sep=';')

In [7]:
df

Unnamed: 0,Species,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex
0,Adelie,39.1,18.7,181.0,3750.0,MALE
1,Adelie,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,36.7,19.3,193.0,3450.0,FEMALE
4,Adelie,39.3,20.6,190.0,3650.0,MALE
...,...,...,...,...,...,...
328,Gentoo,47.2,13.7,214.0,4925.0,FEMALE
329,Gentoo,46.8,14.3,215.0,4850.0,FEMALE
330,Gentoo,50.4,15.7,222.0,5750.0,MALE
331,Gentoo,45.2,14.8,212.0,5200.0,FEMALE


In [2]:
# define a pipeline
impute_and_encode = make_pipeline(
    SimpleImputer(strategy='most_frequent'), 
    OneHotEncoder(sparse=False)
)

In [3]:
cols = df[['Sex']]

In [8]:
impute_and_encode.fit(cols)     

Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')),
                ('onehotencoder', OneHotEncoder(sparse=False))])

In [9]:
        # apply .fit() of each preprocessor sequentially
t = impute_and_encode.transform(cols)   # apply . transform() sequentially

In [5]:
print(t.shape)
print()

(333, 2)



In [6]:
# create a DataFrame
cols_transformed = pd.DataFrame(t, columns = impute_and_encode[1].get_feature_names())
print(cols_transformed.head())

   x0_FEMALE  x0_MALE
0        0.0      1.0
1        1.0      0.0
2        1.0      0.0
3        1.0      0.0
4        0.0      1.0
