In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PowerTransformer
from sklearn.impute import SimpleImputer

# Pipeline

#### Pipeline With the Transformers using make_pipeline

In [4]:
from sklearn.pipeline import make_pipeline

In [5]:
pipeline = make_pipeline(SimpleImputer(), PowerTransformer())

In [6]:
X = [[1], [2], [np.nan], [3]]

In [7]:
pipeline.fit_transform(X)

array([[-1.43683574],
       [ 0.02299616],
       [ 0.02299616],
       [ 1.39084342]])

#### Using Pipeline Object

In [16]:
from sklearn.pipeline import Pipeline

In [17]:
pipeline = Pipeline([('cleaning',SimpleImputer()), 
                          ('power_transform', PowerTransformer())
                         ])

In [19]:
pipeline.set_params(cleaning__strategy = "median", power_transform__standardize=False)

In [20]:
pipeline.fit_transform(X)

array([[0.92947764],
       [1.77008448],
       [1.77008448],
       [2.55772433]])

# Pipeline with Column Transformer Together

In [56]:
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import os
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
data_directory = os.path.join("../../", "Datasets")

In [44]:
df = pd.read_csv(data_directory+"/sample_dataset.csv")

In [42]:
cat_pipe = Pipeline([
    ('cleaner', SimpleImputer(strategy="most_frequent")),
    ('encoder', OneHotEncoder())
])
transformer = ColumnTransformer([
    ('numerical', SimpleImputer(strategy="median"),make_column_selector(dtype_exclude="object")),
    ('categorical', cat_pipe , make_column_selector(dtype_include="object"))
])

In [49]:
column_list =list(df.columns)
column_list.extend(["Encoded1", "Encoded2"])
tdf = pd.DataFrame(transformer.fit_transform(df), columns=column_list)

In [50]:
tdf[["area error"]]

0      0.006399
1      0.005225
2      0.006150
3      0.009110
4      0.006382
         ...   
564    0.010300
565    0.006382
566    0.005903
567    0.006522
568    0.007189
Name: area error, Length: 569, dtype: float64

In [51]:
transformer.set_params(categorical__cleaner__strategy = "constant", categorical__cleaner__fill_value="A", numerical__strategy="mean")

In [52]:
column_list =list(df.columns)
column_list.extend(["Encoded1", "Encoded2"])
ndf = pd.DataFrame(transformer.fit_transform(df), columns=column_list)

In [55]:
tdf[["area error","Encoded1", "Encoded2"]]

Unnamed: 0,area error,Encoded1,Encoded2
0,0.006399,0.0,0.0
1,0.005225,0.0,0.0
2,0.006150,0.0,0.0
3,0.009110,0.0,0.0
4,0.006382,0.0,0.0
...,...,...,...
564,0.010300,0.0,0.0
565,0.006382,0.0,0.0
566,0.005903,0.0,0.0
567,0.006522,0.0,0.0
