### Import the Libraries

In [9]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, mutual_info_classif, mutual_info_regression, f_classif

### Load Dataset

In [10]:
df = pd.read_csv("../../Datasets/sample_dataset.csv")
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

### Transformations

**Numerical Variables :**
* Blank Filling with median value
* Standardization 

**Categorical Variables**
* Blank filling with most frequent value
* One Hot Encoding

In [11]:
num_pipe = Pipeline([
    ('cleaner', SimpleImputer(strategy="median")),
    ('transformer', StandardScaler())
])
cat_pipe = Pipeline([
    ('cleaner', SimpleImputer(strategy="most_frequent")),
    ('encoder', OneHotEncoder())
])

In [12]:
transformer = ColumnTransformer([
    ('numerical', num_pipe, make_column_selector(dtype_exclude="object")),
    ('categorical', cat_pipe, make_column_selector(dtype_include="object"))
])

In [13]:
transformer

### Apply PCA

In [14]:
pca = PCA(n_components=10)

### Feature Selection

In [15]:
selector = SelectKBest(f_classif, k=5)

### Final Pipeline

In [16]:
pipeline = Pipeline([
    ('transformation', transformer),
    ('pca', pca),
    ('feature_selection', selector)
])

In [17]:
pipeline.fit_transform(X, y)

array([[ 8.52256696,  2.64397044, -1.57565677, -3.52420989, -2.60947195],
       [ 2.79623958, -3.89824767,  0.10426929, -1.61476221, -0.15804423],
       [ 4.56985298, -1.18416154, -0.23154002, -0.95078422,  0.12680809],
       ...,
       [ 1.05053099, -2.22225232,  1.11958438,  2.06978788,  1.97780945],
       [10.21620878,  0.39525002, -2.47257669,  1.09076934, -0.72400782],
       [-5.32259512, -0.24553988,  1.22278786,  1.40541489,  0.49452765]])

In [22]:
X1 = pipeline.transform(X)
X1

array([[ 8.52256696,  2.64397044, -1.57565677, -3.52420989, -2.60947195],
       [ 2.79623958, -3.89824767,  0.10426929, -1.61476221, -0.15804423],
       [ 4.56985298, -1.18416154, -0.23154002, -0.95078422,  0.12680809],
       ...,
       [ 1.05053099, -2.22225232,  1.11958438,  2.06978788,  1.97780945],
       [10.21620878,  0.39525002, -2.47257669,  1.09076934, -0.72400782],
       [-5.32259512, -0.24553988,  1.22278786,  1.40541489,  0.49452765]])

# Pipeline Flow

In [24]:
pipeline

In [25]:
pipeline.set_params(transformation__numerical__cleaner__strategy="mean")

In [26]:
X2 = pipeline.fit_transform(X, y)

In [30]:
X1[X1 == X2]

array([], dtype=float64)