# Preprocessing and Pipelines

## Penguins

In [1]:
# imports
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
# load the Palmer Penguins dataset
penguins = sns.load_dataset("penguins")

In [3]:
# create X feature array and y target array
X = penguins.drop("species", axis=1)
y = penguins["species"]

In [4]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# check train data
X_train

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
66,Biscoe,35.5,16.2,195.0,3350.0,Female
229,Biscoe,46.8,15.4,215.0,5150.0,Male
7,Torgersen,39.2,19.6,195.0,4675.0,Male
140,Dream,40.2,17.1,193.0,3400.0,Female
323,Biscoe,49.1,15.0,228.0,5500.0,Male
...,...,...,...,...,...,...
188,Dream,47.6,18.3,195.0,3850.0,Female
71,Torgersen,39.7,18.4,190.0,3900.0,Male
106,Biscoe,38.6,17.2,199.0,3750.0,Female
270,Biscoe,46.6,14.2,210.0,4850.0,Female


In [6]:
# check column types
X_train.dtypes

island                object
bill_length_mm       float64
bill_depth_mm        float64
flipper_length_mm    float64
body_mass_g          float64
sex                   object
dtype: object

In [7]:
# find numeric and categorical features
numeric_features = X_train.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X_train.select_dtypes(include=["object"]).columns

In [8]:
# define how to handle missing values and scale features for numeric features
numeric_transformer = Pipeline(
    steps=[
        ("Median Imputer", SimpleImputer(strategy="median")),
        ("Standardization", StandardScaler()),
    ]
)

In [9]:
# define how to handle missing values and encode features for categorical features
categorical_transformer = Pipeline(
    steps=[
        ("Modal Imputer", SimpleImputer(strategy="most_frequent")),
        ("One-Hot Encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)

In [10]:
# create general preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("Numeric Transformer", numeric_transformer, numeric_features),
        ("Categorical Transformer", categorical_transformer, categorical_features),
    ],
    remainder="drop",
)

In [11]:
# Create the pipeline
pipeline = Pipeline(steps=[("Preprocessor", preprocessor), ("Classifier", KNeighborsClassifier())])

In [12]:
# define the parameter grid for grid search
param_grid = {
    "Classifier__n_neighbors": range(1, 11),
    "Classifier__metric": ["euclidean", "manhattan"],
}

In [13]:
# setup grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="accuracy")

In [14]:
# perform grid search
grid_search.fit(X_train, y_train)

In [15]:
# view full results
grid_search.cv_results_

{'mean_fit_time': array([0.00643125, 0.00397925, 0.00366354, 0.00373359, 0.00315251,
        0.00375562, 0.00552173, 0.00359554, 0.00384922, 0.0034997 ,
        0.00411458, 0.00441518, 0.0034461 , 0.00356698, 0.00515375,
        0.00802546, 0.00381446, 0.00345292, 0.00489764, 0.00416098]),
 'std_fit_time': array([1.69049527e-03, 4.46982060e-04, 1.40579245e-04, 4.94853447e-04,
        2.60011157e-04, 4.66949550e-04, 3.65931510e-03, 1.82068916e-04,
        4.80522160e-04, 1.55564014e-04, 8.09756565e-04, 1.71250875e-03,
        4.15380044e-04, 4.20961015e-04, 1.87455755e-03, 4.17949744e-03,
        3.78272366e-04, 3.99456366e-05, 2.16990240e-03, 6.96342832e-04]),
 'mean_score_time': array([0.00675173, 0.003336  , 0.00333886, 0.0027328 , 0.0024868 ,
        0.00284486, 0.00276699, 0.00272207, 0.00291538, 0.00267825,
        0.00300837, 0.00433283, 0.00263305, 0.00267334, 0.00405083,
        0.00294447, 0.00293798, 0.00268226, 0.0028688 , 0.00289617]),
 'std_score_time': array([3.15806344e-

In [16]:
# print the best parameters found
print(grid_search.best_params_)

{'Classifier__metric': 'euclidean', 'Classifier__n_neighbors': 5}


In [17]:
# evaluate the model
print(f"Test Accuracy: {grid_search.score(X_test, y_test)}")

Test Accuracy: 1.0


In [18]:
# make predictions on test data, transformations are auto-magically applied!
grid_search.predict(X_test)

array(['Chinstrap', 'Chinstrap', 'Gentoo', 'Chinstrap', 'Gentoo',
       'Gentoo', 'Gentoo', 'Gentoo', 'Adelie', 'Gentoo', 'Gentoo',
       'Adelie', 'Adelie', 'Adelie', 'Gentoo', 'Adelie', 'Adelie',
       'Gentoo', 'Gentoo', 'Adelie', 'Adelie', 'Adelie', 'Gentoo',
       'Adelie', 'Chinstrap', 'Chinstrap', 'Adelie', 'Chinstrap',
       'Adelie', 'Chinstrap', 'Adelie', 'Adelie', 'Chinstrap', 'Adelie',
       'Adelie', 'Adelie', 'Gentoo', 'Chinstrap', 'Adelie', 'Chinstrap',
       'Gentoo', 'Gentoo', 'Gentoo', 'Adelie', 'Adelie', 'Adelie',
       'Gentoo', 'Adelie', 'Adelie', 'Chinstrap', 'Adelie', 'Gentoo',
       'Gentoo', 'Adelie', 'Adelie', 'Chinstrap', 'Adelie', 'Chinstrap',
       'Chinstrap', 'Gentoo', 'Chinstrap', 'Adelie', 'Gentoo', 'Adelie',
       'Gentoo', 'Chinstrap', 'Adelie', 'Adelie', 'Adelie'], dtype=object)

In [19]:
# check the parameter grid
grid_search.cv_results_["params"]

[{'Classifier__metric': 'euclidean', 'Classifier__n_neighbors': 1},
 {'Classifier__metric': 'euclidean', 'Classifier__n_neighbors': 2},
 {'Classifier__metric': 'euclidean', 'Classifier__n_neighbors': 3},
 {'Classifier__metric': 'euclidean', 'Classifier__n_neighbors': 4},
 {'Classifier__metric': 'euclidean', 'Classifier__n_neighbors': 5},
 {'Classifier__metric': 'euclidean', 'Classifier__n_neighbors': 6},
 {'Classifier__metric': 'euclidean', 'Classifier__n_neighbors': 7},
 {'Classifier__metric': 'euclidean', 'Classifier__n_neighbors': 8},
 {'Classifier__metric': 'euclidean', 'Classifier__n_neighbors': 9},
 {'Classifier__metric': 'euclidean', 'Classifier__n_neighbors': 10},
 {'Classifier__metric': 'manhattan', 'Classifier__n_neighbors': 1},
 {'Classifier__metric': 'manhattan', 'Classifier__n_neighbors': 2},
 {'Classifier__metric': 'manhattan', 'Classifier__n_neighbors': 3},
 {'Classifier__metric': 'manhattan', 'Classifier__n_neighbors': 4},
 {'Classifier__metric': 'manhattan', 'Classifie

In [20]:
# check the results for each set of parameters
grid_search.cv_results_["mean_test_score"]

array([0.98909091, 0.98909091, 0.98909091, 0.99272727, 0.99636364,
       0.98181818, 0.98545455, 0.98181818, 0.98909091, 0.98181818,
       0.98909091, 0.98909091, 0.98545455, 0.98909091, 0.98545455,
       0.98181818, 0.98181818, 0.98181818, 0.98181818, 0.98181818])

In [21]:
# view subset of results as a data frame for easy-of-reading
results_df = pd.DataFrame(grid_search.cv_results_["params"])
results_df = results_df.rename(
    columns={"Classifier__metric": "Distance", "Classifier__n_neighbors": "Neighbors"}
)
results_df["5-Fold CV Accuracy"] = grid_search.cv_results_["mean_test_score"]
results_df

Unnamed: 0,Distance,Neighbors,5-Fold CV Accuracy
0,euclidean,1,0.989091
1,euclidean,2,0.989091
2,euclidean,3,0.989091
3,euclidean,4,0.992727
4,euclidean,5,0.996364
5,euclidean,6,0.981818
6,euclidean,7,0.985455
7,euclidean,8,0.981818
8,euclidean,9,0.989091
9,euclidean,10,0.981818


## Imputation

In [22]:
X = np.array([[5.3], [4.2], [1.1], [np.nan], [6.3]])
X

array([[5.3],
       [4.2],
       [1.1],
       [nan],
       [6.3]])

In [23]:
num_imp = SimpleImputer(strategy="median")
num_imp.fit(X)
num_imp.transform(X)

array([[5.3 ],
       [4.2 ],
       [1.1 ],
       [4.75],
       [6.3 ]])

In [24]:
num_imp.transform(np.array([[np.nan], [1.1]]))

array([[4.75],
       [1.1 ]])

## Scaling

In [25]:
X = np.array([[5.3, 1000], [4.2, 2000], [1.1, 3000], [4.6, 5000], [6.3, 1500]])
X

array([[5.3e+00, 1.0e+03],
       [4.2e+00, 2.0e+03],
       [1.1e+00, 3.0e+03],
       [4.6e+00, 5.0e+03],
       [6.3e+00, 1.5e+03]])

In [26]:
scaler = StandardScaler()
scaler.fit(X)
scaler.transform(X)

array([[ 0.57091614, -1.06066017],
       [-0.05709161, -0.35355339],
       [-1.82693165,  0.35355339],
       [ 0.17127484,  1.76776695],
       [ 1.14183228, -0.70710678]])

## Categorical Encoding

In [27]:
X = np.array([["dog", "brown"], ["cat", "black"], ["dog", "tan"], ["dog", "brown"], ["cat", "tan"]])
X

array([['dog', 'brown'],
       ['cat', 'black'],
       ['dog', 'tan'],
       ['dog', 'brown'],
       ['cat', 'tan']], dtype='<U5')

In [28]:
one_hot = OneHotEncoder()
one_hot.fit(X)
print(one_hot.fit_transform(X).toarray())

[[0. 1. 0. 1. 0.]
 [1. 0. 1. 0. 0.]
 [0. 1. 0. 0. 1.]
 [0. 1. 0. 1. 0.]
 [1. 0. 0. 0. 1.]]


In [29]:
dummy = OneHotEncoder(drop="first")
dummy.fit(X)
print(dummy.fit_transform(X).toarray())

[[1. 1. 0.]
 [0. 0. 0.]
 [1. 0. 1.]
 [1. 1. 0.]
 [0. 0. 1.]]


## Applying Preprocessor to Penguins

In [30]:
X_train

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
66,Biscoe,35.5,16.2,195.0,3350.0,Female
229,Biscoe,46.8,15.4,215.0,5150.0,Male
7,Torgersen,39.2,19.6,195.0,4675.0,Male
140,Dream,40.2,17.1,193.0,3400.0,Female
323,Biscoe,49.1,15.0,228.0,5500.0,Male
...,...,...,...,...,...,...
188,Dream,47.6,18.3,195.0,3850.0,Female
71,Torgersen,39.7,18.4,190.0,3900.0,Male
106,Biscoe,38.6,17.2,199.0,3750.0,Female
270,Biscoe,46.6,14.2,210.0,4850.0,Female


In [31]:
preprocessor.fit(X_train)

In [32]:
X_train_processed = preprocessor.transform(X_train)
X_train_processed

array([[-1.51801278, -0.43762006, -0.43245586, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.5390362 , -0.84265336,  0.97162162, ...,  0.        ,
         0.        ,  1.        ],
       [-0.84446577,  1.28377144, -0.43245586, ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [-0.95368961,  0.06867155, -0.15164037, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.50262825, -1.4502033 ,  0.62060225, ...,  0.        ,
         1.        ,  0.        ],
       [-1.11752537, -0.53887839, -1.27490235, ...,  0.        ,
         1.        ,  0.        ]])