# Custom Transformer and Transformation Pipeline

Wayne Huynh

# Import Data

In [1]:
import pandas as pd

# Import dataset
fileName = "CustomTransformerData.csv"
custom_transform = pd.read_csv(fileName)
custom_transform

Unnamed: 0,x1,x2,x3,x4,x5
0,1.5,2.354153,COLD,593,0.75
1,2.5,3.314048,WARM,340,2.083333
2,3.5,4.021604,COLD,551,4.083333
3,4.5,,COLD,2368,6.75
4,5.5,5.847601,WARM,2636,10.083333
5,6.5,7.22991,WARM,2779,14.083333
6,7.5,7.997255,HOT,1057,18.75
7,8.5,9.203947,COLD,819,24.083333
8,9.5,10.335348,WARM,3349,
9,10.5,11.112142,HOT,3235,36.75


# Create Numeric and Categorical DataFrames

In [2]:
# Create 'data_num' DataFrame with numeric features
data_num = custom_transform[['x1', 'x2', 'x4', 'x5']]
data_num

Unnamed: 0,x1,x2,x4,x5
0,1.5,2.354153,593,0.75
1,2.5,3.314048,340,2.083333
2,3.5,4.021604,551,4.083333
3,4.5,,2368,6.75
4,5.5,5.847601,2636,10.083333
5,6.5,7.22991,2779,14.083333
6,7.5,7.997255,1057,18.75
7,8.5,9.203947,819,24.083333
8,9.5,10.335348,3349,
9,10.5,11.112142,3235,36.75


In [3]:
# Create 'data_cat' DataFrame with categorical feature
data_cat = custom_transform[['x3']]
data_cat

Unnamed: 0,x3
0,COLD
1,WARM
2,COLD
3,COLD
4,WARM
5,WARM
6,HOT
7,COLD
8,WARM
9,HOT


# Create Custom Transformer

In [4]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

# Create a customer transformer class 
class Assignment4Transformer(BaseEstimator, TransformerMixin):
    # The __init__ method serves as the class constructors and allows us to instantiate objects of the class
    def __init__(self, drop_x4=True):
        self.drop_x4 = drop_x4
    
    # The fit method must exist for use by Pipelines
    def fit(self, X, y=None):
        return self
    
    # The transform method creates our new features and returns them
    def transform(self, X):
        x1_cubed_divided_by_x5 = (X[:, 0] ** 3) / X[:, -1]

        if self.drop_x4:
            X_transformed = X[:, [0, 1, -1]]
        else:
            X_transformed = X[:, [0, 1, 2, -1]]

        X_transformed = np.column_stack((X_transformed, x1_cubed_divided_by_x5))

        return X_transformed

# Create Transformation Pipeline for Numerical Features

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Create a custom transformation pipeline for numeric data
num_pipeline = Pipeline([
    # Apply the SimpleImputer class to the data where the strategy is 'meann'
    ('imputer', SimpleImputer(strategy='mean')),
    # Apply the custom transformer class to the data
    ('custom_trans', Assignment4Transformer(drop_x4=True)),
    # Apply the StandardScaler class to the data
    ('std_scaler', StandardScaler())])

### Run Pipeline and Create Transformed Numeric Data

In [6]:
# Call fit_transform() method from num_pipeline object and pass the data_num DataFrame
data_num_trans = num_pipeline.fit_transform(data_num)

### One-Hot Encode Categorical Features

In [7]:
from sklearn.preprocessing import OneHotEncoder

# Instantiate OneHotEncoder class
cat_encoder = OneHotEncoder(drop='first', sparse=False)
# Call fit_transform method and pass the data_cat DataFrame
data_cat_OHE = cat_encoder.fit_transform(data_cat)

# Construct a Column Transformer

In [8]:
from sklearn.compose import ColumnTransformer

num_features = ['x1', 'x2', 'x4', 'x5']
cat_features = ['x3']
# Construct ColumnTransformer using num_pipeline with data_num DataFrame and cat_encoer with data_cat DataFrame
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_encoder, cat_features)])

# Call fit_transform method with custom_transform data
data_trans = full_pipeline.fit_transform(custom_transform)

# Round Array to 2 Decimal Places

In [9]:
data_trans = np.around(data_trans, decimals=2)
data_trans

array([[-1.64, -1.73, -1.2 , -1.59,  0.  ,  0.  ],
       [-1.45, -1.53, -1.16, -1.4 ,  0.  ,  1.  ],
       [-1.25, -1.38, -1.1 , -1.21,  0.  ,  0.  ],
       [-1.06,  0.  , -1.03, -1.02,  0.  ,  0.  ],
       [-0.87, -0.99, -0.93, -0.83,  0.  ,  1.  ],
       [-0.67, -0.7 , -0.82, -0.64,  0.  ,  1.  ],
       [-0.48, -0.53, -0.69, -0.45,  1.  ,  0.  ],
       [-0.29, -0.28, -0.54, -0.26,  0.  ,  0.  ],
       [-0.1 , -0.04,  0.  , -0.61,  0.  ,  1.  ],
       [ 0.1 ,  0.13, -0.18,  0.13,  1.  ,  0.  ],
       [ 0.29,  0.27,  0.03,  0.32,  0.  ,  1.  ],
       [ 0.48,  0.45,  0.26,  0.51,  0.  ,  1.  ],
       [ 0.67,  0.76,  0.5 ,  0.7 ,  0.  ,  0.  ],
       [ 0.87,  0.88,  0.76,  0.89,  1.  ,  0.  ],
       [ 1.06,  0.  ,  1.05,  1.08,  1.  ,  0.  ],
       [ 1.25,  1.42,  1.35,  1.27,  0.  ,  1.  ],
       [ 1.45,  1.55,  1.67,  1.46,  1.  ,  0.  ],
       [ 1.64,  1.71,  2.01,  1.65,  1.  ,  0.  ]])