In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from feature_engine.encoding import CountFrequencyEncoder
from feature_engine.selection import DropFeatures
from sklearn.feature_selection import RFE

import transformers as tfr

import joblib

## Data

In [19]:
train_X, train_y = pd.read_parquet('/Users/kenwu/Desktop/Python/Python for Machine Learning/tree_based_models/projects/bank_marketing_project/data/train_test/train_X.parquet'), pd.read_parquet('/Users/kenwu/Desktop/Python/Python for Machine Learning/tree_based_models/projects/bank_marketing_project/data/train_test/train_y.parquet').to_numpy().ravel()
test_X, test_y = pd.read_parquet('/Users/kenwu/Desktop/Python/Python for Machine Learning/tree_based_models/projects/bank_marketing_project/data/train_test/test_X.parquet'), pd.read_parquet('/Users/kenwu/Desktop/Python/Python for Machine Learning/tree_based_models/projects/bank_marketing_project/data/train_test/test_y.parquet').to_numpy().ravel()

## Pipeline

In [4]:
cat_col = ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
num_col = ['campaign', 'pdays', 'previous', 'emp_var_rate', 'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed']

In [26]:
preprocessor = Pipeline([
    ('drop_duration', DropFeatures(features_to_drop='duration')),
    ('convert_cat_dtype', FunctionTransformer(func=tfr.CatTransformer)), # Convert columns to categorical datatype, some of which we will use to group by in the next step
    ('num_engineer', FunctionTransformer(func=tfr.NumTransformer)),
    ('scaler_columns', ColumnTransformer([('scaler', StandardScaler().set_output(transform='pandas'), num_col)], remainder='passthrough').set_output(transform='pandas')),
    ('retore_col_names', FunctionTransformer(func=tfr.RestoreColNames)),
    ('count_encoder', CountFrequencyEncoder(encoding_method='count', variables=cat_col, unseen='encode')) # Encode unseen categories as 0
])

label_encoder = LabelEncoder()

Write to disk:

In [27]:
joblib.dump(preprocessor, '../outputs/pipelines/preprocessor.joblib')
joblib.dump(label_encoder, '../outputs/pipelines/label_encoder.joblib')

['../ouputs/pipelines/preprocessor.joblib']

['../ouputs/pipelines/label_encoder.joblib']

Testing on train and test data

In [14]:
preprocessor.fit_transform(train_X)

Unnamed: 0,campaign,pdays,previous,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,age,job,...,euribor3m_max_by_poutcome,nr_employed_max_by_poutcome,campaign_last_by_poutcome,pdays_last_by_poutcome,previous_last_by_poutcome,emp_var_rate_last_by_poutcome,cons_price_idx_last_by_poutcome,cons_conf_idx_last_by_poutcome,euribor3m_last_by_poutcome,nr_employed_last_by_poutcome
0,-0.204920,0.199069,1.661423,-1.198013,-0.867549,-1.427606,-1.277386,-0.937277,846,7391,...,4.968,5195.799805,1,999,1,-1.8,92.892998,-46.200001,1.334,5099.100098
1,0.157065,0.199069,-0.351442,0.838199,-0.229622,0.949995,0.773188,0.844693,1454,8300,...,5.045,5228.100098,1,999,0,1.4,93.444000,-36.099998,4.965,5228.100098
2,-0.566904,0.199069,-0.351442,-1.134381,1.103263,0.042184,-1.605893,-2.422252,48,1363,...,5.045,5228.100098,1,999,0,1.4,93.444000,-36.099998,4.965,5228.100098
3,-0.566904,0.199069,1.661423,-1.198013,-1.182186,-1.233076,-1.348850,-0.937277,1132,5388,...,4.968,5195.799805,1,999,1,-1.8,92.892998,-46.200001,1.334,5099.100098
4,-0.566904,0.199069,-0.351442,-1.198013,-0.867549,-1.427606,-1.277386,-0.937277,531,2372,...,5.045,5228.100098,1,999,0,1.4,93.444000,-36.099998,4.965,5228.100098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32945,-0.566904,0.199069,1.661423,-1.198013,-1.182186,-1.233076,-1.318305,-0.937277,1390,7391,...,4.968,5195.799805,1,999,1,-1.8,92.892998,-46.200001,1.334,5099.100098
32946,-0.566904,0.199069,-0.351442,0.647305,0.721214,0.885151,0.712098,0.332202,928,5388,...,5.045,5228.100098,1,999,0,1.4,93.444000,-36.099998,4.965,5228.100098
32947,8.844697,0.199069,-0.351442,0.838199,0.589819,-0.476566,0.776070,0.844693,542,8300,...,5.045,5228.100098,1,999,0,1.4,93.444000,-36.099998,4.965,5228.100098
32948,-0.566904,0.199069,-0.351442,-0.116275,-0.651451,-0.325264,0.261986,0.398506,1487,3203,...,5.045,5228.100098,1,999,0,1.4,93.444000,-36.099998,4.965,5228.100098


In [15]:
preprocessor.transform(test_X)

Unnamed: 0,campaign,pdays,previous,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,age,job,...,euribor3m_max_by_poutcome,nr_employed_max_by_poutcome,campaign_last_by_poutcome,pdays_last_by_poutcome,previous_last_by_poutcome,emp_var_rate_last_by_poutcome,cons_price_idx_last_by_poutcome,cons_conf_idx_last_by_poutcome,euribor3m_last_by_poutcome,nr_employed_last_by_poutcome
0,-0.566904,0.199069,-0.351442,-0.116275,-0.651451,-0.325264,0.306363,0.398506,1390,7391,...,5.045,5228.100098,4,999,0,1.1,93.994003,-36.400002,4.857,5191.000000
1,-0.566904,0.199069,-0.351442,-0.116275,-0.651451,-0.325264,0.261986,0.398506,1130,2372,...,5.045,5228.100098,4,999,0,1.1,93.994003,-36.400002,4.857,5191.000000
2,-0.566904,0.199069,-0.351442,0.647305,0.721214,0.885151,0.711521,0.332202,1380,8300,...,5.045,5228.100098,4,999,0,1.1,93.994003,-36.400002,4.857,5191.000000
3,-0.204920,0.199069,-0.351442,0.838199,1.535459,-0.282035,0.770883,0.844693,928,5388,...,5.045,5228.100098,4,999,0,1.1,93.994003,-36.400002,4.857,5191.000000
4,-0.204920,0.199069,-0.351442,0.838199,0.589819,-0.476566,0.773188,0.844693,1132,3203,...,5.045,5228.100098,4,999,0,1.1,93.994003,-36.400002,4.857,5191.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8233,0.881034,0.199069,-0.351442,0.647305,0.721214,0.885151,0.716132,0.332202,846,8300,...,5.045,5228.100098,4,999,0,1.1,93.994003,-36.400002,4.857,5191.000000
8234,-0.566904,-5.038845,3.674288,-1.134381,0.826665,0.150256,-1.662373,-2.422252,1390,2372,...,4.191,5195.799805,1,3,2,-1.7,94.055000,-39.799999,0.737,4991.600098
8235,-0.566904,0.199069,-0.351442,0.838199,1.535459,-0.282035,0.717285,0.844693,693,828,...,5.045,5228.100098,4,999,0,1.1,93.994003,-36.400002,4.857,5191.000000
8236,-0.566904,0.199069,-0.351442,0.838199,0.589819,-0.476566,0.771459,0.844693,542,1363,...,5.045,5228.100098,4,999,0,1.1,93.994003,-36.400002,4.857,5191.000000


In [21]:
label_encoder.fit_transform(train_y)

array([0, 0, 1, ..., 0, 0, 0])

In [22]:
label_encoder.transform(test_y)

array([0, 0, 0, ..., 0, 0, 0])