# **Encoding with Decision Trees**

In [1]:
pip install feature-engine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting feature-engine
  Downloading feature_engine-1.3.0-py2.py3-none-any.whl (260 kB)
[K     |████████████████████████████████| 260 kB 5.1 MB/s 
Collecting statsmodels>=0.11.1
  Downloading statsmodels-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 60.0 MB/s 
Installing collected packages: statsmodels, feature-engine
  Attempting uninstall: statsmodels
    Found existing installation: statsmodels 0.10.2
    Uninstalling statsmodels-0.10.2:
      Successfully uninstalled statsmodels-0.10.2
Successfully installed feature-engine-1.3.0 statsmodels-0.13.2


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from feature_engine.encoding import DecisionTreeEncoder

In [3]:
# let's load the data set
data = pd.read_csv("credit_approval_uci_2.csv")
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [4]:
# Let's separate into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=["target"], axis=1),  # predictors
    data["target"],  # target
    test_size=0.3,  # percentage of observations in test set
    random_state=0,  # seed to ensure reproducibility
)

X_train.shape, X_test.shape

((483, 15), (207, 15))

In [5]:
# Set up the decision encoder
tree_encoder = DecisionTreeEncoder(
    encoding_method="arbitrary",  # how to convert the strings to numbers
    cv=3,  # cross-validation
    scoring="roc_auc",  # the scoring metric to optimise during the grid search
    param_grid=None,  # defaults to optimizing the tree depth
    regression=False,
    random_state=10,
    variables=None,
)

In [6]:
# let's fit the encoder to the train set
tree_encoder.fit(X_train, y_train)

DecisionTreeEncoder(random_state=10, regression=False, scoring='roc_auc')

In [7]:
tree_encoder.variables_

['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [8]:
# the pipeline used to encode the categorical
# features
tree_encoder.encoder_

Pipeline(steps=[('categorical_encoder',
                 OrdinalEncoder(encoding_method='arbitrary', errors='raise',
                                variables=['A1', 'A4', 'A5', 'A6', 'A7', 'A9',
                                           'A10', 'A12', 'A13'])),
                ('tree_discretiser',
                 DecisionTreeDiscretiser(param_grid={'max_depth': [1, 2, 3, 4]},
                                         random_state=10, regression=False,
                                         scoring='roc_auc',
                                         variables=['A1', 'A4', 'A5', 'A6',
                                                    'A7', 'A9', 'A10', 'A12',
                                                    'A13']))])

In [9]:
# let's transform the train and test sets
X_train_enc = tree_encoder.transform(X_train)
X_test_enc = tree_encoder.transform(X_test)

In [10]:
# Inspect the encoded variables
X_train_enc[tree_encoder.variables_].head()

Unnamed: 0,A1,A4,A5,A6,A7,A9,A10,A12,A13
596,0.472222,0.512397,0.512397,0.451613,0.418773,0.785156,0.70283,0.445455,0.464853
303,0.472222,0.512397,0.512397,0.567308,0.418773,0.070485,0.250923,0.452471,0.464853
204,0.438806,0.226087,0.226087,0.567308,0.418773,0.785156,0.70283,0.452471,0.464853
351,0.438806,0.226087,0.226087,0.105263,0.146341,0.070485,0.250923,0.452471,0.464853
118,0.438806,0.512397,0.512397,0.328358,0.418773,0.785156,0.70283,0.445455,0.464853


In [11]:
# Inspect the encoded variables
X_test_enc[tree_encoder.variables_].head()

Unnamed: 0,A1,A4,A5,A6,A7,A9,A10,A12,A13
14,0.472222,0.512397,0.512397,0.567308,0.418773,0.785156,0.70283,0.445455,0.464853
586,0.438806,0.512397,0.512397,0.791667,0.578947,0.785156,0.70283,0.445455,0.464853
140,0.472222,0.512397,0.512397,0.7,0.578947,0.785156,0.70283,0.445455,0.464853
492,0.438806,0.512397,0.512397,0.328358,0.418773,0.785156,0.70283,0.452471,0.464853
350,0.472222,0.512397,0.512397,0.25,0.2,0.070485,0.250923,0.445455,0.464853
