# **Binary Encoding**

In [1]:
pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.4.1-py2.py3-none-any.whl (80 kB)
[K     |████████████████████████████████| 80 kB 3.1 MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.4.1


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from category_encoders.binary import BinaryEncoder

  import pandas.util.testing as tm


In [3]:
# let's load the data set
data = pd.read_csv("credit_approval_uci_2.csv")
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [4]:
# Let's separate into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=["target"], axis=1),  # predictors
    data["target"],  # target
    test_size=0.3,  # percentage of observations in test set
    random_state=0,  # seed to ensure reproducibility
)

X_train.shape, X_test.shape

((483, 15), (207, 15))

In [5]:
# Let's inspect the unique categories of A7
X_train["A7"].unique()

array(['v', 'ff', 'h', 'dd', 'z', 'bb', 'j', 'Missing', 'n', 'o'],
      dtype=object)

In [6]:
# Set up the binary encoder to encode A7
encoder = BinaryEncoder(cols=["A7"], drop_invariant=True)

In [7]:
# let's fit the transformer to the train set
encoder.fit(X_train)

BinaryEncoder(cols=['A7'], drop_invariant=True)

In [8]:
# let's inspect the information stored in the transformer
encoder.base_n_encoder

BaseNEncoder(cols=['A7'], drop_invariant=True,
             mapping=[{'col': 'A7',
                       'mapping':      A7_0  A7_1  A7_2  A7_3
 1      0     0     0     1
 2      0     0     1     0
 3      0     0     1     1
 4      0     1     0     0
 5      0     1     0     1
 6      0     1     1     0
 7      0     1     1     1
 8      1     0     0     0
 9      1     0     0     1
 10     1     0     1     0
-1      0     0     0     0
-2      0     0     0     0}])

In [9]:
# let's transform train and test sets
X_train_enc = encoder.transform(X_train)
X_test_enc = encoder.transform(X_test)

In [10]:
X_train_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7_0,A7_1,A7_2,A7_3,A8,A9,A10,A11,A12,A13,A14,A15
596,a,46.08,3.0,u,g,c,0,0,0,1,2.375,t,t,8,t,g,396.0,4159
303,a,15.92,2.875,u,g,q,0,0,0,1,0.085,f,f,0,f,g,120.0,0
204,b,36.33,2.125,y,p,w,0,0,0,1,0.085,t,t,1,f,g,50.0,1187
351,b,22.17,0.585,y,p,ff,0,0,1,0,0.0,f,f,0,f,g,100.0,0
118,b,57.83,7.04,u,g,m,0,0,0,1,14.0,t,t,6,t,g,360.0,1332


In [11]:
X_test_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7_0,A7_1,A7_2,A7_3,A8,A9,A10,A11,A12,A13,A14,A15
14,a,45.83,10.5,u,g,q,0,0,0,1,5.0,t,t,7,t,g,0.0,0
586,b,64.08,20.0,u,g,x,0,0,1,1,17.5,t,t,9,t,g,0.0,1000
140,a,31.25,3.75,u,g,cc,0,0,1,1,0.625,t,t,9,t,g,181.0,0
492,b,39.25,9.5,u,g,m,0,0,0,1,6.5,t,t,14,f,g,240.0,4607
350,a,26.17,2.0,u,g,j,0,1,1,1,0.0,f,f,0,t,g,276.0,1
