# **One-Hot Encoding**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Let's load the data set
data = pd.read_csv("credit_approval_uci_2.csv")
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [3]:
# Let's separate into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=["target"], axis=1),  # predictors
    data["target"],  # target
    test_size=0.3,  # percentage of observations in test set
    random_state=0,  # seed to ensure reproducibility
)

X_train.shape, X_test.shape

((483, 15), (207, 15))

## **One-hot encoding with pandas**

In [4]:
# Let's inspect the unique values of A4
X_train["A4"].unique()

array(['u', 'y', 'Missing', 'l'], dtype=object)

In [5]:
# let's one-hot encode A4
dummies = pd.get_dummies(X_train["A4"], drop_first=False)
dummies.head()

Unnamed: 0,Missing,l,u,y
596,0,0,1,0
303,0,0,1,0
204,0,0,0,1
351,0,0,0,1
118,0,0,1,0


In [6]:
# let's one hot encode A4 into k-1 variables
dummies = pd.get_dummies(X_train["A4"], drop_first=True)
dummies.head()

Unnamed: 0,l,u,y
596,0,1,0
303,0,1,0
204,0,0,1
351,0,0,1
118,0,1,0


In [7]:
# Now let's encode all cateogrical variables simultaneously
# into k-1: train set
X_train_enc = pd.get_dummies(X_train, drop_first=True)
X_train_enc.head()

Unnamed: 0,A2,A3,A8,A11,A14,A15,A1_a,A1_b,A4_l,A4_u,...,A7_j,A7_n,A7_o,A7_v,A7_z,A9_t,A10_t,A12_t,A13_p,A13_s
596,46.08,3.0,2.375,8,396.0,4159,1,0,0,1,...,0,0,0,1,0,1,1,1,0,0
303,15.92,2.875,0.085,0,120.0,0,1,0,0,1,...,0,0,0,1,0,0,0,0,0,0
204,36.33,2.125,0.085,1,50.0,1187,0,1,0,0,...,0,0,0,1,0,1,1,0,0,0
351,22.17,0.585,0.0,0,100.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
118,57.83,7.04,14.0,6,360.0,1332,0,1,0,1,...,0,0,0,1,0,1,1,1,0,0


In [8]:
# and now in the test set.
X_test_enc = pd.get_dummies(X_test, drop_first=True)
X_test_enc.head()

Unnamed: 0,A2,A3,A8,A11,A14,A15,A1_a,A1_b,A4_l,A4_u,...,A7_j,A7_n,A7_o,A7_v,A7_z,A9_t,A10_t,A12_t,A13_p,A13_s
14,45.83,10.5,5.0,7,0.0,0,1,0,0,1,...,0,0,0,1,0,1,1,1,0,0
586,64.08,20.0,17.5,9,0.0,1000,0,1,0,1,...,0,0,0,0,0,1,1,1,0,0
140,31.25,3.75,0.625,9,181.0,0,1,0,0,1,...,0,0,0,0,0,1,1,1,0,0
492,39.25,9.5,6.5,14,240.0,4607,0,1,0,1,...,0,0,0,1,0,1,1,0,0,0
350,26.17,2.0,0.0,0,276.0,1,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0


In [9]:
# Add one-hot encoded variables to the original dataset.
X_test_enc = pd.concat([X_test, X_test_enc], axis=1)

# Drop the categorical variables
X_test_enc.drop(
    labels=X_test_enc.select_dtypes(include="O").columns,
    axis=1,
    inplace=True,
)

# Show data
X_test_enc.head()

Unnamed: 0,A2,A3,A8,A11,A14,A15,A2.1,A3.1,A8.1,A11.1,...,A7_j,A7_n,A7_o,A7_v,A7_z,A9_t,A10_t,A12_t,A13_p,A13_s
14,45.83,10.5,5.0,7,0.0,0,45.83,10.5,5.0,7,...,0,0,0,1,0,1,1,1,0,0
586,64.08,20.0,17.5,9,0.0,1000,64.08,20.0,17.5,9,...,0,0,0,0,0,1,1,1,0,0
140,31.25,3.75,0.625,9,181.0,0,31.25,3.75,0.625,9,...,0,0,0,0,0,1,1,1,0,0
492,39.25,9.5,6.5,14,240.0,4607,39.25,9.5,6.5,14,...,0,0,0,1,0,1,1,0,0,0
350,26.17,2.0,0.0,0,276.0,1,26.17,2.0,0.0,0,...,1,0,0,0,0,0,0,1,0,0


## **One-hot encoding with Scikit-learn**

In [10]:
from sklearn.preprocessing import OneHotEncoder

In [11]:
# we create and train the encoder
encoder = OneHotEncoder(
    drop="first",  # to return k-1
    sparse=False,
)

In [12]:
# Make a list with the categorical variables
vars_categorical = X_train.select_dtypes(include="O").columns.to_list()
vars_categorical

['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [13]:
# fit the encoder to the train set:
# it will learn the categories to encode.
encoder.fit(X_train[vars_categorical])

OneHotEncoder(drop='first', sparse=False)

In [14]:
encoder.categories_

[array(['Missing', 'a', 'b'], dtype=object),
 array(['Missing', 'l', 'u', 'y'], dtype=object),
 array(['Missing', 'g', 'gg', 'p'], dtype=object),
 array(['Missing', 'aa', 'c', 'cc', 'd', 'e', 'ff', 'i', 'j', 'k', 'm',
        'q', 'r', 'w', 'x'], dtype=object),
 array(['Missing', 'bb', 'dd', 'ff', 'h', 'j', 'n', 'o', 'v', 'z'],
       dtype=object),
 array(['f', 't'], dtype=object),
 array(['f', 't'], dtype=object),
 array(['f', 't'], dtype=object),
 array(['g', 'p', 's'], dtype=object)]

In [15]:
# Encode variables in the train and test sets
X_train_enc = encoder.transform(X_train[vars_categorical])
X_test_enc = encoder.transform(X_test[vars_categorical])

In [16]:
# Scikit-learn returns a Numpy array
X_test_enc

array([[1., 0., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [17]:
# Obtain the binary variable names
encoder.get_feature_names_out()

array(['A1_a', 'A1_b', 'A4_l', 'A4_u', 'A4_y', 'A5_g', 'A5_gg', 'A5_p',
       'A6_aa', 'A6_c', 'A6_cc', 'A6_d', 'A6_e', 'A6_ff', 'A6_i', 'A6_j',
       'A6_k', 'A6_m', 'A6_q', 'A6_r', 'A6_w', 'A6_x', 'A7_bb', 'A7_dd',
       'A7_ff', 'A7_h', 'A7_j', 'A7_n', 'A7_o', 'A7_v', 'A7_z', 'A9_t',
       'A10_t', 'A12_t', 'A13_p', 'A13_s'], dtype=object)

In [18]:
# Transform the array to a pandas dataframe
X_test_enc = pd.DataFrame(X_test_enc)

# Add the variable names
X_test_enc.columns = encoder.get_feature_names_out()

# Show dataset
X_test_enc.head()

Unnamed: 0,A1_a,A1_b,A4_l,A4_u,A4_y,A5_g,A5_gg,A5_p,A6_aa,A6_c,...,A7_j,A7_n,A7_o,A7_v,A7_z,A9_t,A10_t,A12_t,A13_p,A13_s
0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
3,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [19]:
# Replace index in transformed dataset by
# the index in the original dataset.
X_test_enc.index = X_test.index
X_test_enc.head()

Unnamed: 0,A1_a,A1_b,A4_l,A4_u,A4_y,A5_g,A5_gg,A5_p,A6_aa,A6_c,...,A7_j,A7_n,A7_o,A7_v,A7_z,A9_t,A10_t,A12_t,A13_p,A13_s
14,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
586,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
140,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
492,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
350,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [20]:
# Add the one-hot encoded variables to the
# original dataset.
X_test_enc = pd.concat([X_test, X_test_enc], axis=1)

# Drop the categorical variables
X_test_enc.drop(labels=vars_categorical, axis=1, inplace=True)

# Show data
X_test_enc.head()

Unnamed: 0,A2,A3,A8,A11,A14,A15,A1_a,A1_b,A4_l,A4_u,...,A7_j,A7_n,A7_o,A7_v,A7_z,A9_t,A10_t,A12_t,A13_p,A13_s
14,45.83,10.5,5.0,7,0.0,0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
586,64.08,20.0,17.5,9,0.0,1000,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
140,31.25,3.75,0.625,9,181.0,0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
492,39.25,9.5,6.5,14,240.0,4607,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
350,26.17,2.0,0.0,0,276.0,1,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


## **One-hot encoding with Feature-engine**

In [21]:
pip install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.3.0-py2.py3-none-any.whl (260 kB)
[?25l[K     |█▎                              | 10 kB 24.0 MB/s eta 0:00:01[K     |██▌                             | 20 kB 18.3 MB/s eta 0:00:01[K     |███▊                            | 30 kB 11.4 MB/s eta 0:00:01[K     |█████                           | 40 kB 9.6 MB/s eta 0:00:01[K     |██████▎                         | 51 kB 4.7 MB/s eta 0:00:01[K     |███████▌                        | 61 kB 5.6 MB/s eta 0:00:01[K     |████████▉                       | 71 kB 5.5 MB/s eta 0:00:01[K     |██████████                      | 81 kB 5.5 MB/s eta 0:00:01[K     |███████████▎                    | 92 kB 6.1 MB/s eta 0:00:01[K     |████████████▋                   | 102 kB 5.2 MB/s eta 0:00:01[K     |█████████████▉                  | 112 kB 5.2 MB/s eta 0:00:01[K     |███████████████                 | 122 kB 5.2 MB/s eta 0:00:01[K     |████████████████▍               | 133 kB 5.2 MB

In [22]:
from feature_engine.encoding import OneHotEncoder

In [23]:
# let's create an encoder to return k-1 binary variables
ohe_enc = OneHotEncoder(drop_last=True)

In [24]:
# fit the encoder to the train set: it will learn the variables and
# categories to encode
ohe_enc.fit(X_train)

OneHotEncoder(drop_last=True)

In [25]:
# we can see which variables the encoder will encode
ohe_enc.variables_

['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [26]:
# The categories that will be encoded
ohe_enc.encoder_dict_

{'A1': ['a', 'b'],
 'A10': ['t'],
 'A12': ['t'],
 'A13': ['g', 's'],
 'A4': ['u', 'y', 'Missing'],
 'A5': ['g', 'p', 'Missing'],
 'A6': ['c',
  'q',
  'w',
  'ff',
  'm',
  'i',
  'e',
  'cc',
  'x',
  'd',
  'k',
  'j',
  'Missing',
  'aa'],
 'A7': ['v', 'ff', 'h', 'dd', 'z', 'bb', 'j', 'Missing', 'n'],
 'A9': ['t']}

In [27]:
# let's transform train and test set
X_train_enc = ohe_enc.transform(X_train)
X_test_enc = ohe_enc.transform(X_test)

In [28]:
# let's inspect the encoded train set
X_train_enc.head()

Unnamed: 0,A2,A3,A8,A11,A14,A15,A1_a,A1_b,A4_u,A4_y,...,A7_z,A7_bb,A7_j,A7_Missing,A7_n,A9_t,A10_t,A12_t,A13_g,A13_s
596,46.08,3.0,2.375,8,396.0,4159,1,0,1,0,...,0,0,0,0,0,1,1,1,1,0
303,15.92,2.875,0.085,0,120.0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,1,0
204,36.33,2.125,0.085,1,50.0,1187,0,1,0,1,...,0,0,0,0,0,1,1,0,1,0
351,22.17,0.585,0.0,0,100.0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,1,0
118,57.83,7.04,14.0,6,360.0,1332,0,1,1,0,...,0,0,0,0,0,1,1,1,1,0


In [29]:
# let's inspect the encoded test set
X_test_enc.head()

Unnamed: 0,A2,A3,A8,A11,A14,A15,A1_a,A1_b,A4_u,A4_y,...,A7_z,A7_bb,A7_j,A7_Missing,A7_n,A9_t,A10_t,A12_t,A13_g,A13_s
14,45.83,10.5,5.0,7,0.0,0,1,0,1,0,...,0,0,0,0,0,1,1,1,1,0
586,64.08,20.0,17.5,9,0.0,1000,0,1,1,0,...,0,0,0,0,0,1,1,1,1,0
140,31.25,3.75,0.625,9,181.0,0,1,0,1,0,...,0,0,0,0,0,1,1,1,1,0
492,39.25,9.5,6.5,14,240.0,4607,0,1,1,0,...,0,0,0,0,0,1,1,0,1,0
350,26.17,2.0,0.0,0,276.0,1,1,0,1,0,...,0,0,1,0,0,0,0,1,1,0


In [30]:
# The name of the variables in the transformed data
ohe_enc.get_feature_names_out()

['A2',
 'A3',
 'A8',
 'A11',
 'A14',
 'A15',
 'A1_a',
 'A1_b',
 'A4_u',
 'A4_y',
 'A4_Missing',
 'A5_g',
 'A5_p',
 'A5_Missing',
 'A6_c',
 'A6_q',
 'A6_w',
 'A6_ff',
 'A6_m',
 'A6_i',
 'A6_e',
 'A6_cc',
 'A6_x',
 'A6_d',
 'A6_k',
 'A6_j',
 'A6_Missing',
 'A6_aa',
 'A7_v',
 'A7_ff',
 'A7_h',
 'A7_dd',
 'A7_z',
 'A7_bb',
 'A7_j',
 'A7_Missing',
 'A7_n',
 'A9_t',
 'A10_t',
 'A12_t',
 'A13_g',
 'A13_s']

In [31]:
# The name of the binary variables
ohe_enc.get_feature_names_out(ohe_enc.variables_)

['A1_a',
 'A1_b',
 'A4_u',
 'A4_y',
 'A4_Missing',
 'A5_g',
 'A5_p',
 'A5_Missing',
 'A6_c',
 'A6_q',
 'A6_w',
 'A6_ff',
 'A6_m',
 'A6_i',
 'A6_e',
 'A6_cc',
 'A6_x',
 'A6_d',
 'A6_k',
 'A6_j',
 'A6_Missing',
 'A6_aa',
 'A7_v',
 'A7_ff',
 'A7_h',
 'A7_dd',
 'A7_z',
 'A7_bb',
 'A7_j',
 'A7_Missing',
 'A7_n',
 'A9_t',
 'A10_t',
 'A12_t',
 'A13_g',
 'A13_s']

## **One-hot encoding with Category Encoders**

In [33]:
pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.4.1-py2.py3-none-any.whl (80 kB)
[?25l[K     |████                            | 10 kB 17.4 MB/s eta 0:00:01[K     |████████                        | 20 kB 12.0 MB/s eta 0:00:01[K     |████████████▏                   | 30 kB 6.7 MB/s eta 0:00:01[K     |████████████████▏               | 40 kB 6.3 MB/s eta 0:00:01[K     |████████████████████▎           | 51 kB 4.3 MB/s eta 0:00:01[K     |████████████████████████▎       | 61 kB 5.1 MB/s eta 0:00:01[K     |████████████████████████████▍   | 71 kB 5.4 MB/s eta 0:00:01[K     |████████████████████████████████| 80 kB 3.8 MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.4.1


In [34]:
from category_encoders.one_hot import OneHotEncoder

In [35]:
# let's create the encoder to return k-1 binary variables
# Category Encoders always returns k-1 dummies
ohe_enc = OneHotEncoder(use_cat_names=True)

In [37]:
# fit the encoder to the train set: it will learn the variables and
# categories to encode
ohe_enc.fit(X_train)

OneHotEncoder(cols=['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13'],
              use_cat_names=True)

In [38]:
# The variables that will be encoded
ohe_enc.cols

['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [39]:
# The names of the new variables
ohe_enc.get_feature_names()

['A1_a',
 'A1_b',
 'A1_Missing',
 'A2',
 'A3',
 'A4_u',
 'A4_y',
 'A4_Missing',
 'A4_l',
 'A5_g',
 'A5_p',
 'A5_Missing',
 'A5_gg',
 'A6_c',
 'A6_q',
 'A6_w',
 'A6_ff',
 'A6_m',
 'A6_i',
 'A6_e',
 'A6_cc',
 'A6_x',
 'A6_d',
 'A6_k',
 'A6_j',
 'A6_Missing',
 'A6_aa',
 'A6_r',
 'A7_v',
 'A7_ff',
 'A7_h',
 'A7_dd',
 'A7_z',
 'A7_bb',
 'A7_j',
 'A7_Missing',
 'A7_n',
 'A7_o',
 'A8',
 'A9_t',
 'A9_f',
 'A10_t',
 'A10_f',
 'A11',
 'A12_t',
 'A12_f',
 'A13_g',
 'A13_s',
 'A13_p',
 'A14',
 'A15']

In [40]:
# let's transform train and test set
X_train_enc = ohe_enc.transform(X_train)
X_test_enc = ohe_enc.transform(X_test)

In [41]:
# let's inspect the encoded test set
X_test_enc.head()

Unnamed: 0,A1_a,A1_b,A1_Missing,A2,A3,A4_u,A4_y,A4_Missing,A4_l,A5_g,...,A10_t,A10_f,A11,A12_t,A12_f,A13_g,A13_s,A13_p,A14,A15
14,1,0,0,45.83,10.5,1,0,0,0,1,...,1,0,7,1,0,1,0,0,0.0,0
586,0,1,0,64.08,20.0,1,0,0,0,1,...,1,0,9,1,0,1,0,0,0.0,1000
140,1,0,0,31.25,3.75,1,0,0,0,1,...,1,0,9,1,0,1,0,0,181.0,0
492,0,1,0,39.25,9.5,1,0,0,0,1,...,1,0,14,0,1,1,0,0,240.0,4607
350,1,0,0,26.17,2.0,1,0,0,0,1,...,0,1,0,1,0,1,0,0,276.0,1
