# **Encoding Subset of Categories**

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Let's load the data set.
data = pd.read_csv("credit_approval_uci_2.csv")
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [3]:
# Let's separate the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=["target"], axis=1),  # predictors
    data["target"],  # target
    test_size=0.3,  # percentage of observations in test set
    random_state=0,  # seed to ensure reproducibility
)

X_train.shape, X_test.shape

((483, 15), (207, 15))

## **One-hot encoding with pandas**

In [4]:
# Let's inspect the values of A6.
X_train["A6"].unique()

array(['c', 'q', 'w', 'ff', 'm', 'i', 'e', 'cc', 'x', 'd', 'k', 'j',
       'Missing', 'aa', 'r'], dtype=object)

In [5]:
# Make a list with the categories for which
# we want dummy variables.
categories = ["aa", "cc", "ff"]

In [6]:
# Add the dummy variables.
for category in categories:
    X_train[f"A6_{category}"] = np.where(X_train["A6"] == category, 1, 0)
    X_test[f"A6_{category}"] = np.where(X_test["A6"] == category, 1, 0)

In [7]:
# We can see the binary variables at the right
# of the dataframe.
X_test.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A6_aa,A6_cc,A6_ff
14,a,45.83,10.5,u,g,q,v,5.0,t,t,7,t,g,0.0,0,0,0,0
586,b,64.08,20.0,u,g,x,h,17.5,t,t,9,t,g,0.0,1000,0,0,0
140,a,31.25,3.75,u,g,cc,h,0.625,t,t,9,t,g,181.0,0,0,1,0
492,b,39.25,9.5,u,g,m,v,6.5,t,t,14,f,g,240.0,4607,0,0,0
350,a,26.17,2.0,u,g,j,j,0.0,f,f,0,t,g,276.0,1,0,0,0


## **One-hot encoding with Scikit-learn**

In [8]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [9]:
# Let's separate the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=["target"], axis=1),  # predictors
    data["target"],  # target
    test_size=0.3,  # percentage of observations in test set
    random_state=0,  # seed to ensure reproducibility
)

In [10]:
# Set up the encoder to create binary variables for some
# categories in 2 of the variables.
encoder = OneHotEncoder(
    categories=[["aa", "cc", "ff"], ["ff", "dd", "bb"]],
    handle_unknown="ignore",
    sparse=False,
)

In [11]:
# Indicate which variables to encode:
ct = ColumnTransformer(
    [("encoder", encoder, ["A6", "A7"])],
    remainder="passthrough",
)

# Find the categories to encode:
ct.fit(X_train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('encoder',
                                 OneHotEncoder(categories=[['aa', 'cc', 'ff'],
                                                           ['ff', 'dd', 'bb']],
                                               handle_unknown='ignore',
                                               sparse=False),
                                 ['A6', 'A7'])])

In [12]:
# Return dummy variables of indicated
# categories.
X_train_enc = ct.transform(X_train)
X_test_enc = ct.transform(X_test)

In [13]:
# Scikit-learn returns a Numpy array
X_test_enc

array([[0.0, 0.0, 0.0, ..., 'g', 0.0, 0],
       [0.0, 0.0, 0.0, ..., 'g', 0.0, 1000],
       [0.0, 1.0, 0.0, ..., 'g', 181.0, 0],
       ...,
       [0.0, 0.0, 0.0, ..., 'g', 132.0, 2],
       [0.0, 0.0, 0.0, ..., 'g', 100.0, 0],
       [0.0, 0.0, 0.0, ..., 'g', 112.0, 0]], dtype=object)

In [14]:
# Obtain the binary variable names.
ct.get_feature_names_out()

array(['encoder__A6_aa', 'encoder__A6_cc', 'encoder__A6_ff',
       'encoder__A7_ff', 'encoder__A7_dd', 'encoder__A7_bb',
       'remainder__A1', 'remainder__A2', 'remainder__A3', 'remainder__A4',
       'remainder__A5', 'remainder__A8', 'remainder__A9',
       'remainder__A10', 'remainder__A11', 'remainder__A12',
       'remainder__A13', 'remainder__A14', 'remainder__A15'], dtype=object)

In [15]:
# Transform the array to a pandas dataframe
X_test_enc = pd.DataFrame(X_test_enc)

# Add the variable names:
X_test_enc.columns = ct.get_feature_names_out()

# Show dataset
X_test_enc.head()

Unnamed: 0,encoder__A6_aa,encoder__A6_cc,encoder__A6_ff,encoder__A7_ff,encoder__A7_dd,encoder__A7_bb,remainder__A1,remainder__A2,remainder__A3,remainder__A4,remainder__A5,remainder__A8,remainder__A9,remainder__A10,remainder__A11,remainder__A12,remainder__A13,remainder__A14,remainder__A15
0,0.0,0.0,0.0,0.0,0.0,0.0,a,45.83,10.5,u,g,5.0,t,t,7,t,g,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,b,64.08,20.0,u,g,17.5,t,t,9,t,g,0.0,1000
2,0.0,1.0,0.0,0.0,0.0,0.0,a,31.25,3.75,u,g,0.625,t,t,9,t,g,181.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,b,39.25,9.5,u,g,6.5,t,t,14,f,g,240.0,4607
4,0.0,0.0,0.0,0.0,0.0,0.0,a,26.17,2.0,u,g,0.0,f,f,0,t,g,276.0,1
