# Comparison of Categorical Variable Encodings

In this lecture, we will compare the performance of the different feature categorical encoding techniques we learned so far.

We will compare:

- One hot encoding
- Replacing labels by the count
- Ordering labels according to target
- Mean Encoding
- WoE

Using the titanic dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

from feature_engine.encoding import (
    CountFrequencyEncoder,
    MeanEncoder,
    OrdinalEncoder,
    WoEEncoder,
)

In [2]:
# let's load the titanic dataset

# we will only use these columns in the demo
cols = [
    "pclass",
    "age",
    "sibsp",
    "parch",
    "fare",
    "sex",
    "cabin",
    "embarked",
    "survived",
]

data = pd.read_csv("../titanic.csv", usecols=cols)

data.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked
0,1,1,female,29.0,0,0,211.3375,B5,S
1,1,1,male,0.9167,1,2,151.55,C22,S
2,1,0,female,2.0,1,2,151.55,C22,S
3,1,0,male,30.0,1,2,151.55,C22,S
4,1,0,female,25.0,1,2,151.55,C22,S


In [3]:
# let's check for missing data

data.isnull().sum()

pclass         0
survived       0
sex            0
age          263
sibsp          0
parch          0
fare           1
cabin       1014
embarked       2
dtype: int64

In [4]:
# Drop observations with NA in Fare and embarked

data.dropna(subset=["fare", "embarked"], inplace=True)

In [5]:
# Now we extract the first letter of the cabin

data["cabin"] = data["cabin"].astype(str).str[0]

data.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked
0,1,1,female,29.0,0,0,211.3375,B,S
1,1,1,male,0.9167,1,2,151.55,C,S
2,1,0,female,2.0,1,2,151.55,C,S
3,1,0,male,30.0,1,2,151.55,C,S
4,1,0,female,25.0,1,2,151.55,C,S


In [6]:
# drop observations with cabin = T, they are too few

data = data[data["cabin"] != "T"]

In [7]:
# Let's divide into train and test set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels="survived", axis=1),  # predictors
    data["survived"],  # target
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape

((913, 8), (392, 8))

In [8]:
# Let's replace null values in age by the mean

value = X_train["age"].mean()

X_train["age"].fillna(value, inplace=True)
X_test["age"].fillna(value, inplace=True)

In [9]:
X_train.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
402,2,female,30.0,1,0,13.8583,n,C
698,3,male,18.0,0,0,8.6625,n,S
1291,3,male,29.79847,0,0,8.7125,n,S
1229,3,male,27.0,0,0,8.6625,n,S
118,1,male,29.79847,0,0,26.55,D,S


In [10]:
# let's check that we have no missing data after NA imputation

X_train.isnull().sum()

pclass      0
sex         0
age         0
sibsp       0
parch       0
fare        0
cabin       0
embarked    0
dtype: int64

In [11]:
X_test.isnull().sum()

pclass      0
sex         0
age         0
sibsp       0
parch       0
fare        0
cabin       0
embarked    0
dtype: int64

## One Hot Encoding

In [12]:
X_train_OHE = pd.get_dummies(X_train, drop_first=True)
X_test_OHE = pd.get_dummies(X_test, drop_first=True)

In [13]:
X_train_OHE.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_male,cabin_B,cabin_C,cabin_D,cabin_E,cabin_F,cabin_G,cabin_n,embarked_Q,embarked_S
402,2,30.0,1,0,13.8583,0,0,0,0,0,0,0,1,0,0
698,3,18.0,0,0,8.6625,1,0,0,0,0,0,0,1,0,1
1291,3,29.79847,0,0,8.7125,1,0,0,0,0,0,0,1,0,1
1229,3,27.0,0,0,8.6625,1,0,0,0,0,0,0,1,0,1
118,1,29.79847,0,0,26.55,1,0,0,1,0,0,0,0,0,1


In [14]:
X_test_OHE.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_male,cabin_B,cabin_C,cabin_D,cabin_E,cabin_F,cabin_G,cabin_n,embarked_Q,embarked_S
586,2,29.0,1,0,26.0,0,0,0,0,0,0,0,1,0,1
200,1,46.0,0,0,75.2417,1,0,1,0,0,0,0,0,0,0
831,3,40.0,1,6,46.9,1,0,0,0,0,0,0,1,0,1
1149,3,29.79847,0,0,7.7208,0,0,0,0,0,0,0,1,1,0
393,2,25.0,0,0,31.5,1,0,0,0,0,0,0,1,0,1


## Count encoding

In [15]:
enc = CountFrequencyEncoder()

X_train_count = enc.fit_transform(X_train)
X_test_count = enc.transform(X_test)

In [16]:
X_train_count.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
402,2,326,30.0,1,0,13.8583,702,184
698,3,587,18.0,0,0,8.6625,702,647
1291,3,587,29.79847,0,0,8.7125,702,647
1229,3,587,27.0,0,0,8.6625,702,647
118,1,587,29.79847,0,0,26.55,33,647


In [17]:
X_test_count.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
586,2,326,29.0,1,0,26.0,702,647
200,1,587,46.0,0,0,75.2417,70,184
831,3,587,40.0,1,6,46.9,702,647
1149,3,326,29.79847,0,0,7.7208,702,82
393,2,587,25.0,0,0,31.5,702,647


## Ordered Integer Encoding

In [18]:
enc = OrdinalEncoder()

X_train_ordered = enc.fit_transform(X_train, y_train)
X_test_ordered = enc.transform(X_test)

In [19]:
X_train_ordered.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
402,2,1,30.0,1,0,13.8583,0,2
698,3,0,18.0,0,0,8.6625,0,0
1291,3,0,29.79847,0,0,8.7125,0,0
1229,3,0,27.0,0,0,8.6625,0,0
118,1,0,29.79847,0,0,26.55,5,0


In [20]:
X_test_ordered.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
586,2,1,29.0,1,0,26.0,0,0
200,1,0,46.0,0,0,75.2417,2,2
831,3,0,40.0,1,6,46.9,0,0
1149,3,1,29.79847,0,0,7.7208,0,1
393,2,0,25.0,0,0,31.5,0,0


## Mean Encoding

In [21]:
enc = MeanEncoder()

X_train_mean = enc.fit_transform(X_train, y_train)
X_test_mean = enc.transform(X_test)

In [22]:
X_train_mean.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
402,2,0.730061,30.0,1,0,13.8583,0.292023,0.516304
698,3,0.173765,18.0,0,0,8.6625,0.292023,0.332303
1291,3,0.173765,29.79847,0,0,8.7125,0.292023,0.332303
1229,3,0.173765,27.0,0,0,8.6625,0.292023,0.332303
118,1,0.173765,29.79847,0,0,26.55,0.69697,0.332303


In [23]:
X_test_mean.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
586,2,0.730061,29.0,1,0,26.0,0.292023,0.332303
200,1,0.173765,46.0,0,0,75.2417,0.6,0.516304
831,3,0.173765,40.0,1,6,46.9,0.292023,0.332303
1149,3,0.730061,29.79847,0,0,7.7208,0.292023,0.365854
393,2,0.173765,25.0,0,0,31.5,0.292023,0.332303


## Weight of evidence

In [24]:
enc = WoEEncoder()

X_train_woe = enc.fit_transform(X_train, y_train)
X_test_woe = enc.transform(X_test)

In [25]:
X_train_woe.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
402,2,1.516874,30.0,1,0,13.8583,-0.36364,0.587181
698,3,-1.037236,18.0,0,0,8.6625,-0.36364,-0.175847
1291,3,-1.037236,29.79847,0,0,8.7125,-0.36364,-0.175847
1229,3,-1.037236,27.0,0,0,8.6625,-0.36364,-0.175847
118,1,-1.037236,29.79847,0,0,26.55,1.354849,-0.175847


In [26]:
X_test_woe.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
586,2,1.516874,29.0,1,0,26.0,-0.36364,-0.175847
200,1,-1.037236,46.0,0,0,75.2417,0.927405,0.587181
831,3,-1.037236,40.0,1,6,46.9,-0.36364,-0.175847
1149,3,1.516874,29.79847,0,0,7.7208,-0.36364,-0.028106
393,2,-1.037236,25.0,0,0,31.5,-0.36364,-0.175847


## Random Forest Performance

In [27]:
# create a function to build random forests and compare performance in train and test set


def run_randomForests(X_train, X_test, y_train, y_test):

    rf = RandomForestClassifier(n_estimators=10, random_state=39, max_depth=3)
    rf.fit(X_train, y_train)

    print("Train set")
    pred = rf.predict_proba(X_train)
    print("Random Forests roc-auc: {}".format(roc_auc_score(y_train, pred[:, 1])))

    print("Test set")
    pred = rf.predict_proba(X_test)
    print("Random Forests roc-auc: {}".format(roc_auc_score(y_test, pred[:, 1])))

In [28]:
# OHE
run_randomForests(X_train_OHE, X_test_OHE, y_train, y_test)

Train set
Random Forests roc-auc: 0.839300379837799
Test set
Random Forests roc-auc: 0.8077464026831115


In [29]:
# counts
run_randomForests(X_train_count, X_test_count, y_train, y_test)

Train set
Random Forests roc-auc: 0.8543630017452007
Test set
Random Forests roc-auc: 0.8149275127123229


In [30]:
# ordered labels
run_randomForests(X_train_ordered, X_test_ordered, y_train, y_test)

Train set
Random Forests roc-auc: 0.8605738630530746
Test set
Random Forests roc-auc: 0.8271529806339935


In [31]:
# mean encoding
run_randomForests(X_train_mean, X_test_mean, y_train, y_test)

Train set
Random Forests roc-auc: 0.8605174006775486
Test set
Random Forests roc-auc: 0.8270988856431896


In [32]:
# woe encoding
run_randomForests(X_train_woe, X_test_woe, y_train, y_test)

Train set
Random Forests roc-auc: 0.8605174006775486
Test set
Random Forests roc-auc: 0.8270988856431896


Comparing the roc_auc values on the test sets, we can see that one hot encoding has the worst performance. This makes sense because trees do not perform well in datasets with big feature spaces.

The remaining encodings returned similar performances.

### Logistic Regression Performance

In [33]:
def run_logistic(X_train, X_test, y_train, y_test):

    # function to train and test the performance of logistic regression
    logit = LogisticRegression(random_state=44, C=0.01, max_iter=100)
    logit.fit(X_train, y_train)

    print("Train set")
    pred = logit.predict_proba(X_train)
    print("Logistic Regression roc-auc: {}".format(roc_auc_score(y_train, pred[:, 1])))

    print("Test set")
    pred = logit.predict_proba(X_test)
    print("Logistic Regression roc-auc: {}".format(roc_auc_score(y_test, pred[:, 1])))

In [34]:
# OHE
run_logistic(X_train_OHE, X_test_OHE, y_train, y_test)

Train set
Logistic Regression roc-auc: 0.8287932450467097
Test set
Logistic Regression roc-auc: 0.8013902412636589


In [35]:
# counts
run_logistic(X_train_count, X_test_count, y_train, y_test)

Train set
Logistic Regression roc-auc: 0.7899009341956678
Test set
Logistic Regression roc-auc: 0.7391810018392297


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [36]:
# ordered labels
run_logistic(X_train_ordered, X_test_ordered, y_train, y_test)

Train set
Logistic Regression roc-auc: 0.8223924648393389
Test set
Logistic Regression roc-auc: 0.8006870063832089


In [37]:
# mean encoding
run_logistic(X_train_mean, X_test_mean, y_train, y_test)

Train set
Logistic Regression roc-auc: 0.7791217534134072
Test set
Logistic Regression roc-auc: 0.7481878178080709


In [38]:
# woe encoding
run_logistic(X_train_woe, X_test_woe, y_train, y_test)

Train set
Logistic Regression roc-auc: 0.850844369161277
Test set
Logistic Regression roc-auc: 0.8202964405496052


For Logistic regression, the best performances was obtained with weight of evidence, followed by one hot encoding.