# **Target Mean Encoding**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# let's load the data set
data = pd.read_csv("credit_approval_uci_2.csv")
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [3]:
# Let's separate into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=["target"], axis=1),  # predictors
    data["target"],  # target
    test_size=0.3,  # percentage of observations in test set
    random_state=0,  # seed to ensure reproducibility
)

X_train.shape, X_test.shape

((483, 15), (207, 15))

## **Target mean encoding with pandas**

In [4]:
# let's create a dictionary with the target mean value
# per category for the variable A7
mapping = y_train.groupby(X_train["A7"]).mean().to_dict()
mapping

{'Missing': 1.0,
 'bb': 0.5128205128205128,
 'dd': 0.4,
 'ff': 0.14634146341463414,
 'h': 0.6039603960396039,
 'j': 0.2,
 'n': 0.6666666666666666,
 'o': 0.0,
 'v': 0.4187725631768953,
 'z': 0.7142857142857143}

In [5]:
# replace the labels with the target mean
X_train["A7"] = X_train["A7"].map(mapping)
X_test["A7"] = X_test["A7"].map(mapping)

## **Target mean encoding with Feature-engine**

In [6]:
pip install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.3.0-py2.py3-none-any.whl (260 kB)
[?25l[K     |█▎                              | 10 kB 18.3 MB/s eta 0:00:01[K     |██▌                             | 20 kB 24.9 MB/s eta 0:00:01[K     |███▊                            | 30 kB 11.2 MB/s eta 0:00:01[K     |█████                           | 40 kB 9.0 MB/s eta 0:00:01[K     |██████▎                         | 51 kB 4.7 MB/s eta 0:00:01[K     |███████▌                        | 61 kB 5.5 MB/s eta 0:00:01[K     |████████▉                       | 71 kB 5.6 MB/s eta 0:00:01[K     |██████████                      | 81 kB 5.6 MB/s eta 0:00:01[K     |███████████▎                    | 92 kB 6.3 MB/s eta 0:00:01[K     |████████████▋                   | 102 kB 5.3 MB/s eta 0:00:01[K     |█████████████▉                  | 112 kB 5.3 MB/s eta 0:00:01[K     |███████████████                 | 122 kB 5.3 MB/s eta 0:00:01[K     |████████████████▍               | 133 kB 5.3 MB

In [7]:
from feature_engine.encoding import MeanEncoder

In [8]:
# Let's separate into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=["target"], axis=1),  # predictors
    data["target"],  # target
    test_size=0.3,  # percentage of observations in test set
    random_state=0,  # seed to ensure reproducibility
)

In [9]:
# let's create the encoder that automatically finds
# the categorical variables
mean_enc = MeanEncoder(variables=None)

In [10]:
# with fit, the transformer learns the numerical
# representation for each category, that is, the
# mean target value per category
# when fitting the transformer, we need to pass the target as well
# just like with any Scikit-learn predictor class
mean_enc.fit(X_train, y_train)

MeanEncoder()

In [11]:
# variables that will be encoded
mean_enc.variables_

['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [12]:
# in the encoder_dict_ we find the target mean per category
# per variable
mean_enc.encoder_dict_

{'A1': {'Missing': 0.5, 'a': 0.4722222222222222, 'b': 0.4388059701492537},
 'A10': {'f': 0.25092250922509224, 't': 0.7028301886792453},
 'A12': {'f': 0.4524714828897338, 't': 0.44545454545454544},
 'A13': {'g': 0.46485260770975056, 'p': 1.0, 's': 0.21052631578947367},
 'A4': {'Missing': 1.0,
  'l': 1.0,
  'u': 0.512396694214876,
  'y': 0.22608695652173913},
 'A5': {'Missing': 1.0,
  'g': 0.512396694214876,
  'gg': 1.0,
  'p': 0.22608695652173913},
 'A6': {'Missing': 1.0,
  'aa': 0.4411764705882353,
  'c': 0.45161290322580644,
  'cc': 0.7,
  'd': 0.19047619047619047,
  'e': 0.6190476190476191,
  'ff': 0.10526315789473684,
  'i': 0.2682926829268293,
  'j': 0.25,
  'k': 0.2894736842105263,
  'm': 0.4230769230769231,
  'q': 0.625,
  'r': 1.0,
  'w': 0.5,
  'x': 0.7916666666666666},
 'A7': {'Missing': 1.0,
  'bb': 0.5128205128205128,
  'dd': 0.4,
  'ff': 0.14634146341463414,
  'h': 0.6039603960396039,
  'j': 0.2,
  'n': 0.6666666666666666,
  'o': 0.0,
  'v': 0.4187725631768953,
  'z': 0.714

In [13]:
# let's transform train and test set
X_train_enc = mean_enc.transform(X_train)
X_test_enc = mean_enc.transform(X_test)

In [14]:
# Let's inspect the transformed dataset
X_train_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
596,0.472222,46.08,3.0,0.512397,0.512397,0.451613,0.418773,2.375,0.785156,0.70283,8,0.445455,0.464853,396.0,4159
303,0.472222,15.92,2.875,0.512397,0.512397,0.625,0.418773,0.085,0.070485,0.250923,0,0.452471,0.464853,120.0,0
204,0.438806,36.33,2.125,0.226087,0.226087,0.5,0.418773,0.085,0.785156,0.70283,1,0.452471,0.464853,50.0,1187
351,0.438806,22.17,0.585,0.226087,0.226087,0.105263,0.146341,0.0,0.070485,0.250923,0,0.452471,0.464853,100.0,0
118,0.438806,57.83,7.04,0.512397,0.512397,0.423077,0.418773,14.0,0.785156,0.70283,6,0.445455,0.464853,360.0,1332


In [15]:
# Let's inspect the transformed dataset
X_test_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
14,0.472222,45.83,10.5,0.512397,0.512397,0.625,0.418773,5.0,0.785156,0.70283,7,0.445455,0.464853,0.0,0
586,0.438806,64.08,20.0,0.512397,0.512397,0.791667,0.60396,17.5,0.785156,0.70283,9,0.445455,0.464853,0.0,1000
140,0.472222,31.25,3.75,0.512397,0.512397,0.7,0.60396,0.625,0.785156,0.70283,9,0.445455,0.464853,181.0,0
492,0.438806,39.25,9.5,0.512397,0.512397,0.423077,0.418773,6.5,0.785156,0.70283,14,0.452471,0.464853,240.0,4607
350,0.472222,26.17,2.0,0.512397,0.512397,0.25,0.2,0.0,0.070485,0.250923,0,0.445455,0.464853,276.0,1


## **Target mean encoding with Category Encoders**

In [16]:
pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.4.1-py2.py3-none-any.whl (80 kB)
[?25l[K     |████                            | 10 kB 25.3 MB/s eta 0:00:01[K     |████████                        | 20 kB 27.2 MB/s eta 0:00:01[K     |████████████▏                   | 30 kB 19.7 MB/s eta 0:00:01[K     |████████████████▏               | 40 kB 13.5 MB/s eta 0:00:01[K     |████████████████████▎           | 51 kB 5.6 MB/s eta 0:00:01[K     |████████████████████████▎       | 61 kB 6.5 MB/s eta 0:00:01[K     |████████████████████████████▍   | 71 kB 6.1 MB/s eta 0:00:01[K     |████████████████████████████████| 80 kB 4.0 MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.4.1


In [17]:
from category_encoders.target_encoder import TargetEncoder

In [18]:
# the parameter min_samples_leaf corresponds to the minimum
# number of observations per category group needed to
# estimate the target mean only from the posterior probability
# the posterior probability is the target mean as estimated in the
# previous cells using pandas
# for those categories with less than 25 observations, the target
# estimate will be a "blended" probability
mean_enc = TargetEncoder(cols=None, min_samples_leaf=25, smoothing=1.0)

In [19]:
# with fit, the transformer learns the numerical
# representation for each category
mean_enc.fit(X_train, y_train)

TargetEncoder(cols=['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13'],
              min_samples_leaf=25)

In [20]:
# let's transform train and test set
X_train_enc = mean_enc.transform(X_train)
X_test_enc = mean_enc.transform(X_test)

In [21]:
# Let's inspect the transformed data
X_train_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
596,0.472222,46.08,3.0,0.512397,0.512397,0.451613,0.418773,2.375,0.785156,0.70283,8,0.445455,0.464853,396.0,4159
303,0.472222,15.92,2.875,0.512397,0.512397,0.625,0.418773,0.085,0.070485,0.250923,0,0.452471,0.464853,120.0,0
204,0.438806,36.33,2.125,0.226087,0.226087,0.5,0.418773,0.085,0.785156,0.70283,1,0.452471,0.464853,50.0,1187
351,0.438806,22.17,0.585,0.226087,0.226087,0.105264,0.146341,0.0,0.070485,0.250923,0,0.452471,0.464853,100.0,0
118,0.438806,57.83,7.04,0.512397,0.512397,0.430123,0.418773,14.0,0.785156,0.70283,6,0.445455,0.464853,360.0,1332


In [22]:
# Let's inspect the transformed data
X_test_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
14,0.472222,45.83,10.5,0.512397,0.512397,0.625,0.418773,5.0,0.785156,0.70283,7,0.445455,0.464853,0.0,0
586,0.438806,64.08,20.0,0.512397,0.512397,0.541359,0.60396,17.5,0.785156,0.70283,9,0.445455,0.464853,0.0,1000
140,0.472222,31.25,3.75,0.512397,0.512397,0.698322,0.60396,0.625,0.785156,0.70283,9,0.445455,0.464853,181.0,0
492,0.438806,39.25,9.5,0.512397,0.512397,0.430123,0.418773,6.5,0.785156,0.70283,14,0.452471,0.464853,240.0,4607
350,0.472222,26.17,2.0,0.512397,0.512397,0.449275,0.449275,0.0,0.070485,0.250923,0,0.445455,0.464853,276.0,1


## **M-estimate encoding with Category Encoders**

In [23]:
from category_encoders.m_estimate import MEstimateEncoder

In [24]:
# Set up the encoder
# m determines the weight given to the prior, that is
# the target mean over the entire dataset
# bigger m values mean more weight to the prior
mean_enc = MEstimateEncoder(cols=None, m=1.0)

In [25]:
# with fit, the transformer learns the numerical
# representation for each category
mean_enc.fit(X_train, y_train)

MEstimateEncoder(cols=['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13'])

In [26]:
# let's transform train and test set
X_train_enc = mean_enc.transform(X_train)
X_test_enc = mean_enc.transform(X_test)

In [27]:
# Let's inspect the transformed data
X_train_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
596,0.472064,46.08,3.0,0.512223,0.512223,0.451588,0.418882,2.375,0.783849,0.70164,8,0.445472,0.464817,396.0,4159
303,0.472064,15.92,2.875,0.512223,0.512223,0.621917,0.418882,0.085,0.072146,0.251652,0,0.452459,0.464817,120.0,0
204,0.438837,36.33,2.125,0.228011,0.228011,0.498965,0.418882,0.085,0.783849,0.70164,1,0.452459,0.464817,50.0,1187
351,0.438837,22.17,0.585,0.228011,0.228011,0.114084,0.153554,0.0,0.072146,0.251652,0,0.452459,0.464817,100.0,0
118,0.438837,57.83,7.04,0.512223,0.512223,0.424047,0.418882,14.0,0.783849,0.70164,6,0.445472,0.464817,360.0,1332


In [28]:
# Let's inspect the transformed data
X_test_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
14,0.472064,45.83,10.5,0.512223,0.512223,0.621917,0.418882,5.0,0.783849,0.70164,7,0.445472,0.464817,0.0,0
586,0.438837,64.08,20.0,0.512223,0.512223,0.777971,0.602444,17.5,0.783849,0.70164,9,0.445472,0.464817,0.0,1000
140,0.472064,31.25,3.75,0.512223,0.512223,0.691912,0.602444,0.625,0.783849,0.70164,9,0.445472,0.464817,181.0,0
492,0.438837,39.25,9.5,0.512223,0.512223,0.424047,0.418882,6.5,0.783849,0.70164,14,0.452459,0.464817,240.0,4607
350,0.472064,26.17,2.0,0.512223,0.512223,0.272142,0.241546,0.0,0.072146,0.251652,0,0.445472,0.464817,276.0,1
