# **Weight of Evidence**

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# let's load the data set
data = pd.read_csv("credit_approval_uci_2.csv")
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [3]:
# Let's separate into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=["target"], axis=1),  # predictors
    data["target"],  # target
    test_size=0.3,  # percentage of observations in test set
    random_state=0,  # seed to ensure reproducibility
)

X_train.shape, X_test.shape

((483, 15), (207, 15))

## **Weight of Evidence with pandas**

In [4]:
# Let's get the inverse of the target values
# to be able to calculate the negative cases
neg_y_train = pd.Series(
    np.where(y_train == 1, 0, 1),
    index=y_train.index,
)

In [5]:
# Let's obtain the number of observations
# where the target takes the value 1 or 0
total_pos = y_train.sum()
total_neg = neg_y_train.sum()
total_pos, total_neg

(217, 266)

In [6]:
# Determine the numerator and denominator of the
# WoE formula
pos = y_train.groupby(X_train["A1"]).sum() / total_pos
neg = neg_y_train.groupby(X_train["A1"]).sum() / total_neg
pos, neg

(A1
 Missing    0.009217
 a          0.313364
 b          0.677419
 Name: target, dtype: float64, A1
 Missing    0.007519
 a          0.285714
 b          0.706767
 dtype: float64)

In [7]:
# Calculate the WoE
woe = np.log(pos / neg)
woe

A1
Missing    0.203599
a          0.092373
b         -0.042410
dtype: float64

In [8]:
# Replace categories with the woe
X_train["A1"] = X_train["A1"].map(woe)
X_test["A1"] = X_test["A1"].map(woe)

In [9]:
# Inspect encoded variable
X_train["A1"].head()

596    0.092373
303    0.092373
204   -0.042410
351   -0.042410
118   -0.042410
Name: A1, dtype: float64

## **Weight of Evidence with Feature-engine**

In [10]:
pip install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.3.0-py2.py3-none-any.whl (260 kB)
[?25l[K     |█▎                              | 10 kB 27.3 MB/s eta 0:00:01[K     |██▌                             | 20 kB 14.5 MB/s eta 0:00:01[K     |███▊                            | 30 kB 10.9 MB/s eta 0:00:01[K     |█████                           | 40 kB 9.5 MB/s eta 0:00:01[K     |██████▎                         | 51 kB 4.9 MB/s eta 0:00:01[K     |███████▌                        | 61 kB 5.8 MB/s eta 0:00:01[K     |████████▉                       | 71 kB 5.9 MB/s eta 0:00:01[K     |██████████                      | 81 kB 4.4 MB/s eta 0:00:01[K     |███████████▎                    | 92 kB 4.9 MB/s eta 0:00:01[K     |████████████▋                   | 102 kB 5.4 MB/s eta 0:00:01[K     |█████████████▉                  | 112 kB 5.4 MB/s eta 0:00:01[K     |███████████████                 | 122 kB 5.4 MB/s eta 0:00:01[K     |████████████████▍               | 133 kB 5.4 MB

In [11]:
from feature_engine.encoding import WoEEncoder

In [12]:
# Let's separate into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=["target"], axis=1),  # predictors
    data["target"],  # target
    test_size=0.3,  # percentage of observations in test set
    random_state=0,  # seed to ensure reproducibility
)

In [13]:
# Set up the weight of evidence encoder to
# encode a few variables
woe_enc = WoEEncoder(variables=["A1", "A9", "A12"])

In [14]:
# Find the WoE values for each category
woe_enc.fit(X_train, y_train)

WoEEncoder(variables=['A1', 'A9', 'A12'])

In [15]:
# let's inspect the variables that will be encoded
woe_enc.variables_

['A1', 'A9', 'A12']

In [16]:
# the encoder stores the woe values per category per variable
woe_enc.encoder_dict_

{'A1': {'Missing': 0.20359895524123955,
  'a': 0.09237332013101507,
  'b': -0.04241042080997339},
 'A12': {'f': 0.012909148776768313, 't': -0.015454610821445114},
 'A9': {'f': -2.3756704559950457, 't': 1.4995706780678444}}

In [17]:
# let's transform train and test sets
X_train_enc = woe_enc.transform(X_train)
X_test_enc = woe_enc.transform(X_test)

In [18]:
# Inspect the encoded data
X_train_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
596,0.092373,46.08,3.0,u,g,c,v,2.375,1.499571,t,8,-0.015455,g,396.0,4159
303,0.092373,15.92,2.875,u,g,q,v,0.085,-2.37567,f,0,0.012909,g,120.0,0
204,-0.04241,36.33,2.125,y,p,w,v,0.085,1.499571,t,1,0.012909,g,50.0,1187
351,-0.04241,22.17,0.585,y,p,ff,ff,0.0,-2.37567,f,0,0.012909,g,100.0,0
118,-0.04241,57.83,7.04,u,g,m,v,14.0,1.499571,t,6,-0.015455,g,360.0,1332


In [19]:
# Inspect the encoded data
X_test_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
14,0.092373,45.83,10.5,u,g,q,v,5.0,1.499571,t,7,-0.015455,g,0.0,0
586,-0.04241,64.08,20.0,u,g,x,h,17.5,1.499571,t,9,-0.015455,g,0.0,1000
140,0.092373,31.25,3.75,u,g,cc,h,0.625,1.499571,t,9,-0.015455,g,181.0,0
492,-0.04241,39.25,9.5,u,g,m,v,6.5,1.499571,t,14,0.012909,g,240.0,4607
350,0.092373,26.17,2.0,u,g,j,j,0.0,-2.37567,f,0,-0.015455,g,276.0,1


## **Weight of Evidence with Category Encoders**

In [20]:
pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.4.1-py2.py3-none-any.whl (80 kB)
[?25l[K     |████                            | 10 kB 25.8 MB/s eta 0:00:01[K     |████████                        | 20 kB 14.7 MB/s eta 0:00:01[K     |████████████▏                   | 30 kB 10.5 MB/s eta 0:00:01[K     |████████████████▏               | 40 kB 9.5 MB/s eta 0:00:01[K     |████████████████████▎           | 51 kB 4.8 MB/s eta 0:00:01[K     |████████████████████████▎       | 61 kB 5.6 MB/s eta 0:00:01[K     |████████████████████████████▍   | 71 kB 5.8 MB/s eta 0:00:01[K     |████████████████████████████████| 80 kB 4.1 MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.4.1


In [21]:
from category_encoders.woe import WOEEncoder

In [22]:
# Set up the weight of evidence encoder to
# encode a few variables
woe_enc = WOEEncoder(cols=["A1", "A9", "A12"])

In [23]:
# Find the WoE values for each category
woe_enc.fit(X_train, y_train)

WOEEncoder(cols=['A1', 'A9', 'A12'])

In [24]:
# The WoE values are stored in the mapping attribute
# (the values are slightly different from those from
# Feature-engine because of the regularization term.
# If we set regularization=0, the values should be identical)
woe_enc.mapping

{'A1': A1
  1    0.092216
  2   -0.042619
  3    0.201915
 -1    0.000000
 -2    0.000000
 dtype: float64, 'A12': A12
  1   -0.015149
  2    0.012673
 -1    0.000000
 -2    0.000000
 dtype: float64, 'A9': A9
  1    1.484831
  2   -2.321458
 -1    0.000000
 -2    0.000000
 dtype: float64}

In [25]:
# let's transform train and test sets
X_train_enc = woe_enc.transform(X_train)
X_test_enc = woe_enc.transform(X_test)