# **Replacing Categories by Counts Frequency**

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# let's load the data set
data = pd.read_csv("credit_approval_uci_2.csv")
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [3]:
# Let's separate into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=["target"], axis=1),  # predictors
    data["target"],  # target
    test_size=0.3,  # percentage of observations in test set
    random_state=0,  # seed to ensure reproducibility
)

X_train.shape, X_test.shape

((483, 15), (207, 15))

## **Count encoding with pandas**

In [4]:
# let's obtain the counts for each category in the variable A7
counts = X_train["A7"].value_counts().to_dict()
counts

{'Missing': 4,
 'bb': 39,
 'dd': 5,
 'ff': 41,
 'h': 101,
 'j': 5,
 'n': 3,
 'o': 1,
 'v': 277,
 'z': 7}

In [5]:
# replace the categories with the counts
X_train["A7"] = X_train["A7"].map(counts)
X_test["A7"] = X_test["A7"].map(counts)

In [6]:
# let's explore the result
X_train["A7"].head(10)

596    277
303    277
204    277
351     41
118    277
247    101
652    277
513      5
230    277
250      7
Name: A7, dtype: int64

In [7]:
# if instead of the count we would like the frequency:
frequencies = X_train["A6"].value_counts(normalize=True).to_dict()
frequencies

{'Missing': 0.008281573498964804,
 'aa': 0.07039337474120083,
 'c': 0.19254658385093168,
 'cc': 0.062111801242236024,
 'd': 0.043478260869565216,
 'e': 0.043478260869565216,
 'ff': 0.07867494824016563,
 'i': 0.08488612836438923,
 'j': 0.016563146997929608,
 'k': 0.07867494824016563,
 'm': 0.053830227743271224,
 'q': 0.11594202898550725,
 'r': 0.002070393374741201,
 'w': 0.09937888198757763,
 'x': 0.049689440993788817}

In [8]:
# replace the categories with the frequency
X_train["A6"] = X_train["A6"].map(frequencies)
X_test["A6"] = X_test["A6"].map(frequencies)

## **Count encoding with Feature-engine**

In [9]:
pip install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.3.0-py2.py3-none-any.whl (260 kB)
[?25l[K     |█▎                              | 10 kB 15.4 MB/s eta 0:00:01[K     |██▌                             | 20 kB 10.0 MB/s eta 0:00:01[K     |███▊                            | 30 kB 9.6 MB/s eta 0:00:01[K     |█████                           | 40 kB 8.6 MB/s eta 0:00:01[K     |██████▎                         | 51 kB 3.1 MB/s eta 0:00:01[K     |███████▌                        | 61 kB 3.6 MB/s eta 0:00:01[K     |████████▉                       | 71 kB 4.2 MB/s eta 0:00:01[K     |██████████                      | 81 kB 4.1 MB/s eta 0:00:01[K     |███████████▎                    | 92 kB 4.6 MB/s eta 0:00:01[K     |████████████▋                   | 102 kB 5.0 MB/s eta 0:00:01[K     |█████████████▉                  | 112 kB 5.0 MB/s eta 0:00:01[K     |███████████████                 | 122 kB 5.0 MB/s eta 0:00:01[K     |████████████████▍               | 133 kB 5.0 MB/

In [10]:
from feature_engine.encoding import CountFrequencyEncoder

In [11]:
# let's divide into train and test sets (again)
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=["target"], axis=1),  # predictors
    data["target"],  # target
    test_size=0.3,  # percentage of observations in test set
    random_state=0,  # seed to ensure reproducibility
)

In [13]:
# let's set up the encoder
count_enc = CountFrequencyEncoder(
    encoding_method="count",  # to do frequency ==> encoding_method='frequency'
    variables=None,  # to select all categorical variables
)

In [14]:
# fit the encoder to the train set
count_enc.fit(X_train)

CountFrequencyEncoder()

In [15]:
# the encoder finds categorical variables automatically
count_enc.variables_

['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [16]:
# the encoder_dict_ attribute contains the counts
# per category per variable.
count_enc.encoder_dict_

{'A1': {'Missing': 4, 'a': 144, 'b': 335},
 'A10': {'f': 271, 't': 212},
 'A12': {'f': 263, 't': 220},
 'A13': {'g': 441, 'p': 4, 's': 38},
 'A4': {'Missing': 4, 'l': 1, 'u': 363, 'y': 115},
 'A5': {'Missing': 4, 'g': 363, 'gg': 1, 'p': 115},
 'A6': {'Missing': 4,
  'aa': 34,
  'c': 93,
  'cc': 30,
  'd': 21,
  'e': 21,
  'ff': 38,
  'i': 41,
  'j': 8,
  'k': 38,
  'm': 26,
  'q': 56,
  'r': 1,
  'w': 48,
  'x': 24},
 'A7': {'Missing': 4,
  'bb': 39,
  'dd': 5,
  'ff': 41,
  'h': 101,
  'j': 5,
  'n': 3,
  'o': 1,
  'v': 277,
  'z': 7},
 'A9': {'f': 227, 't': 256}}

In [17]:
# let's transform train and test sets
X_train_enc = count_enc.transform(X_train)
X_test_enc = count_enc.transform(X_test)

In [18]:
# let's inspect the result
X_train_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
596,144,46.08,3.0,363,363,93,277,2.375,256,212,8,220,441,396.0,4159
303,144,15.92,2.875,363,363,56,277,0.085,227,271,0,263,441,120.0,0
204,335,36.33,2.125,115,115,48,277,0.085,256,212,1,263,441,50.0,1187
351,335,22.17,0.585,115,115,38,41,0.0,227,271,0,263,441,100.0,0
118,335,57.83,7.04,363,363,26,277,14.0,256,212,6,220,441,360.0,1332


In [19]:
X_test_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
14,144,45.83,10.5,363,363,56,277,5.0,256,212,7,220,441,0.0,0
586,335,64.08,20.0,363,363,24,101,17.5,256,212,9,220,441,0.0,1000
140,144,31.25,3.75,363,363,30,101,0.625,256,212,9,220,441,181.0,0
492,335,39.25,9.5,363,363,26,277,6.5,256,212,14,263,441,240.0,4607
350,144,26.17,2.0,363,363,8,5,0.0,227,271,0,220,441,276.0,1


## **Count encoding with Category Encoders**

In [20]:
pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.4.1-py2.py3-none-any.whl (80 kB)
[?25l[K     |████                            | 10 kB 18.2 MB/s eta 0:00:01[K     |████████                        | 20 kB 11.5 MB/s eta 0:00:01[K     |████████████▏                   | 30 kB 9.2 MB/s eta 0:00:01[K     |████████████████▏               | 40 kB 8.0 MB/s eta 0:00:01[K     |████████████████████▎           | 51 kB 4.3 MB/s eta 0:00:01[K     |████████████████████████▎       | 61 kB 5.0 MB/s eta 0:00:01[K     |████████████████████████████▍   | 71 kB 5.6 MB/s eta 0:00:01[K     |████████████████████████████████| 80 kB 3.8 MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.4.1


In [21]:
from category_encoders.count import CountEncoder

In [22]:
# let's set up the encoder to encode all
# categorical variables with counts
count_enc = CountEncoder(cols=None)
# for frequency encoding ==> set parameter normalize=True

In [23]:
# fit the encoder to the train set
count_enc.fit(X_train)

CountEncoder(cols=['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13'],
             combine_min_nan_groups=True)

In [24]:
# the encoder finds categorical variables automatically
count_enc.cols

['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [25]:
# the enoding parameters are stored in the
# attribute mappings:
count_enc.mapping

{'A1': b          335
 a          144
 Missing      4
 Name: A1, dtype: int64, 'A10': f    271
 t    212
 Name: A10, dtype: int64, 'A12': f    263
 t    220
 Name: A12, dtype: int64, 'A13': g    441
 s     38
 p      4
 Name: A13, dtype: int64, 'A4': u          363
 y          115
 Missing      4
 l            1
 Name: A4, dtype: int64, 'A5': g          363
 p          115
 Missing      4
 gg           1
 Name: A5, dtype: int64, 'A6': c          93
 q          56
 w          48
 i          41
 ff         38
 k          38
 aa         34
 cc         30
 m          26
 x          24
 e          21
 d          21
 j           8
 Missing     4
 r           1
 Name: A6, dtype: int64, 'A7': v          277
 h          101
 ff          41
 bb          39
 z            7
 dd           5
 j            5
 Missing      4
 n            3
 o            1
 Name: A7, dtype: int64, 'A9': t    256
 f    227
 Name: A9, dtype: int64}

In [26]:
# let's transform train and test sets
X_train_enc = count_enc.transform(X_train)
X_test_enc = count_enc.transform(X_test)

In [27]:
# let's inspect the result
X_train_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
596,144,46.08,3.0,363,363,93,277,2.375,256,212,8,220,441,396.0,4159
303,144,15.92,2.875,363,363,56,277,0.085,227,271,0,263,441,120.0,0
204,335,36.33,2.125,115,115,48,277,0.085,256,212,1,263,441,50.0,1187
351,335,22.17,0.585,115,115,38,41,0.0,227,271,0,263,441,100.0,0
118,335,57.83,7.04,363,363,26,277,14.0,256,212,6,220,441,360.0,1332


In [28]:
X_test_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
14,144,45.83,10.5,363,363,56,277,5.0,256,212,7,220,441,0.0,0
586,335,64.08,20.0,363,363,24,101,17.5,256,212,9,220,441,0.0,1000
140,144,31.25,3.75,363,363,30,101,0.625,256,212,9,220,441,181.0,0
492,335,39.25,9.5,363,363,26,277,6.5,256,212,14,263,441,240.0,4607
350,144,26.17,2.0,363,363,8,5,0.0,227,271,0,220,441,276.0,1
