# Target Encoder

### Imports

In [1]:
import pandas as pd
import numpy as np

### Synthetic Dataset

In [2]:
seed = 321

In [3]:
np.random.seed(seed)
target = list(np.random.randint(0, 2, 20))

In [4]:
genre = ["Sci Fi", "Drama", "Romance", "Fantasy", "Nonfiction"]

In [5]:
np.random.seed(seed)
genres = [genre[i] for i in np.random.randint(0, len(genre), 20)]

In [6]:
df = pd.DataFrame({"genre" : genres, "target" : target})

In [7]:
df

Unnamed: 0,genre,target
0,Nonfiction,0
1,Romance,0
2,Nonfiction,0
3,Drama,1
4,Sci Fi,1
5,Drama,0
6,Sci Fi,1
7,Romance,0
8,Sci Fi,0
9,Nonfiction,0


### Using Probabilities
Calculating only the posteriors, by calculating the conditional probability

In [8]:
categories = df['genre'].unique()
targets = df['target'].unique()
cat_list = []
for cat in categories:
    aux_dict = {}
    aux_dict['category'] = cat
    aux_df = df[df['genre'] == cat]
    counts = aux_df['target'].value_counts()
    aux_dict['count'] = sum(counts)
    for t in targets:
        aux_dict['target_' + str(t)] = counts[t]
    cat_list.append(aux_dict)

In [9]:
cat_list = pd.DataFrame(cat_list)

In [10]:
cat_list['genre_encoded_prob'] = cat_list['target_1'] / cat_list['count']

In [11]:
cat_list

Unnamed: 0,category,count,target_0,target_1,genre_encoded_prob
0,Nonfiction,4,3,1,0.25
1,Romance,5,4,1,0.2
2,Drama,3,1,2,0.666667
3,Sci Fi,4,2,2,0.5
4,Fantasy,4,1,3,0.75


In [12]:
df = df.join(cat_list.drop(columns = ['count', 'target_1', 'target_0']).set_index('category'), on = 'genre', how = 'left')
df

Unnamed: 0,genre,target,genre_encoded_prob
0,Nonfiction,0,0.25
1,Romance,0,0.2
2,Nonfiction,0,0.25
3,Drama,1,0.666667
4,Sci Fi,1,0.5
5,Drama,0,0.666667
6,Sci Fi,1,0.5
7,Romance,0,0.2
8,Sci Fi,0,0.5
9,Nonfiction,0,0.25


### Using the mean
Calculating only the posteriors

In [13]:
stats = df['target'].groupby(df['genre']).agg(['count', 'mean'])
stats

Unnamed: 0_level_0,count,mean
genre,Unnamed: 1_level_1,Unnamed: 2_level_1
Drama,3,0.666667
Fantasy,4,0.75
Nonfiction,4,0.25
Romance,5,0.2
Sci Fi,4,0.5


In [14]:
df = df.join(stats.drop(columns = 'count'), on = 'genre', how = 'left').rename(columns = {'mean'  : 'genre_encoded_mean'})
df

Unnamed: 0,genre,target,genre_encoded_prob,genre_encoded_mean
0,Nonfiction,0,0.25,0.25
1,Romance,0,0.2,0.2
2,Nonfiction,0,0.25,0.25
3,Drama,1,0.666667,0.666667
4,Sci Fi,1,0.5,0.5
5,Drama,0,0.666667,0.666667
6,Sci Fi,1,0.5,0.5
7,Romance,0,0.2,0.2
8,Sci Fi,0,0.5,0.5
9,Nonfiction,0,0.25,0.25


### With Smoothing
sklearn method uses some smoothing based on the prior

In [15]:
smoothing_factor = 1.0
min_samples_leaf = 1

In [16]:
prior = df['target'].mean()

In [17]:
smoove = 1 / (1 + np.exp(-(stats['count'] - min_samples_leaf) / smoothing_factor))

In [18]:
smoothing = prior * (1 - smoove) + stats['mean'] * smoove

In [19]:
encoded = pd.Series(smoothing, name = 'genre_encoded_smoothing')
encoded

genre
Drama         0.640839
Fantasy       0.735772
Nonfiction    0.259485
Romance       0.204497
Sci Fi        0.497629
Name: genre_encoded_smoothing, dtype: float64

In [20]:
df = df.join(encoded, on = 'genre', how = 'left')
df

Unnamed: 0,genre,target,genre_encoded_prob,genre_encoded_mean,genre_encoded_smoothing
0,Nonfiction,0,0.25,0.25,0.259485
1,Romance,0,0.2,0.2,0.204497
2,Nonfiction,0,0.25,0.25,0.259485
3,Drama,1,0.666667,0.666667,0.640839
4,Sci Fi,1,0.5,0.5,0.497629
5,Drama,0,0.666667,0.666667,0.640839
6,Sci Fi,1,0.5,0.5,0.497629
7,Romance,0,0.2,0.2,0.204497
8,Sci Fi,0,0.5,0.5,0.497629
9,Nonfiction,0,0.25,0.25,0.259485


### Sklearn category_encoders

https://contrib.scikit-learn.org/category_encoders/targetencoder.html

In [21]:
from category_encoders import TargetEncoder

In [22]:
encoder = TargetEncoder()

In [23]:
df['genre_encoded_sklearn'] = encoder.fit_transform(df['genre'], df['target'])

In [24]:
df

Unnamed: 0,genre,target,genre_encoded_prob,genre_encoded_mean,genre_encoded_smoothing,genre_encoded_sklearn
0,Nonfiction,0,0.25,0.25,0.259485,0.259485
1,Romance,0,0.2,0.2,0.204497,0.204497
2,Nonfiction,0,0.25,0.25,0.259485,0.259485
3,Drama,1,0.666667,0.666667,0.640839,0.640839
4,Sci Fi,1,0.5,0.5,0.497629,0.497629
5,Drama,0,0.666667,0.666667,0.640839,0.640839
6,Sci Fi,1,0.5,0.5,0.497629,0.497629
7,Romance,0,0.2,0.2,0.204497,0.204497
8,Sci Fi,0,0.5,0.5,0.497629,0.497629
9,Nonfiction,0,0.25,0.25,0.259485,0.259485
