# Multiclass Target Encoder

### Imports

In [1]:
import pandas as pd
import numpy as np

### Synthetic Dataset

In [2]:
np.random.seed(999)
target = list(np.random.randint(0, 3, 20))

In [3]:
genre = ["Romance", "Fantasy", "Nonfiction"]

In [4]:
np.random.seed(123)
genres = [genre[i] for i in np.random.randint(0, len(genre), 20)]

In [5]:
df = pd.DataFrame({"genre" : genres, "target" : target})

In [6]:
df

Unnamed: 0,genre,target
0,Nonfiction,0
1,Fantasy,0
2,Nonfiction,1
3,Nonfiction,1
4,Romance,0
5,Nonfiction,1
6,Nonfiction,1
7,Fantasy,0
8,Nonfiction,1
9,Fantasy,1


## Manual

### Using the mean - wrong way
Calculating only the posteriors

In [7]:
stats = df['target'].groupby(df['genre']).agg(['count', 'mean'])
stats

Unnamed: 0_level_0,count,mean
genre,Unnamed: 1_level_1,Unnamed: 2_level_1
Fantasy,7,0.714286
Nonfiction,9,1.0
Romance,4,0.5


In [8]:
df = df.join(stats.drop(columns = 'count'), on = 'genre', how = 'left').rename(columns = {'mean'  : 'genre_encoded_mean'})
df

Unnamed: 0,genre,target,genre_encoded_mean
0,Nonfiction,0,1.0
1,Fantasy,0,0.714286
2,Nonfiction,1,1.0
3,Nonfiction,1,1.0
4,Romance,0,0.5
5,Nonfiction,1,1.0
6,Nonfiction,1,1.0
7,Fantasy,0,0.714286
8,Nonfiction,1,1.0
9,Fantasy,1,0.714286


### Sklearn Category Encoders - wrong way

https://contrib.scikit-learn.org/category_encoders/targetencoder.html

In [9]:
from category_encoders import TargetEncoder

In [10]:
encoder = TargetEncoder()

In [11]:
df['genre_encoded_sklearn'] = encoder.fit_transform(df['genre'], df['target'])

In [12]:
df

Unnamed: 0,genre,target,genre_encoded_mean,genre_encoded_sklearn
0,Nonfiction,0,1.0,0.999933
1,Fantasy,0,0.714286,0.714498
2,Nonfiction,1,1.0,0.999933
3,Nonfiction,1,1.0,0.999933
4,Romance,0,0.5,0.514228
5,Nonfiction,1,1.0,0.999933
6,Nonfiction,1,1.0,0.999933
7,Fantasy,0,0.714286,0.714498
8,Nonfiction,1,1.0,0.999933
9,Fantasy,1,0.714286,0.714498


### Using Probabilities
Calculating only the posteriors, by using conditional probabilities

In [13]:
df = df.drop(columns = ['genre_encoded_mean', 'genre_encoded_sklearn'])

In [14]:
categories = df['genre'].unique()
targets = df['target'].unique()
cat_list = []
for cat in categories:
    aux_dict = {}
    aux_dict['category'] = cat
    aux_df = df[df['genre'] == cat]
    counts = aux_df['target'].value_counts()
    aux_dict['count'] = sum(counts)
    for t in targets:
        aux_dict['target_' + str(t)] = counts[t] if t in counts.keys() else 0
    cat_list.append(aux_dict)

In [15]:
cat_list = pd.DataFrame(cat_list)

In [16]:
for t in targets:
    cat_list['genre_encoded_prob_target_' + str(t)] = cat_list['target_' + str(t)] / cat_list['count']

In [17]:
cat_list

Unnamed: 0,category,count,target_0,target_1,target_2,genre_encoded_prob_target_0,genre_encoded_prob_target_1,genre_encoded_prob_target_2
0,Nonfiction,9,2,5,2,0.222222,0.555556,0.222222
1,Fantasy,7,3,3,1,0.428571,0.428571,0.142857
2,Romance,4,3,0,1,0.75,0.0,0.25


In [18]:
df = df.join(cat_list.drop(columns = (['count'] + ['target_' + str(t) for t in targets])).set_index('category'), on = 'genre', how = 'left')
df

Unnamed: 0,genre,target,genre_encoded_prob_target_0,genre_encoded_prob_target_1,genre_encoded_prob_target_2
0,Nonfiction,0,0.222222,0.555556,0.222222
1,Fantasy,0,0.428571,0.428571,0.142857
2,Nonfiction,1,0.222222,0.555556,0.222222
3,Nonfiction,1,0.222222,0.555556,0.222222
4,Romance,0,0.75,0.0,0.25
5,Nonfiction,1,0.222222,0.555556,0.222222
6,Nonfiction,1,0.222222,0.555556,0.222222
7,Fantasy,0,0.428571,0.428571,0.142857
8,Nonfiction,1,0.222222,0.555556,0.222222
9,Fantasy,1,0.428571,0.428571,0.142857


### Using the library - right way

In [19]:
from category_encoders import TargetEncoder

In [20]:
targets = df['target'].unique()
for t in targets:
    target_aux = df['target'].apply(lambda x: 1 if x == t else 0)
    encoder = TargetEncoder()
    df['genre_encoded_sklearn_target_' + str(t)] = encoder.fit_transform(df['genre'], target_aux)

In [21]:
df

Unnamed: 0,genre,target,genre_encoded_prob_target_0,genre_encoded_prob_target_1,genre_encoded_prob_target_2,genre_encoded_sklearn_target_0,genre_encoded_sklearn_target_1,genre_encoded_sklearn_target_2
0,Nonfiction,0,0.222222,0.555556,0.222222,0.222282,0.555503,0.222215
1,Fantasy,0,0.428571,0.428571,0.142857,0.428501,0.428501,0.142998
2,Nonfiction,1,0.222222,0.555556,0.222222,0.222282,0.555503,0.222215
3,Nonfiction,1,0.222222,0.555556,0.222222,0.222282,0.555503,0.222215
4,Romance,0,0.75,0.0,0.25,0.733401,0.01897,0.247629
5,Nonfiction,1,0.222222,0.555556,0.222222,0.222282,0.555503,0.222215
6,Nonfiction,1,0.222222,0.555556,0.222222,0.222282,0.555503,0.222215
7,Fantasy,0,0.428571,0.428571,0.142857,0.428501,0.428501,0.142998
8,Nonfiction,1,0.222222,0.555556,0.222222,0.222282,0.555503,0.222215
9,Fantasy,1,0.428571,0.428571,0.142857,0.428501,0.428501,0.142998


### Using the mean - right way

In [22]:
from category_encoders import TargetEncoder

In [23]:
targets = df['target'].unique()
for t in targets:
    df['target_' + str(t)] = df['target'].apply(lambda x: 1 if x == t else 0)
    stats = df['target_' + str(t)].groupby(df['genre']).agg(['mean'])
    df = df.join(stats, on = 'genre', how = 'left').rename(columns = {'mean'  : 'genre_encoded_mean_target_' + str(t)})
    df = df.drop(columns = ['target_' + str(t)])

In [24]:
df

Unnamed: 0,genre,target,genre_encoded_prob_target_0,genre_encoded_prob_target_1,genre_encoded_prob_target_2,genre_encoded_sklearn_target_0,genre_encoded_sklearn_target_1,genre_encoded_sklearn_target_2,genre_encoded_mean_target_0,genre_encoded_mean_target_1,genre_encoded_mean_target_2
0,Nonfiction,0,0.222222,0.555556,0.222222,0.222282,0.555503,0.222215,0.222222,0.555556,0.222222
1,Fantasy,0,0.428571,0.428571,0.142857,0.428501,0.428501,0.142998,0.428571,0.428571,0.142857
2,Nonfiction,1,0.222222,0.555556,0.222222,0.222282,0.555503,0.222215,0.222222,0.555556,0.222222
3,Nonfiction,1,0.222222,0.555556,0.222222,0.222282,0.555503,0.222215,0.222222,0.555556,0.222222
4,Romance,0,0.75,0.0,0.25,0.733401,0.01897,0.247629,0.75,0.0,0.25
5,Nonfiction,1,0.222222,0.555556,0.222222,0.222282,0.555503,0.222215,0.222222,0.555556,0.222222
6,Nonfiction,1,0.222222,0.555556,0.222222,0.222282,0.555503,0.222215,0.222222,0.555556,0.222222
7,Fantasy,0,0.428571,0.428571,0.142857,0.428501,0.428501,0.142998,0.428571,0.428571,0.142857
8,Nonfiction,1,0.222222,0.555556,0.222222,0.222282,0.555503,0.222215,0.222222,0.555556,0.222222
9,Fantasy,1,0.428571,0.428571,0.142857,0.428501,0.428501,0.142998,0.428571,0.428571,0.142857
