In [1]:
import os
import numpy as np
import pandas as pd

# Ordinal Encoding

In [2]:
# with pandas
data = {'Size': ['small', 'small', 'large', 'medium', 'large', 'large', 'small', 'medium'],
        'Color': ['red', 'green', 'black', 'white', 'blue', 'red', 'green', 'black'],
        'Class': [1, 1, 1, 0, 1, 0, 0, 1]}

df = pd.DataFrame(data, columns = ['Size', 'Color', 'Class'])
df

Unnamed: 0,Size,Color,Class
0,small,red,1
1,small,green,1
2,large,black,1
3,medium,white,0
4,large,blue,1
5,large,red,0
6,small,green,0
7,medium,black,1


In [3]:
# with scikit-learn
from sklearn.preprocessing import OrdinalEncoder

# Creating an instance of OrdinalEncoder
enc = OrdinalEncoder()

# Assigning numerical value and storing it
enc.fit(df[["Size","Color"]])
df[["Size","Color"]] = enc.transform(df[["Size","Color"]])
df

Unnamed: 0,Size,Color,Class
0,2.0,3.0,1
1,2.0,2.0,1
2,0.0,0.0,1
3,1.0,4.0,0
4,0.0,1.0,1
5,0.0,3.0,0
6,2.0,2.0,0
7,1.0,0.0,1


# One Hot Enconding

In [4]:
data = {'Size': ['small', 'small', 'large', 'medium', 'large', 'large', 'small', 'medium'],
        'Color': ['red', 'green', 'black', 'white', 'blue', 'red', 'green', 'black'],
        'Class': [1, 1, 1, 0, 1, 0, 0, 1]}

df = pd.DataFrame(data, columns = ['Size', 'Color', 'Class'])
df

Unnamed: 0,Size,Color,Class
0,small,red,1
1,small,green,1
2,large,black,1
3,medium,white,0
4,large,blue,1
5,large,red,0
6,small,green,0
7,medium,black,1


In [5]:
# with pandas
df = pd.get_dummies(df, prefix="One",columns=['Size', 'Color'])
df

Unnamed: 0,Class,One_large,One_medium,One_small,One_black,One_blue,One_green,One_red,One_white
0,1,0,0,1,0,0,0,1,0
1,1,0,0,1,0,0,1,0,0
2,1,1,0,0,1,0,0,0,0
3,0,0,1,0,0,0,0,0,1
4,1,1,0,0,0,1,0,0,0
5,0,1,0,0,0,0,0,1,0
6,0,0,0,1,0,0,1,0,0
7,1,0,1,0,1,0,0,0,0


In [6]:
# with scikit-learn

from sklearn.preprocessing import OneHotEncoder

data = {'Size': ['small', 'small', 'large', 'medium', 'large', 'large', 'small', 'medium'],
        'Color': ['red', 'green', 'black', 'white', 'blue', 'red', 'green', 'black'],
        'Class': [1, 1, 1, 0, 1, 0, 0, 1]}

df = pd.DataFrame(data, columns = ['Size', 'Color', 'Class'])
df

enc = OneHotEncoder(handle_unknown='ignore')
enc_df = pd.DataFrame(enc.fit_transform(df[['Size','Color']]).toarray())
df = df.join(enc_df)
df




Unnamed: 0,Size,Color,Class,0,1,2,3,4,5,6,7
0,small,red,1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,small,green,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,large,black,1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,medium,white,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,large,blue,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,large,red,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6,small,green,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
7,medium,black,1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


# Label Encoding

In [7]:
from sklearn.preprocessing import LabelEncoder

data = {'Size': ['small', 'small', 'large', 'medium', 'large', 'large', 'small', 'medium'],
        'Color': ['red', 'green', 'black', 'white', 'blue', 'red', 'green', 'black'],
        'Class': [1, 1, 1, 0, 1, 0, 0, 1]}

df = pd.DataFrame(data, columns = ['Size', 'Color', 'Class'])
df

# Creating an instance of Labelencoder
enc = LabelEncoder()

# Assigning numerical value and storing it
df[["Size","Color"]] = df[["Size","Color"]].apply(enc.fit_transform)

df

Unnamed: 0,Size,Color,Class
0,2,3,1
1,2,2,1
2,0,0,1
3,1,4,0
4,0,1,1
5,0,3,0
6,2,2,0
7,1,0,1


# Helmert Encoding

In [8]:
data = {'Size': ['small', 'small', 'small', 'small', 'medium', 'medium', 'medium', 'large','large', 'x-large']}
df = pd.DataFrame(data, columns = ['Size'])
df

Unnamed: 0,Size
0,small
1,small
2,small
3,small
4,medium
5,medium
6,medium
7,large
8,large
9,x-large


In [9]:
import category_encoders as ce
enc = ce.HelmertEncoder()
df = enc.fit_transform(df['Size'])
df

Unnamed: 0,intercept,Size_0,Size_1,Size_2
0,1,-1.0,-1.0,-1.0
1,1,-1.0,-1.0,-1.0
2,1,-1.0,-1.0,-1.0
3,1,-1.0,-1.0,-1.0
4,1,1.0,-1.0,-1.0
5,1,1.0,-1.0,-1.0
6,1,1.0,-1.0,-1.0
7,1,0.0,2.0,-1.0
8,1,0.0,2.0,-1.0
9,1,0.0,0.0,3.0


# Binary Encoding

In [10]:
import category_encoders as ce

data = {'Size': ['small', 'small', 'large', 'medium', 'large', 'large', 'small', 'medium'],
        'Color': ['red', 'green', 'black', 'white', 'blue', 'red', 'green', 'black'],
        'Class': [1, 1, 1, 0, 1, 0, 0, 1]}


df = pd.DataFrame(data, columns = ['Size', 'Color', 'Class'])

enc = ce.BinaryEncoder(cols=['Color','Size'])
df_binary = enc.fit_transform(df)

df_binary

Unnamed: 0,Size_0,Size_1,Color_0,Color_1,Color_2,Class
0,0,1,0,0,1,1
1,0,1,0,1,0,1
2,1,0,0,1,1,1
3,1,1,1,0,0,0
4,1,0,1,0,1,1
5,1,0,0,0,1,0
6,0,1,0,1,0,0
7,1,1,0,1,1,1


# Frequency Encoding

In [11]:
data = {'Size': ['small', 'small', 'large', 'medium', 'large', 'large', 'small', 'medium'],
        'Color': ['red', 'green', 'black', 'white', 'blue', 'red', 'green', 'black'],
        'Class': [1, 1, 1, 0, 1, 0, 0, 1]}


df = pd.DataFrame(data, columns = ['Size', 'Color', 'Class'])

frequency = df.groupby('Color').size()/len(df)
df.loc[:,'Frequency'] = df['Color'].map(frequency)
df

Unnamed: 0,Size,Color,Class,Frequency
0,small,red,1,0.25
1,small,green,1,0.25
2,large,black,1,0.25
3,medium,white,0,0.125
4,large,blue,1,0.125
5,large,red,0,0.25
6,small,green,0,0.25
7,medium,black,1,0.25


# Mean Encoding

In [12]:
data = {'Size': ['small', 'small', 'large', 'medium', 'large', 'large', 'small', 'medium', 'small', 'medium'],
        'Color': ['red', 'green', 'black', 'white', 'blue', 'red', 'green', 'black', 'red', 'red'],
        'Target': [1, 1, 1, 0, 1, 0, 0, 1, 1, 0]}


df = pd.DataFrame(data, columns = ['Size', 'Color', 'Target'])
df

Unnamed: 0,Size,Color,Target
0,small,red,1
1,small,green,1
2,large,black,1
3,medium,white,0
4,large,blue,1
5,large,red,0
6,small,green,0
7,medium,black,1
8,small,red,1
9,medium,red,0


In [13]:
from category_encoders.target_encoder import TargetEncoder

#TE_encoder = TargetEncoder(drop_invariant=True)
#df = TE_encoder.fit_transform(df['Color'], df['Target'])
#df

mean_encoding = df.groupby('Color')['Target'].mean()
print(mean_encoding)
df.loc[:,'Mean_encoding'] = df['Color'].map(mean_encoding)
df



Color
black    1.0
blue     1.0
green    0.5
red      0.5
white    0.0
Name: Target, dtype: float64


Unnamed: 0,Size,Color,Target,Mean_encoding
0,small,red,1,0.5
1,small,green,1,0.5
2,large,black,1,1.0
3,medium,white,0,0.0
4,large,blue,1,1.0
5,large,red,0,0.5
6,small,green,0,0.5
7,medium,black,1,1.0
8,small,red,1,0.5
9,medium,red,0,0.5


# Sum Encoding

In [14]:
from category_encoders import SumEncoder

data = {'Size': ['small', 'small', 'large', 'medium', 'large', 'large', 'small', 'medium', 'small', 'medium'],
        'Color': ['red', 'green', 'black', 'white', 'blue', 'red', 'green', 'black', 'red', 'red'],
        'Target': [1, 0, 1, 0, 1, 0, 0, 1, 1, 0]}


df = pd.DataFrame(data, columns = ['Size', 'Color', 'Target'])

sum_encoder =SumEncoder()
df_encoded = sum_encoder.fit_transform(df['Size'], df['Target'])

df_encoded

Unnamed: 0,intercept,Size_0,Size_1
0,1,1.0,0.0
1,1,1.0,0.0
2,1,0.0,1.0
3,1,-1.0,-1.0
4,1,0.0,1.0
5,1,0.0,1.0
6,1,1.0,0.0
7,1,-1.0,-1.0
8,1,1.0,0.0
9,1,-1.0,-1.0


# Weigth of Evidence

In [15]:
from category_encoders import WOEEncoder

data = {'Size': ['small', 'small', 'large', 'medium', 'large', 'large', 'small', 'medium', 'small', 'medium'],
        'Color': ['red', 'green', 'black', 'white', 'blue', 'red', 'green', 'black', 'red', 'red'],
        'Target': [1, 0, 1, 0, 1, 0, 0, 1, 1, 0]}


df = pd.DataFrame(data, columns = ['Size', 'Color', 'Target'])

#regularization is mostly to prevent division by zero. 
woe =WOEEncoder(random_state=42, regularization=0)
df_encoded = woe.fit_transform(df['Size'], df['Target'])
df_encoded


Unnamed: 0,Size
0,0.0
1,0.0
2,0.693147
3,-0.693147
4,0.693147
5,0.693147
6,0.0
7,-0.693147
8,0.0
9,-0.693147


# Probability Ratio Encoding

In [16]:
data = {'Size': ['small', 'small', 'large', 'medium', 'large', 'large', 'small', 'medium', 'small', 'medium'],
        'Color': ['red', 'green', 'black', 'white', 'blue', 'red', 'green', 'black', 'red', 'red'],
        'Target': [1, 0, 1, 0, 1, 0, 0, 1, 1, 0]}


df = pd.DataFrame(data, columns = ['Size', 'Color', 'Target'])

# Calculation of the probability of target being 1
probability_encoding_1 = df.groupby('Color')['Target'].mean()
# Calculation of the probability of target not being 1
probability_encoding_0 = 1 - probability_encoding_1
probability_encoding_0 = np.where(probability_encoding_0 == 0, 0.00001, probability_encoding_0)
# Probability ratio calculation
df_encoded = probability_encoding_1 / probability_encoding_0
# Map the probability ratio into the data
df.loc[:,'Proba_Ratio'] = df['Color'].map(df_encoded)
df

Unnamed: 0,Size,Color,Target,Proba_Ratio
0,small,red,1,1.0
1,small,green,0,0.0
2,large,black,1,100000.0
3,medium,white,0,0.0
4,large,blue,1,100000.0
5,large,red,0,1.0
6,small,green,0,0.0
7,medium,black,1,100000.0
8,small,red,1,1.0
9,medium,red,0,1.0


# Hashing Enconding

In [34]:
data = {'Size': ['small', 'small', 'large', 'medium', 'large', 'large', 'small', 'medium', 'small', 'medium'],
        'Color': ['red', 'green', 'black', 'white', 'blue', 'red', 'green', 'black', 'red', 'red'],
        'Target': [1, 0, 1, 0, 1, 0, 0, 1, 1, 0]}


df = pd.DataFrame(data, columns = ['Size', 'Color', 'Target'])

import category_encoders as ce
# n_components contains the number of bits you want in your hash value.
encoder_purpose = ce.HashingEncoder(n_components=3, hash_method="sha256")
# Converting the feature "Size"
df_encoded = encoder_purpose.fit_transform(df['Size'])
df_encoded

Unnamed: 0,col_0,col_1,col_2
0,0,1,0
1,0,1,0
2,1,0,0
3,0,1,0
4,1,0,0
5,1,0,0
6,0,1,0
7,0,1,0
8,0,1,0
9,0,1,0


# Backward difference encoding

In [35]:
data = {'Size': ['small', 'small', 'large', 'medium', 'large', 'large', 'small', 'medium', 'small', 'medium'],
        'Color': ['red', 'green', 'black', 'white', 'blue', 'red', 'green', 'black', 'red', 'red'],
        'Target': [1, 0, 1, 0, 1, 0, 0, 1, 1, 0]}


df = pd.DataFrame(data, columns = ['Size', 'Color', 'Target'])

import category_encoders as ce
encoder = ce.BackwardDifferenceEncoder(cols=['Size'])
df_encoded = encoder.fit_transform(df['Size'])
df_encoded

Unnamed: 0,intercept,Size_0,Size_1
0,1,-0.666667,-0.333333
1,1,-0.666667,-0.333333
2,1,0.333333,-0.333333
3,1,0.333333,0.666667
4,1,0.333333,-0.333333
5,1,0.333333,-0.333333
6,1,-0.666667,-0.333333
7,1,0.333333,0.666667
8,1,-0.666667,-0.333333
9,1,0.333333,0.666667


# Leave One Out Encoder

In [36]:
data = {'Size': ['small', 'small', 'large', 'medium', 'large', 'large', 'small', 'medium', 'small', 'medium'],
        'Color': ['red', 'green', 'black', 'white', 'blue', 'red', 'green', 'black', 'red', 'red'],
        'Target': [1, 0, 1, 0, 1, 0, 0, 1, 1, 0]}


df = pd.DataFrame(data, columns = ['Size', 'Color', 'Target'])

import category_encoders as ce
encoder = ce.LeaveOneOutEncoder(cols=['Color'])
df_encoded = encoder.fit_transform(df['Color'], df['Target'])
df_encoded

Unnamed: 0,Color
0,0.333333
1,0.0
2,1.0
3,0.5
4,0.5
5,0.666667
6,0.0
7,1.0
8,0.333333
9,0.666667


# James-Stein Encoder

In [37]:
data = {'Size': ['small', 'small', 'large', 'medium', 'large', 'large', 'small', 'medium', 'small', 'medium'],
        'Color': ['red', 'green', 'black', 'white', 'blue', 'red', 'green', 'black', 'red', 'red'],
        'Target': [1, 0, 1, 0, 1, 0, 0, 1, 1, 0]}


df = pd.DataFrame(data, columns = ['Size', 'Color', 'Target'])

import category_encoders as ce
encoder = ce.JamesSteinEncoder(cols=['Color'])
df_encoded = encoder.fit_transform(df['Color'], df['Target'])

df_encoded


Unnamed: 0,Color
0,0.5
1,0.0
2,1.0
3,0.0
4,1.0
5,0.5
6,0.0
7,1.0
8,0.5
9,0.5


# M-Estimator Econding 

In [39]:
data = {'Size': ['small', 'small', 'large', 'medium', 'large', 'large', 'small', 'medium', 'small', 'medium'],
        'Color': ['red', 'green', 'black', 'white', 'blue', 'red', 'green', 'black', 'red', 'red'],
        'Target': [1, 0, 1, 0, 1, 0, 0, 1, 1, 0]}


df = pd.DataFrame(data, columns = ['Size', 'Color', 'Target'])

encoder = ce.MEstimateEncoder(cols=['Color'])
df_encoded = encoder.fit_transform(df['Color'], df['Target'])
df_encoded

      Color
0  0.500000
1  0.166667
2  0.833333
3  0.250000
4  0.750000
5  0.500000
6  0.166667
7  0.833333
8  0.500000
9  0.500000
