In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({'color' : ['red', 'green', 'blue', 'red', np.nan]})
df

Unnamed: 0,color
0,red
1,green
2,blue
3,red
4,


# --- Manual Way ---

Also allows to specify explicitly categories and their corresponding numerical values (especially for encoding the ordinal data).

Use this method if the data contain NANs.

### Forward Mapping

In [3]:
class_mapping = {label : idx for idx, label in enumerate(np.unique(df['color'].dropna()))}
class_mapping

{'blue': 0, 'green': 1, 'red': 2}

In [4]:
df['color'] = df['color'].map(class_mapping)
df

Unnamed: 0,color
0,2.0
1,1.0
2,0.0
3,2.0
4,


### Inverse Mapping

In [5]:
inv_class_mapping = {idx : label for label, idx in class_mapping.items()}
inv_class_mapping

{0: 'blue', 1: 'green', 2: 'red'}

In [6]:
df['color'] = df['color'].map(inv_class_mapping)
df

Unnamed: 0,color
0,red
1,green
2,blue
3,red
4,


# --- Label Encoder ---

Suitable for nominal categories. DOES NOT HANDLE NAN VALUES!

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
df = pd.DataFrame({'color' : ['red', 'green', 'blue', 'red']})
df

Unnamed: 0,color
0,red
1,green
2,blue
3,red


### Forward Mapping

In [9]:
encoder = LabelEncoder().fit(df['color'])
df['color'] = encoder.transform(df['color'])
df

Unnamed: 0,color
0,2
1,1
2,0
3,2


### Inverse Mapping

In [10]:
df['color'] = encoder.inverse_transform(df['color'])
df

Unnamed: 0,color
0,red
1,green
2,blue
3,red


# Category_Encoders Package

If you have a column with values car, bus, and truck (NOMINAL DATA) you should first encode this nominal data using OrdinalEncoder. Then encode it again using one of the methods appropriate to nominal data that we’ll explore below.

Stopping after you use OrdinalEncoder is a bad idea. Your machine learning algorithm will treat the variable as continuous and assume the values are on a meaningful scale.

In contrast, if your column values are truly ordinal, that means that the integer assigned to each value is meaningful. Assignment should be done with intention. Say your column had the string values “First”, “Third”, and “Second” in it. Those values should be mapped to the corresponding integers by passing OrdinalEncoder a list of dicts

In [11]:
# pip install category_encoders
import category_encoders as ce

In [12]:
df_train = pd.DataFrame({
    'col_1':['a', 'c', 'a', 'a', np.nan, 'b', 'b'], 
    'target':[1, 2, 0, 0, 0, 1, 1]})
df_train

Unnamed: 0,col_1,target
0,a,1
1,c,2
2,a,0
3,a,0
4,,0
5,b,1
6,b,1


In [13]:
df_test = pd.DataFrame({
    'col_1':['c', 'b', np.nan, 'b', 'a'], 
    'target':[0, 2, 0, 1, 1]})
df_test

Unnamed: 0,col_1,target
0,c,0
1,b,2
2,,0
3,b,1
4,a,1


# Ordinal Encoder

Can also be used as a label encoder.

http://contrib.scikit-learn.org/categorical-encoding/ordinal.html

In [14]:
encoder = ce.OrdinalEncoder(cols=['col_1'], handle_missing='return_nan')

### Train Set

In [15]:
encoder.fit_transform(df_train['col_1'])

Unnamed: 0,col_1
0,1.0
1,2.0
2,1.0
3,1.0
4,
5,4.0
6,4.0


### Test Set

In [16]:
encoder.transform(df_test['col_1'])

Unnamed: 0,col_1
0,2.0
1,4.0
2,
3,4.0
4,1.0


# Target Encoder

http://contrib.scikit-learn.org/categorical-encoding/targetencoder.html

In [17]:
encoder = ce.target_encoder.TargetEncoder(cols=['col_1'], handle_missing='return_nan')

In [18]:
encoder.fit_transform(df_train['col_1'], df_train['target'])

Unnamed: 0,col_1
0,0.378744
1,0.714286
2,0.378744
3,0.378744
4,
5,0.92316
6,0.92316


In [19]:
encoder.transform(df_test['col_1'])

Unnamed: 0,col_1
0,0.714286
1,0.92316
2,
3,0.92316
4,0.378744


# Leave One Out Encoder

http://contrib.scikit-learn.org/categorical-encoding/leaveoneout.html

In [20]:
encoder = ce.leave_one_out.LeaveOneOutEncoder(cols=['col_1'], handle_missing='return_nan', random_state=0)

In [21]:
encoder.fit_transform(df_train['col_1'], df_train['target'])

Unnamed: 0,col_1
0,0.0
1,0.714286
2,0.5
3,0.5
4,
5,1.0
6,1.0


In [22]:
encoder.transform(df_test['col_1'])

Unnamed: 0,col_1
0,0.714286
1,1.0
2,
3,1.0
4,0.333333


# CatBoost Encoder

http://contrib.scikit-learn.org/categorical-encoding/catboost.html

In [23]:
encoder = ce.cat_boost.CatBoostEncoder(cols=['col_1'], handle_missing='return_nan', random_state=0)

In [24]:
encoder.fit_transform(df_train['col_1'], df_train['target'])

Unnamed: 0,col_1
0,0.714286
1,0.714286
2,0.857143
3,0.571429
4,
5,0.714286
6,0.857143


In [25]:
encoder.transform(df_test['col_1'])

Unnamed: 0,col_1
0,0.714286
1,0.904762
2,
3,0.904762
4,0.428571
