# Encoding Categorical Values

This tutorial is from

http://pbpython.com/categorical-encoding.html

All credits should go to the author except the additional work which I have done

In [1]:
import os
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

In [3]:
# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )

In [4]:
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [5]:
# Check the data types
df.dtypes

symboling              int64
normalized_losses    float64
make                  object
fuel_type             object
aspiration            object
num_doors             object
body_style            object
drive_wheels          object
engine_location       object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_cylinders         object
engine_size            int64
fuel_system           object
bore                 float64
stroke               float64
compression_ratio    float64
horsepower           float64
peak_rpm             float64
city_mpg               int64
highway_mpg            int64
price                float64
dtype: object

In [6]:
# Separate out the object type variables as we are only focusing on encoding the categorical values
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi


In [7]:
# Check for null values
obj_df[obj_df.isnull().any(axis=1)]

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
27,dodge,gas,turbo,,sedan,fwd,front,ohc,four,mpfi
63,mazda,diesel,std,,sedan,fwd,front,ohc,four,idi


In [8]:
obj_df["num_doors"].value_counts()

four    114
two      89
Name: num_doors, dtype: int64

In [9]:
# For the sake of simplicity, just fill in the value with the number 4 (since that is the most common value):
obj_df = obj_df.fillna({"num_doors": "four"})

## Approach #1 - Find and Replace

In [10]:
obj_df["num_cylinders"].value_counts()

four      159
six        24
five       11
eight       5
two         4
three       1
twelve      1
Name: num_cylinders, dtype: int64

In [11]:
# cleaning up the num_doors and num_cylinders columns:
cleanup_nums = {"num_doors":     {"four": 4, "two": 2},
                "num_cylinders": {"four": 4, "six": 6, "five": 5, "eight": 8,
                                  "two": 2, "twelve": 12, "three":3 }}

In [12]:
obj_df.replace(cleanup_nums, inplace=True)
obj_df.head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi
1,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi
2,alfa-romero,gas,std,2,hatchback,rwd,front,ohcv,6,mpfi
3,audi,gas,std,4,sedan,fwd,front,ohc,4,mpfi
4,audi,gas,std,4,sedan,4wd,front,ohc,5,mpfi


In [13]:
# The nice benefit to this approach is that pandas “knows” the types of values in the columns so the object is now a int64
obj_df.dtypes

make               object
fuel_type          object
aspiration         object
num_doors           int64
body_style         object
drive_wheels       object
engine_location    object
engine_type        object
num_cylinders       int64
fuel_system        object
dtype: object

## Approach #2 - Label Encoding

For example, the body_style column contains 5 different values. We could choose to encode it like this:
<br>
convertible -> 0 
<br>
hardtop -> 1
<br>
hatchback -> 2
<br>
sedan -> 3
<br>
wagon -> 4

In [15]:
# One trick you can use in pandas is to convert a column to a category, then use those category values for your label encoding:
obj_df["body_style"] = obj_df["body_style"].astype('category')
print(obj_df.dtypes)
obj_df["body_style"].value_counts()

make                 object
fuel_type            object
aspiration           object
num_doors             int64
body_style         category
drive_wheels         object
engine_location      object
engine_type          object
num_cylinders         int64
fuel_system          object
dtype: object


sedan          96
hatchback      70
wagon          25
hardtop         8
convertible     6
Name: body_style, dtype: int64

In [16]:
# Then you can assign the encoded variable to a new column using the cat.codes accessor:
obj_df["body_style_cat"] = obj_df["body_style"].cat.codes
print(obj_df.head())
print(obj_df.dtypes)

          make fuel_type aspiration  num_doors   body_style drive_wheels  \
0  alfa-romero       gas        std          2  convertible          rwd   
1  alfa-romero       gas        std          2  convertible          rwd   
2  alfa-romero       gas        std          2    hatchback          rwd   
3         audi       gas        std          4        sedan          fwd   
4         audi       gas        std          4        sedan          4wd   

  engine_location engine_type  num_cylinders fuel_system  body_style_cat  
0           front        dohc              4        mpfi               0  
1           front        dohc              4        mpfi               0  
2           front        ohcv              6        mpfi               2  
3           front         ohc              4        mpfi               3  
4           front         ohc              5        mpfi               3  
make                 object
fuel_type            object
aspiration           object
num_doors

The nice aspect of this approach is that you get the benefits of pandas categories (compact data size, ability to order, lotting support) but can easily be converted to numeric values for further analysis.

## Approach #3 - One Hot Encoding

Label encoding has the advantage that it is straightforward but it has the disadvantage that the numeric values can be misinterpreted” by the algorithms. 
For example, the value of 0 is obviously less than the value of 4 but does that really correspond to the data set in real life? Does a wagon have “4X” more weight in our calculation than the convertible? 
In this example, I don’t think so.

In [17]:
pd.get_dummies(obj_df, columns=["drive_wheels"]).head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,engine_location,engine_type,num_cylinders,fuel_system,body_style_cat,drive_wheels_4wd,drive_wheels_fwd,drive_wheels_rwd
0,alfa-romero,gas,std,2,convertible,front,dohc,4,mpfi,0,0,0,1
1,alfa-romero,gas,std,2,convertible,front,dohc,4,mpfi,0,0,0,1
2,alfa-romero,gas,std,2,hatchback,front,ohcv,6,mpfi,2,0,0,1
3,audi,gas,std,4,sedan,front,ohc,4,mpfi,3,0,1,0
4,audi,gas,std,4,sedan,front,ohc,5,mpfi,3,1,0,0


In [18]:
# Proper naming will make the rest of the analysis just a little bit easier.
pd.get_dummies(obj_df, columns=["body_style", "drive_wheels"], prefix=["body", "drive"]).head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,engine_location,engine_type,num_cylinders,fuel_system,body_style_cat,body_convertible,body_hardtop,body_hatchback,body_sedan,body_wagon,drive_4wd,drive_fwd,drive_rwd
0,alfa-romero,gas,std,2,front,dohc,4,mpfi,0,1,0,0,0,0,0,0,1
1,alfa-romero,gas,std,2,front,dohc,4,mpfi,0,1,0,0,0,0,0,0,1
2,alfa-romero,gas,std,2,front,ohcv,6,mpfi,2,0,0,1,0,0,0,0,1
3,audi,gas,std,4,front,ohc,4,mpfi,3,0,0,0,1,0,0,1,0
4,audi,gas,std,4,front,ohc,5,mpfi,3,0,0,0,1,0,1,0,0


The other concept to keep in mind is that get_dummies returns the full dataframe so you will need to filter out the objects using select_dtypes when you are ready to do the final analysis.

## Approach #4 - Custom Binary Encoding

Depending on the data set, you may be able to use some combination of label encoding and one hot encoding to 
create a binary column that meets your needs for further analysis.

In [19]:
print(obj_df["engine_type"].value_counts())
obj_df["OHC_Code"] = np.where(obj_df["engine_type"].str.contains("ohc"), 1, 0)
obj_df[["make", "engine_type", "OHC_Code"]].head(10)

ohc      148
ohcf      15
ohcv      13
l         12
dohc      12
rotor      4
dohcv      1
Name: engine_type, dtype: int64


Unnamed: 0,make,engine_type,OHC_Code
0,alfa-romero,dohc,1
1,alfa-romero,dohc,1
2,alfa-romero,ohcv,1
3,audi,ohc,1
4,audi,ohc,1
5,audi,ohc,1
6,audi,ohc,1
7,audi,ohc,1
8,audi,ohc,1
9,audi,ohc,1


## Scikit-Learn

In addition to the pandas approach, scikit-learn provides similar functionality. 
Using pandas a little simpler but it is important to be aware of how to execute the processes in scikit-learn.

For instance, if we want to do a label encoding on the make of the car, we need to instantiate a LabelEncoder object and fit_transform the data:

In [20]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
obj_df["make_code"] = lb_make.fit_transform(obj_df["make"])
obj_df[["make", "make_code"]].head(11)

Unnamed: 0,make,make_code
0,alfa-romero,0
1,alfa-romero,0
2,alfa-romero,0
3,audi,1
4,audi,1
5,audi,1
6,audi,1
7,audi,1
8,audi,1
9,audi,1


Scikit-learn also supports binary encoding by using the LabelBinarizer. 
We use a similar process as above to transform the data but the process of creating a 
pandas DataFrame adds a couple of extra steps.

In [21]:
from sklearn.preprocessing import LabelBinarizer

lb_style = LabelBinarizer()
lb_results = lb_style.fit_transform(obj_df["body_style"])
pd.DataFrame(lb_results, columns=lb_style.classes_).head()

Unnamed: 0,convertible,hardtop,hatchback,sedan,wagon
0,1,0,0,0,0
1,1,0,0,0,0
2,0,0,1,0,0
3,0,0,0,1,0
4,0,0,0,1,0


The next step would be to join this data back to the original dataframe

## Advanced Approaches

There are even more advanced algorithms for categorical encoding.
<br>
http://www.willmcginnis.com/2015/11/29/beyond-one-hot-an-exploration-of-categorical-variables/
<br>
The other nice aspect is that the author of the article has created a scikit-learn contrib package called categorical-encoding
<br>
http://contrib.scikit-learn.org/categorical-encoding/

Here is a brief introduction to using the library for some other types of encoding. 
For the first example, we will try doing a Backward Difference encoding.
<br>
First we get a clean dataframe and setup the BackwardDifferenceEncoder :

In [23]:
import category_encoders as ce

# Get a new clean dataframe
obj_df = df.select_dtypes(include=['object']).copy()

In [24]:
# Specify the columns to encode then fit and transform
encoder = ce.backward_difference.BackwardDifferenceEncoder(cols=["engine_type"])
encoder.fit(obj_df, verbose=1)

BackwardDifferenceEncoder(cols=['engine_type'], drop_invariant=False,
                          handle_missing='value', handle_unknown='value',
                          mapping=[{'col': 'engine_type',
                                    'mapping':     engine_type_0  engine_type_1  engine_type_2  engine_type_3  engine_type_4  \
 1      -0.857143      -0.714286      -0.571429      -0.428571      -0.285714   
 2       0.142857      -0.714286      -0.571429      -0.428571      -0.285714   
 3       0.142857       0.285714      -0.571429      -0.428571      -0.2...
 4       0.142857       0.285714       0.428571      -0.428571      -0.285714   
 5       0.142857       0.285714       0.428571       0.571429      -0.285714   
 6       0.142857       0.285714       0.428571       0.571429       0.714286   
 7       0.142857       0.285714       0.428571       0.571429       0.714286   
-1       0.000000       0.000000       0.000000       0.000000       0.000000   
-2       0.000000       0.0

In [25]:
# Only display the first 8 columns for brevity
encoder.transform(obj_df).iloc[:,8:14].head()

Unnamed: 0,engine_type_0,engine_type_1,engine_type_2,engine_type_3,engine_type_4,engine_type_5
0,-0.857143,-0.714286,-0.571429,-0.428571,-0.285714,-0.142857
1,-0.857143,-0.714286,-0.571429,-0.428571,-0.285714,-0.142857
2,0.142857,-0.714286,-0.571429,-0.428571,-0.285714,-0.142857
3,0.142857,0.285714,-0.571429,-0.428571,-0.285714,-0.142857
4,0.142857,0.285714,-0.571429,-0.428571,-0.285714,-0.142857


The interesting thing is that you can see that the result are not the standard 1’s and 0’s we saw in the earlier encoding examples.

If we try a polynomial encoding, we get a different distribution of values used to encode the columns:

In [26]:
encoder = ce.polynomial.PolynomialEncoder(cols=["engine_type"])
encoder.fit(obj_df, verbose=1)
encoder.transform(obj_df).iloc[:,8:14].head()

Unnamed: 0,engine_type_0,engine_type_1,engine_type_2,engine_type_3,engine_type_4,engine_type_5
0,-0.566947,0.5455447,-0.408248,0.241747,-0.109109,0.032898
1,-0.566947,0.5455447,-0.408248,0.241747,-0.109109,0.032898
2,-0.377964,9.521795000000001e-17,0.408248,-0.564076,0.436436,-0.197386
3,-0.188982,-0.3273268,0.408248,0.080582,-0.545545,0.493464
4,-0.188982,-0.3273268,0.408248,0.080582,-0.545545,0.493464


There are several different algorithms included in this package and the best way to learn is to try them out and see if it helps you with the accuracy of your analysis. 
<br>
The code shown above should give you guidance on how to plug in the other approaches and see what kind of results you get.