# One Hot Encoder

In [1]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [2]:
X = np.array([["A"] , ["A"], ["B"],["C"]])
X

array([['A'],
       ['A'],
       ['B'],
       ['C']], dtype='<U1')

In [3]:
enc = OneHotEncoder()

In [4]:
enc.fit_transform(X).todense()

matrix([[1., 0., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]])

#### Removing one dummy Variable

In [9]:
enc = OneHotEncoder(drop='first')
Y = enc.fit_transform(X).todense()

# Removing one dummy variable from binary features

In [12]:
enc = OneHotEncoder(drop="if_binary")
Z = enc.fit_transform(np.array(Y)).todense()

#### Error Handling


In [18]:
Y = np.array([["A"] , ["A"], ["A"],["C"]])
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(Y)

In [20]:
Z = np.array([["A"] , ["B"], ["A"],["C"]])
enc.transform(Y).todense()

matrix([[1., 0.],
        [1., 0.],
        [1., 0.],
        [0., 1.]])

In [21]:
enc.transform(Z).todense()

matrix([[1., 0.],
        [0., 0.],
        [1., 0.],
        [0., 1.]])

In [22]:
# As the B is ignored The Values become 0 for both A & C Columns

#### Define the Categories into the OneHotEncoder Manually

In [23]:
X = [["A", "X"],["B", "Y"], ["C", "Z"]]

In [29]:
enc = OneHotEncoder(categories=[["A", "B", "C", "D"], ["X", "Y"]], handle_unknown="ignore")

In [30]:
enc.fit_transform(X).todense()

matrix([[1., 0., 0., 0., 1., 0.],
        [0., 1., 0., 0., 0., 1.],
        [0., 0., 1., 0., 0., 0.]])

In [33]:
Y = [["D", "Z"]]
enc.transform(Y).todense()

matrix([[0., 0., 0., 1., 0., 0.]])

# Ordinal Encoder

In [34]:
from sklearn.preprocessing import OrdinalEncoder

In [36]:
X = [["High"], ["Low"],["Low"], ["Medium"]]
np.array(X)

array([['High'],
       ['Low'],
       ['Low'],
       ['Medium']], dtype='<U6')

In [38]:
enc  = OrdinalEncoder(categories=[["Low", "Medium", "High"]])
#Order is mandatory above
enc.fit_transform(X)

array([[2.],
       [0.],
       [0.],
       [1.]])

In [39]:
Y = [["High", "A"],["Low", "C"], ["Low", "B"] , ["Medium", "C"]] 
np.array(Y)

array([['High', 'A'],
       ['Low', 'C'],
       ['Low', 'B'],
       ['Medium', 'C']], dtype='<U6')

In [41]:
enc = OrdinalEncoder(categories= [["High", "Medium", "Low"], ["A", "B", "C"]])

In [43]:
enc.fit_transform(Y)
#As the Order of Rank Changes the Rank is also changed

array([[0., 0.],
       [2., 2.],
       [2., 1.],
       [1., 2.]])

# Label Encoder

In [44]:
from sklearn.preprocessing import LabelEncoder

In [45]:
X = ["A", "B", "C", "D"]
X

['A', 'B', 'C', 'D']

In [46]:
enc = LabelEncoder()

In [47]:
enc.fit_transform(X)

array([0, 1, 2, 3], dtype=int64)

In [48]:
enc.inverse_transform(enc.fit_transform(X))

array(['A', 'B', 'C', 'D'], dtype='<U1')