# Encoding


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df = sns.load_dataset('iris')
# df.head()
df.sample(5, random_state=1)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
14,5.8,4.0,1.2,0.2,setosa
98,5.1,2.5,3.0,1.1,versicolor
75,6.6,3.0,4.4,1.4,versicolor
16,5.4,3.9,1.3,0.4,setosa
131,7.9,3.8,6.4,2.0,virginica


In [3]:
# Label encoding
y, class_names = pd.factorize(df.species, sort=True) 
# print(pd.unique(y))

class_names

Index(['setosa', 'versicolor', 'virginica'], dtype='object')

In [4]:
y[45:55]

array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1], dtype=int64)

## Keras: to_categorical

In [5]:
from tensorflow.keras.utils import to_categorical

to_categorical(1, 3).astype('int')

array([0, 1, 0])

In [6]:
categories, ids = np.unique(df.species, return_inverse=True)

In [7]:
categories

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [8]:
df.species

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: object

In [9]:
ids

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [10]:
from tensorflow.keras.utils import to_categorical

def one_hot_encode(arr):  # array (String or int)
    categories, ids = np.unique(arr, return_inverse=True)
    return to_categorical(ids, len(categories))

y_1h = one_hot_encode(df.species).astype('int')

y_1h[47:52]
y_1h[130:138]

array([[0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1]])

In [11]:
df.iloc[14].species

'setosa'

In [12]:
y[14]

0

In [13]:
y_1h[14]

array([1, 0, 0])

In [14]:
df.iloc[98].species

'versicolor'

In [15]:
y[98]

1

In [16]:
y_1h[98]

array([0, 1, 0])

In [17]:
print('  data      Label-Encoded   one-hot')
print('-----------------------------------')
for k in [14, 98, 131]:
    # print(k)
    print(f'{df.loc[k].species:17} {y[k]:<8}  {y_1h[k]}')

  data      Label-Encoded   one-hot
-----------------------------------
setosa            0         [1 0 0]
versicolor        1         [0 1 0]
virginica         2         [0 0 1]


## Pandas: get_dummies

In [18]:
y[:5]

array([0, 0, 0, 0, 0], dtype=int64)

In [19]:
y_1hgd = pd.get_dummies(y)

In [20]:
y_1hgd[130:131].to_numpy()

array([[0, 0, 1]], dtype=uint8)

## Scikit-learn: OneHotEncoder

In [21]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()

y2 = df.species.to_numpy()
y2 = y2.reshape(-1, 1)

y2[:5]

array([['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa']], dtype=object)

In [22]:
y_1hsk = enc.fit_transform(y2).toarray().astype('int')

In [23]:
y_1hsk[14]

array([1, 0, 0])

In [24]:
y_1hsk[131]

array([0, 0, 1])

In [25]:
df.iloc[131].species

'virginica'