# One Hot Encoding

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv("data.csv")

In [3]:
df.head()

Unnamed: 0,Id,Colour,Country
0,1,Red,USA
1,2,Blue,UK
2,3,Green,Canada
3,4,Blue,USA
4,5,Blue,USA


In [4]:
df.tail()

Unnamed: 0,Id,Colour,Country
295,296,Red,Canada
296,297,Green,UK
297,298,Green,UK
298,299,Red,Canada
299,300,Red,Canada


In [5]:
df.dtypes

Id          int64
Colour     object
Country    object
dtype: object

In [6]:
df["Colour"].unique()

array(['Red', 'Blue', 'Green'], dtype=object)

In [7]:
df["Country"].unique()

array(['USA', 'UK', 'Canada'], dtype=object)

In [8]:
ohe = OneHotEncoder()

In [9]:
print(ohe)

OneHotEncoder()


In [10]:
ohe.fit_transform(df[["Colour", "Country"]]).toarray()

array([[0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1., 0.],
       [0., 1., 0., 1., 0., 0.],
       ...,
       [0., 1., 0., 0., 1., 0.],
       [0., 0., 1., 1., 0., 0.],
       [0., 0., 1., 1., 0., 0.]])

In [11]:
feature_arry = ohe.fit_transform(df[["Colour", "Country"]]).toarray()

In [12]:
print(feature_arry)

[[0. 0. 1. 0. 0. 1.]
 [1. 0. 0. 0. 1. 0.]
 [0. 1. 0. 1. 0. 0.]
 ...
 [0. 1. 0. 0. 1. 0.]
 [0. 0. 1. 1. 0. 0.]
 [0. 0. 1. 1. 0. 0.]]


In [13]:
ohe.categories_

[array(['Blue', 'Green', 'Red'], dtype=object),
 array(['Canada', 'UK', 'USA'], dtype=object)]

In [14]:
feature_labels = ohe.categories_

In [15]:
np.array(feature_labels).ravel()

array(['Blue', 'Green', 'Red', 'Canada', 'UK', 'USA'], dtype=object)

In [16]:
feature_labels = np.array(feature_labels).ravel()

In [17]:
print(feature_labels)

['Blue' 'Green' 'Red' 'Canada' 'UK' 'USA']


In [18]:
pd.DataFrame(feature_arry, columns = feature_labels)

Unnamed: 0,Blue,Green,Red,Canada,UK,USA
0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...
295,0.0,0.0,1.0,1.0,0.0,0.0
296,0.0,1.0,0.0,0.0,1.0,0.0
297,0.0,1.0,0.0,0.0,1.0,0.0
298,0.0,0.0,1.0,1.0,0.0,0.0


In [19]:
features = pd.DataFrame(feature_arry, columns = feature_labels)

In [20]:
print(features)

     Blue  Green  Red  Canada   UK  USA
0     0.0    0.0  1.0     0.0  0.0  1.0
1     1.0    0.0  0.0     0.0  1.0  0.0
2     0.0    1.0  0.0     1.0  0.0  0.0
3     1.0    0.0  0.0     0.0  0.0  1.0
4     1.0    0.0  0.0     0.0  0.0  1.0
..    ...    ...  ...     ...  ...  ...
295   0.0    0.0  1.0     1.0  0.0  0.0
296   0.0    1.0  0.0     0.0  1.0  0.0
297   0.0    1.0  0.0     0.0  1.0  0.0
298   0.0    0.0  1.0     1.0  0.0  0.0
299   0.0    0.0  1.0     1.0  0.0  0.0

[300 rows x 6 columns]


In [21]:
features.head()

Unnamed: 0,Blue,Green,Red,Canada,UK,USA
0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,1.0


In [22]:
pd.concat([df, features], axis=1)

Unnamed: 0,Id,Colour,Country,Blue,Green,Red,Canada,UK,USA
0,1,Red,USA,0.0,0.0,1.0,0.0,0.0,1.0
1,2,Blue,UK,1.0,0.0,0.0,0.0,1.0,0.0
2,3,Green,Canada,0.0,1.0,0.0,1.0,0.0,0.0
3,4,Blue,USA,1.0,0.0,0.0,0.0,0.0,1.0
4,5,Blue,USA,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
295,296,Red,Canada,0.0,0.0,1.0,1.0,0.0,0.0
296,297,Green,UK,0.0,1.0,0.0,0.0,1.0,0.0
297,298,Green,UK,0.0,1.0,0.0,0.0,1.0,0.0
298,299,Red,Canada,0.0,0.0,1.0,1.0,0.0,0.0


In [23]:
df_new = pd.concat([df, features], axis=1)

In [24]:
df_new.head()

Unnamed: 0,Id,Colour,Country,Blue,Green,Red,Canada,UK,USA
0,1,Red,USA,0.0,0.0,1.0,0.0,0.0,1.0
1,2,Blue,UK,1.0,0.0,0.0,0.0,1.0,0.0
2,3,Green,Canada,0.0,1.0,0.0,1.0,0.0,0.0
3,4,Blue,USA,1.0,0.0,0.0,0.0,0.0,1.0
4,5,Blue,USA,1.0,0.0,0.0,0.0,0.0,1.0
