## One Hot Encoding - variables with many categories 

This notebook illustrates how we can use one hot encoding for 120 categorical features. We used KDD Cup Orange Challenege with Ensemble Selection method where one categorical features equals top 10 categorical features. 

In [1]:
#Importing Libraries and dataset
import pandas as pd
import numpy as np

dataset = pd.read_csv('mercedesbenz.csv',usecols = ['X1','X2','X3','X4','X5','X6'])
dataset.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [2]:
#Lets count the unique categorical values in each categorical feature
for col in dataset.columns:
    print(col, ': ', len(dataset[col].unique()),'labels')

X1 :  27 labels
X2 :  44 labels
X3 :  7 labels
X4 :  4 labels
X5 :  29 labels
X6 :  12 labels


In [3]:
#lets see how many columns will come if we apply one hot encoding now
pd.get_dummies(dataset,drop_first=True).shape  #this means (117 - 5)-> 112 new columns will add

(4209, 117)

112 new columns will be added which does not look good. Hence, we are using following techniques used by KDD 2009 cup : "Winning the KDD Cup Orange Challenge with Ensemble Selection".

Basically we will take top 10 most frequent categorical values of 1 categorical feature.

In [8]:
#Lets find top 10 most frequent categories for varible X1
dataset.X1.value_counts().sort_values(ascending=False).head(20)

aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
w      52
z      46
u      37
e      33
m      32
t      31
h      29
f      23
y      23
j      22
Name: X1, dtype: int64

In [9]:
#Lets make a list with the most frequent categories of the variable
top_10_X1 = [x for x in dataset.X1.value_counts().sort_values(ascending=False).head(10).index]
top_10_X1

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']

In [10]:
#Now lets make 10 binary variables
for label in top_10_X1:
    dataset[label] = np.where(dataset['X1']==label,1,0)

dataset[['X1']+top_10_X1].head(10)

Unnamed: 0,X1,aa,s,b,l,v,r,i,a,c,o
0,v,0,0,0,0,1,0,0,0,0,0
1,t,0,0,0,0,0,0,0,0,0,0
2,w,0,0,0,0,0,0,0,0,0,0
3,t,0,0,0,0,0,0,0,0,0,0
4,v,0,0,0,0,1,0,0,0,0,0
5,b,0,0,1,0,0,0,0,0,0,0
6,r,0,0,0,0,0,1,0,0,0,0
7,l,0,0,0,1,0,0,0,0,0,0
8,s,0,1,0,0,0,0,0,0,0,0
9,b,0,0,1,0,0,0,0,0,0,0


In [12]:
#Lets do the same for X2
top_10_X2 = [ x for x in dataset.X2.value_counts().sort_values(ascending=False).head(10).index]
top_10_X2

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [13]:
for label in top_10_X2:
    dataset[label] = np.where(dataset['X2']==label,1,0)

dataset[['X2']+top_10_X2].head(20)

Unnamed: 0,X2,as,ae,ai,m,ak,r,n,s,f,e
0,at,0,0,0,0,0,0,0,0,0,0
1,av,0,0,0,0,0,0,0,0,0,0
2,n,0,0,0,0,0,0,1,0,0,0
3,n,0,0,0,0,0,0,1,0,0,0
4,n,0,0,0,0,0,0,1,0,0,0
5,e,0,0,0,0,0,0,0,0,0,1
6,e,0,0,0,0,0,0,0,0,0,1
7,as,1,0,0,0,0,0,0,0,0,0
8,as,1,0,0,0,0,0,0,0,0,0
9,aq,0,0,0,0,0,0,0,0,0,0


In [14]:
#Now lets get the set of dummy varibles, for all categorical variables

def one_hot_top_x(df,variable,top_x_labels):
    for label in top_x_labels:
        df[variable+' '+label] = np.where(data[variable]==label,1,0)

data = pd.read_csv('mercedesbenz.csv',usecols=['X1','X2','X3','X4','X5','X6'])

#Lets apply for X1
one_hot_top_x(data,'X1',top_10_X1)
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X1 aa,X1 s,X1 b,X1 l,X1 v,X1 r,X1 i,X1 a,X1 c,X1 o
0,v,at,a,d,u,j,0,0,0,0,1,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,1,0,0,0,0,0


In [15]:
#Lets apply for X2
one_hot_top_x(data,'X2',top_10_X2)
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X1 aa,X1 s,X1 b,X1 l,...,X2 as,X2 ae,X2 ai,X2 m,X2 ak,X2 r,X2 n,X2 s,X2 f,X2 e
0,v,at,a,d,u,j,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,t,n,f,d,x,l,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,v,n,f,d,h,d,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In the similar fashion, we can apply to every categorical features.