In [1]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv("Data/mercedesbenz.csv",usecols=['X1','X2','X3','X4','X5','X6'])

In [4]:
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [5]:
#lets look how many labels for each variable

for col in data.columns:
    print(col," :",len(data[col].unique()),'labels')

X1  : 27 labels
X2  : 44 labels
X3  : 7 labels
X4  : 4 labels
X5  : 29 labels
X6  : 12 labels


In [6]:
#lets exmaine how many column we will get get after using onehot encoding.

pd.get_dummies(data,drop_first=True).shape

(4209, 117)

We can see that from 6 initial categorial variables, we end up with 117 new variables. Above we can see curse of Dimensionality


What we can do instead:- 

KDD 2009 wining solution, "Wining the KDD Cup ORange Challange with Ensamble Selection" the authors limits one hot encoding to the 

10 most frequent labels of the variable.This means that they would make one binary variable for eachof the 10 most frequently labels

only. This is equivalent to grouping all the other labels under a new category, that in this case will be dropped. Thus, the 

10 new dummy variables indicate if one of the 10 most frequent labels is present (1) or not (0) for perticular observation.



In [8]:
# lets find 10 most frequent categories for variables X2

data.X2.value_counts().sort_values(ascending=False).head(20)

X2
as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
k       25
i       25
b       21
ao      20
z       19
ag      19
Name: count, dtype: int64

In [10]:
# Lets make a list with the most frequent categories of the variable

top_10 = [x for x in data.X2.value_counts().sort_values(ascending=False).head(10).index]
top_10

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [15]:
#we make 10 variables binary

for label in top_10:
    data[label] = np.where(data['X2']==label, 1, 0)

data[['X2']+top_10].head(40)

Unnamed: 0,X2,as,ae,ai,m,ak,r,n,s,f,e
0,at,0,0,0,0,0,0,0,0,0,0
1,av,0,0,0,0,0,0,0,0,0,0
2,n,0,0,0,0,0,0,1,0,0,0
3,n,0,0,0,0,0,0,1,0,0,0
4,n,0,0,0,0,0,0,1,0,0,0
5,e,0,0,0,0,0,0,0,0,0,1
6,e,0,0,0,0,0,0,0,0,0,1
7,as,1,0,0,0,0,0,0,0,0,0
8,as,1,0,0,0,0,0,0,0,0,0
9,aq,0,0,0,0,0,0,0,0,0,0


In [18]:
#will apply it in X1, X3, X4, X5, X6 columns for all categorical variables

def one_hot_top_x(df, variable,top_x_labels):
    #will create dummy variables

    for label in top_x_labels:
        df[variable+'_'+label] = np.where(data[variable]==label, 1 , 0)

# Read data again
data = pd.read_csv("Data/mercedesbenz.csv", usecols=['X1','X2','X3','X4', 'X5','X6'])

#encode X2 into top 10 most frequent categories
one_hot_top_x(data,'X2',top_10)
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e
0,v,at,a,d,u,j,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,1,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,1,0,0,0
4,v,n,f,d,h,d,0,0,0,0,0,0,1,0,0,0


In [20]:
#Find 10 most frequent categories for X1

top_10 = [x for x in data.X1.value_counts().sort_values(ascending=False).head(10).index]

#now create onehot encoding for X1
one_hot_top_x(data,'X1',top_10)
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,...,X1_aa,X1_s,X1_b,X1_l,X1_v,X1_r,X1_i,X1_a,X1_c,X1_o
0,v,at,a,d,u,j,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


Advantages:-

1. StraightForward to implement
2. Does not require hrs of variable exploration
3. Does not expand massively the feature space (number of columns in the dataset)

Disadvantages

1. Does not add any information that may make the variables more predictive.
2. Does not keep the information of the ignored labels.


Because it is not unusual that categorical variables have a few dominating categories and the remaining labesl add mostly
noise, this is quite simple and straightforward  approach that may be useful on many occasions.

