In [1]:
import pandas as pd
import numpy as np

In [2]:
# Lets load the dataset for demonstration, only the categorical variables

In [4]:
data = pd.read_csv("train.csv", usecols=["X1", "X2", "X3", "X4", "X5", "X6"])
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [6]:
# lets have a look at how many labels each variable has
for col in data.columns:
    print(col, ": ", len(data[col].unique()), ' labels')

X1 :  27  labels
X2 :  44  labels
X3 :  7  labels
X4 :  4  labels
X5 :  29  labels
X6 :  12  labels


In [7]:
# lets examine how many columns we will obtain after one hot encoding these variables
pd.get_dummies(data, drop_first=True).shape

(4209, 117)

In [9]:
# lets find the top 10 most frequent categories for the variable X2
data.X2.value_counts().sort_values(ascending=False).head(20)

as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
k       25
i       25
b       21
ao      20
ag      19
z       19
Name: X2, dtype: int64

In [11]:
#Lets make a list with the most frequent categories of the variable
top_10 = [x for x in data.X2.value_counts().sort_values(ascending=False).head(10).index]
print(top_10)

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']


In [13]:
# and now we make the 10 binary variables

for label in top_10:
    data[label] = np.where(data['X2']==label, 1, 0)
    
print(data[['X2']+top_10].head(40))

    X2  as  ae  ai  m  ak  r  n  s  f  e
0   at   0   0   0  0   0  0  0  0  0  0
1   av   0   0   0  0   0  0  0  0  0  0
2    n   0   0   0  0   0  0  1  0  0  0
3    n   0   0   0  0   0  0  1  0  0  0
4    n   0   0   0  0   0  0  1  0  0  0
5    e   0   0   0  0   0  0  0  0  0  1
6    e   0   0   0  0   0  0  0  0  0  1
7   as   1   0   0  0   0  0  0  0  0  0
8   as   1   0   0  0   0  0  0  0  0  0
9   aq   0   0   0  0   0  0  0  0  0  0
10   r   0   0   0  0   0  1  0  0  0  0
11   e   0   0   0  0   0  0  0  0  0  1
12  ai   0   0   1  0   0  0  0  0  0  0
13   e   0   0   0  0   0  0  0  0  0  1
14  as   1   0   0  0   0  0  0  0  0  0
15  ak   0   0   0  0   1  0  0  0  0  0
16   e   0   0   0  0   0  0  0  0  0  1
17  as   1   0   0  0   0  0  0  0  0  0
18   m   0   0   0  1   0  0  0  0  0  0
19  ak   0   0   0  0   1  0  0  0  0  0
20   m   0   0   0  1   0  0  0  0  0  0
21  as   1   0   0  0   0  0  0  0  0  0
22   a   0   0   0  0   0  0  0  0  0  0
23   m   0   0  

In [14]:
# get whole set of dummy variables for all the categorical variables

def one_hot_top_x(df, variable, top_x_labels):
    '''Function to create the dummy variables for the most frequent labels
       we can vary the number of most frequent labels that we encode
    '''
    for label in top_x_labels:
        df[variable+'_'+label] = np.where(data[variable]==label, 1, 0)

In [16]:
# read the data again
data = pd.read_csv("train.csv", usecols=["X1", "X2", "X3", "X4", "X5", "X6"])

In [17]:
# encode X2 into the 10 most frequent categories
one_hot_top_x(data, 'X2', top_10)
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e
0,v,at,a,d,u,j,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,1,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,1,0,0,0
4,v,n,f,d,h,d,0,0,0,0,0,0,1,0,0,0


In [18]:
# find the 10 most frequent categories for X1
top_10 = [x for x in data.X1.value_counts().sort_values(ascending=False).head(10).index]

In [19]:
# now create the 10 most frequent dummy variables for X1
one_hot_top_x(data, 'X1', top_10)
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,...,X1_aa,X1_s,X1_b,X1_l,X1_v,X1_r,X1_i,X1_a,X1_c,X1_o
0,v,at,a,d,u,j,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


## One hot encoding of top variables

### Advantages

- Straightforward to implement
- Does not require hrs of variable exploration
- Does not expand massively the feature space

### Disadvantages
- Does not add any information that may make the variable more predictive
- Does not keep the information of the ignored variables