## Handle categorical features

### 1) One-hot encoding

In [27]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("titanic.csv", usecols=["Sex"])

In [5]:
df.head()

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male


In [7]:
pd.get_dummies(df, drop_first=True).head()

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1


In [8]:
df = pd.read_csv("titanic.csv", usecols=["Embarked"])

In [9]:
df["Embarked"].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [10]:
df.dropna(inplace=True)

In [11]:
df.head()

Unnamed: 0,Embarked
0,S
1,C
2,S
3,S
4,S


In [13]:
pd.get_dummies(df, drop_first=True).head()

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


#

### Performing one-hot encoding with many categories in a feature

In [20]:
df = pd.read_csv("mercedesbenz.csv", usecols=["X0", "X1", "X2", "X3", "X4", "X5", "X6"])

In [21]:
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [22]:
for i in df.columns:
    print(df[i].value_counts())

z     360
ak    349
y     324
ay    313
t     306
x     300
o     269
f     227
n     195
w     182
j     181
az    175
aj    151
s     106
ap    103
h      75
d      73
al     67
v      36
af     35
ai     34
m      34
e      32
ba     27
at     25
a      21
ax     19
aq     18
i      18
am     18
u      17
aw     16
l      16
ad     14
k      11
b      11
au     11
r      10
as     10
bc      6
ao      4
c       3
q       2
aa      2
ab      1
ac      1
g       1
Name: X0, dtype: int64
aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
w      52
z      46
u      37
e      33
m      32
t      31
h      29
f      23
y      23
j      22
n      19
k      17
p       9
g       6
ab      3
q       3
d       3
Name: X1, dtype: int64
as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
k       25
i       25
b       21
ao      20
ag      19
z    

In [23]:
len(df['X0'].unique())

47

In [31]:
lst_10 = df.X1.value_counts().sort_values(ascending=False).head(10).index
lst_10 = list(lst_10)

In [32]:
for category in lst_10:
    df[category] = np.where(df["X1"] == category, 1, 0)

In [33]:
lst_10.append("X1")

In [34]:
df[lst_10]

Unnamed: 0,aa,s,b,l,v,r,i,a,c,o,X1
0,0,0,0,0,1,0,0,0,0,0,v
1,0,0,0,0,0,0,0,0,0,0,t
2,0,0,0,0,0,0,0,0,0,0,w
3,0,0,0,0,0,0,0,0,0,0,t
4,0,0,0,0,1,0,0,0,0,0,v
...,...,...,...,...,...,...,...,...,...,...,...
4204,0,1,0,0,0,0,0,0,0,0,s
4205,0,0,0,0,0,0,0,0,0,1,o
4206,0,0,0,0,1,0,0,0,0,0,v
4207,0,0,0,0,0,1,0,0,0,0,r
