## Feature Engineering - Handling Categorical Features - Day 3

In [1]:
import pandas as pd 
import numpy as np 

## Handle Categorical Features

### One Hot Encoding

In [4]:
df = pd.read_csv('Datasets/titanic.csv',usecols=['Sex'])

In [5]:
df.head()

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male


In [6]:
df.isnull().sum()

Sex    0
dtype: int64

In [None]:
## One Hot Encoding
pd.get_dummies(df,dtype=int).head()

Unnamed: 0,Sex_female,Sex_male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [11]:
pd.get_dummies(df,drop_first=True,dtype=int).head()

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1


In [12]:
df = pd.read_csv('Datasets/titanic.csv',usecols=['Embarked'])

In [14]:
df['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [15]:
df.isnull().sum()

Embarked    2
dtype: int64

In [16]:
df.dropna(inplace=True)

In [17]:
pd.get_dummies(df,drop_first=False,dtype=int).head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [18]:
pd.get_dummies(df,drop_first=True,dtype=int).head()

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


In [19]:
#### One Hot Encoding with many categories in a feature

In [20]:
df = pd.read_csv('Datasets/mercedes.csv')

In [21]:
df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [23]:
df.shape

(4209, 378)

In [24]:
## using required cols 
df = pd.read_csv('Datasets/mercedes.csv',usecols=['X0','X1','X2','X3','X4','X5'])

In [25]:
for i in df.columns:
    print(df[i].value_counts())

X0
z     360
ak    349
y     324
ay    313
t     306
x     300
o     269
f     227
n     195
w     182
j     181
az    175
aj    151
s     106
ap    103
h      75
d      73
al     67
v      36
af     35
m      34
ai     34
e      32
ba     27
at     25
a      21
ax     19
aq     18
am     18
i      18
u      17
aw     16
l      16
ad     14
au     11
k      11
b      11
r      10
as     10
bc      6
ao      4
c       3
aa      2
q       2
ac      1
g       1
ab      1
Name: count, dtype: int64
X1
aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
w      52
z      46
u      37
e      33
m      32
t      31
h      29
y      23
f      23
j      22
n      19
k      17
p       9
g       6
d       3
q       3
ab      3
Name: count, dtype: int64
X2
as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
k       25
i       25
b       21
ao      20
a

In [26]:
df['X0'].unique()

array(['k', 'az', 't', 'al', 'o', 'w', 'j', 'h', 's', 'n', 'ay', 'f', 'x',
       'y', 'aj', 'ak', 'am', 'z', 'q', 'at', 'ap', 'v', 'af', 'a', 'e',
       'ai', 'd', 'aq', 'c', 'aa', 'ba', 'as', 'i', 'r', 'b', 'ax', 'bc',
       'u', 'ad', 'au', 'm', 'l', 'aw', 'ao', 'ac', 'g', 'ab'],
      dtype=object)

In [27]:
for i in df.columns:
    print(df[i].unique())
    print()

['k' 'az' 't' 'al' 'o' 'w' 'j' 'h' 's' 'n' 'ay' 'f' 'x' 'y' 'aj' 'ak' 'am'
 'z' 'q' 'at' 'ap' 'v' 'af' 'a' 'e' 'ai' 'd' 'aq' 'c' 'aa' 'ba' 'as' 'i'
 'r' 'b' 'ax' 'bc' 'u' 'ad' 'au' 'm' 'l' 'aw' 'ao' 'ac' 'g' 'ab']

['v' 't' 'w' 'b' 'r' 'l' 's' 'aa' 'c' 'a' 'e' 'h' 'z' 'j' 'o' 'u' 'p' 'n'
 'i' 'y' 'd' 'f' 'm' 'k' 'g' 'q' 'ab']

['at' 'av' 'n' 'e' 'as' 'aq' 'r' 'ai' 'ak' 'm' 'a' 'k' 'ae' 's' 'f' 'd'
 'ag' 'ay' 'ac' 'ap' 'g' 'i' 'aw' 'y' 'b' 'ao' 'al' 'h' 'x' 'au' 't' 'an'
 'z' 'ah' 'p' 'am' 'j' 'q' 'af' 'l' 'aa' 'c' 'o' 'ar']

['a' 'e' 'c' 'f' 'd' 'b' 'g']

['d' 'b' 'c' 'a']

['u' 'y' 'x' 'h' 'g' 'f' 'j' 'i' 'd' 'c' 'af' 'ag' 'ab' 'ac' 'ad' 'ae'
 'ah' 'l' 'k' 'n' 'm' 'p' 'q' 's' 'r' 'v' 'w' 'o' 'aa']



In [28]:
for i in df.columns:
    print(df[i].nunique())
    print()

47

27

44

7

4

29



In [29]:
## we have 47 categorical in X1

In [31]:
df['X1'].value_counts().sort_values(ascending=False).head(10)

## Top 10 Most occuring categories

X1
aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
Name: count, dtype: int64

In [33]:
lst_10 = df['X1'].value_counts().sort_values(ascending=False).head(10).index

In [35]:
lst_10 = list(lst_10)

In [36]:
lst_10

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']

In [37]:
## skipping all other categories
for category in lst_10:
    df[category] = np.where(df['X1']==category,1,0)
    
## If the value is in list of category then 1 else 0 

In [38]:
df[lst_10]

Unnamed: 0,aa,s,b,l,v,r,i,a,c,o
0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
4204,0,1,0,0,0,0,0,0,0,0
4205,0,0,0,0,0,0,0,0,0,1
4206,0,0,0,0,1,0,0,0,0,0
4207,0,0,0,0,0,1,0,0,0,0


In [39]:
lst_10.append('X1')

In [40]:
df[lst_10]

Unnamed: 0,aa,s,b,l,v,r,i,a,c,o,X1
0,0,0,0,0,1,0,0,0,0,0,v
1,0,0,0,0,0,0,0,0,0,0,t
2,0,0,0,0,0,0,0,0,0,0,w
3,0,0,0,0,0,0,0,0,0,0,t
4,0,0,0,0,1,0,0,0,0,0,v
...,...,...,...,...,...,...,...,...,...,...,...
4204,0,1,0,0,0,0,0,0,0,0,s
4205,0,0,0,0,0,0,0,0,0,1,o
4206,0,0,0,0,1,0,0,0,0,0,v
4207,0,0,0,0,0,1,0,0,0,0,r


In [41]:
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,aa,s,b,l,v,r,i,a,c,o
0,k,v,at,a,d,u,0,0,0,0,1,0,0,0,0,0
1,k,t,av,e,d,y,0,0,0,0,0,0,0,0,0,0
2,az,w,n,c,d,x,0,0,0,0,0,0,0,0,0,0
3,az,t,n,f,d,x,0,0,0,0,0,0,0,0,0,0
4,az,v,n,f,d,h,0,0,0,0,1,0,0,0,0,0


In [None]:
## By this way we are taking only top 10

In [42]:
df[lst_10].head()

Unnamed: 0,aa,s,b,l,v,r,i,a,c,o,X1
0,0,0,0,0,1,0,0,0,0,0,v
1,0,0,0,0,0,0,0,0,0,0,t
2,0,0,0,0,0,0,0,0,0,0,w
3,0,0,0,0,0,0,0,0,0,0,t
4,0,0,0,0,1,0,0,0,0,0,v


In [43]:
df[lst_10].sample(10)

Unnamed: 0,aa,s,b,l,v,r,i,a,c,o,X1
2897,0,0,0,1,0,0,0,0,0,0,l
1534,0,0,0,0,0,0,1,0,0,0,i
2892,1,0,0,0,0,0,0,0,0,0,aa
822,1,0,0,0,0,0,0,0,0,0,aa
3450,0,0,0,1,0,0,0,0,0,0,l
2471,0,0,0,1,0,0,0,0,0,0,l
934,0,0,0,0,0,0,0,0,0,0,e
352,0,0,0,0,0,0,1,0,0,0,i
1738,0,0,0,0,0,1,0,0,0,0,r
49,0,0,0,0,0,0,0,0,0,0,j
