## Handling Categorical Feature

### one-Hot Encoding

In [2]:
import pandas as pd

In [4]:
df=pd.read_csv('titanic.csv',usecols=['Sex'])

In [5]:
df.head()

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male


In [6]:
pd.get_dummies(df).head()

Unnamed: 0,Sex_female,Sex_male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [7]:
# We can decide column category based on single column in this case. Male -0, Female -1
pd.get_dummies(df,drop_first=True).head()

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1


In [19]:
df=pd.read_csv('titanic.csv',usecols=['Embarked'])

In [20]:
df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [21]:
df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [22]:
df['Embarked'].isnull().sum()

2

In [23]:
df.loc[df['Embarked'].isnull()]

Unnamed: 0,Embarked
61,
829,


In [24]:
df[df['Embarked'].isnull()]

Unnamed: 0,Embarked
61,
829,


In [26]:
pd.get_dummies(df).head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [27]:
df.dropna(inplace=True)

In [28]:
pd.get_dummies(df).head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [29]:
pd.get_dummies(df,drop_first=True).head()

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


### Onehotencoding with many categories in a feature

In the winning solution of the KDD 2009 cup: "Winning the KDD Cup Orange Challenge with Ensemble Selection" (http://www.mtome.com/Publications/CiML/CiML-v3-book.pdf), the authors limit one hot encoding to the 10 most frequent labels of the variable. This means that they would make one binary variable for each of the 10 most frequent labels only. This is equivalent to grouping all the other labels under a new category, that in this case will be dropped. Thus, the 10 new dummy variables indicate if one of the 10 most frequent labels is present (1) or not (0) for a particular observation.

In [14]:
df=pd.read_csv('mercedes.csv',usecols=['X0','X1','X2','X3','X4','X5'])
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5
0,k,v,at,a,d,u
1,k,t,av,e,d,y
2,az,w,n,c,d,x
3,az,t,n,f,d,x
4,az,v,n,f,d,h


In [15]:
df['X0'].value_counts()

z     360
ak    349
y     324
ay    313
t     306
x     300
o     269
f     227
n     195
w     182
j     181
az    175
aj    151
s     106
ap    103
h      75
d      73
al     67
v      36
af     35
m      34
ai     34
e      32
ba     27
at     25
a      21
ax     19
aq     18
i      18
am     18
u      17
l      16
aw     16
ad     14
k      11
b      11
au     11
as     10
r      10
bc      6
ao      4
c       3
aa      2
q       2
g       1
ab      1
ac      1
Name: X0, dtype: int64

In [16]:
df['X0'].unique()

array(['k', 'az', 't', 'al', 'o', 'w', 'j', 'h', 's', 'n', 'ay', 'f', 'x',
       'y', 'aj', 'ak', 'am', 'z', 'q', 'at', 'ap', 'v', 'af', 'a', 'e',
       'ai', 'd', 'aq', 'c', 'aa', 'ba', 'as', 'i', 'r', 'b', 'ax', 'bc',
       'u', 'ad', 'au', 'm', 'l', 'aw', 'ao', 'ac', 'g', 'ab'],
      dtype=object)

In [17]:
df.columns

Index(['X0', 'X1', 'X2', 'X3', 'X4', 'X5'], dtype='object')

In [18]:
for col in df.columns:
    print(len(df[col].unique())) # No. of unique values in the column

47
27
44
7
4
29


In [19]:
df['X0'].value_counts().sort_values(ascending=False).head(10)

z     360
ak    349
y     324
ay    313
t     306
x     300
o     269
f     227
n     195
w     182
Name: X0, dtype: int64

In [20]:
df['X0'].value_counts().sort_values(ascending=False).head(10).index

Index(['z', 'ak', 'y', 'ay', 't', 'x', 'o', 'f', 'n', 'w'], dtype='object')

In [23]:
list_10 = df['X0'].value_counts().sort_values(ascending=False).head(10).index
list_10 = list(list_10) 
list_10

['z', 'ak', 'y', 'ay', 't', 'x', 'o', 'f', 'n', 'w']

In [27]:
import numpy as np
for categories in list_10:
    df[categories] = np.where(df['X1']== categories,1,0)

In [28]:
df[list_10]

Unnamed: 0,z,ak,y,ay,t,x,o,f,n,w
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
4204,0,0,0,0,0,0,0,0,0,0
4205,0,0,0,0,0,0,1,0,0,0
4206,0,0,0,0,0,0,0,0,0,0
4207,0,0,0,0,0,0,0,0,0,0


In [29]:
list_10.append('X1')

In [30]:
df[list_10]

Unnamed: 0,z,ak,y,ay,t,x,o,f,n,w,X1
0,0,0,0,0,0,0,0,0,0,0,v
1,0,0,0,0,1,0,0,0,0,0,t
2,0,0,0,0,0,0,0,0,0,1,w
3,0,0,0,0,1,0,0,0,0,0,t
4,0,0,0,0,0,0,0,0,0,0,v
...,...,...,...,...,...,...,...,...,...,...,...
4204,0,0,0,0,0,0,0,0,0,0,s
4205,0,0,0,0,0,0,1,0,0,0,o
4206,0,0,0,0,0,0,0,0,0,0,v
4207,0,0,0,0,0,0,0,0,0,0,r


### Count Or Frequency Encoding

In [70]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' , header = None,index_col=None)
train_set.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [71]:
columns=[1,3,5,6,7,8,9,13]

In [75]:
train_set=train_set[columns]

In [76]:
train_set.columns=['Employment','Degree','Status','Designation','family_job','Race','Sex','Country']

In [77]:
train_set.head()

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [78]:
for col in train_set.columns:
    print(col,':',len(train_set[col].unique()),' labels')

Employment : 9  labels
Degree : 16  labels
Status : 7  labels
Designation : 15  labels
family_job : 6  labels
Race : 5  labels
Sex : 2  labels
Country : 42  labels


In [79]:
country_map = train_set['Country'].value_counts().to_dict()
country_map

{' United-States': 29170,
 ' Mexico': 643,
 ' ?': 583,
 ' Philippines': 198,
 ' Germany': 137,
 ' Canada': 121,
 ' Puerto-Rico': 114,
 ' El-Salvador': 106,
 ' India': 100,
 ' Cuba': 95,
 ' England': 90,
 ' Jamaica': 81,
 ' South': 80,
 ' China': 75,
 ' Italy': 73,
 ' Dominican-Republic': 70,
 ' Vietnam': 67,
 ' Guatemala': 64,
 ' Japan': 62,
 ' Poland': 60,
 ' Columbia': 59,
 ' Taiwan': 51,
 ' Haiti': 44,
 ' Iran': 43,
 ' Portugal': 37,
 ' Nicaragua': 34,
 ' Peru': 31,
 ' Greece': 29,
 ' France': 29,
 ' Ecuador': 28,
 ' Ireland': 24,
 ' Hong': 20,
 ' Cambodia': 19,
 ' Trinadad&Tobago': 19,
 ' Thailand': 18,
 ' Laos': 18,
 ' Yugoslavia': 16,
 ' Outlying-US(Guam-USVI-etc)': 14,
 ' Hungary': 13,
 ' Honduras': 13,
 ' Scotland': 12,
 ' Holand-Netherlands': 1}

In [80]:
train_set['Country']=train_set['Country'].map(country_map)
train_set.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set['Country']=train_set['Country'].map(country_map)


Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95
5,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,29170
6,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,81
7,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
8,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,29170
9,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170



#### Advantages
- Easy To Use
- Not increasing feature space 

##### Disadvantages
- It will provide same weight if the frequencies are same

### Target Guided Ordinal Encoding

- Ordering the labels according to the target
- Replace the labels by the joint probability of being 1 or 0

In [82]:
import pandas as pd
df=pd.read_csv('titanic.csv', usecols=['Cabin','Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [83]:
df['Cabin'].fillna('Missing',inplace=True)

In [84]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [88]:
print(type(df['Cabin']))

<class 'numpy.ndarray'>


In [90]:
df['Cabin']=df['Cabin'].astype(str).str[0]

In [91]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [92]:
df['Cabin'].unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [94]:
df.groupby('Cabin')['Survived'].mean()

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [95]:
df.groupby('Cabin')['Survived'].mean().sort_values()

Cabin
T    0.000000
M    0.299854
A    0.466667
G    0.500000
C    0.593220
F    0.615385
B    0.744681
E    0.750000
D    0.757576
Name: Survived, dtype: float64

In [97]:
ordinal_label = df.groupby('Cabin')['Survived'].mean().sort_values().index
ordinal_label

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [101]:
for i,k in enumerate(ordinal_label):
    print(i,', k: ',k) 

0 , k:  T
1 , k:  M
2 , k:  A
3 , k:  G
4 , k:  C
5 , k:  F
6 , k:  B
7 , k:  E
8 , k:  D


In [102]:
ordinal_label2 = {k:i for i,k in enumerate(ordinal_label)}
ordinal_label2

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [103]:
df['cabin_ordinal_labels']=df['Cabin'].map(ordinal_label2)

In [104]:
df.head()

Unnamed: 0,Survived,Cabin,cabin_ordinal_labels
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1


### Mean Encoding

In [107]:
mean_ordinal = df.groupby('Cabin')['Survived'].mean().to_dict()
mean_ordinal

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29985443959243085,
 'T': 0.0}

In [108]:
df['mean_ordinal_encode']=df['Cabin'].map(mean_ordinal)

In [109]:
df.head()

Unnamed: 0,Survived,Cabin,cabin_ordinal_labels,mean_ordinal_encode
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854
