## Handle categorical features

### 1) One-hot encoding

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("titanic.csv", usecols=["Sex"])

In [3]:
df.head()

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male


In [4]:
pd.get_dummies(df, drop_first=True).head()

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1


In [5]:
df = pd.read_csv("titanic.csv", usecols=["Embarked"])

In [6]:
df["Embarked"].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [7]:
df.dropna(inplace=True)

In [8]:
df.head()

Unnamed: 0,Embarked
0,S
1,C
2,S
3,S
4,S


In [9]:
pd.get_dummies(df, drop_first=True).head()

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


#

### Performing one-hot encoding with many categories in a feature

In [10]:
df = pd.read_csv("mercedesbenz.csv", usecols=["X0", "X1", "X2", "X3", "X4", "X5", "X6"])

In [11]:
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [12]:
for i in df.columns:
    print(df[i].value_counts())

z     360
ak    349
y     324
ay    313
t     306
x     300
o     269
f     227
n     195
w     182
j     181
az    175
aj    151
s     106
ap    103
h      75
d      73
al     67
v      36
af     35
m      34
ai     34
e      32
ba     27
at     25
a      21
ax     19
i      18
am     18
aq     18
u      17
aw     16
l      16
ad     14
au     11
k      11
b      11
as     10
r      10
bc      6
ao      4
c       3
q       2
aa      2
ac      1
ab      1
g       1
Name: X0, dtype: int64
aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
w      52
z      46
u      37
e      33
m      32
t      31
h      29
f      23
y      23
j      22
n      19
k      17
p       9
g       6
q       3
d       3
ab      3
Name: X1, dtype: int64
as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
i       25
k       25
b       21
ao      20
ag      19
z    

In [13]:
len(df['X0'].unique())

47

In [14]:
lst_10 = df.X1.value_counts().sort_values(ascending=False).head(10).index
lst_10 = list(lst_10)

In [15]:
for category in lst_10:
    df[category] = np.where(df["X1"] == category, 1, 0)

In [16]:
lst_10.append("X1")

In [17]:
df[lst_10]

Unnamed: 0,aa,s,b,l,v,r,i,a,c,o,X1
0,0,0,0,0,1,0,0,0,0,0,v
1,0,0,0,0,0,0,0,0,0,0,t
2,0,0,0,0,0,0,0,0,0,0,w
3,0,0,0,0,0,0,0,0,0,0,t
4,0,0,0,0,1,0,0,0,0,0,v
...,...,...,...,...,...,...,...,...,...,...,...
4204,0,1,0,0,0,0,0,0,0,0,s
4205,0,0,0,0,0,0,0,0,0,1,o
4206,0,0,0,0,1,0,0,0,0,0,v
4207,0,0,0,0,0,1,0,0,0,0,r


 #

### 2) Ordinal Number Encoding

In [18]:
import datetime

In [19]:
today_date = datetime.datetime.today()
today_date

datetime.datetime(2021, 10, 24, 13, 14, 11, 12390)

In [20]:
today_date - datetime.timedelta(3)

datetime.datetime(2021, 10, 21, 13, 14, 11, 12390)

In [21]:
days = [today_date - datetime.timedelta(x) for x in range(0, 15)]

In [22]:
data = pd.DataFrame(days)
data.columns = ["Day"]

In [23]:
data

Unnamed: 0,Day
0,2021-10-24 13:14:11.012390
1,2021-10-23 13:14:11.012390
2,2021-10-22 13:14:11.012390
3,2021-10-21 13:14:11.012390
4,2021-10-20 13:14:11.012390
5,2021-10-19 13:14:11.012390
6,2021-10-18 13:14:11.012390
7,2021-10-17 13:14:11.012390
8,2021-10-16 13:14:11.012390
9,2021-10-15 13:14:11.012390


In [24]:
data["weekday"] = data["Day"].dt.day_name()
data.head()

Unnamed: 0,Day,weekday
0,2021-10-24 13:14:11.012390,Sunday
1,2021-10-23 13:14:11.012390,Saturday
2,2021-10-22 13:14:11.012390,Friday
3,2021-10-21 13:14:11.012390,Thursday
4,2021-10-20 13:14:11.012390,Wednesday


In [25]:
dictionary = {
    "Monday": 1,
    "Tuesday": 2,
    "Wednesday": 3,
    "Thursday": 4,
    "Friday": 5,
    "Saturday": 6,
    "Sunday": 7
}
print(dictionary)

{'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6, 'Sunday': 7}


In [26]:
data["weekday_ordinal"] = data["weekday"].map(dictionary)
print(data)

                          Day    weekday  weekday_ordinal
0  2021-10-24 13:14:11.012390     Sunday                7
1  2021-10-23 13:14:11.012390   Saturday                6
2  2021-10-22 13:14:11.012390     Friday                5
3  2021-10-21 13:14:11.012390   Thursday                4
4  2021-10-20 13:14:11.012390  Wednesday                3
5  2021-10-19 13:14:11.012390    Tuesday                2
6  2021-10-18 13:14:11.012390     Monday                1
7  2021-10-17 13:14:11.012390     Sunday                7
8  2021-10-16 13:14:11.012390   Saturday                6
9  2021-10-15 13:14:11.012390     Friday                5
10 2021-10-14 13:14:11.012390   Thursday                4
11 2021-10-13 13:14:11.012390  Wednesday                3
12 2021-10-12 13:14:11.012390    Tuesday                2
13 2021-10-11 13:14:11.012390     Monday                1
14 2021-10-10 13:14:11.012390     Sunday                7


#

### 3) Count or Frequency Encoding

In [27]:
train_set = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", header = None,index_col=None)
train_set.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [28]:
 columns = [1, 3, 5, 6, 7, 8, 9, 13]

In [29]:
train_set = train_set[columns]

In [30]:
train_set.columns = ["Employment", "Degree", "Status", "Designation", "Family_Job", "Race", "Sex", "Country"]

In [31]:
train_set.head()

Unnamed: 0,Employment,Degree,Status,Designation,Family_Job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [32]:
for feature in train_set.columns[:]:
    print(feature, ":", len(train_set[feature].unique()), "labels")
    

Employment : 9 labels
Degree : 16 labels
Status : 7 labels
Designation : 15 labels
Family_Job : 6 labels
Race : 5 labels
Sex : 2 labels
Country : 42 labels


In [33]:
train_set["Country"].value_counts()

 United-States                 29170
 Mexico                          643
 ?                               583
 Philippines                     198
 Germany                         137
 Canada                          121
 Puerto-Rico                     114
 El-Salvador                     106
 India                           100
 Cuba                             95
 England                          90
 Jamaica                          81
 South                            80
 China                            75
 Italy                            73
 Dominican-Republic               70
 Vietnam                          67
 Guatemala                        64
 Japan                            62
 Poland                           60
 Columbia                         59
 Taiwan                           51
 Haiti                            44
 Iran                             43
 Portugal                         37
 Nicaragua                        34
 Peru                             31
 

In [34]:
country_map = train_set["Country"].value_counts().to_dict()

In [35]:
train_set["Country"] = train_set["Country"].map(country_map)

In [36]:
train_set.head(20)

Unnamed: 0,Employment,Degree,Status,Designation,Family_Job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95
5,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,29170
6,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,81
7,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
8,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,29170
9,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170


#### Advantages
- Easy to implement.
- Not increasing feature space

#### Disadvantages
- It will provide the same weight if the frequencies are same.

#

### 3) Target Guided Ordinal Encoding
- Ordering the labels according to the target.
- Replace the labels by the joint probability of being 1 or 0

In [37]:
df = pd.read_csv("titanic.csv", usecols=["Cabin", "Survived"])
df.head(10)

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,
5,0,
6,0,E46
7,0,
8,1,
9,1,


In [38]:
df["Cabin"].fillna("Missing", inplace=True)

In [39]:
df["Cabin"] = df["Cabin"].astype(str).str[0]

In [40]:
df.head(10)

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M
5,0,M
6,0,E
7,0,M
8,1,M
9,1,M


In [41]:
df.Cabin.value_counts()

M    687
C     59
B     47
D     33
E     32
A     15
F     13
G      4
T      1
Name: Cabin, dtype: int64

In [42]:
df.groupby(["Cabin"])["Survived"].mean()  # which cabin's people have highest probablity of surviving

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [43]:
ordinal_labels = df.groupby(["Cabin"])["Survived"].mean().sort_values().index
ordinal_labels

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [44]:
ordinal_labels2 = {k:i for i, k in enumerate(ordinal_labels, 0)}

In [45]:
df["Cabin_ordinal_labels"] = df["Cabin"].map(ordinal_labels2)
df

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1
...,...,...,...
886,0,M,1
887,1,B,6
888,0,M,1
889,1,C,4


#

### 4) Mean Encoding

In [46]:
mean_ordinal = df.groupby(["Cabin"])["Survived"].mean().to_dict()

In [47]:
mean_ordinal

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29985443959243085,
 'T': 0.0}

In [48]:
df["mean_ordinal_encode"] = df["Cabin"].map(mean_ordinal)

In [49]:
df.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels,mean_ordinal_encode
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854


#

### 4) Probability Ration Encoding

In [50]:
df = pd.read_csv("titanic.csv", usecols=["Cabin", "Survived"])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [51]:
# Replacing NaN
df["Cabin"].fillna("Missing", inplace=True)

In [52]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [53]:
df.Cabin.unique()

array(['Missing', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62

In [54]:
df["Cabin"] = df["Cabin"].astype(str).str[0]
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [55]:
df["Cabin"].unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [57]:
prob_df = df.groupby(["Cabin"])["Survived"].mean()

In [58]:
prob_df = pd.DataFrame(prob_df)
prob_df

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [59]:
prob_df["Died"] = 1 - prob_df["Survived"]

In [60]:
prob_df.head()

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25


In [61]:
prob_df["Probability_ration"] = prob_df["Survived"] / prob_df["Died"]

In [62]:
prob_df.head()

Unnamed: 0_level_0,Survived,Died,Probability_ration
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0


In [64]:
probability_encoded = prob_df["Probability_ration"].to_dict()

In [65]:
df["Cabin_encoded"] = df["Cabin"].map(probability_encoded)

In [66]:
df

Unnamed: 0,Survived,Cabin,Cabin_encoded
0,0,M,0.428274
1,1,C,1.458333
2,1,M,0.428274
3,1,C,1.458333
4,0,M,0.428274
...,...,...,...
886,0,M,0.428274
887,1,B,2.916667
888,0,M,0.428274
889,1,C,1.458333
