##### Handle Categorical Features
###### One Hot Encoding

In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('titanic.csv',usecols=['Sex'])

In [3]:
df.head()

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male


In [4]:
pd.get_dummies(df)

Unnamed: 0,Sex_female,Sex_male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
886,0,1
887,1,0
888,1,0
889,0,1


In [5]:
#if we use both features, we will fall into dummy trap. 
#to save from dummy trap, we will drop one of the feature.
pd.get_dummies(df,drop_first=True)

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1
...,...
886,1
887,0
888,0
889,1


In [6]:
df=pd.read_csv('titanic.csv',usecols=['Embarked'])
df.head()

Unnamed: 0,Embarked
0,S
1,C
2,S
3,S
4,S


In [7]:
#see how many levels this variable containsabs
df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [8]:
#drop nan value first
df.dropna(inplace=True)

In [9]:
pd.get_dummies(df).head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [10]:
#convert to one-hot encoding and remove one feature
pd.get_dummies(df,drop_first=True).head()

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


#### Disadvantage: (Curse of dimension)
if our levels of variable are more than the features are more..
n levels feature create n-1 features

In [11]:
#### Onehot encoding with many categories in a feature

In [12]:
df=pd.read_csv('mercedes.csv',usecols=["X0","X1","X2","X3","X4","X5","X6"])

In [13]:
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [14]:
#find categories in each variable
for i in df.columns:
    print(len(df[i].unique()))

47
27
44
7
4
29
12


In [15]:
#one way to do is to take most 10 frequent categories and experiment
#KDD cup challenge
df.X1.value_counts().sort_values(ascending=False).head(10)

aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
Name: X1, dtype: int64

In [16]:
lst_10=df.X1.value_counts().sort_values(ascending=False).head(10).index
lst_10=list(lst_10)

In [17]:
lst_10

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']

In [18]:
import numpy as np
for categories in lst_10:
    df[categories]=np.where(df['X1']==categories,1,0)

In [19]:
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,aa,s,b,l,v,r,i,a,c,o
0,k,v,at,a,d,u,j,0,0,0,0,1,0,0,0,0,0
1,k,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,az,w,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0
3,az,t,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0
4,az,v,n,f,d,h,d,0,0,0,0,1,0,0,0,0,0


In [20]:
!pip install jovian --upgrade
import jovian
jovian.commit(project='Handling Categorical Features')

Requirement already up-to-date: jovian in c:\users\vipul\anaconda3\lib\site-packages (0.2.16)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[jovian] Attempting to save notebook..
[jovian] Please enter your API key ( from https://jovian.ml/ ):
API KEY: ········
[jovian] Creating a new project "vipul0036vipul/Handling Categorical Features"
[jovian] Uploading notebook..
[jovian] Capturing environment..
[jovian] Committed successfully! https://jovian.ml/vipul0036vipul/handling-categorical-features


'https://jovian.ml/vipul0036vipul/handling-categorical-features'

### Ordinal Number Encoding

In [35]:
import datetime as dt

In [36]:
today_date=datetime.datetime.today()

In [37]:
today_date

datetime.datetime(2020, 7, 24, 20, 29, 45, 799305)

In [38]:
#how to get 3 days back date
today_date-datetime.timedelta(3)

datetime.datetime(2020, 7, 21, 20, 29, 45, 799305)

In [39]:
#list comprehension to create sequence of dates
days=[today_date-datetime.timedelta(x) for x in range(0,15)]
days

[datetime.datetime(2020, 7, 24, 20, 29, 45, 799305),
 datetime.datetime(2020, 7, 23, 20, 29, 45, 799305),
 datetime.datetime(2020, 7, 22, 20, 29, 45, 799305),
 datetime.datetime(2020, 7, 21, 20, 29, 45, 799305),
 datetime.datetime(2020, 7, 20, 20, 29, 45, 799305),
 datetime.datetime(2020, 7, 19, 20, 29, 45, 799305),
 datetime.datetime(2020, 7, 18, 20, 29, 45, 799305),
 datetime.datetime(2020, 7, 17, 20, 29, 45, 799305),
 datetime.datetime(2020, 7, 16, 20, 29, 45, 799305),
 datetime.datetime(2020, 7, 15, 20, 29, 45, 799305),
 datetime.datetime(2020, 7, 14, 20, 29, 45, 799305),
 datetime.datetime(2020, 7, 13, 20, 29, 45, 799305),
 datetime.datetime(2020, 7, 12, 20, 29, 45, 799305),
 datetime.datetime(2020, 7, 11, 20, 29, 45, 799305),
 datetime.datetime(2020, 7, 10, 20, 29, 45, 799305)]

In [40]:
import pandas as pd
data=pd.DataFrame(days)
data.columns=['Day']

In [41]:
data

Unnamed: 0,Day
0,2020-07-24 20:29:45.799305
1,2020-07-23 20:29:45.799305
2,2020-07-22 20:29:45.799305
3,2020-07-21 20:29:45.799305
4,2020-07-20 20:29:45.799305
5,2020-07-19 20:29:45.799305
6,2020-07-18 20:29:45.799305
7,2020-07-17 20:29:45.799305
8,2020-07-16 20:29:45.799305
9,2020-07-15 20:29:45.799305


In [50]:
data['weekday']=data['Day'].dt.day_name()

In [51]:
data.head()

Unnamed: 0,Day,weekday
0,2020-07-24 20:29:45.799305,Friday
1,2020-07-23 20:29:45.799305,Thursday
2,2020-07-22 20:29:45.799305,Wednesday
3,2020-07-21 20:29:45.799305,Tuesday
4,2020-07-20 20:29:45.799305,Monday


In [52]:
dictionary={'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6,'Sunday':7}

In [53]:
dictionary

{'Monday': 1,
 'Tuesday': 2,
 'Wednesday': 3,
 'Thursday': 4,
 'Friday': 5,
 'Saturday': 6,
 'Sunday': 7}

In [54]:
data['weekday_ordinal']=data['weekday'].map(dictionary)

In [56]:
data.head()

Unnamed: 0,Day,weekday,weekday_ordinal
0,2020-07-24 20:29:45.799305,Friday,5
1,2020-07-23 20:29:45.799305,Thursday,4
2,2020-07-22 20:29:45.799305,Wednesday,3
3,2020-07-21 20:29:45.799305,Tuesday,2
4,2020-07-20 20:29:45.799305,Monday,1


### Count or Frequency Encoding

In [57]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' , header = None,index_col=None)
train_set.head() 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [58]:
columns=[1,3,5,6,7,8,9,13]

In [59]:
train_set=train_set[columns]

In [61]:
train_set.columns=['Employment','Degree','Status','Designation','family_job','Race','Sex','Country']

In [62]:
train_set.head()

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [71]:
for feature in train_set.columns[:]:
    print(feature,':',len(train_set[feature].unique()))

Employment : 9
Degree : 16
Status : 7
Designation : 15
family_job : 6
Race : 5
Sex : 2
Country : 42


In [74]:
country_map=train_set['Country'].value_counts().to_dict()

In [75]:
train_set['Country']=train_set['Country'].map(country_map)

In [77]:
train_set.head()

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95


### Advantages
* Easy To Use
* Not increasing feature space
### Disadvantages
* It will provide same weight if the frequencies are same

### Target Guided Ordinal Encoding
* 1. Ordering the labels according to the target
* 2. Replace the labels by the joint probability of being 1 or 0

In [88]:
import pandas as pd
df=pd.read_csv('titanic.csv', usecols=['Cabin','Survived'])
df

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,
...,...,...
886,0,
887,1,B42
888,0,
889,1,C148


In [89]:
#fill missing value with keyword 'Missing'
df['Cabin'].fillna('Missing',inplace=True)
df.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [90]:
#take the first letter of feature
df['Cabin']=df['Cabin'].astype(str).str[0]
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [94]:
df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [108]:
df.groupby(['Cabin'])['Survived'].mean()

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [110]:
df.groupby(['Cabin'])['Survived'].mean().sort_values()

Cabin
T    0.000000
M    0.299854
A    0.466667
G    0.500000
C    0.593220
F    0.615385
B    0.744681
E    0.750000
D    0.757576
Name: Survived, dtype: float64

In [105]:
ordinal_labels=df.groupby(['Cabin'])['Survived'].mean().sort_values().index
ordinal_labels

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [106]:
#labels are based on the survived mean
ordinal_labels2={k:i for i,k in enumerate(ordinal_labels)}
ordinal_labels2

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [107]:
#based on the frequency of the survived feature, we are assigning ordinal labels
df['Cabin_ordinal_labels']=df['Cabin'].map(ordinal_labels2)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1


### Mean Encoding

In [111]:
mean_ordinal=df.groupby(['Cabin'])['Survived'].mean().to_dict()

In [112]:
mean_ordinal

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29985443959243085,
 'T': 0.0}

In [114]:
df['mean_ordinal_encode']=df['Cabin'].map(mean_ordinal)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels,mean_ordinal_encode
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854


#### Advantage 
* It capture information within the label therefore rendering more predictive features.
* It creates a monotomy relationship between variable and the target.

#### DisAdvantage
* It prones to overfitting


In [115]:
jovian.commit(project='Handling Categorical Features')

<IPython.core.display.Javascript object>

[jovian] Attempting to save notebook..
[jovian] Updating notebook "vipul0036vipul/handling-categorical-features" on https://jovian.ml/
[jovian] Uploading notebook..
[jovian] Capturing environment..
[jovian] Committed successfully! https://jovian.ml/vipul0036vipul/handling-categorical-features


'https://jovian.ml/vipul0036vipul/handling-categorical-features'

### Probability Ration Encoding

* 1. Probability of Survived based on cabin ---> categorical Feature
* 2. Probability of not Survived ---> (1-prob(survived))
* 3. Prob(Survived) / Prob(Not Survived)
* 4. Dictionary to map cabin with probability
* 5. Replace with the categorical feature

In [130]:
import pandas as pd
df=pd.read_csv('titanic.csv', usecols=['Cabin','Survived'])
df

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,
...,...,...
886,0,
887,1,B42
888,0,
889,1,C148


In [131]:
#Replace NaN with keyword 'Missing'
df['Cabin'].fillna('Missing',inplace=True)

In [132]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [128]:
df['Cabin'].unique()

array(['Missing', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62

In [135]:
df['Cabin']=df['Cabin'].astype(str).str[0]

In [136]:
df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [139]:
prob_df=df.groupby(['Cabin'])['Survived'].mean()

In [140]:
prob_df=pd.DataFrame(prob_df)

In [141]:
prob_df

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [142]:
prob_df['Died']=1-prob_df['Survived']

In [144]:
prob_df.head()

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25


In [146]:
prob_df['Probability_ratio']=prob_df['Survived']/prob_df['Died']

In [147]:
prob_df.head()

Unnamed: 0_level_0,Survived,Died,Probability_ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0


In [150]:
probability_encoded=prob_df['Probability_ratio'].to_dict()

In [151]:
df['Cabin_encoder']=df['Cabin'].map(probability_encoded)

In [152]:
df.head()

Unnamed: 0,Survived,Cabin,Cabin_encoder
0,0,M,0.428274
1,1,C,1.458333
2,1,M,0.428274
3,1,C,1.458333
4,0,M,0.428274


In [153]:
jovian.commit(project='Handling Categorical Features')

<IPython.core.display.Javascript object>

[jovian] Attempting to save notebook..
[jovian] Updating notebook "vipul0036vipul/handling-categorical-features" on https://jovian.ml/
[jovian] Uploading notebook..
[jovian] Capturing environment..
[jovian] Committed successfully! https://jovian.ml/vipul0036vipul/handling-categorical-features


'https://jovian.ml/vipul0036vipul/handling-categorical-features'