# Data Encoding - Utkarsh Gaikwad

1. Nominal / One Hot Encoding
2. Label and Ordinal Encoding
3. Target Guided Ordinal Encoding

### 1. Nominal / One Hot Encoding

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Create a simple dataframe
df = pd.DataFrame({
    'color':['red','blue','green','green','red','blue']
})

In [3]:
df

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,red
5,blue


In [5]:
# Create instance of OHE
encoder = OneHotEncoder()

In [9]:
# Perform fit_transform
ohe_values = encoder.fit_transform(df[['color']]).toarray()

In [10]:
import pandas as pd
df_ohe = pd.DataFrame(ohe_values, columns = encoder.get_feature_names_out())
df_ohe

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0
5,1.0,0.0,0.0


In [14]:
encoder.transform([['red']]).toarray()



array([[0., 0., 1.]])

In [15]:
pd.concat([df,df_ohe],axis=1)

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,blue,1.0,0.0,0.0
2,green,0.0,1.0,0.0
3,green,0.0,1.0,0.0
4,red,0.0,0.0,1.0
5,blue,1.0,0.0,0.0


### OHE with pandas

In [12]:
df_dummies = pd.get_dummies(df['color'])
df_dummies

Unnamed: 0,blue,green,red
0,0,0,1
1,1,0,0
2,0,1,0
3,0,1,0
4,0,0,1
5,1,0,0


### Nominal Encoding for tips dataset

In [16]:
import seaborn as sns
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [18]:
categorical_cols = list(df.columns[df.dtypes=='category'])
categorical_cols

['sex', 'smoker', 'day', 'time']

In [19]:
# Create instance of OHE
ohe = OneHotEncoder()

In [20]:
ohe_values = ohe.fit_transform(df[categorical_cols]).toarray()

In [21]:
ohe_values[0:5]

array([[1., 0., 1., 0., 0., 0., 1., 0., 1., 0.],
       [0., 1., 1., 0., 0., 0., 1., 0., 1., 0.],
       [0., 1., 1., 0., 0., 0., 1., 0., 1., 0.],
       [0., 1., 1., 0., 0., 0., 1., 0., 1., 0.],
       [1., 0., 1., 0., 0., 0., 1., 0., 1., 0.]])

In [22]:
df_ohe = pd.DataFrame(ohe_values,columns = ohe.get_feature_names_out())
df_ohe.head()

Unnamed: 0,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [24]:
## Using pd.get_dummies
df_dummies = pd.get_dummies(df[categorical_cols])
df_dummies.head()

Unnamed: 0,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
0,0,1,0,1,0,0,0,1,0,1
1,1,0,0,1,0,0,0,1,0,1
2,1,0,0,1,0,0,0,1,0,1
3,1,0,0,1,0,0,0,1,0,1
4,0,1,0,1,0,0,0,1,0,1
