<a href="https://colab.research.google.com/github/utkarshkant/Helpful-Python/blob/master/Encoding_Categorical_Features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Label Encoding

In [55]:
# necessary imports
from sklearn.preprocessing import LabelEncoder

# Sample data
data = ["Red", "Green", "Blue", "Red", "Green"]
print(data)     # Output: ['Red', 'Green', 'Blue', 'Red', 'Green']

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit and transform the data
encoded_data = label_encoder.fit_transform(data)
print(encoded_data)  # Output: [2, 1, 0, 2, 1]
print(label_encoder.classes_)   # Output: ['Blue' 'Green' 'Red']

['Red', 'Green', 'Blue', 'Red', 'Green']
[2 1 0 2 1]
['Blue' 'Green' 'Red']


# 2. One-Hot Encoding or Dummy Encoding

In [56]:
# necessary imports
import pandas as pd

# Sample data
data = pd.DataFrame({'Color': ['Red', 'Green', 'Blue', 'Red', 'Green']})

# Perform one-hot encoding
encoded_data = pd.get_dummies(data, columns=['Color'])

In [57]:
data

Unnamed: 0,Color
0,Red
1,Green
2,Blue
3,Red
4,Green


In [58]:
encoded_data

Unnamed: 0,Color_Blue,Color_Green,Color_Red
0,0,0,1
1,0,1,0
2,1,0,0
3,0,0,1
4,0,1,0


## Dummy Variable Trap

In [59]:
import pandas as pd

# Sample data
data = pd.DataFrame({'Color': ['Red', 'Green', 'Blue', 'Red', 'Green']})

# Perform one-hot encoding
encoded_data = pd.get_dummies(data, columns=['Color'], drop_first=True)

In [60]:
encoded_data

Unnamed: 0,Color_Green,Color_Red
0,0,1
1,1,0
2,0,0
3,0,1
4,1,0


# 3. Binary Encoding

In [61]:
!pip install category_encoders -q

In [62]:
# necessary imports
import category_encoders as ce
import pandas as pd

# Sample data
data = pd.DataFrame({'Country': ['USA', 'Canada', 'UK', 'USA', 'UK']})

# Initialize the binary encoder
encoder = ce.BinaryEncoder(cols=['Country'])

# Fit and transform the data
encoded_data = encoder.fit_transform(data)

In [63]:
data

Unnamed: 0,Country
0,USA
1,Canada
2,UK
3,USA
4,UK


In [64]:
encoded_data

Unnamed: 0,Country_0,Country_1
0,0,1
1,1,0
2,1,1
3,0,1
4,1,1


# 4. Ordinal Encoding

In [65]:
# necessary imports
import category_encoders as ce
import pandas as pd

# Sample data
data = pd.DataFrame({"Education Level": ["High School", "Bachelor's Degree", "Master's Degree", "PhD", "Associate's Degree"]})

# Define the ordinal encoding mapping
education_mapping = {
    'High School': 0,
    "Associate's Degree": 1,
    "Bachelor's Degree": 2,
    "Master's Degree": 3,
    'PhD': 4
}

# Perform ordinal encoding
encoder = ce.OrdinalEncoder(mapping=[{'col': 'Education Level', 'mapping': education_mapping}])
encoded_data = encoder.fit_transform(data)

In [66]:
data

Unnamed: 0,Education Level
0,High School
1,Bachelor's Degree
2,Master's Degree
3,PhD
4,Associate's Degree


In [67]:
encoded_data

Unnamed: 0,Education Level
0,0
1,2
2,3
3,4
4,1


# 5. Frequency Encoding or Count Encoding

In [68]:
# imports
import pandas as pd

# Sample data
data = pd.DataFrame({'City': ['New York', 'Los Angeles', 'Chicago', 'New York', 'Los Angeles', 'Chicago', 'Chicago', 'New York', 'New York']})

# frequency encoding
frequency_encoding = data['City'].value_counts().to_dict()
data['Encoded_City'] = data['City'].map(frequency_encoding)

In [69]:
data

Unnamed: 0,City,Encoded_City
0,New York,4
1,Los Angeles,2
2,Chicago,3
3,New York,4
4,Los Angeles,2
5,Chicago,3
6,Chicago,3
7,New York,4
8,New York,4


In [70]:
data

Unnamed: 0,City,Encoded_City
0,New York,4
1,Los Angeles,2
2,Chicago,3
3,New York,4
4,Los Angeles,2
5,Chicago,3
6,Chicago,3
7,New York,4
8,New York,4


In [71]:
encoded_data

Unnamed: 0,Education Level
0,0
1,2
2,3
3,4
4,1


# 6. Target Encoding or Mean Encoding

In [72]:
# imports
import category_encoders as ce
import pandas as pd

# Sample data
data = pd.DataFrame({'Region': ['North', 'South', 'East', 'West', 'North', 'South'],
                     'Churn': [0, 1, 0, 1, 0, 1]})

# Perform target encoding
encoder = ce.TargetEncoder(cols=['Region'])
encoded_data = encoder.fit_transform(data, data['Churn'])

In [73]:
data

Unnamed: 0,Region,Churn
0,North,0
1,South,1
2,East,0
3,West,1
4,North,0
5,South,1


In [74]:
encoded_data

Unnamed: 0,Region,Churn
0,0.429074,0
1,0.570926,1
2,0.434946,0
3,0.565054,1
4,0.429074,0
5,0.570926,1


# 7. Feature Hashing or Hashing Trick

In [75]:
import category_encoders as ce
import pandas as pd

# Sample data
data = pd.DataFrame({'Product Category': ['A', 'B', 'C', 'A', 'C', 'D', 'E', 'D', 'C', 'A']})

# Perform feature hashing with three columns
encoder = ce.HashingEncoder(cols=['Product Category'], n_components=3)
encoded_data = encoder.fit_transform(data)

In [76]:
data

Unnamed: 0,Product Category
0,A
1,B
2,C
3,A
4,C
5,D
6,E
7,D
8,C
9,A


In [77]:
encoded_data

Unnamed: 0,col_0,col_1,col_2
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0
5,0,0,1
6,0,1,0
7,0,0,1
8,0,1,0
9,0,1,0
