In [1]:
from sklearn.preprocessing import LabelEncoder

labels = ['cat', 'dog', 'fish', 'dog', 'cat']
le = LabelEncoder()
encoded_labels = le.fit_transform(labels)

print(f"Encoded Labels : {encoded_labels}")  # Output: [0 1 2 1 0]

# To get back the original labels
original_labels = le.inverse_transform(encoded_labels)
print(f"Reversed Encoded Labels : {original_labels}")  # Output: ['cat' 'dog' 'fish' 'dog' 'cat']

Encoded Labels : [0 1 2 1 0]
Reversed Encoded Labels : ['cat' 'dog' 'fish' 'dog' 'cat']


In [2]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Original data
colors = np.array(['red', 'green', 'blue', 'green']).reshape(-1, 1)

# Fit OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
encoded_colors = encoder.fit_transform(colors)

print("One-hot encoded:\n", encoded_colors)

# Reverse back to original labels
original_colors = encoder.inverse_transform(encoded_colors)
print("\nInverse transformed:\n", original_colors)

One-hot encoded:
 [[0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]]

Inverse transformed:
 [['red']
 ['green']
 ['blue']
 ['green']]


In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np

# LabelEncoder example (for target labels)
labels = ['cat', 'dog', 'fish']
le = LabelEncoder()
encoded_labels = le.fit_transform(labels)
print("Label encoded:", encoded_labels)

# OneHotEncoder example (for categorical features)
colors = np.array(['red', 'green', 'blue', 'green']).reshape(-1, 1)
ohe = OneHotEncoder(sparse_output=False)
encoded_colors = ohe.fit_transform(colors)
print("\nOne-hot encoded:\n", encoded_colors)

Label encoded: [0 1 2]
One-hot encoded:
 [[0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]]


In [4]:
from sklearn.preprocessing import OrdinalEncoder
import numpy as np

sizes = np.array(['small', 'medium', 'large', 'medium']).reshape(-1, 1)

encoder = OrdinalEncoder(categories=[['small', 'medium', 'large']])
encoded_sizes = encoder.fit_transform(sizes)
print(encoded_sizes)
# Output:
# [[0.]
#  [1.]
#  [2.]
#  [1.]]

[[0.]
 [1.]
 [2.]
 [1.]]


In [5]:
from sklearn.preprocessing import LabelBinarizer

# Example labels (multiclass target)
labels = ['cat', 'dog', 'fish', 'dog', 'cat']

# Initialize LabelBinarizer
lb = LabelBinarizer()

# Fit and transform labels to one-hot binary matrix
binary_labels = lb.fit_transform(labels)

print("Original labels:")
print(labels)
print("\nOne-hot encoded binary matrix:")
print(binary_labels)

# Show classes found by the encoder
print("\nClasses:")
print(lb.classes_)

# Inverse transform back to original labels
original_labels = lb.inverse_transform(binary_labels)
print("\nInverse transformed labels:")
print(original_labels)

Original labels:
['cat', 'dog', 'fish', 'dog', 'cat']

One-hot encoded binary matrix:
[[1 0 0]
 [0 1 0]
 [0 0 1]
 [0 1 0]
 [1 0 0]]

Classes:
['cat' 'dog' 'fish']

Inverse transformed labels:
['cat' 'dog' 'fish' 'dog' 'cat']


In [6]:
# Example with binary labels
binary_example = ['yes', 'no', 'yes', 'no']
lb_binary = LabelBinarizer()
binary_encoded = lb_binary.fit_transform(binary_example)
print("\nBinary labels one-hot encoded:")
print(binary_encoded)

print("\nBinary classes:")
print(lb_binary.classes_)

print("\nInverse transformed binary labels:")
print(lb_binary.inverse_transform(binary_encoded))


Binary labels one-hot encoded:
[[1]
 [0]
 [1]
 [0]]

Binary classes:
['no' 'yes']

Inverse transformed binary labels:
['yes' 'no' 'yes' 'no']


In [7]:
from sklearn.preprocessing import FunctionTransformer

def to_uppercase(X):
    return np.char.upper(X)

func_transformer = FunctionTransformer(to_uppercase)

data = np.array(['cat', 'dog', 'fish']).reshape(-1, 1)
transformed_data = func_transformer.fit_transform(data)

print("\nCustom transformed data (to uppercase):\n", transformed_data)


Custom transformed data (to uppercase):
 [['CAT']
 ['DOG']
 ['FISH']]


In [10]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer, OrdinalEncoder, FunctionTransformer
import numpy as np

# Sample data
labels = ['cat', 'dog', 'fish', 'dog', 'cat']          # For LabelEncoder & LabelBinarizer
colors = np.array(['red', 'green', 'blue', 'green']).reshape(-1, 1)  # For OneHotEncoder
sizes = np.array(['small', 'medium', 'large', 'medium']).reshape(-1, 1)  # For OrdinalEncoder
binary_labels = ['yes', 'no', 'yes', 'no']             # For binary LabelBinarizer

print("=== LabelEncoder ===")
le = LabelEncoder()
encoded_labels = le.fit_transform(labels)
print("Original labels:", labels)
print("Encoded labels:", encoded_labels)
print("Inverse transform:", le.inverse_transform(encoded_labels))


=== LabelEncoder ===
Original labels: ['cat', 'dog', 'fish', 'dog', 'cat']
Encoded labels: [0 1 2 1 0]
Inverse transform: ['cat' 'dog' 'fish' 'dog' 'cat']


In [11]:
print("\n=== OneHotEncoder ===")
ohe = OneHotEncoder(sparse_output=False)
encoded_colors = ohe.fit_transform(colors)
print("Original colors:\n", colors.flatten())
print("One-hot encoded colors:\n", encoded_colors)
print("Inverse transform:\n", ohe.inverse_transform(encoded_colors))


=== OneHotEncoder ===
Original colors:
 ['red' 'green' 'blue' 'green']
One-hot encoded colors:
 [[0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]]
Inverse transform:
 [['red']
 ['green']
 ['blue']
 ['green']]


In [12]:
print("\n=== LabelBinarizer ===")
lb = LabelBinarizer()
binary_matrix = lb.fit_transform(labels)
print("Original labels:", labels)
print("Binarized labels:\n", binary_matrix)
print("Inverse transform:", lb.inverse_transform(binary_matrix))


=== LabelBinarizer ===
Original labels: ['cat', 'dog', 'fish', 'dog', 'cat']
Binarized labels:
 [[1 0 0]
 [0 1 0]
 [0 0 1]
 [0 1 0]
 [1 0 0]]
Inverse transform: ['cat' 'dog' 'fish' 'dog' 'cat']


In [13]:
print("\nBinary labels example with LabelBinarizer:")
lb_binary = LabelBinarizer()
binary_encoded = lb_binary.fit_transform(binary_labels)
print("Original binary labels:", binary_labels)
print("Binarized binary labels:\n", binary_encoded)
print("Inverse transform:", lb_binary.inverse_transform(binary_encoded))


Binary labels example with LabelBinarizer:
Original binary labels: ['yes', 'no', 'yes', 'no']
Binarized binary labels:
 [[1]
 [0]
 [1]
 [0]]
Inverse transform: ['yes' 'no' 'yes' 'no']


In [14]:
print("\n=== OrdinalEncoder ===")
ordinal_encoder = OrdinalEncoder(categories=[['small', 'medium', 'large']])
encoded_sizes = ordinal_encoder.fit_transform(sizes)
print("Original sizes:\n", sizes.flatten())
print("Ordinal encoded sizes:\n", encoded_sizes)
print("Inverse transform:\n", ordinal_encoder.inverse_transform(encoded_sizes))


=== OrdinalEncoder ===
Original sizes:
 ['small' 'medium' 'large' 'medium']
Ordinal encoded sizes:
 [[0.]
 [1.]
 [2.]
 [1.]]
Inverse transform:
 [['small']
 ['medium']
 ['large']
 ['medium']]


In [15]:
print("\n=== FunctionTransformer ===")
def to_uppercase(X):
    return np.char.upper(X)

func_transformer = FunctionTransformer(to_uppercase)
data = np.array(['cat', 'dog', 'fish']).reshape(-1, 1)
transformed_data = func_transformer.fit_transform(data)
print("Original data:\n", data.flatten())
print("Custom transformed data (uppercase):\n", transformed_data.flatten())


=== FunctionTransformer ===
Original data:
 ['cat' 'dog' 'fish']
Custom transformed data (uppercase):
 ['CAT' 'DOG' 'FISH']


In [9]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
import numpy as np

X = np.array([
    ['red', 10],
    ['green', 20],
    ['blue', 15],
    ['green', 25]
])

# Apply OneHotEncoder only to the first column (categorical)
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=False), [0])  # column 0 is categorical
    ],
    remainder='passthrough'  # pass through other columns unchanged
)

pipeline = make_pipeline(preprocessor, LogisticRegression())

# pipeline.fit(X, y)  # Fit with features X and target y

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import category_encoders as ce

# Create a simple dataset
df = pd.DataFrame({
    'Color': ['Red', 'Blue', 'Green', 'Blue', 'Red', 'Green', 'Red', 'Blue'],
    'Size': ['S', 'M', 'L', 'M', 'L', 'S', 'M', 'L'],
    'Target': [1, 0, 1, 0, 1, 0, 1, 0]
})

X = df[['Color', 'Size']]
y = df['Target']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# 1. OneHotEncoder
ohe = OneHotEncoder(sparse_output=False)
X_ohe = pd.DataFrame(ohe.fit_transform(X_train), columns=ohe.get_feature_names_out(X_train.columns))

# 2. OrdinalEncoder
oe = OrdinalEncoder()
X_ordinal = pd.DataFrame(oe.fit_transform(X_train), columns=X_train.columns)

# 3. TargetEncoder
te = ce.TargetEncoder()
X_target = te.fit_transform(X_train, y_train)

# 4. Frequency/CountEncoder
fe = ce.CountEncoder()
X_freq = fe.fit_transform(X_train)

# 5. BinaryEncoder
be = ce.BinaryEncoder()
X_binary = be.fit_transform(X_train)

# 6. HashingEncoder
he = ce.HashingEncoder(n_components=6)
X_hash = he.fit_transform(X_train)

In [4]:
# OneHotEncoder:
X_ohe.head()

Unnamed: 0,Color_Blue,Color_Green,Color_Red,Size_L,Size_M,Size_S
0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,1.0,0.0,1.0,0.0,0.0
3,0.0,0.0,1.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,1.0,0.0


In [5]:
#OrdinalEncoder:
X_ordinal.head()

Unnamed: 0,Color,Size
0,2.0,2.0
1,0.0,0.0
2,1.0,0.0
3,2.0,0.0
4,0.0,1.0


In [6]:
#TargetEncoder:
X_target.head()

Unnamed: 0,Color,Size
0,0.718155,0.710036
7,0.572099,0.666667
2,0.710036,0.666667
4,0.718155,0.666667
3,0.572099,0.643025


In [7]:
#FrequencyEncoder:
X_freq.head()

Unnamed: 0,Color,Size
0,3,1
7,2,3
2,1,3
4,3,3
3,2,2


In [8]:
#BinaryEncoder:
X_binary.head()

Unnamed: 0,Color_0,Color_1,Size_0,Size_1
0,0,1,0,1
7,1,0,1,0
2,1,1,1,0
4,0,1,1,0
3,1,0,1,1


In [9]:
#HashingEncoder:
X_hash.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5
0,0,0,0,0,1,1
7,0,0,0,2,0,0
2,0,0,0,2,0,0
4,0,0,0,1,0,1
3,0,0,1,1,0,0
