# 1. Unordered Categorical Data: One-Hot Encoding

### Preparation

In [1]:
# Import modules
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer

## 1.1. Basics

In [4]:
# Generate data
feature = np.array((['a, b, c, d'],
                   ['a, b, c, d'],
                   ['i, j, k, l'],
                   ['m, n, o, p']))

feature

array([['a, b, c, d'],
       ['a, b, c, d'],
       ['i, j, k, l'],
       ['m, n, o, p']], dtype='<U10')

In [7]:
# Load One-Hot Encoder
one_hot = LabelBinarizer()

one_hot

In [8]:
# Transform to One-Hot Encoding
one_hot.fit_transform(feature)

one_hot.classes_

array(['a, b, c, d', 'i, j, k, l', 'm, n, o, p'], dtype='<U10')

## 1.2. Multi-class

In [17]:
# Generate data
multiclass_feature = [('apple', 'banana'),
                     ('pineapple', 'apple'),
                     ('mango', 'greenMango'),
                     ('Cherry', 'banana'),
                     ('apple', 'banana')]

multiclass_feature

[('apple', 'banana'),
 ('pineapple', 'apple'),
 ('mango', 'greenMango'),
 ('Cherry', 'banana'),
 ('apple', 'banana')]

In [41]:
# Generate One-Hot encoder
one_hot_multiclass = MultiLabelBinarizer()

# Transfer to One-Hot Encoding 
one_hot_multiclass.fit_transform(multiclass_feature)

# Check class
one_hot_data = one_hot_multiclass.classes_

print(one_hot_data)

['Cherry' 'apple' 'banana' 'greenMango' 'mango' 'pineapple']


## 1.3. One-Hot Encoding with LabelBinarizer from Scikit-Learn

In [19]:
# Import modules
from sklearn.preprocessing import OneHotEncoder

In [24]:
# Generate data
str_feature = np.array([
                            ['Morning!', 1],
                           ['Night!', 4],
                           ['Afternoon!', 2],
                           ['Morning!', 1],
                           ['Evening!', 3]
                       ])

str_feature

array([['Morning!', '1'],
       ['Night!', '4'],
       ['Afternoon!', '2'],
       ['Morning!', '1'],
       ['Evening!', '3']], dtype='<U11')

In [86]:
# Dense Array Switching
one_hot_encoder = OneHotEncoder(sparse=False)  # Dense Array

one_hot_encoder


In [40]:
# One Hot encoder
one_hot_encoder.fit_transform(str_feature)
one_hot_encoder_data = one_hot_encoder.categories_

print(one_hot_encoder_data)

[array(['Afternoon!', 'Evening!', 'Morning!', 'Night!'], dtype='<U11'), array(['1', '2', '3', '4'], dtype='<U11')]




# 2. Ordered Categorical Data

## 2.1. Basics

In [31]:
# Import modules
import pandas as pd

In [32]:
# Create a dataframe
dataframe = pd.DataFrame({
    'Score': ['Low', 'Low', 'Medium', 'Medium', 'High']
})

dataframe

Unnamed: 0,Score
0,Low
1,Low
2,Medium
3,Medium
4,High


In [33]:
# Generate a maaping dictionary
# -> Used to use for image labelling
scale_mapper = {
    'Low': 1,
    'Medium': 2,
    'High': 3
}

scale_mapper

{'Low': 1, 'Medium': 2, 'High': 3}

In [34]:
# Change featuers to integers
data = dataframe['Score'].replace(scale_mapper)

data

0    1
1    1
2    2
3    2
4    3
Name: Score, dtype: int64

## 2.2. OrdinalEncoder

In [35]:
# Import modules
from sklearn.preprocessing import OrdinalEncoder

In [36]:
# Generate an array
features_array = np.array([['Low', 10],
                          ['High', 50],
                          ['Medium', 30]])

features_array

array([['Low', '10'],
       ['High', '50'],
       ['Medium', '30']], dtype='<U11')

In [39]:
# OrdinalEncoder()
ordinal_encoder = OrdinalEncoder()

ordinal_encoder.fit_transform(features_array)
ordinal_encoder_data = ordinal_encoder.categories_

print('Ordinal Encoder >> ', ordinal_encoder_data)

Ordinal Encoder >>  [array(['High', 'Low', 'Medium'], dtype='<U11'), array(['10', '30', '50'], dtype='<U11')]


## 2.3. DictVectorizer

In [42]:
# Import modules
from sklearn.feature_extraction import DictVectorizer

In [43]:
# Generate data
data_dict = [{'Red':2, 'Blue':4},
            {'Red':4, 'Blue':3},
            {'Red':1, 'Yellow':2},
            {'Red':1, 'Yellow':2}]

data_dict

[{'Red': 2, 'Blue': 4},
 {'Red': 4, 'Blue': 3},
 {'Red': 1, 'Yellow': 2},
 {'Red': 1, 'Yellow': 2}]

In [56]:
# Generate DictVectorizer
dictvectorizer = DictVectorizer(sparse=False)

dictvectorizer

In [58]:
# Transform Dictionary to Features
feature_dict = dictvectorizer.fit_transform(data_dict)

feature_dict

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 1., 2.]])

In [60]:
# Check names of features
feature_dict_name = dictvectorizer.get_feature_names_out()

feature_dict_name

array(['Blue', 'Red', 'Yellow'], dtype=object)

In [62]:
dict_data = pd.DataFrame(feature_dict, columns=feature_dict_name)

dict_data

Unnamed: 0,Blue,Red,Yellow
0,4.0,2.0,0.0
1,3.0,4.0,0.0
2,0.0,1.0,2.0
3,0.0,1.0,2.0


# 3. Replace Missing Values

## 3.1. Use K-Nearest Neighbor (KNN) Algorithm

- K-Nearest Neighbor
  - Predict missing values by using KNN (K-Nearest Neighbor)

In [63]:
# Import modules
from sklearn.neighbors import KNeighborsClassifier

In [64]:
# Generate Characteristic matrix with Categorical characteristics
x = np.array([[0, 2.10, 1.45],
             [1, 1.18, 1.33],
             [0, 1.22, 1.27],
             [1, -0.20, -1.15]])

x

array([[ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.2 , -1.15]])

In [65]:
# Add missing values
x_with_nan = np.array([[np.nan, 0.87, 1.33],
                      [np.nan, -0.67, -0.22]])

x_with_nan

array([[  nan,  0.87,  1.33],
       [  nan, -0.67, -0.22]])

In [71]:
# Modelling
clf = KNeighborsClassifier(3,  # k=3
                           weights='distance')

clf

In [70]:
# Check
print(x[:, 0], '\n')  # Label
print(x[:,1:])  # Data

[0. 1. 0. 1.] 

[[ 2.1   1.45]
 [ 1.18  1.33]
 [ 1.22  1.27]
 [-0.2  -1.15]]


In [73]:
# Fit model
train_model = clf.fit(x[:,1:], x[:,0])

train_model

In [75]:
# Prediction
imputed_values = train_model.predict(x_with_nan[:, 1:])

imputed_values

array([0., 1.])

In [76]:
# Merge imputed classes and original dataset
x_with_imputed = np.hstack((imputed_values.reshape(-1, 1), x_with_nan[:, 1:]))

data = np.vstack((x_with_imputed, x))

data

array([[ 0.  ,  0.87,  1.33],
       [ 1.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.2 , -1.15]])

## 3.2. SimpleImputer

- Simple Imputer
  - Replace Missing Values with other values
  - Options
      - mean
      - median
      - most_frequent

In [77]:
# Import modules
from sklearn.impute import SimpleImputer

In [82]:
# Generate Characteristic matrix with Categorical characteristics
x = np.array([[0, 2.10, 1.45],
             [1, 1.18, 1.33],
             [0, 1.22, 1.27],
             [1, -0.20, -1.15]])

x

array([[ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.2 , -1.15]])

In [83]:
# Add missing values
x_with_nan = np.array([[np.nan, 0.87, 1.33],
                      [np.nan, -0.67, -0.22]])

x_with_nan

array([[  nan,  0.87,  1.33],
       [  nan, -0.67, -0.22]])

In [85]:
# Combine 'x' with missing values
x_complete = np.vstack((x_with_nan, x))

x_complete

array([[  nan,  0.87,  1.33],
       [  nan, -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.2 , -1.15]])

In [84]:
# Use Imputer
imputer = SimpleImputer(strategy="most_frequent")
data_imputer = imputer.fit_transform(x_complete)

data_imputer

array([[ 0.  ,  0.87,  1.33],
       [ 0.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.2 , -1.15]])