In [44]:
# numerate string datasets
# normally, we have ordinal feature and nominal feature
# we use different way to transform these string features into numerical features
import pandas as pd
df = pd.DataFrame([['green', 'M', 10.1, 'class2'], 
                  ['red', 'S', 9.1, 'class3'], 
                  ['blue', 'L', 12.9, 'class1'],
                  ['yellow', 'XL', 14.8, 'class1']])
df.columns = ['color', 'size', 'price', 'label']
print(df)

    color size  price   label
0   green    M   10.1  class2
1     red    S    9.1  class3
2    blue    L   12.9  class1
3  yellow   XL   14.8  class1


In [45]:
# ordinal feature
# ---------------

# .map(): pandas dictionary function to map new numericak key to values in dataframe
size_mapping = {'XL': 4, 
                'L' : 3, 
                'M' : 2, 
                'S' : 1}
df['size'] = df['size'].map(size_mapping)
print(df, '\n')

# create a inverse mapping dictionary to inverse the key and value
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df['size'] = df['size'].map(inv_size_mapping)
print(df, '\n')

    color  size  price   label
0   green     2   10.1  class2
1     red     1    9.1  class3
2    blue     3   12.9  class1
3  yellow     4   14.8  class1 

    color size  price   label
0   green    M   10.1  class2
1     red    S    9.1  class3
2    blue    L   12.9  class1
3  yellow   XL   14.8  class1 



In [46]:
# nominal feature and label
# -------------------------
import numpy as np

# .map() function again for nominal feature and label
label_mapping = {label: idx for idx, label in enumerate(np.unique(df['label']))}
df['label'] = df['label'].map(label_mapping)
print(df, '\n')

# inverse agian
inv_label_mapping = {v: k for k, v in label_mapping.items()}
df['label'] = df['label'].map(inv_label_mapping)
print(df, '\n')

# sklearn encoder method
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()
df['label'] = LE.fit_transform(df['label'].values)
print(df)

    color size  price  label
0   green    M   10.1      1
1     red    S    9.1      2
2    blue    L   12.9      0
3  yellow   XL   14.8      0 

    color size  price   label
0   green    M   10.1  class2
1     red    S    9.1  class3
2    blue    L   12.9  class1
3  yellow   XL   14.8  class1 

    color size  price  label
0   green    M   10.1      1
1     red    S    9.1      2
2    blue    L   12.9      0
3  yellow   XL   14.8      0


In [47]:
# nominal problem 1
# ---------------
# mechine learning regard those nunmerical features as ordinal
# we dont want nominal features to be regarded as ordinal after transformation

# one-hot encoder
# transform nominal feature into binary values
# (only apply to single column)
from sklearn.preprocessing import OneHotEncoder
OHEncoder = OneHotEncoder()
X = df[['color', 'size', 'price']].values
OHE_outcome = OHEncoder.fit_transform(X[:, 0].reshape(-1, 1)).toarray()  # you can see color features are now represented in 4 bit number
print(OHE_outcome, '\n')

# Column Transformer
# shape: ('name', transformer/'passthrough', [columns])
# (manipulate multiple columns)
from sklearn.compose import ColumnTransformer
CT = ColumnTransformer([('onehot', OneHotEncoder(), [0]), 
                        ('nothing', 'passthrough', [1, 2])])   # 'passthrough' to skip those columns not needing transformation
CT_outcome = CT.fit_transform(X)
print(CT_outcome, '\n')

# .dummies() pandas function
# apply to all string columns, and transform them into binary type
pd_outcome = pd.get_dummies(df[['color', 'size', 'price']])
print(pd_outcome)

[[0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]] 

[[0.0 1.0 0.0 0.0 'M' 10.1]
 [0.0 0.0 1.0 0.0 'S' 9.1]
 [1.0 0.0 0.0 0.0 'L' 12.9]
 [0.0 0.0 0.0 1.0 'XL' 14.8]] 

   price  color_blue  color_green  color_red  color_yellow  size_L  size_M  \
0   10.1           0            1          0             0       0       1   
1    9.1           0            0          1             0       0       0   
2   12.9           1            0          0             0       1       0   
3   14.8           0            0          0             1       0       0   

   size_S  size_XL  
0       0        0  
1       1        0  
2       0        0  
3       0        1  


In [48]:
# nominal problem 2
# -----------------
# Multicollinearity, binary type data will be highly 
# relavant causing problem for metrices inverse

# delete one of the binary values(columns) wont reduce
# any information
pd_outcome_drop1 = pd.get_dummies(df[['price', 'color', 'size']], drop_first=True) 
print(pd_outcome_drop1, '\n')

OHEncoder_drop1 = OneHotEncoder(categories='auto', drop='first')
CT_drop1 = ColumnTransformer([('onehot', OHEncoder_drop1, [0]), 
                              ('nothing', 'passthrough', [1, 2])])
CT_outcome_drop1 = CT_drop1.fit_transform(X)
print(CT_outcome_drop1)

   price  color_green  color_red  color_yellow  size_M  size_S  size_XL
0   10.1            1          0             0       1       0        0
1    9.1            0          1             0       0       1        0
2   12.9            0          0             0       0       0        0
3   14.8            0          0             1       0       0        1 

[[1.0 0.0 0.0 'M' 10.1]
 [0.0 1.0 0.0 'S' 9.1]
 [0.0 0.0 0.0 'L' 12.9]
 [0.0 0.0 1.0 'XL' 14.8]]
