## Ordinal Encoder

In [1]:
import pandas as pd
# load example long jump dataset
df = pd.read_csv("./data/long_jump.csv")
df.set_index('Person', inplace = True)

In [2]:
# filter in categorical columns for demonstration
cats = ['Jersey Size', 'Shoe Size']
print(df[cats])

        Jersey Size  Shoe Size
Person                        
Thomas        small          7
Jane         medium         10
Vaughn        large         12
Vera         medium          9
Vincent       large         12
Lei-Ann       small          7


In [3]:
# import module and instatiate enc object
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()

In [4]:
#fit and transform in one call and print categories
out_enc = enc.fit_transform(df[cats])
print('identified categories:')
print(enc.categories_)
print('encoded_data:')
print(out_enc)

identified categories:
[array(['large', 'medium', 'small'], dtype=object), array([ 7,  9, 10, 12])]
encoded_data:
[[2. 0.]
 [1. 2.]
 [0. 3.]
 [1. 1.]
 [0. 3.]
 [2. 0.]]


In [5]:
# overwrite categorical features in original dataframe
df[cats] = out_enc
print(df.head())

         Age  Height  Weight  Training Hours/week Jersey Color  Jersey Size  \
Person                                                                        
Thomas    12    57.5    73.4                  6.5         blue          2.0   
Jane      13    65.5    85.3                  8.9        green          1.0   
Vaughn    17    71.9   125.9                  1.1        green          0.0   
Vera      14    65.3   100.5                  7.9          red          1.0   
Vincent   18    70.1   110.7                 10.5         blue          0.0   

         Shoe Size  Long Jump  
Person                         
Thomas         0.0       19.2  
Jane           2.0       25.1  
Vaughn         3.0       14.3  
Vera           1.0       18.3  
Vincent        3.0       21.1  


## One-Hot Encoding

In [6]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse = False)

out_enc = enc.fit_transform(df[cats])
new_cols = enc.get_feature_names(cats).tolist()
print(new_cols)

['Jersey Size_0.0', 'Jersey Size_1.0', 'Jersey Size_2.0', 'Shoe Size_0.0', 'Shoe Size_1.0', 'Shoe Size_2.0', 'Shoe Size_3.0']


In [7]:
# create temporary dataframe "df_enc" for concatenation with original data
df_enc = pd.DataFrame(data= out_enc, columns = new_cols)
df_enc.index = df.index

#drop original columns and concat new encoded columns
df. drop(cats, axis=1, inplace = True)
df = pd.concat([df,df_enc], axis =1 )
print(df.columns)

Index(['Age', 'Height', 'Weight', 'Training Hours/week', 'Jersey Color',
       'Long Jump', 'Jersey Size_0.0', 'Jersey Size_1.0', 'Jersey Size_2.0',
       'Shoe Size_0.0', 'Shoe Size_1.0', 'Shoe Size_2.0', 'Shoe Size_3.0'],
      dtype='object')


## Lable Encoding

In [8]:
from sklearn import preprocessing
enc = preprocessing.LableEncoder()

out_enc = enc.fit_transform([1,2,5,2,4,2,5])
print(out_enc)

out_enc = enc.fit_transform(['blue','red','blue','green','red','red'])
print(out_enc)

AttributeError: module 'sklearn.preprocessing' has no attribute 'LableEncoder'