# Setup

In [161]:
import numpy as np
import pandas as pd

import sklearn.preprocessing 

import sklearn.model_selection
import sklearn.dummy

# https://github.com/yang-zhang/utils-python
import utils_yz.base
import utils_yz.explore
import utils_yz.preprocessing


## Get Data

In [162]:
df_train = pd.DataFrame({
    'letter': [
        'a',
        'b',
        'c',
    ],
    'animal': [
        'dog',
        'cat',
        'dog',
    ],
    'color': [
        'red',
        'green',
        'blue',
    ],
    'number': [1., 2.5, 3.],
    'target': [True, False, True]
})

df_test = pd.DataFrame({
    'letter': [
        'a',
        'b',
    ],
    'animal': [
        'dog',
        'pig',
    ],
    'color': [
        'red',
        'green',
    ],
    'number': [2.5, 3.5],
})
df_train
df_test

Unnamed: 0,animal,color,letter,number,target
0,dog,red,a,1.0,True
1,cat,green,b,2.5,False
2,dog,blue,c,3.0,True


Unnamed: 0,animal,color,letter,number
0,dog,red,a,2.5
1,pig,green,b,3.5


## Classifier

In [163]:
categorical_cols = ['letter', 'animal', 'color']
y = df_train['target']
clf = sklearn.dummy.DummyClassifier()

# pandas

## Use `get_dummies` in pandas to do one-hoe encoding

In [164]:
X, X_test = utils_yz.preprocessing.get_dummies_train_test(
    df_train, df_test, cat_cols=categorical_cols)
X
X_test

Unnamed: 0,number,letter_a,letter_b,letter_c,animal_cat,animal_dog,animal_pig,color_blue,color_green,color_red
0,1.0,1,0,0,0,1,0,0,0,1
1,2.5,0,1,0,1,0,0,0,1,0
2,3.0,0,0,1,0,1,0,1,0,0


Unnamed: 0,number,letter_a,letter_b,letter_c,animal_cat,animal_dog,animal_pig,color_blue,color_green,color_red
0,2.5,1,0,0,0,1,0,0,0,1
1,3.5,0,1,0,0,0,1,0,1,0


In [165]:
clf.fit(X, y).predict(X_test)

array([ True, False], dtype=bool)

## Do label encoding in pandas

In [217]:
utils_yz.preprocessing.label_encode(df_train, cat_cols=categorical_cols)

Unnamed: 0,animal,color,letter,number,target
0,1,2,0,1.0,True
1,0,1,1,2.5,False
2,1,0,2,3.0,True


In [218]:
utils_yz.preprocessing.label_encode(df_test, cat_cols=categorical_cols)

Unnamed: 0,animal,color,letter,number
0,0,1,0,2.5
1,1,0,1,3.5


In [208]:
df_train_label_encoded, df_test_label_encoded = utils_yz.preprocessing.label_encode_train_test(
    df_train, df_test, cat_cols=categorical_cols)
df_train_label_encoded
df_test_label_encoded

Unnamed: 0,animal,color,letter,number
0,1,2,0,1.0
1,0,1,1,2.5
2,1,0,2,3.0


Unnamed: 0,animal,color,letter,number
0,1,2,0,2.5
1,2,1,1,3.5


# sklearn: first  `LabelEncoder` then `OneHotEncoder`

## When train has all the values in test

In [166]:
col = 'letter'

### `LabelEncoder`

In [167]:
label_encoder = sklearn.preprocessing.LabelEncoder()
label_encoder.fit(df_train[col])
train_col_label_encoder = label_encoder.transform(df_train[col])
train_col_label_encoder

LabelEncoder()

array([0, 1, 2])

In [168]:
test_col_label_encoder = label_encoder.transform(df_test[col])
test_col_label_encoder

array([0, 1])

### `OneHotEncoder`

In [169]:
onehot_encoder = sklearn.preprocessing.OneHotEncoder()

In [170]:
onehot_encoder.fit(train_col_label_encoder.reshape(-1, 1))

OneHotEncoder(categorical_features='all', dtype=<type 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [171]:
onehot_encoder.transform(train_col_label_encoder.reshape(-1, 1)).todense()

matrix([[ 1.,  0.,  0.],
        [ 0.,  1.,  0.],
        [ 0.,  0.,  1.]])

In [172]:
onehot_encoder.transform(test_col_label_encoder.reshape(-1, 1)).todense()

matrix([[ 1.,  0.,  0.],
        [ 0.,  1.,  0.]])

## When train does not have all the values in test

In [173]:
col = 'animal'

### `LabelEncoder`

In [174]:
label_encoder = sklearn.preprocessing.LabelEncoder()
label_encoder.fit(df_train[col])
train_col_label_encoder = label_encoder.transform(df_train[col])
train_col_label_encoder

LabelEncoder()

array([1, 0, 1])

In [175]:
# error
test_col_label_encoder = label_encoder.transform(df_test[col])

ValueError: y contains new labels: ['pig']

In [176]:
label_encoder.classes_
df_test[col].unique()

array(['cat', 'dog'], dtype=object)

array(['dog', 'pig'], dtype=object)

To solve this problem, use both train and test to fit encoder.

In [177]:
label_encoder = sklearn.preprocessing.LabelEncoder()
label_encoder.fit(np.concatenate([df_train[col], df_test[col]]))

LabelEncoder()

In [178]:
train_col_label_encoder = label_encoder.transform(df_train[col])
train_col_label_encoder
test_col_label_encoder = label_encoder.transform(df_test[col])
test_col_label_encoder

array([1, 0, 1])

array([1, 2])

### `OneHotEncoder`

In [179]:
onehot_encoder = sklearn.preprocessing.OneHotEncoder()

In [183]:
# Use both train and test
onehot_encoder.fit(
    np.concatenate([train_col_label_encoder, test_col_label_encoder]).reshape(
        -1, 1))

OneHotEncoder(categorical_features='all', dtype=<type 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [181]:
onehot_encoder.transform(train_col_label_encoder.reshape(-1, 1)).todense()

matrix([[ 0.,  1.,  0.],
        [ 1.,  0.,  0.],
        [ 0.,  1.,  0.]])

In [182]:
onehot_encoder.transform(test_col_label_encoder.reshape(-1, 1)).todense()

matrix([[ 0.,  1.,  0.],
        [ 0.,  0.,  1.]])

# References
- http://stackoverflow.com/questions/35107559/scikit-learn-one-hot-encoding-of-string-categorical-features