In [1]:
from __future__ import print_function
%matplotlib inline
import sys
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import feature_extraction


# turn of data table rendering
pd.set_option('display.notebook_repr_html', False)
sns.set_palette(['#00A99D', '#F5CA0C', '#B6129F', '#76620C', '#095C57'])
sys.version

'3.5.4 | packaged by conda-forge | (default, Nov  4 2017, 10:19:47) [MSC v.1900 64 bit (AMD64)]'

## Example
We have a dataframe with four columns with only the value column containing continuous data. To use this data for a machine learning task, we need to extract the categorical data into binary one-hot columns. This way, each categorical value gets its own column with either the value 1 or 0 to denote the state of the feature for a particular row.

In [2]:
# Load the original dataframe from csv
df = pd.read_csv('data/feature-engineering.csv')
df.head()

  Gender       City Attended     Value
0   girl   New York      Yes  0.991472
1    boy     London       No  0.980504
2    boy  Amsterdam      Yes  0.969145
3    boy  Amsterdam       No  0.968502
4    man     London      Yes  0.938684

## Binary One-Hot Encoding using DictVectorizer and Pandas
A few simple steps let us extracts the categorical data from the dataframe and replace them with binary one-hot encoded colums. For this we use DictVectorizer from Scikit Learn's [Feature Extraction](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html) library.

In [3]:
# Create a dictionary with the categorical data points for each row
cat_columns = ['Gender', 'City', 'Attended']
cat_dict = df[cat_columns].to_dict(outtype='records')
cat_dict[:5]

[{'Gender': 'girl', 'City': 'New York', 'Attended': 'Yes'},
 {'Gender': 'boy', 'City': 'London', 'Attended': 'No'},
 {'Gender': 'boy', 'City': 'Amsterdam', 'Attended': 'Yes'},
 {'Gender': 'boy', 'City': 'Amsterdam', 'Attended': 'No'},
 {'Gender': 'man', 'City': 'London', 'Attended': 'Yes'}]

In [4]:
# Construct a DictVectorizer to transform our dictionary to
# a binary on-hot encoded array for each row
vec = feature_extraction.DictVectorizer()
cat_vector = vec.fit_transform(cat_dict).toarray()
cat_vector[:5]

array([[ 0.,  1.,  0.,  0.,  1.,  0.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  1.,  1.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  0.]])

In [5]:
# Construct a separate dataframe with the one-hot encoded data
# and set the column names by calling get_feature_names
df_vector = pd.DataFrame(cat_vector)
vector_columns = vec.get_feature_names()
vector_columns

['Attended=No',
 'Attended=Yes',
 'City=Amsterdam',
 'City=London',
 'City=New York',
 'Gender=boy',
 'Gender=girl',
 'Gender=man',
 'Gender=woman']

In [6]:
# Drop the categorical columns and join the new one-hot 
# dataframe with the original dataframe
df_vector.columns = vector_columns
df_vector.index = df.index

df = df.drop(cat_columns, axis=1)
df = df.join(df_vector)
df.head()

      Value  Attended=No  Attended=Yes  City=Amsterdam  City=London  \
0  0.991472            0             1               0            0   
1  0.980504            1             0               0            1   
2  0.969145            0             1               1            0   
3  0.968502            1             0               1            0   
4  0.938684            0             1               0            1   

   City=New York  Gender=boy  Gender=girl  Gender=man  Gender=woman  
0              1           0            1           0             0  
1              0           1            0           0             0  
2              0           1            0           0             0  
3              0           1            0           0             0  
4              0           0            0           1             0  

In [7]:
df.describe()

            Value  Attended=No  Attended=Yes  City=Amsterdam  City=London  \
count  100.000000       100.00        100.00      100.000000   100.000000   
mean     0.485863         0.55          0.45        0.400000     0.320000   
std      0.307723         0.50          0.50        0.492366     0.468826   
min      0.016199         0.00          0.00        0.000000     0.000000   
25%      0.213648         0.00          0.00        0.000000     0.000000   
50%      0.454238         1.00          0.00        0.000000     0.000000   
75%      0.752019         1.00          1.00        1.000000     1.000000   
max      0.991472         1.00          1.00        1.000000     1.000000   

       City=New York  Gender=boy  Gender=girl  Gender=man  Gender=woman  
count     100.000000  100.000000   100.000000  100.000000    100.000000  
mean        0.280000    0.250000     0.250000    0.250000      0.250000  
std         0.451261    0.435194     0.435194    0.435194      0.435194  
min       