## Transforming Categorical Data

### Encoding Nominal Features ie features with no instrinsic values

#### Single Vector: Encoding to a one-hot encoder

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelBinarizer

# Create feature
feature = np.array([["Texas"],
 ["California"],
 ["Texas"],
 ["Delaware"],
 ["Texas"]])

# create one hot encoder
one_hot = LabelBinarizer()

# transform features
one_hot.fit_transform(feature)

# view features classes
one_hot.classes_

# reverse one hot encoding
one_hot.inverse_transform(one_hot.transform(feature))

# Create dummy variables from feature
pd.get_dummies(feature[:,0])

Unnamed: 0,California,Delaware,Texas
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


#### Encoding Multiple Classes

In [3]:
from sklearn.preprocessing import MultiLabelBinarizer

# Create multiclass feature
multiclass_feature = [("Texas", "Florida"),
 ("California", "Alabama"),
 ("Texas", "Florida"),
 ("Delware", "Florida"),
 ("Texas", "Alabama")]

# create binarizer
one_hot_multiclass = MultiLabelBinarizer()

# transform data
one_hot_multiclass.fit_transform(multiclass_feature)

# view transformed data
one_hot_multiclass.classes_

array(['Alabama', 'California', 'Delware', 'Florida', 'Texas'],
      dtype=object)

### Encoding Ordinal Categorical Features: classes have order

#### Single target vector

In [4]:
# Create features
dataframe = pd.DataFrame({"Score": ["Low", "Low", "Medium", "Medium", "High"]})

# create mapper : create dictionnary
scale_mapper = {'Low':1,
              'Medium':2,
              'High':3}

dataframe['Score'].replace(scale_mapper)

0    1
1    1
2    2
3    2
4    3
Name: Score, dtype: int64

#### Multiple Features: Encoding Dictionnaries of Features

In [5]:
from sklearn.feature_extraction import DictVectorizer

# Create dictionary
data_dict = [{"Red": 2, "Blue": 4},
 {"Red": 4, "Blue": 3},
 {"Red": 1, "Yellow": 2},
 {"Red": 2, "Yellow": 2}]

# create dictionnary vectorizer
dict_vectorizer = DictVectorizer(sparse = False)

# convert dictionnary to feature matrix
features = dict_vectorizer.fit_transform(data_dict)

# get features names
feature_names = dict_vectorizer.get_feature_names()

# put features matrix into dataframe
df = pd.DataFrame(features, columns = feature_names)

# sparse: encode value with 0