# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Description" data-toc-modified-id="Description-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Description</a></div><div class="lev1 toc-item"><a href="#Imports" data-toc-modified-id="Imports-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Imports</a></div><div class="lev1 toc-item"><a href="#Components" data-toc-modified-id="Components-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Components</a></div><div class="lev1 toc-item"><a href="#Behavior" data-toc-modified-id="Behavior-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Behavior</a></div><div class="lev1 toc-item"><a href="#Usage" data-toc-modified-id="Usage-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Usage</a></div><div class="lev2 toc-item"><a href="#Categorical-feature" data-toc-modified-id="Categorical-feature-51"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Categorical feature</a></div><div class="lev2 toc-item"><a href="#Multicategorical-feature" data-toc-modified-id="Multicategorical-feature-52"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Multicategorical feature</a></div><div class="lev1 toc-item"><a href="#Persistence" data-toc-modified-id="Persistence-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Persistence</a></div>

# Description

One hot encoding (for categorical and multicategorical features) via scikit-learn CountVectorizer

# Imports

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pickle

# Components

In [2]:
class CategoricalAnalyzer:    
    def __call__(self, value):
        assert not isinstance(value, list), 'don\'t give me a list!'
        return [value]
    
class MulticategoricalAnalyzer:
    def __call__(self, value):
        assert isinstance(value, list), 'hey! give me a list!'
        return value

# Behavior

In [3]:
CategoricalAnalyzer().__call__('Bob')

['Bob']

In [4]:
MulticategoricalAnalyzer().__call__(['Minsk', 'London'])

['Minsk', 'London']

# Usage

## Categorical feature

In [5]:
CountVectorizer(analyzer=CategoricalAnalyzer(), binary=True, min_df=2).fit([
    'Peter',
    'Peter',
    'Bob',
    'Bob',
    'Bob',
    'John',
    'John',
    'Max'
]).transform([
    'Peter',
    'Bob',
    'Max',
    'Rambo'
]).todense()

matrix([[0, 0, 1],
        [1, 0, 0],
        [0, 0, 0],
        [0, 0, 0]], dtype=int64)

## Multicategorical feature

In [6]:
CountVectorizer(analyzer=MulticategoricalAnalyzer(), binary=True, min_df=2).fit([
    ['Minsk', 'Paris'],
    ['Minsk'],
    ['Minsk'],
    ['Minsk', 'Moscow', 'Houston'],    
    ['Paris'],
    ['London'],
    ['Sydney'],
    ['Sydney']
]).transform([
    ['Minsk'],
    ['Minsk', 'Paris'],
    ['Minsk', 'Minsk'],
    ['Sydney'],
    ['London', 'London', 'Houston'],    
    ['New York']
]).todense()

matrix([[1, 0, 0],
        [1, 1, 0],
        [1, 0, 0],
        [0, 0, 1],
        [0, 0, 0],
        [0, 0, 0]], dtype=int64)

# Persistence

In [7]:
v = CountVectorizer(analyzer=MulticategoricalAnalyzer(), binary=True, min_df=2).fit([
    ['Minsk', 'Paris'],
    ['Minsk'],
    ['Minsk'],
    ['Minsk', 'Moscow'],
    ['Sydney'],
    ['Sydney'],
    ['Sydney'],
    ['Sydney'],
    ['Paris'],
    ['London']
])
vp = pickle.loads(pickle.dumps(v))
assert v.vocabulary_ == vp.vocabulary_
vp.vocabulary_

{'Minsk': 0, 'Paris': 1, 'Sydney': 2}