# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Description" data-toc-modified-id="Description-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Description</a></div><div class="lev1 toc-item"><a href="#Imports" data-toc-modified-id="Imports-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Imports</a></div><div class="lev1 toc-item"><a href="#Custom-components" data-toc-modified-id="Custom-components-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Custom components</a></div><div class="lev2 toc-item"><a href="#Definition" data-toc-modified-id="Definition-31"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Definition</a></div><div class="lev2 toc-item"><a href="#Demo" data-toc-modified-id="Demo-32"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Demo</a></div><div class="lev1 toc-item"><a href="#Usage" data-toc-modified-id="Usage-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Usage</a></div><div class="lev2 toc-item"><a href="#Categorical-feature" data-toc-modified-id="Categorical-feature-41"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Categorical feature</a></div><div class="lev2 toc-item"><a href="#Multicategorical-feature" data-toc-modified-id="Multicategorical-feature-42"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Multicategorical feature</a></div><div class="lev1 toc-item"><a href="#Persistence" data-toc-modified-id="Persistence-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Persistence</a></div>

# Description

One hot encoding (for categorical and multicategorical features) via scikit-learn CountVectorizer

# Imports

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pickle

# Custom components

## Definition

In [2]:
class CategoricalAnalyzer:
    
    def __call__(self, value):
        assert not isinstance(value, list), 'don\'t give me a list!'
        return [value]

In [3]:
class MulticategoricalAnalyzer:
    
    def __call__(self, value):
        assert isinstance(value, list), 'hey! give me a list!'
        return value

## Demo

In [4]:
CategoricalAnalyzer()('Bob')

['Bob']

In [5]:
MulticategoricalAnalyzer()(['Minsk', 'London'])

['Minsk', 'London']

# Usage

## Categorical feature

In [6]:
CountVectorizer(analyzer=CategoricalAnalyzer(), binary=True, min_df=2).fit([
    'Peter',
    'Peter',
    'Bob',
    'Bob',
    'Bob',
    'John',
    'John',
    'Max'
]).transform([
    'Peter',
    'Bob',
    'Max',
    'Rambo'
]).todense()

matrix([[0, 0, 1],
        [1, 0, 0],
        [0, 0, 0],
        [0, 0, 0]], dtype=int64)

## Multicategorical feature

In [7]:
CountVectorizer(analyzer=MulticategoricalAnalyzer(), binary=True, min_df=2).fit([
    ['Minsk', 'Paris'],
    ['Minsk'],
    ['Minsk'],
    ['Minsk', 'Moscow', 'Houston'],    
    ['Paris'],
    ['London'],
    ['Sydney'],
    ['Sydney']
]).transform([
    ['Minsk'],
    ['Minsk', 'Paris'],
    ['Minsk', 'Minsk'],
    ['Sydney'],
    ['London', 'London', 'Houston'],    
    ['New York']
]).todense()

matrix([[1, 0, 0],
        [1, 1, 0],
        [1, 0, 0],
        [0, 0, 1],
        [0, 0, 0],
        [0, 0, 0]], dtype=int64)

# Persistence

In [8]:
v = CountVectorizer(analyzer=MulticategoricalAnalyzer(), binary=True, min_df=2).fit([
    ['Minsk', 'Paris'],
    ['Minsk'],
    ['Minsk'],
    ['Minsk', 'Moscow'],
    ['Sydney'],
    ['Sydney'],
    ['Sydney'],
    ['Sydney'],
    ['Paris'],
    ['London']
])
vp = pickle.loads(pickle.dumps(v))
assert v.vocabulary_ == vp.vocabulary_
vp.vocabulary_

{'Minsk': 0, 'Paris': 1, 'Sydney': 2}