# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Components" data-toc-modified-id="Components-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Components</a></div><div class="lev1 toc-item"><a href="#Behavior" data-toc-modified-id="Behavior-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Behavior</a></div><div class="lev1 toc-item"><a href="#Usage" data-toc-modified-id="Usage-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Usage</a></div><div class="lev2 toc-item"><a href="#Categorical" data-toc-modified-id="Categorical-31"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Categorical</a></div><div class="lev2 toc-item"><a href="#Multicategorical" data-toc-modified-id="Multicategorical-32"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Multicategorical</a></div><div class="lev1 toc-item"><a href="#Persistence-test" data-toc-modified-id="Persistence-test-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Persistence test</a></div>

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
import pickle

# Components

In [9]:
class CategoricalAnalyzer:    
    def __call__(self, value):
        assert not isinstance(value, list), 'don\'t give me a list!'
        return [value]
    
class MulticategoricalAnalyzer:
    def __call__(self, value):
        assert isinstance(value, list), 'hey! give me a list!'
        return value

# Behavior

In [10]:
CategoricalAnalyzer().__call__('Bob')

['Bob']

In [11]:
MulticategoricalAnalyzer().__call__(['Minsk', 'London'])

['Minsk', 'London']

# Usage

## Categorical

In [12]:
CountVectorizer(analyzer=CategoricalAnalyzer(), binary=True, min_df=2).fit([
    'Peter',
    'Peter',
    'Bob',
    'Bob',
    'Bob',
    'Max'
]).transform([
    'Peter',
    'Bob',
    'Max',
    'Rambo'
]).todense()

matrix([[0, 1],
        [1, 0],
        [0, 0],
        [0, 0]], dtype=int64)

## Multicategorical

In [13]:
CountVectorizer(analyzer=MulticategoricalAnalyzer(), binary=True, min_df=2).fit([
    ['Minsk', 'Paris'],
    ['Minsk'],
    ['Minsk', 'Moscow'],
    ['Minsk'],
    ['Paris'],
    ['London']
]).transform([
    ['Minsk'],
    ['Minsk', 'Paris'],
    ['Minsk', 'Minsk'],
    ['London', 'London'],
]).todense()

matrix([[1, 0],
        [1, 1],
        [1, 0],
        [0, 0]], dtype=int64)

# Persistence test

In [14]:
v = CountVectorizer(analyzer=MulticategoricalAnalyzer(), binary=True, min_df=2).fit([
    ['Minsk', 'Paris'],
    ['Minsk'],
    ['Minsk', 'Moscow'],
    ['Minsk'],
    ['Paris'],
    ['London']
])
vp = pickle.loads(pickle.dumps(v))
assert v.vocabulary_ == vp.vocabulary_
vp.vocabulary_

{'Minsk': 0, 'Paris': 1}