# 3 basic approaches in Bag of Words
https://towardsdatascience.com/3-basic-approaches-in-bag-of-words-which-are-better-than-word-embeddings-c2cbc7398016

In [5]:
import collections
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, KFold

In [30]:
from sklearn.datasets import fetch_20newsgroups
y_train = fetch_20newsgroups(subset='train')

Counting word occurrence. The reason behind of using this approach is that keyword or important signal will occur again and again. So if the number of occurrence represent the importance of word. More frequency means more importance.

In [31]:
doc = "In the-state-of-art of the NLP field, Embedding is the \
success way to resolve text related problem and outperform \
Bag of Words ( BoW ). Indeed, BoW introduced limitations \
large feature dimension, sparse representation etc."

In [32]:
count_vec = CountVectorizer()
count_occurs = count_vec.fit_transform([doc])

In [37]:
count_vec.build_tokenizer()

<function sklearn.feature_extraction.text.VectorizerMixin.build_tokenizer.<locals>.<lambda>(doc)>

In [10]:
count_occur_df = pd.DataFrame((count, word) for word, count in \
                              zip(count_occurs.toarray().tolist()[0], count_vec.get_feature_names()))

In [14]:
count_occur_df.columns = ['Word', 'Count']
count_occur_df.sort_values('Count', ascending=False, inplace=True)

In [16]:
count_occur_df.head()

Unnamed: 0,Word,Count
16,of,3
26,the,3
3,bow,2
0,and,1
28,way,1


## Normalized Count Occurrence
High frequency may dominate the result and causing model bias. Normalization can be apply to pipeline easily

## Hacking Count Vectorizer

In [9]:
def wm2df(wm, feat_names):
    
    # create an index for each row
    doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(wm)]
    df = pd.DataFrame(data=wm.toarray(), index=doc_names, columns=feat_names)
    return(df)

In [10]:
corpora = ['The quick brown fox.','Jumps over the lazy dog!']

In [11]:
cvec = CountVectorizer(lowercase=False)

In [12]:
wm = cvec.fit_transform(corpora)

In [13]:
tokens = cvec.get_feature_names()

In [14]:
wm2df(wm, tokens)

Unnamed: 0,Jumps,The,brown,dog,fox,lazy,over,quick,the
Doc0,0,1,1,0,1,0,0,1,0
Doc1,1,0,0,1,0,1,1,0,1


## We can create a custom vectorize with our own analyzer, preprocessor and tokenizer

In [15]:
corpora = [
    'The quick brown fox&#x0002E;',
    'jumped over the lazy dog&#x00021;'
]

cvec = CountVectorizer()

wm = cvec.fit_transform(corpora)

tokens = cvec.get_feature_names()

pd.DataFrame(data=wm.toarray(), index=['Doc1', 'Doc2'], columns=tokens)

Unnamed: 0,brown,dog,fox,jumped,lazy,over,quick,the,x00021,x0002e
Doc1,1,0,1,0,0,0,1,1,0,1
Doc2,0,1,0,1,1,1,0,1,1,0


### Cleaning

removing HTML entities as a preprocessing step and lemmatizes the words as the document is tokenized:

In [16]:

import spacy
from html import unescape

# create a spaCy tokenizer
spacy.load('en')
lemmatizer = spacy.lang.en.English()

# remove html entities from docs and
# set everything to lowercase
def my_preprocessor(doc):
    return(unescape(doc).lower())

# tokenize the doc and lemmatize its tokens
def my_tokenizer(doc):
    tokens = lemmatizer(doc)
    return([token.lemma_ for token in tokens])

custom_vec = CountVectorizer(preprocessor=my_preprocessor, tokenizer=my_tokenizer)

cwm = custom_vec.fit_transform(corpora)

tokens = custom_vec.get_feature_names()

wm2df(cwm, tokens)

Unnamed: 0,!,.,brown,dog,fox,jump,lazy,over,quick,the
Doc0,0,1,1,0,1,0,0,0,1,1
Doc1,1,0,0,1,0,1,1,1,0,1


The default analyzers all call the preprocessor and tokenizer, but custom analyzers will skip this. N-gram extraction and stop word filtering take place at the analyzer level, so a custom analyzer may have to reproduce these steps.

In [17]:
# instantiate a vectorizer with custom preprocessor and tokenizer,
# set to remove stop words and extract bigrams
custom_vec = CountVectorizer(preprocessor=my_preprocessor,
                             tokenizer=my_tokenizer,
                             ngram_range=(1,2),
                             stop_words='english')
cwm = custom_vec.fit_transform(corpora)
tokens = custom_vec.get_feature_names()
wm2df(cwm, tokens)

  'stop_words.' % sorted(inconsistent))


Unnamed: 0,!,.,brown,brown fox,dog,dog !,fox,fox .,jump,jump lazy,lazy,lazy dog,quick,quick brown
Doc0,0,1,1,1,0,0,1,1,0,0,0,0,1,1
Doc1,1,0,0,0,1,1,0,0,1,1,1,1,0,0


when a user defined analyzer is used, the build_analyzer method does not call _word_ngrams, which is responsible for removing stop words and extracting n-grams. One way to circumvent this is by creating custom vectorizer classes. The concept is pretty simple, just create a new class inheriting from the base vectorizer and overwrite the build_preprocessor, build_tokenizer and/or build_analyzer methods as desired.

In [42]:


# defines a custom vectorizer class
class CustomVectorizer(CountVectorizer): 
    
    # overwrite the build_analyzer method, allowing one to
    # create a custom analyzer for the vectorizer
    def build_analyzer(self):
        
        # load stop words using CountVectorizer's built in method
        stop_words = self.get_stop_words()
        
        # create the analyzer that will be returned by this method
        def analyser(doc):
            
            # load spaCy's model for english language
            spacy.load('en')
            
            # instantiate a spaCy tokenizer
            lemmatizer = spacy.lang.en.English()
            
            # apply the preprocessing and tokenzation steps
            doc_clean = unescape(doc).lower()
            tokens = lemmatizer(doc_clean)
            lemmatized_tokens = [token.lemma_ for token in tokens]
            
            # use CountVectorizer's _word_ngrams built in method
            # to remove stop words and extract n-grams
            return(self._word_ngrams(lemmatized_tokens, stop_words))
        return(analyser)

In [44]:
%debug

custom_vec = CustomVectorizer(ngram_range=(1,2),
                              stop_words='english')
cv_fit = custom_vec.fit_transform(corpora)

wm2df(cv_fit, custom_vec.get_feature_names())

> [0;32m<ipython-input-38-7168b82ecbe2>[0m(1)[0;36m<module>[0;34m()[0m
[0;32m----> 1 [0;31m[0mcnt[0m [0;34m=[0m [0mbow[0m[0;34m([0m[0mcv_fit[0m[0;34m,[0m [0mcv[0m[0;34m)[0m[0;34m[[0m[0;34m'dog'[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


Unnamed: 0,!,.,brown,brown fox,dog,dog !,fox,fox .,jump,jump lazy,lazy,lazy dog,quick,quick brown
Doc0,0,1,1,1,0,0,1,1,0,0,0,0,1,1
Doc1,1,0,0,0,1,1,0,0,1,1,1,1,0,0


In [31]:
[ print(s) for s in corpora]

The quick brown fox&#x0002E;
jumped over the lazy dog&#x00021;


[None, None]

In [34]:
print(cwm.toarray())

[[0 1 1 1 0 0 1 1 0 0 0 0 1 1]
 [1 0 0 0 1 1 0 0 1 1 1 1 0 0]]


### Do the counting

In [36]:
# faster to perform computation on the sparse array
np.asarray(cv_fit.sum(axis=0))

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int64)

In [37]:
def bow(cv_fit, cv):
    
    word_list = cv.get_feature_names(); 
    count_list = cv_fit.toarray().sum(axis=0) 
    
    return dict(zip(word_list, count_list))

In [40]:
cnt = bow(cv_fit, custom_vec)['dog']
cnt

1