# Bag of words in sklearn
Reference:
https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
pd.options.display.max_columns = 30
%matplotlib inline

In [2]:
texts = [
    "Penny bought bright blue fishes.",
    "Penny bought bright blue and orange fish.",
    "The cat ate a fish at the store.",
    "Penny went to the store. Penny ate a bug. Penny saw a fish.",
    "It meowed once at the bug, it is still meowing at the bug and the fish",
    "The cat is at the fish store. The cat is orange. The cat is meowing at the fish.",
    "Penny is a fish"
]

In [3]:
# A simplest countvectorizer
count_vectorizer = CountVectorizer()
x = count_vectorizer.fit_transform(texts)
print(x.toarray())
pd.DataFrame(x.toarray(), columns=count_vectorizer.get_feature_names())

[[0 0 0 1 1 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [1 0 0 1 1 1 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0]
 [0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 2 0 0]
 [0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 3 1 0 1 1 1 1]
 [1 2 0 0 0 0 2 0 1 0 1 2 1 1 1 0 0 0 1 0 3 0 0]
 [0 2 0 0 0 0 0 3 2 0 3 0 0 1 0 1 0 0 0 1 5 0 0]
 [0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0]]


Unnamed: 0,and,at,ate,blue,bought,bright,bug,cat,fish,fishes,is,it,meowed,meowing,once,orange,penny,saw,still,store,the,to,went
0,0,0,0,1,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
1,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0
2,0,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,2,0,0
3,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,3,1,0,1,1,1,1
4,1,2,0,0,0,0,2,0,1,0,1,2,1,1,1,0,0,0,1,0,3,0,0
5,0,2,0,0,0,0,0,3,2,0,3,0,0,1,0,1,0,0,0,1,5,0,0
6,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0


In [4]:
# CountVectorizer with stop words, we can see the number of features decreases by a lot.
count_vectorizer = CountVectorizer(stop_words='english')
x = count_vectorizer.fit_transform(texts)
print(x.toarray())
pd.DataFrame(x.toarray(), columns=count_vectorizer.get_feature_names())

[[0 1 1 1 0 0 0 1 0 0 0 1 0 0 0]
 [0 1 1 1 0 0 1 0 0 0 1 1 0 0 0]
 [1 0 0 0 0 1 1 0 0 0 0 0 0 1 0]
 [1 0 0 0 1 0 1 0 0 0 0 3 1 1 1]
 [0 0 0 0 2 0 1 0 1 1 0 0 0 0 0]
 [0 0 0 0 0 3 2 0 0 1 1 0 0 1 0]
 [0 0 0 0 0 0 1 0 0 0 0 1 0 0 0]]


Unnamed: 0,ate,blue,bought,bright,bug,cat,fish,fishes,meowed,meowing,orange,penny,saw,store,went
0,0,1,1,1,0,0,0,1,0,0,0,1,0,0,0
1,0,1,1,1,0,0,1,0,0,0,1,1,0,0,0
2,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0
3,1,0,0,0,1,0,1,0,0,0,0,3,1,1,1
4,0,0,0,0,2,0,1,0,1,1,0,0,0,0,0
5,0,0,0,0,0,3,2,0,0,1,1,0,0,1,0
6,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0


In [5]:
# CountVectorizer with custom tokenizer; boring tokenizer, nothing happened.
import re

def boring_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    return words

count_vectorizer = CountVectorizer(stop_words='english', tokenizer=boring_tokenizer)
X = count_vectorizer.fit_transform(texts)
print(count_vectorizer.get_feature_names())

['ate', 'blue', 'bought', 'bright', 'bug', 'cat', 'fish', 'fishes', 'meowed', 'meowing', 'orange', 'penny', 'saw', 'store', 'went']


In [6]:
# Tryout stemmer
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

print(porter_stemmer.stem('fishes'))
print(porter_stemmer.stem('meowed'))
print(porter_stemmer.stem('oranges'))
print(porter_stemmer.stem('meowing'))
print(porter_stemmer.stem('orange'))
print(porter_stemmer.stem('go'))
print(porter_stemmer.stem('went'))

fish
meow
orang
meow
orang
go
went


In [7]:
# Stop_word + stemming tokenizer
porter_stemmer = PorterStemmer()

def stemming_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

count_vectorizer = CountVectorizer(stop_words='english', tokenizer=stemming_tokenizer)
X = count_vectorizer.fit_transform(texts)
print(count_vectorizer.get_feature_names())

['ate', 'blue', 'bought', 'bright', 'bug', 'cat', 'fish', 'meow', 'onc', 'orang', 'penni', 'saw', 'store', 'went']


  'stop_words.' % sorted(inconsistent))


### Term frequency = the probability of the term showing up in the string/sentence

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
# Same usage as CountVectorizer, as special setting here: use_idf=False. Setting norm = 'l1' makes it easier to see t
tfidf_vectorizer = TfidfVectorizer(stop_words='english', tokenizer=stemming_tokenizer, use_idf=False, norm='l1')
X = tfidf_vectorizer.fit_transform(texts)
df = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names())
df

Unnamed: 0,ate,blue,bought,bright,bug,cat,fish,meow,onc,orang,penni,saw,store,went
0,0.0,0.2,0.2,0.2,0.0,0.0,0.2,0.0,0.0,0.0,0.2,0.0,0.0,0.0
1,0.0,0.166667,0.166667,0.166667,0.0,0.0,0.166667,0.0,0.0,0.166667,0.166667,0.0,0.0,0.0
2,0.25,0.0,0.0,0.0,0.0,0.25,0.25,0.0,0.0,0.0,0.0,0.0,0.25,0.0
3,0.111111,0.0,0.0,0.0,0.111111,0.0,0.111111,0.0,0.0,0.0,0.333333,0.111111,0.111111,0.111111
4,0.0,0.0,0.0,0.0,0.333333,0.0,0.166667,0.333333,0.166667,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.375,0.25,0.125,0.0,0.125,0.0,0.0,0.125,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.5,0.0,0.0,0.0


### Inverse term frequency = how much information the term provides - the more a term shows up in a text, the less important it is.

In [10]:
idf_vectorizer = TfidfVectorizer(stop_words='english', tokenizer=stemming_tokenizer, use_idf=True, norm='l1')
X = idf_vectorizer.fit_transform(texts)
idf_df = pd.DataFrame(X.toarray(), columns=idf_vectorizer.get_feature_names())
idf_df

Unnamed: 0,ate,blue,bought,bright,bug,cat,fish,meow,onc,orang,penni,saw,store,went
0,0.0,0.235463,0.235463,0.235463,0.0,0.0,0.118871,0.0,0.0,0.0,0.174741,0.0,0.0,0.0
1,0.0,0.190587,0.190587,0.190587,0.0,0.0,0.096216,0.0,0.0,0.190587,0.141437,0.0,0.0,0.0
2,0.297654,0.0,0.0,0.0,0.0,0.297654,0.150267,0.0,0.0,0.0,0.0,0.0,0.254425,0.0
3,0.125073,0.0,0.0,0.0,0.125073,0.0,0.063142,0.0,0.0,0.0,0.278455,0.150675,0.106908,0.150675
4,0.0,0.0,0.0,0.0,0.350291,0.0,0.08842,0.350291,0.210997,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.437035,0.147088,0.145678,0.0,0.145678,0.0,0.0,0.124521,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.404858,0.0,0.0,0.0,0.595142,0.0,0.0,0.0


# Smoothing parameter in naive Bayes

In statistics, additive smoothing, also called Laplace smoothing (not to be confused with Laplacian smoothing), or Lidstone smoothing, is a technique used to smooth categorical data.
 
Given an observation x = (x1, …, xd) from a multinomial distribution with N trials and parameter vector θ = (θ1, …, θd), a "smoothed" version of the data gives the estimator:

$\hat\theta_{i} = \frac{x_{i} + \alpha}{N + \alpha d}$

where the pseudocount α > 0 is the smoothing parameter (α = 0 corresponds to no smoothing). Additive smoothing is a type of shrinkage estimator, as the resulting estimate will be between the empirical estimate xi / N, and the uniform probability 1/d. Using Laplace's rule of succession, some authors have argued that α should be 1 (in which case the term add-one smoothing is also used), though in practice a smaller value is typically chosen.

Reference: https://medium.com/syncedreview/applying-multinomial-naive-bayes-to-nlp-problems-a-practical-explanation-4f5271768ebf

tuned_parameters = [{'alpha': [1, 5, 10, 20]}].  
nb = GridSearchCV(MultinomialNB(), tuned_parameters, refit=True)