<a href="https://colab.research.google.com/github/yashika51/Understanding-Count-Vectorizer/blob/master/CountVectorizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#importing necessary libraries
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

### Small Example

In [0]:
# checking with smaller list of senetences
text=["Some Mobile networks send 'stay at home' text","Some customers have already received the message, while others are set to get it later in the day",]

In [3]:
print(text)

["Some Mobile networks send 'stay at home' text", 'Some customers have already received the message, while others are set to get it later in the day']


In [0]:
#initializing count vectorizer, lowercase=True turns text to lowercase if not already, this argument is optional and by default it is true
cv=CountVectorizer(text,lowercase=True)

In [0]:
#fit transform on our text
c=cv.fit_transform(text)


In [6]:
#Checking the vocabulary, note these are not number of occurences but position in the sparse vector
cv.vocabulary_


{'already': 0,
 'are': 1,
 'at': 2,
 'customers': 3,
 'day': 4,
 'get': 5,
 'have': 6,
 'home': 7,
 'in': 8,
 'it': 9,
 'later': 10,
 'message': 11,
 'mobile': 12,
 'networks': 13,
 'others': 14,
 'received': 15,
 'send': 16,
 'set': 17,
 'some': 18,
 'stay': 19,
 'text': 20,
 'the': 21,
 'to': 22,
 'while': 23}

### Now playing with more set of sentences


In [0]:
document=["devastating social and economic consequences of COVID-19",
          "investment and initiatives already ongoing around the world to expedite deployment of innovative COVID-19",
          "We commit to the shared aim of equitable global access to innovative tools for COVID-19 for all",
          "We ask the global community and political leaders to support this landmark collaboration, and for donors",
           "In the fight against COVID-19, no one should be left behind"
]

In [0]:
#Initializing
cv_doc=CountVectorizer(document)

#Fitting on our document
cv_vector=cv_doc.fit_transform(document)


In [9]:
#checking the vocabulary
cv_doc.vocabulary_

{'19': 0,
 'access': 1,
 'against': 2,
 'aim': 3,
 'all': 4,
 'already': 5,
 'and': 6,
 'around': 7,
 'ask': 8,
 'be': 9,
 'behind': 10,
 'collaboration': 11,
 'commit': 12,
 'community': 13,
 'consequences': 14,
 'covid': 15,
 'deployment': 16,
 'devastating': 17,
 'donors': 18,
 'economic': 19,
 'equitable': 20,
 'expedite': 21,
 'fight': 22,
 'for': 23,
 'global': 24,
 'in': 25,
 'initiatives': 26,
 'innovative': 27,
 'investment': 28,
 'landmark': 29,
 'leaders': 30,
 'left': 31,
 'no': 32,
 'of': 33,
 'one': 34,
 'ongoing': 35,
 'political': 36,
 'shared': 37,
 'should': 38,
 'social': 39,
 'support': 40,
 'the': 41,
 'this': 42,
 'to': 43,
 'tools': 44,
 'we': 45,
 'world': 46}

In [10]:
# let's check the length of our vocabulary
len(cv_doc.get_feature_names()) 

47

In [11]:
#In case you are wondering what get_feature_names would return :)
cv_doc.get_feature_names()

['19',
 'access',
 'against',
 'aim',
 'all',
 'already',
 'and',
 'around',
 'ask',
 'be',
 'behind',
 'collaboration',
 'commit',
 'community',
 'consequences',
 'covid',
 'deployment',
 'devastating',
 'donors',
 'economic',
 'equitable',
 'expedite',
 'fight',
 'for',
 'global',
 'in',
 'initiatives',
 'innovative',
 'investment',
 'landmark',
 'leaders',
 'left',
 'no',
 'of',
 'one',
 'ongoing',
 'political',
 'shared',
 'should',
 'social',
 'support',
 'the',
 'this',
 'to',
 'tools',
 'we',
 'world']

In [12]:
#checking the shape of the count vector, (5,47) means 5 rows(sentences) and 47 columns(unique words)
cv_vector.shape

(5, 47)

In [13]:
#check if any stop_words present, empty set tells no stop words are present
cv_doc.stop_words_

set()

# Let's see some more examples experimenting with multiple parameters

## Count Vectorizer with predefined and default lits of stop_words


In [14]:
cv1=CountVectorizer(document,stop_words='english')
#Lets test cv1 on our doc
cv1_doc=cv1.fit_transform(document)
#after removing stop_words now number of unique words reduced from 47 to 30 and shape returned is (5,30)
cv1_doc.shape

(5, 30)

### Count Vectorizer with custom stop words, you can specify any word here that is unwanted in your corpus

In [15]:
#passing the list of stop_words
cv2=CountVectorizer(document,stop_words=['the','we','should','this','to'])
#checking the stop_words
cv2.stop_words

['the', 'we', 'should', 'this', 'to']

In [16]:
#Lets test cv1 on our doc
cv2_doc=cv1.fit_transform(document)

#after removing stop_words now number of unique words reduced from 47 to 42 and shape returned is (5,42)
cv2_doc.shape

(5, 30)

In [19]:
#Are there any stop_words internally used by Count vectorizer?
cv2.stop_words

['the', 'we', 'should', 'this', 'to']

## What does min_df do?

min_df considers words that are only present in minimum of 2 documents. We can also pass a proportion instead of absolute number.


For example, min_df=0.25 ignores words that are present in less than 25% of the document

In [20]:
#new initialization with min_df=2
cv3=CountVectorizer(document, min_df=2)
cv3_doc=cv3.fit_transform(document)
# you will see a lot of words here
cv3.stop_words_

{'access',
 'against',
 'aim',
 'all',
 'already',
 'around',
 'ask',
 'be',
 'behind',
 'collaboration',
 'commit',
 'community',
 'consequences',
 'deployment',
 'devastating',
 'donors',
 'economic',
 'equitable',
 'expedite',
 'fight',
 'in',
 'initiatives',
 'investment',
 'landmark',
 'leaders',
 'left',
 'no',
 'one',
 'ongoing',
 'political',
 'shared',
 'should',
 'social',
 'support',
 'this',
 'tools',
 'world'}

In [21]:
#We can see a lot of thw words are removed as they are present in less than 2 documents, this is the vocabulary we have now
cv3.vocabulary_

{'19': 0,
 'and': 1,
 'covid': 2,
 'for': 3,
 'global': 4,
 'innovative': 5,
 'of': 6,
 'the': 7,
 'to': 8,
 'we': 9}

In [22]:
#Woah! only 10 out of 47 left
cv3_doc.shape

(5, 10)

## Now it's turn of max_df 

As you might have guessed it's opposite to min_df and considers words based on their presence in maximum n number of documents specified.


Let's test the proportion instead of absolute number here. If words are present in more than 50% of the document they are ignored.

In [0]:
cv4=CountVectorizer(document,max_df=0.50)
cv4_doc=cv4.fit_transform(document)

In [24]:
cv4.vocabulary_

{'access': 0,
 'against': 1,
 'aim': 2,
 'all': 3,
 'already': 4,
 'around': 5,
 'ask': 6,
 'be': 7,
 'behind': 8,
 'collaboration': 9,
 'commit': 10,
 'community': 11,
 'consequences': 12,
 'deployment': 13,
 'devastating': 14,
 'donors': 15,
 'economic': 16,
 'equitable': 17,
 'expedite': 18,
 'fight': 19,
 'for': 20,
 'global': 21,
 'in': 22,
 'initiatives': 23,
 'innovative': 24,
 'investment': 25,
 'landmark': 26,
 'leaders': 27,
 'left': 28,
 'no': 29,
 'one': 30,
 'ongoing': 31,
 'political': 32,
 'shared': 33,
 'should': 34,
 'social': 35,
 'support': 36,
 'this': 37,
 'tools': 38,
 'we': 39,
 'world': 40}

In [25]:
#these are the only words that crossed the limit of presence in 50% of document
cv4.stop_words_

{'19', 'and', 'covid', 'of', 'the', 'to'}

In [26]:
cv4_doc.shape

(5, 41)

# Now the bigger picture, let's do some custom preprocessing

## Starting with Tokenizer

In [27]:
#defining custom tokenizer, we can tokenize the document easily with libraries like nltk
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
def tok(text):
  tokens=word_tokenize(text)
  return tokens
tok(str(document))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['[',
 "'devastating",
 'social',
 'and',
 'economic',
 'consequences',
 'of',
 'COVID-19',
 "'",
 ',',
 "'investment",
 'and',
 'initiatives',
 'already',
 'ongoing',
 'around',
 'the',
 'world',
 'to',
 'expedite',
 'deployment',
 'of',
 'innovative',
 'COVID-19',
 "'",
 ',',
 "'We",
 'commit',
 'to',
 'the',
 'shared',
 'aim',
 'of',
 'equitable',
 'global',
 'access',
 'to',
 'innovative',
 'tools',
 'for',
 'COVID-19',
 'for',
 'all',
 "'",
 ',',
 "'We",
 'ask',
 'the',
 'global',
 'community',
 'and',
 'political',
 'leaders',
 'to',
 'support',
 'this',
 'landmark',
 'collaboration',
 ',',
 'and',
 'for',
 'donors',
 "'",
 ',',
 "'In",
 'the',
 'fight',
 'against',
 'COVID-19',
 ',',
 'no',
 'one',
 'should',
 'be',
 'left',
 'behind',
 "'",
 ']']

In [28]:
cv5=CountVectorizer(document,tokenizer=tok)
cv5.fit_transform(document)
cv5.vocabulary_

{',': 0,
 'access': 1,
 'against': 2,
 'aim': 3,
 'all': 4,
 'already': 5,
 'and': 6,
 'around': 7,
 'ask': 8,
 'be': 9,
 'behind': 10,
 'collaboration': 11,
 'commit': 12,
 'community': 13,
 'consequences': 14,
 'covid-19': 15,
 'deployment': 16,
 'devastating': 17,
 'donors': 18,
 'economic': 19,
 'equitable': 20,
 'expedite': 21,
 'fight': 22,
 'for': 23,
 'global': 24,
 'in': 25,
 'initiatives': 26,
 'innovative': 27,
 'investment': 28,
 'landmark': 29,
 'leaders': 30,
 'left': 31,
 'no': 32,
 'of': 33,
 'one': 34,
 'ongoing': 35,
 'political': 36,
 'shared': 37,
 'should': 38,
 'social': 39,
 'support': 40,
 'the': 41,
 'this': 42,
 'to': 43,
 'tools': 44,
 'we': 45,
 'world': 46}

## Combinations makes more sense with n-gram

Let's check the vocabulary full of unigrams(single words) and bigrams(combinations of two words)

In [29]:
cv6=CountVectorizer(document, ngram_range=(1,2))
cv6.fit_transform(document)
cv6.vocabulary_

{'19': 0,
 '19 for': 1,
 '19 no': 2,
 'access': 3,
 'access to': 4,
 'against': 5,
 'against covid': 6,
 'aim': 7,
 'aim of': 8,
 'all': 9,
 'already': 10,
 'already ongoing': 11,
 'and': 12,
 'and economic': 13,
 'and for': 14,
 'and initiatives': 15,
 'and political': 16,
 'around': 17,
 'around the': 18,
 'ask': 19,
 'ask the': 20,
 'be': 21,
 'be left': 22,
 'behind': 23,
 'collaboration': 24,
 'collaboration and': 25,
 'commit': 26,
 'commit to': 27,
 'community': 28,
 'community and': 29,
 'consequences': 30,
 'consequences of': 31,
 'covid': 32,
 'covid 19': 33,
 'deployment': 34,
 'deployment of': 35,
 'devastating': 36,
 'devastating social': 37,
 'donors': 38,
 'economic': 39,
 'economic consequences': 40,
 'equitable': 41,
 'equitable global': 42,
 'expedite': 43,
 'expedite deployment': 44,
 'fight': 45,
 'fight against': 46,
 'for': 47,
 'for all': 48,
 'for covid': 49,
 'for donors': 50,
 'global': 51,
 'global access': 52,
 'global community': 53,
 'in': 54,
 'in the': 5

## Limiting Vocabulary size

We can mention the maximum vocabulary size we intend to keep.

In [30]:
cv7=CountVectorizer(document,max_features=20)
cv7_doc=cv7.fit_transform(document)
cv7.vocabulary_


{'19': 0,
 'aim': 1,
 'and': 2,
 'around': 3,
 'covid': 4,
 'for': 5,
 'global': 6,
 'innovative': 7,
 'investment': 8,
 'landmark': 9,
 'leaders': 10,
 'left': 11,
 'no': 12,
 'of': 13,
 'one': 14,
 'shared': 15,
 'should': 16,
 'the': 17,
 'to': 18,
 'we': 19}

In [31]:
#only 20 unique words as we demanded with max_features
cv7_doc.shape

(5, 20)

## Custom Preprocessing


Although our data is clean in this notebook, the real world data is very messy and in case you want to clean that along with Count Vectorizer you can pass your custom preprocessor as an argument to Count Vectorizer.

In [0]:
import regex as re
def custom_preprocessor(text):
  #lowering the text case
  text=text.lower() 
  # remove special chars
  text=re.sub("\\W"," ",text)
  return text

In [33]:
cv8=CountVectorizer(document,preprocessor=custom_preprocessor)
cv8.fit_transform(document)

<5x47 sparse matrix of type '<class 'numpy.int64'>'
	with 66 stored elements in Compressed Sparse Row format>

In [34]:
cv8.vocabulary_

{'19': 0,
 'access': 1,
 'against': 2,
 'aim': 3,
 'all': 4,
 'already': 5,
 'and': 6,
 'around': 7,
 'ask': 8,
 'be': 9,
 'behind': 10,
 'collaboration': 11,
 'commit': 12,
 'community': 13,
 'consequences': 14,
 'covid': 15,
 'deployment': 16,
 'devastating': 17,
 'donors': 18,
 'economic': 19,
 'equitable': 20,
 'expedite': 21,
 'fight': 22,
 'for': 23,
 'global': 24,
 'in': 25,
 'initiatives': 26,
 'innovative': 27,
 'investment': 28,
 'landmark': 29,
 'leaders': 30,
 'left': 31,
 'no': 32,
 'of': 33,
 'one': 34,
 'ongoing': 35,
 'political': 36,
 'shared': 37,
 'should': 38,
 'social': 39,
 'support': 40,
 'the': 41,
 'this': 42,
 'to': 43,
 'tools': 44,
 'we': 45,
 'world': 46}

# Before you go, let's combine most of the things together

In [0]:
cv_final=CountVectorizer(document,preprocessor=custom_preprocessor,tokenizer=tok,max_features=45,ngram_range=(1,1),stop_words='english',min_df=1)

In [36]:
cv_final_doc=cv_final.fit_transform(document)
cv_final.vocabulary_

{'19': 0,
 'access': 1,
 'aim': 2,
 'ask': 3,
 'collaboration': 4,
 'commit': 5,
 'community': 6,
 'consequences': 7,
 'covid': 8,
 'deployment': 9,
 'devastating': 10,
 'donors': 11,
 'economic': 12,
 'equitable': 13,
 'expedite': 14,
 'fight': 15,
 'global': 16,
 'initiatives': 17,
 'innovative': 18,
 'investment': 19,
 'landmark': 20,
 'leaders': 21,
 'left': 22,
 'ongoing': 23,
 'political': 24,
 'shared': 25,
 'social': 26,
 'support': 27,
 'tools': 28,
 'world': 29}

In [37]:
cv_final_doc.shape

(5, 30)