CountVectorizer with usage example

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
cat_in_the_hat_docs=[
      "One Cent, Two Cents, Old Cent, New Cent: All About Money (Cat in the Hat's Learning Library",
      "Inside Your Outside: All About the Human Body (Cat in the Hat's Learning Library)",
      "Oh, The Things You Can Do That Are Good for You: All About Staying Healthy (Cat in the Hat's Learning Library)",
      "On Beyond Bugs: All About Insects (Cat in the Hat's Learning Library)",
      "There's No Place Like Space: All About Our Solar System (Cat in the Hat's Learning Library)"
     ]

cv = CountVectorizer()
count_vector=cv.fit_transform(cat_in_the_hat_docs)

In [3]:
cv.vocabulary_

{'one': 28,
 'cent': 8,
 'two': 40,
 'cents': 9,
 'old': 26,
 'new': 23,
 'all': 1,
 'about': 0,
 'money': 22,
 'cat': 7,
 'in': 16,
 'the': 37,
 'hat': 13,
 'learning': 19,
 'library': 20,
 'inside': 18,
 'your': 42,
 'outside': 30,
 'human': 15,
 'body': 4,
 'oh': 25,
 'things': 39,
 'you': 41,
 'can': 6,
 'do': 10,
 'that': 36,
 'are': 2,
 'good': 12,
 'for': 11,
 'staying': 34,
 'healthy': 14,
 'on': 27,
 'beyond': 3,
 'bugs': 5,
 'insects': 17,
 'there': 38,
 'no': 24,
 'place': 31,
 'like': 21,
 'space': 33,
 'our': 29,
 'solar': 32,
 'system': 35}

CountVectorizer With Custom StopWords

In [4]:
cv = CountVectorizer(stop_words=["all","in","the","is","and"])
count_vector=cv.fit_transform(cat_in_the_hat_docs)
count_vector.shape

(5, 40)

CountVectorizer With Predefined StopWords

In [5]:
cv = CountVectorizer(stop_words="english")
count_vector=cv.fit_transform(cat_in_the_hat_docs)

CountVectorizer with min_df as stopwword

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample text documents
documents = [
    "This is a sample document.",
    "Stop words are common in text processing.",
    "CountVectorizer is a useful tool.",
    "This document is just a sample.",
]

# Create a CountVectorizer with min_df=2
vectorizer = CountVectorizer(min_df=2)

# Fit and transform the documents
X = vectorizer.fit_transform(documents)

# Get the vocabulary
vocabulary = vectorizer.get_feature_names_out()

# Print the transformed matrix and vocabulary
print(X.toarray())
print(vocabulary)


[[1 1 1 1]
 [0 0 0 0]
 [0 1 0 0]
 [1 1 1 1]]
['document' 'is' 'sample' 'this']


Working with Ngrams

In [7]:
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize

nltk.download('punkt')  # Download the Punkt tokenizer if you haven't already


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
sentence = "This is a sample sentence for generating bigrams."
words = word_tokenize(sentence)

# Generate bigrams (2-grams)
bigrams = list(ngrams(words, 2))
print(bigrams)


[('This', 'is'), ('is', 'a'), ('a', 'sample'), ('sample', 'sentence'), ('sentence', 'for'), ('for', 'generating'), ('generating', 'bigrams'), ('bigrams', '.')]


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample text documents
documents = [
    "This is a sample document.",
    "N-grams are useful for text analysis.",
]

# Create a CountVectorizer with ngram_range=(1, 2) for unigrams and bigrams
vectorizer = CountVectorizer(ngram_range=(1, 2))

# Fit and transform the documents
X = vectorizer.fit_transform(documents)

# Get the feature names (n-grams)
ngrams = vectorizer.get_feature_names_out()

# Print the transformed matrix and n-grams
print(X.toarray())
print(ngrams)


[[0 0 0 1 0 0 0 0 1 1 1 1 0 0 1 1 0 0]
 [1 1 1 0 1 1 1 1 0 0 0 0 1 1 0 0 1 1]]
['analysis' 'are' 'are useful' 'document' 'for' 'for text' 'grams'
 'grams are' 'is' 'is sample' 'sample' 'sample document' 'text'
 'text analysis' 'this' 'this is' 'useful' 'useful for']



Limiting Vocabulary Size

In [10]:
# Sample text documents
documents = [
    "This is a sample document.",
    "Limiting vocabulary size is important for NLP tasks.",
    "You can achieve this by specifying max_features in CountVectorizer.",
]

# Create a CountVectorizer with a maximum vocabulary size
max_vocab_size = 20  # Set your desired maximum vocabulary size

vectorizer = CountVectorizer(max_features=max_vocab_size)

# Fit and transform the documents
X = vectorizer.fit_transform(documents)

# Get the feature names (vocabulary)
vocabulary = vectorizer.get_feature_names_out()

# Print the transformed matrix and vocabulary
print(X.toarray())
print(vocabulary)


[[0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0]
 [0 0 0 0 0 1 1 0 1 1 0 1 0 1 0 1 0 1 0]
 [1 1 1 1 0 0 0 1 0 0 1 0 0 0 1 0 1 0 1]]
['achieve' 'by' 'can' 'countvectorizer' 'document' 'for' 'important' 'in'
 'is' 'limiting' 'max_features' 'nlp' 'sample' 'size' 'specifying' 'tasks'
 'this' 'vocabulary' 'you']


Extracting Counts of Words / N-Grams

In [11]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """return n-gram counts in descending order of counts"""

    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []
    results=[]

    # word index, count i
    for idx, count in sorted_items:

        # get the ngram name
        n_gram=feature_names[idx]

        # collect as a list of tuples
        results.append((n_gram,count))

    return results
cv = CountVectorizer(ngram_range=(1,2),max_features=100)
count_vector=cv.fit_transform(cat_in_the_hat_docs)

#sort the counts of first book title by descending order of counts
sorted_items=sort_coo(count_vector[0].tocoo())

#Get feature names (words/n-grams). It is sorted by position in sparse matrix
feature_names=cv.get_feature_names_out()
n_grams=extract_topn_from_vector(feature_names,sorted_items,10)
n_grams

[('cent', 3),
 ('two cents', 1),
 ('two', 1),
 ('the hat', 1),
 ('the', 1),
 ('one cent', 1),
 ('one', 1),
 ('old cent', 1),
 ('old', 1),
 ('new cent', 1)]

Custom Tokenizer

In [12]:
import re

def my_tokenizer(text):
    text=re.sub("(\\W)"," \\1 ",text)
    return re.split("\\s+",text)


cv = CountVectorizer(tokenizer=my_tokenizer)
count_vector=cv.fit_transform(cat_in_the_hat_docs)
print(cv.vocabulary_)

{'one': 34, 'cent': 14, ',': 4, 'two': 47, 'cents': 15, 'old': 32, 'new': 29, ':': 5, 'all': 7, 'about': 6, 'money': 28, '(': 2, 'cat': 13, 'in': 22, 'the': 44, 'hat': 19, "'": 1, 's': 38, 'learning': 25, 'library': 26, 'inside': 24, 'your': 49, 'outside': 36, 'human': 21, 'body': 10, ')': 3, '': 0, 'oh': 31, 'things': 46, 'you': 48, 'can': 12, 'do': 16, 'that': 43, 'are': 8, 'good': 18, 'for': 17, 'staying': 41, 'healthy': 20, 'on': 33, 'beyond': 9, 'bugs': 11, 'insects': 23, 'there': 45, 'no': 30, 'place': 37, 'like': 27, 'space': 40, 'our': 35, 'solar': 39, 'system': 42}


