In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
corpus = ['This is the first document.',
          'This is the second document.', 
          'Third document. Document number three', 
          'Number four. To repeat, number four']

In [11]:
vectorizer = CountVectorizer()
bag_of_words = vectorizer.fit_transform(corpus)

bag_of_words

<4x12 sparse matrix of type '<class 'numpy.int64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [9]:
print(bag_of_words)

  (0, 10)	1
  (0, 3)	1
  (0, 8)	1
  (0, 1)	1
  (0, 0)	1
  (1, 3)	1
  (1, 8)	1
  (1, 0)	1
  (1, 7)	1
  (1, 6)	1
  (2, 0)	2
  (2, 9)	1
  (2, 4)	1
  (2, 11)	1
  (3, 4)	2
  (3, 2)	2
  (3, 12)	1
  (3, 5)	1


Access the id that goes to a particular word

In [14]:
vectorizer.vocabulary_.get('number')

4

In [16]:
import pandas as pd

print(pd.__version__)

2.0.3


In [19]:
pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names_out())


Unnamed: 0,document,first,four,is,number,repeat,second,the,third,this,three,to
0,1,1,0,1,0,0,0,1,0,1,0,0
1,1,0,0,1,0,0,1,1,0,1,0,0
2,2,0,0,0,1,0,0,0,1,0,1,0
3,0,0,2,0,2,1,0,0,0,0,0,1


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer() # This associates scores with every word in our document corpus
bag_of_words = vectorizer.fit_transform(corpus)

In [22]:
print(bag_of_words) 
                    # every document has a unique ID
                    # every word has a unique ID 
        # every document ID word ID combination is associated with a score

  (0, 0)	0.3528554929793508
  (0, 1)	0.5528163151092931
  (0, 7)	0.43584673254990375
  (0, 3)	0.43584673254990375
  (0, 9)	0.43584673254990375
  (1, 6)	0.5528163151092931
  (1, 0)	0.3528554929793508
  (1, 7)	0.43584673254990375
  (1, 3)	0.43584673254990375
  (1, 9)	0.43584673254990375
  (2, 10)	0.4850008395708102
  (2, 4)	0.3823802326982809
  (2, 8)	0.4850008395708102
  (2, 0)	0.6191395067937654
  (3, 5)	0.3432724906138499
  (3, 11)	0.3432724906138499
  (3, 2)	0.6865449812276998
  (3, 4)	0.5412799489419371


In [23]:
pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names_out())


Unnamed: 0,document,first,four,is,number,repeat,second,the,third,this,three,to
0,0.352855,0.552816,0.0,0.435847,0.0,0.0,0.0,0.435847,0.0,0.435847,0.0,0.0
1,0.352855,0.0,0.0,0.435847,0.0,0.0,0.552816,0.435847,0.0,0.435847,0.0,0.0
2,0.61914,0.0,0.0,0.0,0.38238,0.0,0.0,0.0,0.485001,0.0,0.485001,0.0
3,0.0,0.0,0.686545,0.0,0.54128,0.343272,0.0,0.0,0.0,0.0,0.0,0.343272


The cells of this dataframe contains TF-IDF scores and not word frequencies

## And this is our complete vocabulary, all of 12 words

In [24]:
vectorizer.vocabulary_

{'this': 9,
 'is': 3,
 'the': 7,
 'first': 1,
 'document': 0,
 'second': 6,
 'third': 8,
 'number': 4,
 'three': 10,
 'four': 2,
 'to': 11,
 'repeat': 5}

If you have a very large vocabulary of words, we can choose to use the HashingVectorizer rather than the CountVectorizer.
The use of hashing buckets to represent words allows us to scale large data sets when we use the HashingVectorizer.

In [26]:
from sklearn.feature_extraction.text import HashingVectorizer

vectorizer = HashingVectorizer(n_features=8) # The input argument to this vectorizer is the number of hash buckets
feature_vector = vectorizer.fit_transform(corpus)
print(feature_vector)

  (0, 0)	-0.8944271909999159
  (0, 5)	0.4472135954999579
  (0, 6)	0.0
  (1, 0)	-0.5773502691896258
  (1, 3)	0.5773502691896258
  (1, 5)	0.5773502691896258
  (1, 6)	0.0
  (2, 0)	-0.7559289460184544
  (2, 3)	0.3779644730092272
  (2, 5)	0.3779644730092272
  (2, 7)	0.3779644730092272
  (3, 0)	0.31622776601683794
  (3, 3)	0.31622776601683794
  (3, 5)	0.6324555320336759
  (3, 7)	0.6324555320336759


Notice that word IDS are from zero to seven because, we have a total of 8 buckets because the size of our vocabulary is larger than the number of buckets which is how it should be, multiple words can hash the same bucket.

## One disadvantage of the HashingVectorizer is that there is no way to get back to the original word from its hash bucket value.

The frequencies of each token is not represented in raw number form.
This is some kind of normalized from.