In [3]:
import string
from collections import defaultdict

In [47]:
corpus = [
    "The quick brown fox jumps over the lazy dog.",
    "A journey of a thousand miles begins with a single step.",
    "To be or not to be, that is the question."
]

In [48]:
def preprocess_text(text):
    """Preprocess the text by converting to lowercase and removing punctuation."""
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text.split()

In [49]:
preprocessed_text = [preprocess_text(doc) for doc in corpus]

In [50]:
print(preprocessed_text)

[['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'], ['a', 'journey', 'of', 'a', 'thousand', 'miles', 'begins', 'with', 'a', 'single', 'step'], ['to', 'be', 'or', 'not', 'to', 'be', 'that', 'is', 'the', 'question']]


In [51]:
vocabulary = sorted(set(word for doc in preprocessed_text for word in doc))

In [52]:
vocabulary

['a',
 'be',
 'begins',
 'brown',
 'dog',
 'fox',
 'is',
 'journey',
 'jumps',
 'lazy',
 'miles',
 'not',
 'of',
 'or',
 'over',
 'question',
 'quick',
 'single',
 'step',
 'that',
 'the',
 'thousand',
 'to',
 'with']

In [53]:
bow_vectors = []
for doc in preprocessed_text:
    '''
    Create a bag-of-words vector for each document.
    Each vector has the same length as the vocabulary, and the value at each index corresponds to the count of the word at that index in the document.
    '''
    vector = [0] * len(vocabulary)
    word_count = defaultdict(int)
    for word in doc:
        word_count[word] += 1
    for i, word in enumerate(vocabulary):
        vector[i] = word_count.get(word, 0)
    bow_vectors.append(vector)

In [54]:
print('Vocabulary: ', vocabulary)
print('Bag-of-Words Vectors: ', bow_vectors)

Vocabulary:  ['a', 'be', 'begins', 'brown', 'dog', 'fox', 'is', 'journey', 'jumps', 'lazy', 'miles', 'not', 'of', 'or', 'over', 'question', 'quick', 'single', 'step', 'that', 'the', 'thousand', 'to', 'with']
Bag-of-Words Vectors:  [[0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 2, 0, 0, 0], [3, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1], [0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 2, 0]]
