In [1]:
# Import NumPy, SciPy, and other libraries for utility functions

import numpy as np
from collections import Counter
from scipy.sparse import csr_matrix
import pickle

In [2]:
# Function to generate the vocabulary of the text corpus
# Input: Text corpus represented as a list of sentences
# Output: Vocabulary of the text corpus represented as a Python dictionary with (unique_words, dimesion)
# key-value pairs as it elements

def fit(dataset):
    # Initializing an empty set
    unique_words = set()
    
    # Iterating over every row in the corpus
    # Each row is essentially a sentence represented as a Python string object
    for row in dataset:
        # Splitting the words by a single white-space character
        # And, storing them in a Python list
        word_list = row.split()
        
        # Iterating over every word in the word list
        for word in word_list:
            # Ignoring words which are essentially single characters
            if len(word) > 1:
                # Adding words into the set
                # At the end of the iteration, it will hold only unique words in the sentence
                unique_words.add(word)
    
    # Sorting the list of unique words lexicographically 
    sorted_unique_words = sorted(list(unique_words))
    
    # Generating the vocabulary which is a collection of words paired with their dimension code
    vocabulary = {word:idx for idx, word in enumerate(sorted_unique_words)}
    
    # Returning the generate vocabulary
    return vocabulary

In [3]:
# Function to generate the Bag of Words (BOW) representation of the text corpus
# Input: The text corpus, and the vocabulary associated with it
# Output: BOW representation of the corpus represented as a Sparse Matrix in CSR Format

def transform(dataset, vocabulary):
    # Initializing an empty list to store rows, columns, and a non-zero value
    rows = []
    columns = []
    values = []
    
    # Iterating over the text corpus
    # Each row is essentially a sentence represented as a Python string object
    for idx, row in enumerate(dataset):
        # Splitting the words by a single white-space character
        # And, storing them in a Python list
        words = row.split()
        
        # Using a Counter object to generate frequencies of every unique word in the sentence
        word_frequencies = dict(Counter(words))
        
        # Iterating over every unique word, and corresponding frequency in the counter
        for word, frequency in word_frequencies.items():
            # Ignoring words which are essentially a single character
            if len(word) > 1:
                # A sanity check for the word to exist in the already generated vocabulary
                col_idx = vocabulary.get(word, -1)
                if col_idx != -1:
                    # Appending row in list of rows
                    rows.append(idx)
                    
                    # Appending column in list of columns
                    columns.append(col_idx)
                    
                    # Appending word frqeuency in list of values storing the frequencies
                    values.append(frequency)
    
    # Creating a Sparse Matrix of the BOW in the CSR format using SciPy
    return csr_matrix((values, (rows, columns)), shape=(len(dataset), len(vocabulary)))

In [4]:
# Pickle the corpus and load it into the computer's memory
with open('cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)

# Printing the size of the corpus
print(len(corpus))

746


In [5]:
# Generate the vocabulary by using the fit function
vocabulary = fit(corpus)

# Print the count of unique words in the vocabulary
print(len(vocabulary))

2886


In [6]:
# Generate the BOW representationg of the corpus
bag_of_words = transform(corpus, vocabulary)

# Print the shape of BOW of every sentence in the corpus 
print(bag_of_words.shape)

(746, 2886)


In [7]:
# Converting the Sparse Matrix to a Dense Matrix just to get a feel for the BOW representation

print(bag_of_words.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
