# Implementing Bag of Words algorithms

In [1]:
import numpy as np
from scipy.sparse import csr_matrix
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from collections import Counter

In [2]:
# it only accepts the list of documents or list of reviews
def fit(dataset):
  unique_words = set()
  if isinstance(dataset,(list,)): # checking if the input is list or not
    for row in dataset: # fo each document in dataset
      for word in row.split(): # for each word in the document
        if len(word)<2: # we will consider a words if its length is greater than or equal to 2
          continue
        unique_words.add(word) # inserting the word in the set of unque words
    unique_words = sorted(list(unique_words)) # sorting the list of unique words
    vocab = {j:i for i, j in enumerate(unique_words)} # creating the dictoniary with each key having its value as index in unique words list
    return vocab
  else:
    print("You need to pass the list of documents")

In [3]:
vocab = fit(["abc def aaa prq", "lmn pqr aaaaaaa aaa abbb baaa"])
print(vocab)

{'aaa': 0, 'aaaaaaa': 1, 'abbb': 2, 'abc': 3, 'baaa': 4, 'def': 5, 'lmn': 6, 'pqr': 7, 'prq': 8}


In [4]:
def transform(dataset, vocab):
  rows = []
  columns = []
  values = []
  if isinstance(dataset, (list,)):
    for idx, row in enumerate(dataset):
      word_freq = dict(Counter(row.split()))
      for word, freq in word_freq.items():
        if len(word)<2:
          continue
        col_index = vocab.get(word, -1)
        if col_index!= -1:
          rows.append(idx)
          columns.append(col_index)
          values.append(freq)
    return csr_matrix((values, (rows, columns)), shape = (len(dataset), len(vocab)))


In [5]:
strings = ["the method of lagrange multipliers is the economists workhorse for solving optimization problems",
           "the technique is a centerpiece of economic theory but unfortunately its usually taught poorly"]
vocab = fit(strings)
print(list(vocab.keys()))
print(transform(strings, vocab).toarray())

['but', 'centerpiece', 'economic', 'economists', 'for', 'is', 'its', 'lagrange', 'method', 'multipliers', 'of', 'optimization', 'poorly', 'problems', 'solving', 'taught', 'technique', 'the', 'theory', 'unfortunately', 'usually', 'workhorse']
[[0 0 0 1 1 1 0 1 1 1 1 1 0 1 1 0 0 2 0 0 0 1]
 [1 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 1 1 1 1 1 0]]


## Comparing results with countvectorizer

In [6]:
cv = CountVectorizer()
new_string = cv.fit_transform(strings)
print(cv.get_feature_names())
print(new_string.toarray())

['but', 'centerpiece', 'economic', 'economists', 'for', 'is', 'its', 'lagrange', 'method', 'multipliers', 'of', 'optimization', 'poorly', 'problems', 'solving', 'taught', 'technique', 'the', 'theory', 'unfortunately', 'usually', 'workhorse']
[[0 0 0 1 1 1 0 1 1 1 1 1 0 1 1 0 0 2 0 0 0 1]
 [1 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 1 1 1 1 1 0]]
