# Bag-of-Words
Bag-of-Words is a representation of text that describes the occurence of words within a document. It involves two things 
1. Vocabulary of known words
2. A measure of presence of known of words

The measure of occurence of known words can be 'counts', 'binary', 'frequency'. 'tf-idf'.


In [11]:
import re
import string
import os
import collections
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np


In [2]:
def load_doc(filename):
    """The documents of a file loaded """
    try:
        with open(filename, 'r') as file:
            doc = file.read()
        return doc
    except FileNotFoundError as e:
        print(e)
    except Exception as e:
        print(e)

In [5]:
def clean_doc(doc):
    """Tokenization, Removal of numerics, stopwords, punctuations and less frequent tokens"""
    tokens = doc.split(' ')
    # remove punctuations
    reg_punctuation = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [reg_punctuation.sub('', w) for w in tokens]
    # romove numerics and stop words
    tokens = [token for token in tokens if token.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if not word in stop_words]
    # remove short tokens
    tokens = [token for token in tokens if len(token)>1]
    return tokens

In [6]:
def create_save_vocab(directory):
    """Vocabs is saved to a file"""
    neg_dir = directory + '/neg'
    pos_dir = directory + '/pos'
    dirs = [pos_dir, neg_dir]
    vocab_dict = collections.Counter()
    for direc in dirs:
        for filename in os.listdir(direc):
            file_path = direc + '/' + filename
            doc = load_doc(file_path)
            doc_tokens = clean_doc(doc)
            vocab_dict.update(doc_tokens)
    # remove tokens with less than minimum occurence
    tokens = [token for token, count in vocab_dict.items() if count>=2]
    data = '\n'.join(tokens)
    file = open('vocabulary.txt', 'w')
    file.write(data)
    file.close()

In [9]:
def prepare_doc(directory, vocab):
    """Preparation of documents for bag-of-words model"""
    docs = []
    for filename in os.listdir(directory):
        file_path = directory + '/' + filename
        doc = load_doc(file_path)
        cleaned_doc = clean_doc(doc)
        cleaned_doc = [token for token in cleaned_doc if token in vocab]
        docs.append(cleaned_doc)
    return docs

In [12]:
def create_tokenizer(docs):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(docs)
    return tokenizer

Keras API is used to converts docs to encoded documents vectors. Keras provides the ```Tokenizer``` class cleans the documents and transforms the document to encoded documents. First ```Tokernier``` object is created and fit on the documents to be transformed. ```Tokenizer``` calls the function ```texts_to_matrix()``` for encoding of the documents.

In [23]:
def main():
    create_save_vocab('txt_sentoken')
    file_path = 'vocabulary.txt'
    vocab = load_doc(file_path)
    vocab = set(vocab.split())
    print('Vocabulary Size: ', len(vocab))
    pos_docs = prepare_doc('txt_sentoken/pos', vocab)
    neg_docs = prepare_doc('txt_sentoken/neg', vocab)
    docs = pos_docs + neg_docs
    tokenizer = create_tokenizer(docs)
    X = tokenizer.texts_to_matrix(docs, mode='freq')
    print('Shape of the encoded Document: ', X.shape)
    print('Number of Document: ', X.shape[0])
    print('Document vector size: ', X.shape[1])
    print(X)
    

In [24]:
if __name__ == '__main__':
    main() 

Vocabulary Size:  26790
Shape of the encoded Document:  (2000, 26791)
Number of Document:  2000
Document vector size:  26791
[[0.         0.01312336 0.         ... 0.         0.         0.        ]
 [0.         0.0147929  0.00591716 ... 0.         0.         0.        ]
 [0.         0.00490196 0.00980392 ... 0.         0.         0.        ]
 ...
 [0.         0.01522843 0.00507614 ... 0.         0.         0.01015228]
 [0.         0.01153846 0.         ... 0.         0.         0.        ]
 [0.         0.01123596 0.         ... 0.         0.         0.        ]]
