## Program to parse input document corpus, create index and store it in a file

In [1]:
from bs4 import BeautifulSoup
import string
import re
from nltk.tokenize import word_tokenize
from collections import Counter
import matplotlib.pyplot as plt
import nltk as nltk
import numpy as np
import pickle
from scipy.sparse import coo_matrix

In [2]:
# decalre names of all files
data_file_path = ".\\data\\wiki_05"

# these are the output files
vocabulary_index_file_path = ".\\data\\vocabulary.pkl"
document_index_file_path = ".\\data\\file_index.pkl"
inverted_index_file_path = ".\\data\inverted_index.pkl"

# import the text corpora from disk and perform pre-processing steps

In [3]:
input_file = open(data_file_path, "rt", encoding="utf-8")
input_data = input_file.read()
input_file.close()

# Remove html and other tags.

In [4]:
souped_data = BeautifulSoup(input_data)

In [5]:
# Generate vocabulary, document list and inverted index
vocabulary_dict = dict()
document_dict = dict()
all_unique_words = set(word_tokenize(souped_data.get_text().replace("\n", " ").lower()))
rows = []
cols = []
token_counts = []

for doc in souped_data.find_all('doc'):
    doc_id = doc['id']
    if document_dict.get(doc_id) is None:
        document_dict[doc_id] = len(document_dict.items())
    # Remove html and other tags.
    doc_data = re.sub(r'[^\w\s]', ' ', doc.get_text().replace("\n", " ").lower())
    # Tokenize the text. This will get rid of extra spaces as well.
    tokens = word_tokenize(doc_data)
    column_data = Counter(tokens)
    for token, count in column_data.items():
        if vocabulary_dict.get(token) is None:
            vocabulary_dict[token] = len(vocabulary_dict.items())
        row_id = vocabulary_dict[token]
        rows.append(row_id)
        col_id = document_dict[doc_id]
        cols.append(col_id)
        token_counts.append(count)
inverted_index_matrix = coo_matrix((token_counts, (rows, cols)))

# Store the index in a file

In [6]:
# store vocabulary index
pickle_file = open(vocabulary_index_file_path, "wb")
pickle.dump(vocabulary_dict, pickle_file)
pickle_file.close()

# store document index
pickle_file = open(document_index_file_path, "wb")
pickle.dump(document_dict, pickle_file)
pickle_file.close()

# store term frequency matrix
pickle_file = open(inverted_index_file_path, "wb")
pickle.dump(inverted_index_matrix, pickle_file)
pickle_file.close()