In [60]:
import sys
import ast
import re
import collections

from collections import OrderedDict
from nltk.tokenize import WhitespaceTokenizer

def spimi_invert(documents, block_size_limit):
    """ Applies the Single-pass in-memory indexing algorithm """
    block_number = 0
    documents_count = len(documents)
    dictionary = {} # (term - postings list)
    tokenizer = WhitespaceTokenizer()
    for index, docID in enumerate(documents):
        for term in tokenizer.tokenize(documents[index]):
            # If term occurs for the first time
            if term not in dictionary:
                # Add term to dictionary, create new postings list, and add docID
                dictionary[term] = [index]
            # else:
            #     # If term has a subsequent occurence
            #     if docID not in dictionary[term]:
            #         # Add a posting (docID) to the existing posting list of the term
            #         dictionary[term].append(docID)
            else:
                dictionary[term].append(index)
#         print(sys.getsizeof(dictionary))
        if sys.getsizeof(dictionary) > block_size_limit:
            temp_dict = sort_terms(dictionary)
            write_block_to_disk(temp_dict, block_number)
            temp_dict = {}
            block_number += 1
            dictionary = {}
    temp_dict = sort_terms(dictionary)
    write_block_to_disk(temp_dict, block_number)
    print("SPIMI invert complete!")



In [23]:
def sort_terms(term_postings_list):
    """ Sorts dictionary terms in alphabetical order """
    print(" -- Sorting terms...")
    sorted_dictionary = OrderedDict() # keep track of insertion order
    sorted_terms = sorted(term_postings_list)
    for term in sorted_terms:
        result = [docIds for docIds in term_postings_list[term]]
        result_tftd = calculate_tftd(result)
        sorted_dictionary[term] = result_tftd
    return sorted_dictionary



In [24]:
def calculate_tftd(pl_with_duplicates):
    """ Add term frequency of term in each document """
    # print(pl_with_duplicates)
    counter = collections.Counter(pl_with_duplicates)
    pl_tftd = [[docId, counter[docId]] for docId in counter.keys()]
    return pl_tftd



In [49]:
def write_block_to_disk(term_postings_list, block_number):
    """ Writes index of the block (dictionary + postings list) to disk """
    # Define block
    base_path = 'HillaryEmails'
    block_name = 'block-' + str(block_number) + '.txt'
    block = open(base_path + block_name, 'a+')
    print(" -- Writing term-positing list block: " + block_name + "...")
    # Write term : posting lists to block
    for index, term in enumerate(term_postings_list):
        # Term - Posting List Format
        # term:[docID1, docID2, docID3]
        # e.g. cat:[4,9,21,42]
        block.write(str(term) + ":" + str((term_postings_list[term])) + "\n")
    print(block_name, "created")
    block.close()

In [63]:
import os
def listFile(d):    
    path = [os.path.abspath(os.path.join(d,i)) for i in os.listdir(d)]
    return path
        
def readFile(d):
    file = open(d,"r",encoding='utf-8')
    content = file.read()
    return content

files = listFile("HillaryEmails") #list dir
tokens = list()
file_content = []
for docs in files:
    file_content += [readFile(docs)] #read content
spimi_invert(file_content, 1000000000)


 -- Sorting terms...
 -- Writing term-positing list block: block-0.txt...
block-0.txt created
SPIMI invert complete!


In [37]:
doc = ['obama', 'canton']
spimi_invert(doc, 1000)


 -- Sorting terms...
 -- Writing term-positing list block: block-0.txt...
SPIMI invert complete!
