# IR System using Boolean Retrieval Technique

## Function that clears the Boiletplate text from the beginning of the book, and the end

In [1]:
import os
import re
import nltk
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
# Defining the function to strip Project Gutenberg boilerplate
def strip_boilerplate(text):
    # Defining the start and end markers
    start_marker = r'\*\*\* START OF THE PROJECT GUTENBERG EBOOK .* \*\*\*'
    end_marker = r'\*\*\* END OF THE PROJECT GUTENBERG EBOOK .* \*\*\*'
    
    # Using regex to extract text between the markers
    match = re.search(f'{start_marker}(.*?){end_marker}', text, re.DOTALL)
    
    if match:
        # Returning the text between the markers
        return match.group(1).strip()
    else:
        return "Boilerplate markers not found."

## Text pre-processing (Documents-only), and trying the system out

In [3]:
# Defining the text cleaning function
def text_cleaner(text, stem='Stem'):
    # Converting text to lowercase
    text = text.lower()

    # Removing URLs
    text = re.sub(r"http\S+", '', text, flags=re.MULTILINE)

    # Removing non-word and non-whitespace characters
    text = re.sub(r"[^\w\s]", '', text)

    # Removing numbers
    text = re.sub(r"[\d]", '', text)

    # Tokenizing text
    tokens = text.split()

    # Removing stop words
    stop_words = stopwords.words("english")
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming the words if requested
    if stem == 'Stem':
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]

    return tokens

In [4]:
# Defining the function to read and clean all text documents in a directory
def clean_and_add_documents(directory, inverted_index, doc_id_to_filename, dictionary):
    doc_id_counter = 0
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            # Reading content of each file
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                content = file.read()

                # Stripping the boilerplate if present
                content = strip_boilerplate(content)

                # Cleaning the content
                cleaned_content = text_cleaner(content)

                # Adding cleaned document to the inverted index
                add_document_to_index(cleaned_content, filename, doc_id_counter, inverted_index, doc_id_to_filename, dictionary)
                doc_id_counter += 1

In [5]:
# Defining the function to add a cleaned document to the inverted index
def add_document_to_index(content, filename, doc_id, inverted_index, doc_id_to_filename, dictionary):
    # Storing document filename with doc_id
    doc_id_to_filename[doc_id] = filename

    # Looping through each word in the document
    for word in content:
        if word not in dictionary:
            dictionary[word] = len(dictionary)  # Assigning unique ID to each word
        word_id = dictionary[word]
        # Adding doc_id to the set for this word
        inverted_index[word_id].add(doc_id)

In [6]:
# Defining the Boolean retrieval function to process a query
def boolean_retrieval(query, dictionary, inverted_index):
    # Cleaning the query and splitting into words
    words = text_cleaner(query)

    result_set = None
    # Retrieving documents for each word in the query
    for word in words:
        word_id = dictionary.get(word, -1)
        if word_id == -1:
            return set()  # Word not found, returning empty set
        word_set = inverted_index[word_id]
        if result_set is None:
            result_set = word_set  # Initializing result set
        else:
            result_set &= word_set  # Performing intersection of result sets for each word

    return result_set

In [7]:
# Defining the function to get filenames of the documents matching the query
def get_filenames(doc_ids, doc_id_to_filename):
    return [doc_id_to_filename[doc_id] for doc_id in doc_ids]

In [8]:
# Defining the main function to process the documents and run queries
def main():
    # Initializing necessary structures
    inverted_index = defaultdict(set)
    doc_id_to_filename = {}
    dictionary = {}

    # Specifying the directory with text documents
    directory = './dataset/'

    # Cleaning and indexing all documents in the directory
    clean_and_add_documents(directory, inverted_index, doc_id_to_filename, dictionary)

    # Defining queries
    query1 = "sailor"
    query2 = "Machiavelli Florentines faint"

    # Performing Boolean retrieval for both queries
    result1 = boolean_retrieval(query1, dictionary, inverted_index)
    result2 = boolean_retrieval(query2, dictionary, inverted_index)

    # Printing results for query1
    print(f"Document IDs for query '{query1}':", result1)
    print(f"Document names for query '{query1}':", get_filenames(result1, doc_id_to_filename))

    # Printing results for query2
    print(f"Document IDs for query '{query2}':", result2)
    print(f"Document names for query '{query2}':", get_filenames(result2, doc_id_to_filename))

if __name__ == "__main__":
    main()


Document IDs for query 'sailor': {0, 2}
Document names for query 'sailor': ['pg1184.txt', 'pg6130.txt']
Document IDs for query 'Machiavelli Florentines faint': {0}
Document names for query 'Machiavelli Florentines faint': ['pg1184.txt']
