# IR System using Boolean Retrieval Technique

## Function that clears the Boiletplate text from the beginning of the book, and the end

In [12]:
def strip_boilerplate(text):
    # Define the start and end markers
    start_marker = r'\*\*\* START OF THE PROJECT GUTENBERG EBOOK .* \*\*\*'
    end_marker = r'\*\*\* END OF THE PROJECT GUTENBERG EBOOK .* \*\*\*'
    
    # Use regex to extract text between the markers
    match = re.search(f'{start_marker}(.*?){end_marker}', text, re.DOTALL)
    
    if match:
        # Return the text between the markers
        return match.group(1).strip()
    else:
        return "Boilerplate markers not found."

## Text pre-processing (Documents-only), and trying the system out

In [None]:
import re
import os
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import defaultdict

# Text cleaning function
def text_cleaner(text, stem='Stem'):
    # Convert text to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r"http\S+", '', text, flags=re.MULTILINE)

    # Remove non-word and non-whitespace characters
    text = re.sub(r"[^\w\s]", '', text)

    # Remove numbers
    text = re.sub(r"[\d]", '', text)

    # Tokenize text
    tokens = text.split()

    # Remove stop words
    stop_words = stopwords.words("english")
    tokens = [word for word in tokens if word not in stop_words]

    # Stem the words if requested
    if stem == 'Stem':
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]

    return tokens

# Function to read and clean all text documents in a directory
def clean_and_add_documents(directory, inverted_index, doc_id_to_filename, dictionary):
    doc_id_counter = 0
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            # Read content of each file
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                content = file.read()

                # Cleaning the content
                cleaned_content = text_cleaner(content)

                # Adding cleaned document to the inverted index
                add_document_to_index(cleaned_content, filename, doc_id_counter, inverted_index, doc_id_to_filename, dictionary)
                doc_id_counter += 1

# Function to add a cleaned document to the inverted index
def add_document_to_index(content, filename, doc_id, inverted_index, doc_id_to_filename, dictionary):
    # Store document filename with doc_id
    doc_id_to_filename[doc_id] = filename

    # For each word in the document
    for word in content:
        if word not in dictionary:
            dictionary[word] = len(dictionary)  # Assigning unique ID to each word
        word_id = dictionary[word]
        inverted_index[word_id].add(doc_id)  # Adding doc_id to the set for this word

# Boolean retrieval function to process a query
def boolean_retrieval(query, dictionary, inverted_index):
    # Clean the query and split into words
    words = text_cleaner(query)

    result_set = None
    # Retrieve documents for each word in the query
    for word in words:
        word_id = dictionary.get(word, -1)
        if word_id == -1:
            return set()  # Word not found, return empty set
        word_set = inverted_index[word_id]
        if result_set is None:
            result_set = word_set  # Initialize result set
        else:
            result_set &= word_set  # Intersection of result sets for each word

    return result_set

# Function to get filenames of the documents matching the query
def get_filenames(doc_ids, doc_id_to_filename):
    return [doc_id_to_filename[doc_id] for doc_id in doc_ids]

# Main function to process the documents and run queries
def main():
    # Initializing necessary structures
    inverted_index = defaultdict(set)
    doc_id_to_filename = {}
    dictionary = {}

    # Directory with text documents
    directory = './dataset/'

    # Clean and index all documents in the directory
    clean_and_add_documents(directory, inverted_index, doc_id_to_filename, dictionary)

    # Queries
    query1 = "sailor"
    query2 = "Machiavelli Florentines faint"

    # Perform Boolean retrieval for both queries
    result1 = boolean_retrieval(query1, dictionary, inverted_index)
    result2 = boolean_retrieval(query2, dictionary, inverted_index)

    # Print results for query1
    print(f"Document IDs for query '{query1}':", result1)
    print(f"Document names for query '{query1}':", get_filenames(result1, doc_id_to_filename))

    # Print results for query2
    print(f"Document IDs for query '{query2}':", result2)
    print(f"Document names for query '{query2}':", get_filenames(result2, doc_id_to_filename))

if __name__ == "__main__":
    main()