In [6]:
# First Prepare 5 Document
documents = {
    1: "The quick brown fox jumps over the lazy dog.",
    2: "A journey of a thousand miles begins with a single step.",
    3: "To be or not to be, that is the question.",
    4: "All that glitters is not gold.",
    5: "Ask not what your country can do for you, ask what you can do for your country."
}


In [7]:
import re

# Preprocess and clean the text by removing punctuation and converting to lowercase
def preprocess_text(text):
    return re.sub(r'[^\w\s]', '', text).lower()

# Build a dictionary of unique terms (vocabulary)
def build_vocabulary(documents):
    vocabulary = set()
    for doc_id, text in documents.items():
        words = preprocess_text(text).split()
        vocabulary.update(words)
    return sorted(vocabulary)

# Display vocabulary in columns
def display_vocabulary(vocab, num_columns=5):
    for i, word in enumerate(vocab):
        print(f"{word:<15}", end=" ")  # Adjust column width
        if (i + 1) % num_columns == 0:
            print()  # Break line after 'num_columns' words

# Sample documents
documents = {
    1: "The quick brown fox jumps over the lazy dog.",
    2: "A journey of a thousand miles begins with a single step.",
    3: "To be or not to be, that is the question.",
    4: "All that glitters is not gold.",
    5: "Ask not what your country can do for you, ask what you can do for your country."
}

vocabulary = build_vocabulary(documents)

print("Vocabulary:")
display_vocabulary(vocabulary)


Vocabulary:
a               all             ask             be              begins          
brown           can             country         do              dog             
for             fox             glitters        gold            is              
journey         jumps           lazy            miles           not             
of              or              over            question        quick           
single          step            that            the             thousand        
to              what            with            you             your            


In [9]:
from collections import defaultdict

# Build an inverted index
def build_inverted_index(documents):
    inverted_index = defaultdict(list)
    for doc_id, text in documents.items():
        words = preprocess_text(text).split()
        for word in set(words):  # Use set to avoid duplicate words in the same document
            inverted_index[word].append(doc_id)
    return inverted_index

# Display inverted index in column format
def display_inverted_index(index, num_columns=3):
    for i, (word, doc_ids) in enumerate(index.items()):
        print(f"{word:<15}: {doc_ids}", end=" ")
        if (i + 1) % num_columns == 0:
            print()  # Break line after 'num_columns' words

inverted_index = build_inverted_index(documents)

print("\nInverted Index:")
display_inverted_index(inverted_index)



Inverted Index:
dog            : [1] brown          : [1] jumps          : [1] 
over           : [1] lazy           : [1] quick          : [1] 
fox            : [1] the            : [1, 3] thousand       : [2] 
step           : [2] of             : [2] a              : [2] 
miles          : [2] begins         : [2] with           : [2] 
single         : [2] journey        : [2] is             : [3, 4] 
question       : [3] to             : [3] or             : [3] 
be             : [3] that           : [3, 4] not            : [3, 4, 5] 
all            : [4] glitters       : [4] gold           : [4] 
for            : [5] ask            : [5] you            : [5] 
do             : [5] country        : [5] what           : [5] 
can            : [5] your           : [5] 

In [5]:
# Implement Boolean Retrival
# Boolean AND query
def boolean_and(query, inverted_index):
    query_terms = query.split()
    result_sets = [set(inverted_index[term]) for term in query_terms if term in inverted_index]
    if result_sets:
        return set.intersection(*result_sets)
    return set()

# Boolean OR query
def boolean_or(query, inverted_index):
    query_terms = query.split()
    result_sets = [set(inverted_index[term]) for term in query_terms if term in inverted_index]
    if result_sets:
        return set.union(*result_sets)
    return set()

# Boolean NOT query (inverts the result of an AND query)
def boolean_not(query, inverted_index, all_doc_ids):
    result = boolean_and(query, inverted_index)
    return all_doc_ids - result

# Example usage
all_doc_ids = set(documents.keys())

and_result = boolean_and("quick brown", inverted_index)
or_result = boolean_or("quick question", inverted_index)
not_result = boolean_not("quick", inverted_index, all_doc_ids)

print("Boolean AND result:", and_result)
print("Boolean OR result:", or_result)
print("Boolean NOT result:", not_result)


Boolean AND result: {1}
Boolean OR result: {1, 3}
Boolean NOT result: {2, 3, 4, 5}
