In [None]:
import re
from collections import defaultdict

# Sample documents
documents = {
    1: "This is the first document document. It is about inverted indexing.",
    2: "The second document discusses document retrieval and indexing.",
    3: "Document three is the final document in this collection."
}

# Function to tokenize text
def tokenize(text):
    # Use regex to find words (alphanumeric sequences)
    tokens = re.findall(r'\b\w+\b', text.lower())
    return tokens

# Inverted index creation
def create_inverted_index(documents):
    inverted_index = defaultdict(list)

    for doc_id, text in documents.items():
        tokens = tokenize(text)
        # print(tokens)
        for token in tokens:
            inverted_index[token].append(doc_id)

    return inverted_index

# Query processing
def retrieve_documents(query, inverted_index):
    query_tokens = tokenize(query)
    result_docs = set()

    for token in query_tokens:
        if token in inverted_index:
            result_docs.update(inverted_index[token])

    return result_docs

# Example usage
inverted_index = create_inverted_index(documents)

# Query example
query = "invereted indexing"
results = retrieve_documents(query, inverted_index)

print("Query:", query)
print("Results:")
for doc_id in results:
    print(f"Document {doc_id}: {documents[doc_id]}")


Query: invereted indexing
Results:
Document 1: This is the first document document. It is about inverted indexing.
Document 2: The second document discusses document retrieval and indexing.
