2. Implement a program for retrieval of documents using inverted files.

In [7]:
import re
from collections import defaultdict

In [8]:
# Sample document collection
documents = {
    1: "the quick brown fox jumps over the lazy dog",
    2: "never jump over the lazy dog quickly",
    3: "foxes are quick and smart animals",
    4: "python programming is fun and smart",
}


In [9]:
# 1. Build Inverted Index
inverted_index = defaultdict(set)

for doc_id, text in documents.items():
    # Basic tokenization and lowering (no stopwords/stemming for now)
    words = [w.lower() for w in re.findall(r"\w+", text)]
    for word in words:
        inverted_index[word].add(doc_id)


In [10]:
# 2. Print the inverted index (sample)
print("\nInverted Index (word: docIDs):")
for word, docids in inverted_index.items():
    print(f"{word}: {sorted(docids)}")



Inverted Index (word: docIDs):
the: [1, 2]
quick: [1, 3]
brown: [1]
fox: [1]
jumps: [1]
over: [1, 2]
lazy: [1, 2]
dog: [1, 2]
never: [2]
jump: [2]
quickly: [2]
foxes: [3]
are: [3]
and: [3, 4]
smart: [3, 4]
animals: [3]
python: [4]
programming: [4]
is: [4]
fun: [4]


In [11]:
# 3. Document Retrieval: Search for documents containing ALL query terms
def search(query):
    query_words = [w.lower() for w in re.findall(r"\w+", query)]
    # Get doc ID sets for each word
    sets = [inverted_index.get(word, set()) for word in query_words]
    # Find intersection (docs containing all words)
    if sets:
        docs = set.intersection(*sets)
    else:
        docs = set()
    return docs


In [12]:
query = "quick smart"
result = search(query)
print(f"\nDocuments containing ALL words [{query}]: {sorted(result) if result else 'None'}")


Documents containing ALL words [quick smart]: [3]
