In [18]:
import nltk
nltk.download ("stopwords")
nltk.download ("punkt")
nltk.download('punkt_tab')
nltk.download ("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shree\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shree\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\shree\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shree\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [19]:
import os, string, logging, re
from collections import defaultdict, Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
STOPWORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()

In [20]:
def  load_text_files (folder_path ):
    data = {}
    doc_id_to_filename = {}
    doc_id = 0

    print(f"Scanning folder: {folder_path}")
    for filename in os.listdir(folder_path):
        print(f"Found file: {filename}")  
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                content = file.read()
                data[doc_id] = content
                doc_id_to_filename[doc_id] = filename
                print(f"Loaded doc_id {doc_id} -> {filename}")
                doc_id += 1

    print(f"Total files loaded: {len(data)}")
    return data, doc_id_to_filename


In [21]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    tokens = word_tokenize(text)
    cleaned_tokens = [LEMMATIZER.lemmatize(word) for word in tokens if len(word) > 1]
    return cleaned_tokens

In [22]:
def build_inverted_index(data):
    inverted_index = defaultdict(set)
    term_frequencies = Counter()
    
    for doc_id, content in data.items():
        tokens = clean_text(content)
        for token in tokens:
            inverted_index[token].add(doc_id)
            term_frequencies[token] += 1

    return inverted_index, term_frequencies


In [23]:
def boolean_query(query, inverted_index, doc_id_to_filename):
    query = query.lower()
    tokens = query.split()
    result_set = set()

    if 'and' in tokens:
        terms = [term for term in tokens if term not in ['', 'or', 'not', 'and']]
        if all(term in inverted_index for term in terms):
            result_set = inverted_index[terms[0]].copy()
            for term in terms[1:]:
                result_set &= inverted_index[term]
    elif 'or' in tokens:
        terms = [term for term in tokens if term not in ['', 'or', 'not', 'and']]
        for term in terms:
            if term in inverted_index:
                result_set |= inverted_index[term]
    elif 'not' in tokens:
        term = tokens[1]
        all_docs = set(doc_id_to_filename.keys())
        if term in inverted_index:
            result_set = all_docs - inverted_index[term]
        else:
            result_set = all_docs
    else:
        if query in inverted_index:
            result_set = inverted_index[query]

    result_filenames = [doc_id_to_filename[doc_id] for doc_id in result_set if doc_id in doc_id_to_filename]
    logging.info(f"Query '{query}' resulted in: {result_filenames}")
    return result_filenames


In [24]:
def generate_queries_file(term_frequencies, output_file="queries.txt", top_n=5):
    queries = ["apple AND iphone", "update OR ios", "NOT support"]
    with open("queries.txt", "w", encoding="utf-8") as f:
        for q in queries:
            f.write(q + "\n")
    print("Generated queries file: queries.txt")

In [25]:
def main():
    folder_path = r"C:\Users\shree\Documents\College\Information Retrieval\Lab\W2\documents"  
    data, doc_id_to_filename = load_text_files(folder_path)
    for doc_id, content in data.items():
        tokens = clean_text(content)
        print(f"Doc {doc_id} cleaned tokens:", tokens[:20])
    
    inverted_index, term_frequencies = build_inverted_index(data)
    print("Sample inverted index keys:", list(inverted_index.keys())[:20])

    generate_queries_file(term_frequencies)

    queries = ["apple AND iphone", "update OR ios", "NOT support"]

    with open("query_results.txt", 'w', encoding='utf-8') as result_file:
        for query in queries:
            result = boolean_query(query, inverted_index, doc_id_to_filename)
            result_str = f"Results for '{query}': {result}\n"
            print(result_str)
            result_file.write(result_str)

if __name__ == "__main__":
    main()

Scanning folder: C:\Users\shree\Documents\College\Information Retrieval\Lab\W2\documents
Found file: apple.txt
Loaded doc_id 0 -> apple.txt
Found file: appleNews.txt
Loaded doc_id 1 -> appleNews.txt
Found file: microsoft.txt
Loaded doc_id 2 -> microsoft.txt
Found file: netflix.txt
Loaded doc_id 3 -> netflix.txt
Found file: tesla.txt
Loaded doc_id 4 -> tesla.txt
Total files loaded: 5
Doc 0 cleaned tokens: ['apple', 'watch', 'receives', 'new', 'healthtracking', 'update', 'the', 'update', 'add', 'sleep', 'monitoring', 'and', 'stress', 'detection', 'apple', 'health', 'app', 'syncs', 'seamlessly', 'with']
Doc 1 cleaned tokens: ['apple', 'release', 'the', 'new', 'iphone', 'this', 'week', 'the', 'iphone', 'ha', 'improved', 'battery', 'life', 'and', 'better', 'camera', 'many', 'user', 'are', 'excited']
Doc 2 cleaned tokens: ['microsoft', 'unveils', 'new', 'surface', 'laptop', 'with', 'faster', 'processor', 'window', '12', 'introduces', 'enhanced', 'multitasking', 'and', 'better', 'memory', 'ma