# SUNNY SHABAN ALI - 22K4149 - 6B - ASSIGNMENT 1

##### GitHub Repository for the project: https://www.github.com/sunnyallana/boolean-retrieval-model-pipeline


## References:
- 1. NLTK PorterStemmer: https://www.nltk.org/api/nltk.stem.porter.html
- 2. Python Regular Expressions: https://docs.python.org/3/library/re.html
- 3. Pickle Module: https://docs.python.org/3/library/pickle.html

<br/>

##### <strong>Kindly ensure that you have the Abstracts directory and Stopword-List.txt in the same folder as this code.<strong/>



In [None]:
import os
import re
from nltk.stem import PorterStemmer
from collections import defaultdict
import pickle

# Preprocessing function
def preprocess(text, stopwords, stemmer):
    tokens = re.findall(r'\w+', text.lower())
    tokens = [token for token in tokens if token not in stopwords]
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

# Read stopwords with Latin-1 encoding
def read_stopwords(stopword_file):
    with open(stopword_file, 'r', encoding='latin-1') as f:
        stopwords = set(line.strip() for line in f)
    return stopwords

# Define a serializable defaultdict factory for positional index
def list_defaultdict():
    return defaultdict(list)

# Build indexes
def build_indexes(abstracts_dir, stopwords):
    stemmer = PorterStemmer()
    inverted_index = defaultdict(set)
    positional_index = defaultdict(list_defaultdict)
    all_docs = set()

    for filename in os.listdir(abstracts_dir):
        if filename.endswith('.txt'):
            doc_id = filename.split('.')[0]
            all_docs.add(doc_id)
            with open(os.path.join(abstracts_dir, filename), 'r', encoding='latin-1') as f:
                text = f.read()
                tokens = preprocess(text, stopwords, stemmer)
                for pos, term in enumerate(tokens):
                    inverted_index[term].add(doc_id)
                    positional_index[term][doc_id].append(pos)

    # Convert defaultdict to dict for pickling
    inverted_index = dict(inverted_index)
    positional_index = {term: dict(docs) for term, docs in positional_index.items()}
    return inverted_index, positional_index, all_docs

# Save indexes
def save_indexes(inverted_index, positional_index, filename='indexes.pkl'):
    with open(filename, 'wb') as f:
        pickle.dump((inverted_index, positional_index), f)

# Load indexes and reconstruct defaultdicts
def load_indexes(filename='indexes.pkl'):
    with open(filename, 'rb') as f:
        inverted_index, positional_index = pickle.load(f)

    # Rebuild defaultdicts for query processing
    inverted_index_dd = defaultdict(set, {k: set(v) for k, v in inverted_index.items()})
    positional_index_dd = defaultdict(list_defaultdict)
    for term, docs in positional_index.items():
        positional_index_dd[term] = defaultdict(list, docs)
    return inverted_index_dd, positional_index_dd

# Process Boolean query
def process_boolean_query(query, inverted_index, all_docs, stopwords):
    stemmer = PorterStemmer()
    tokens = query.split()
    current_result = None
    prev_op = None
    i = 0

    while i < len(tokens):
        token = tokens[i]
        if token == "NOT":
            if i + 1 >= len(tokens):
                return set()
            term = preprocess(tokens[i+1], stopwords, stemmer)
            if not term:
                docs = set()
            else:
                term = term[0]
                docs = inverted_index.get(term, set())
            not_docs = all_docs - docs
            if current_result is None:
                current_result = not_docs
            else:
                if prev_op == "AND":
                    current_result &= not_docs
                elif prev_op == "OR":
                    current_result |= not_docs
                prev_op = None
            i += 2
        elif token in ["AND", "OR"]:
            prev_op = token
            i += 1
        else:
            term = preprocess(token, stopwords, stemmer)
            if not term:
                docs = set()
            else:
                term = term[0]
                docs = inverted_index.get(term, set())
            if current_result is None:
                current_result = docs
            else:
                if prev_op == "AND":
                    current_result &= docs
                elif prev_op == "OR":
                    current_result |= docs
                prev_op = None
            i += 1

    return current_result if current_result is not None else set()


# Process proximity query
def process_proximity_query(query, inverted_index, positional_index, stopwords):
    stemmer = PorterStemmer()
    parts = query.split()
    for i, part in enumerate(parts):
        if '/' in part:
            if i < 2:
                return set()
            term1_part = parts[i-2]
            term2_part = parts[i-1]
            k = int(part.split('/')[1])
            term1 = preprocess(term1_part, stopwords, stemmer)
            term2 = preprocess(term2_part, stopwords, stemmer)
            if not term1 or not term2:
                return set()
            term1 = term1[0]
            term2 = term2[0]
            common_docs = inverted_index.get(term1, set()) & inverted_index.get(term2, set())
            result = set()
            for doc_id in common_docs:
                pos1_list = positional_index.get(term1, {}).get(doc_id, [])
                pos2_list = positional_index.get(term2, {}).get(doc_id, [])
                for pos1 in pos1_list:
                    for pos2 in pos2_list:
                        if abs(pos1 - pos2) - 1 <= k:
                            result.add(doc_id)
                            break
                    if doc_id in result:
                        break
            return result
    return set()


def main():
    # Configuration
    abstracts_dir = 'Abstracts'
    stopword_file = 'Stopword-List.txt'
    index_file = 'indexes.pkl'

    # Load or build indexes with error handling
    if os.path.exists(index_file):
        try:
            inverted_index, positional_index = load_indexes(index_file)
            all_docs = set()
            for term in inverted_index:
                all_docs.update(inverted_index[term])
            # Check if indexes are empty (invalid)
            if not all_docs:
                raise ValueError("Indexes are empty")
        except (EOFError, ValueError, pickle.UnpicklingError):
            print("Corrupted or empty index file. Rebuilding indexes...")
            os.remove(index_file)  # Delete bad file
            stopwords = read_stopwords(stopword_file)
            inverted_index, positional_index, all_docs = build_indexes(abstracts_dir, stopwords)
            save_indexes(inverted_index, positional_index, index_file)
    else:
        stopwords = read_stopwords(stopword_file)
        inverted_index, positional_index, all_docs = build_indexes(abstracts_dir, stopwords)
        if not all_docs:  # Check if no documents were processed
            raise FileNotFoundError(f"No .txt files found in {abstracts_dir} directory")
        save_indexes(inverted_index, positional_index, index_file)

    # Command line interface
    while True:
        query = input("Enter your query (or 'exit' to quit): ").strip()
        if query.lower() == 'exit':
            break
        if '/ ' in query:  # Handle proximity queries with space after /
            query = query.replace('/ ', '/')
        if '/' in query:
            result = process_proximity_query(query, inverted_index, positional_index, read_stopwords(stopword_file))
        else:
            result = process_boolean_query(query, inverted_index, all_docs, read_stopwords(stopword_file))
        print("Result-Set:", ', '.join(sorted(result, key=int)) if result else "No results found")

if __name__ == "__main__":
    main()

Enter your query (or 'exit' to quit): neural networks /2
Result-Set: 13, 21, 23, 24, 25, 26, 27, 28, 29, 30, 84, 88, 100, 114, 135, 145, 157, 162, 163, 164, 165, 174, 175, 176, 187, 194, 208, 210, 222, 223, 226, 245, 246, 247, 248, 267, 272, 273, 279, 280, 281, 284, 288, 291, 295, 303, 305, 308, 320, 322, 329, 337, 339, 344, 345, 348, 356, 371, 372, 373, 374, 375, 376, 381, 382, 395, 396, 397, 398, 401, 402, 406, 414, 415, 416, 417, 418, 420, 421, 429, 431, 440, 447
Enter your query (or 'exit' to quit): neural information /2
Result-Set: 26
Enter your query (or 'exit' to quit): pattern
Result-Set: 9, 10, 18, 21, 23, 26, 30, 34, 40, 50, 73, 118, 126, 127, 139, 145, 148, 155, 180, 186, 189, 194, 201, 209, 214, 216, 230, 231, 234, 238, 279, 280, 288, 326, 343, 350, 351, 368, 369, 383, 394, 406, 412, 413, 424, 425, 429, 446, 447
Enter your query (or 'exit' to quit): exit
