# Q1. Data Preprocessing

In [1]:
import os
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

dataset_dir = 'text_files'
preprocessed_dir = 'preprocessed_files'

if not os.path.exists(preprocessed_dir):
    os.makedirs(preprocessed_dir)

def preprocess_files(file_path, save_path, pt5=False):
    
    # Reading the file content
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    if pt5:
        print(f"Original text from {file_path}:\n{text[:500]}\n")  
    
    # Lowercase all the text of the file
    text_lower = text.lower()
    if pt5:
        print(f"After lowercase:\n{text_lower[:500]}\n")  
    
    # Tokenize the text of the file
    tokens = word_tokenize(text_lower)
    if pt5:
        print("After tokenization:")
        print(tokens[:50])  
    
    # Removeing stopwords from the tokens list
    stopwords_set = set(stopwords.words('english')) # importing set of English stopwords from NLTK library
    tokens_no_stopwords = [w for w in tokens if w not in stopwords_set] # Filtering out the stopwords from 'tokens' list.
    if pt5:
        print("After removing stopwords:")
        print(tokens_no_stopwords[:50])  
    
    # Removing punctuation from the tokens list
    table = str.maketrans('', '', string.punctuation) # Creating a translation table that maps all punctuation to None (deleting it)
    tokens_no_punct = [w.translate(table) for w in tokens_no_stopwords if w.translate(table)] # Using the translation table to remove punctuation from each token
    if pt5:
        print("After removing punctuation:")
        print(tokens_no_punct[:50])  
    
    # Removing blank space tokens
    final_words = [word for word in tokens_no_punct if word.strip()]
    if pt5:
        print("After removing blank spaces:")
        print(final_words[:50])  
    
    # Saving preprocessed text to a new file
    with open(save_path, 'w', encoding='utf-8') as file:
        file.write(' '.join(final_words))

processed_5files = 0
# Iterating over each filename in the directory specified by 'dataset_dir'
for filename in os.listdir(dataset_dir):
    # Constructing full path to the file by joining the directory path with the filename
    file_path = os.path.join(dataset_dir, filename)
    # Setting the path where the preprocessed file will be saved
    preprocessed_path = os.path.join(preprocessed_dir, filename)
    # Checking if the current path is a file
    if os.path.isfile(file_path):
        preprocess_files(file_path, preprocessed_path, pt5=processed_5files < 5)
        processed_5files += 1
        if processed_5files <= 5:
            print(f'-----------------------------------\n')

[nltk_data] Downloading package punkt to /Users/vasanth/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vasanth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Original text from text_files/file502.txt:
Kit is awesome. I play in my garage just for personal enjoyment not for performances or anything. Once you take the time to break down all the settings, your able to dial in pretty much any kit and sound. With the expansion options and the relatively inexpensive parts expanding is easy and fun.

After a few weeks of daily use for at least an hour a day it still looks and plays beautifully. Overall one of the best purchases I could have made.

After lowercase:
kit is awesome. i play in my garage just for personal enjoyment not for performances or anything. once you take the time to break down all the settings, your able to dial in pretty much any kit and sound. with the expansion options and the relatively inexpensive parts expanding is easy and fun.

after a few weeks of daily use for at least an hour a day it still looks and plays beautifully. overall one of the best purchases i could have made.

After tokenization:
['kit', 'is', 'awesome', '

Q2. Unigram Inverted Index and Boolean Queries

In [2]:
import os
import pickle

preprocessed_directory = 'preprocessed_files'

def create_unigram_inverted_index(directory):
    inv_index = {} # Initializing an empty dictionary for the inverted index
    for filename in os.listdir(directory): # Loop through each file in the directory

        file_path = os.path.join(directory, filename) # Creating the full path to the file

        if os.path.isfile(file_path): # Checking if the path is a file
            with open(file_path, 'r', encoding='utf-8') as file: # Open the file for reading
                
                for w in file.read().split(): # Iterating each word in the file
                    if w in inv_index: # Checking whether the word is already in the index
                        if filename not in inv_index[w]:  # Avoiding duplicate file entries
                            inv_index[w].append(filename) # Adding the filename to the word's list
                    else:
                        inv_index[w] = [filename] # Initializing a new list with the filename for new words
    return inv_index

# Create the inverted index
unigram_inverted_index = create_unigram_inverted_index(preprocessed_directory)

In [3]:
# Function to save the inverted index using pickle
def save_inverted_index(index, save_path):
    with open(save_path, 'wb') as file:
        pickle.dump(index, file)

# Save the unigram inverted index to a file
save_path = 'unigram_inverted_index.pkl'
save_inverted_index(unigram_inverted_index, save_path)


In [4]:
# Function to load the inverted index using pickle
def load_inverted_index(load_path):
    with open(load_path, 'rb') as file:
        index = pickle.load(file)
    return index

# Load the unigram inverted index from a file
loaded_unigram_inverted_index = load_inverted_index(save_path)


In [5]:
def preprocess_text(text):
    # a. Lowercase the text
    text_lower = text.lower()
    
    # b. Perform tokenization
    tokens = word_tokenize(text_lower)
    
    # c. Remove stopwords
    stopwords_set = set(stopwords.words('english'))
    tokens_no_stopwords = [w for w in tokens if w not in stopwords_set]
    
    # d. Remove punctuation from tokens
    table = str.maketrans('', '', string.punctuation)
    tokens_no_punctuation = [w.translate(table) for w in tokens_no_stopwords]
    
    # e. Remove blank space tokens
    words = [w for w in tokens_no_punctuation if w.strip() != '']
    
    return words

def and_operation(set1, set2):
    return set1.intersection(set2)

def or_operation(set1, set2):
    return set1.union(set2)

def and_not_operation(set1, set2):
    return set1 - set2

def or_not_operation(set1, set2, all_documents):
    return set1.union(all_documents - set2)

def execute_query(query_tokens, operations, inverted_index, all_documents):
    # Converting query tokens to sets of documents
    sets = [set(inverted_index.get(token, [])) for token in query_tokens]
    
    # Executing operations
    result_set = sets[0]
    for op, next_set in zip(operations, sets[1:]):
        if op == "AND":
            result_set = and_operation(result_set, next_set)
        elif op == "OR":
            result_set = or_operation(result_set, next_set)
        elif op == "AND NOT":
            result_set = and_not_operation(result_set, next_set)
        elif op == "OR NOT":
            result_set = or_not_operation(result_set, next_set, all_documents)
    
    return sorted(list(result_set))

def reconstruct_query_with_operations(original_query, operations):
    # Preprocessing the original query
    preprocessed_terms = preprocess_text(original_query)
    
    # Checkingif the number of operations is not one less than the number of preprocessed terms
    if len(operations) != len(preprocessed_terms) - 1:
        return "Issue: The number of operations does not match the number of terms minus one."
    
    # Reconstructing the query by interleaving operations b/w preprocessed terms
    reconstructed_query = ""
    for i, term in enumerate(preprocessed_terms):
        if i > 0 and i-1 < len(operations):  # Check to avoid index error
            reconstructed_query += f" {operations[i-1]} "
        reconstructed_query += term
    
    return reconstructed_query

def p_execute_queries(queries, inverted_index, all_documents):
    results = []
    for i, (original_query, ops) in enumerate(queries, start=1):
        query_tokens = preprocess_text(original_query)
        operations = ops.split(', ')
        # checking if the operations and terms correct in number
        if len(query_tokens) - 1 != len(operations):
            results.append((i, "Error you provided incorrect number of operations for the terms", 0, []))
            continue
        result_docs = execute_query(query_tokens, operations, inverted_index, all_documents)
        reconstructed_query = reconstruct_query_with_operations(original_query, operations)
        if "Issue" in reconstructed_query:
            results.append((i, reconstructed_query, 0, []))
            continue
        results.append((i, reconstructed_query, len(result_docs), result_docs))
    return results

N = int(input("Enter the number of queries: "))

queries = []
for i in range(N):
    input_sequence = input(f"Enter input sequence for query {i+1}: ")
    operations = input(f"Enter operations (separated by comma) for query {i+1}: ")
    queries.append((input_sequence, operations))

preprocessed_directory = 'preprocessed_files'

# Creating a set of all document names
all_documents = set(os.listdir(preprocessed_directory))

results = p_execute_queries(queries, loaded_unigram_inverted_index, all_documents)

# Printing the results with reconstructed query
for i, reconstructed_query, num_docs, docs in results:
    print(f"Query {i}: {reconstructed_query}")
    print(f"Number of documents retrieved for query {i}: {num_docs}")
    print(f"Names of the documents retrieved for query {i}: {', '.join(docs)}")

Query 1: perfect AND fit AND color
Number of documents retrieved for query 1: 1
Names of the documents retrieved for query 1: file26.txt


Q3. Positional Index and Phrase Queries

In [6]:
from collections import defaultdict
import os
import pickle

def default_dict():
    return defaultdict(list)

def create_positional_index(directory):
    positional_index = defaultdict(default_dict)
    
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                words = file.read().split()
                for position, word in enumerate(words):
                    positional_index[word][filename].append(position)
    return positional_index

# Saving the positional index
def save_positional_index(index, save_path):
    with open(save_path, 'wb') as file:
        pickle.dump(index, file)

# Loading the positional index
def load_positional_index(load_path):
    with open(load_path, 'rb') as file:
        index = pickle.load(file)
    return index

preprocessed_directory = 'preprocessed_files'
positional_index = create_positional_index(preprocessed_directory)

save_path = 'positional_index.pkl'
save_positional_index(positional_index, save_path)

loaded_positional_index = load_positional_index(save_path)


In [7]:
def exe_phrase_query(query_tokens, positional_index):
    if not query_tokens:
        return []

    # Starting with candidate documents for the first token
    candidate_docs = positional_index.get(query_tokens[0], {})
    
    # If single-word query, return its documents directly
    if len(query_tokens) == 1:
        return list(candidate_docs.keys())

    # For phrases, ensuring all the tokens appear in the exact sequence
    valid_docs = []
    for doc, positions in candidate_docs.items():
        for pos in positions:
            match = True
            for i, token in enumerate(query_tokens[1:], 1):

                next_positions = positional_index.get(token, {}).get(doc, [])

                if not any(pos + i == next_pos for next_pos in next_positions):
                    match = False
                    break  # if the position doesn't match the sequence; trying the next position
            if match:
                valid_docs.append(doc)
                break  # Found a matching sequence in this document, no need to check further
    return valid_docs

def user_input_and_execute_queries(positional_index):
    N = int(input("Enter the number of queries: "))

    for i in range(1, N + 1):
        
        query = input(f"Enter query {i}: ")
        preprocessed_query = preprocess_text(query) 
        docs_retrieved = exe_phrase_query(preprocessed_query, positional_index)

        print(f"Number of documents retrieved for query {i} using positional index: {len(docs_retrieved)}")
        if docs_retrieved:
            print(f"Names of documents retrieved for query {i} using positional index: {', '.join(docs_retrieved)}")
        else:
            print(f"Names of documents retrieved for query {i} using positional index: None")

user_input_and_execute_queries(loaded_positional_index)

Number of documents retrieved for query 1 using positional index: 4
Names of documents retrieved for query 1 using positional index: file160.txt, file907.txt, file16.txt, file526.txt
