Q1. Data Preprocessing

In [42]:
import os
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Ensure the NLTK datasets are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Set the path to your dataset directory and the preprocessed directory
dataset_directory = 'text_files'
preprocessed_directory = 'preprocessed_files'

# Ensure the preprocessed directory exists
if not os.path.exists(preprocessed_directory):
    os.makedirs(preprocessed_directory)

def preprocess_file(file_path, save_path, verbose=False):
    # Read the file content
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    if verbose:
        print(f"Original text from {file_path}:\n{text[:500]}\n")  # Print the first 500 characters of the original text
    
    # Lowercase the text
    text_lower = text.lower()
    if verbose:
        print(f"After lowercase:\n{text_lower[:500]}\n")  # Print the first 500 characters after lowercase
    
    # Tokenize the text
    tokens = word_tokenize(text_lower)
    if verbose:
        print("After tokenization:")
        print(tokens[:50])  # Print the first 50 tokens directly
    
    # Remove stopwords
    stopwords_set = set(stopwords.words('english'))
    tokens_no_stopwords = [w for w in tokens if w not in stopwords_set]
    if verbose:
        print("After removing stopwords:")
        print(tokens_no_stopwords[:50])  # Print the first 50 tokens directly after stopwords removal
    
    # Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    tokens_no_punctuation = [w.translate(table) for w in tokens_no_stopwords if w.translate(table)]
    if verbose:
        print("After removing punctuation:")
        print(tokens_no_punctuation[:50])  # Print the first 50 tokens directly after punctuation removal
    
    # Remove blank space tokens, if any remain
    final_words = [word for word in tokens_no_punctuation if word.strip()]
    if verbose:
        print("After removing blank spaces:")
        print(final_words[:50])  # Print the first 50 tokens directly after removing blank spaces
    
    # Save the preprocessed text to a new file
    with open(save_path, 'w', encoding='utf-8') as file:
        file.write(' '.join(final_words))

files_processed = 0
for filename in os.listdir(dataset_directory):
    file_path = os.path.join(dataset_directory, filename)
    preprocessed_path = os.path.join(preprocessed_directory, filename)
    if os.path.isfile(file_path):
        preprocess_file(file_path, preprocessed_path, verbose=files_processed < 5)
        files_processed += 1
        if files_processed <= 5:
            print(f'-----------------------------------\n')

[nltk_data] Downloading package punkt to /Users/vasanth/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vasanth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Original text from text_files/file502.txt:
Kit is awesome. I play in my garage just for personal enjoyment not for performances or anything. Once you take the time to break down all the settings, your able to dial in pretty much any kit and sound. With the expansion options and the relatively inexpensive parts expanding is easy and fun.

After a few weeks of daily use for at least an hour a day it still looks and plays beautifully. Overall one of the best purchases I could have made.

After lowercase:
kit is awesome. i play in my garage just for personal enjoyment not for performances or anything. once you take the time to break down all the settings, your able to dial in pretty much any kit and sound. with the expansion options and the relatively inexpensive parts expanding is easy and fun.

after a few weeks of daily use for at least an hour a day it still looks and plays beautifully. overall one of the best purchases i could have made.

After tokenization:
['kit', 'is', 'awesome', '

Q2. Unigram Inverted Index and Boolean Queries

In [43]:
import os
import pickle

# Assuming the preprocessed files are stored in 'preprocessed_files'
preprocessed_directory = 'preprocessed_files'

# Function to create a unigram inverted index
def create_unigram_inverted_index(directory):
    inverted_index = {}
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                for word in file.read().split():
                    if word in inverted_index:
                        if filename not in inverted_index[word]:
                            inverted_index[word].append(filename)
                    else:
                        inverted_index[word] = [filename]
    return inverted_index

# Create the inverted index
unigram_inverted_index = create_unigram_inverted_index(preprocessed_directory)

In [44]:
# Function to save the inverted index using pickle
def save_inverted_index(index, save_path):
    with open(save_path, 'wb') as file:
        pickle.dump(index, file)

# Save the unigram inverted index to a file
save_path = 'unigram_inverted_index.pkl'
save_inverted_index(unigram_inverted_index, save_path)


In [45]:
# Function to load the inverted index using pickle
def load_inverted_index(load_path):
    with open(load_path, 'rb') as file:
        index = pickle.load(file)
    return index

# Load the unigram inverted index from a file
loaded_unigram_inverted_index = load_inverted_index(save_path)


In [48]:
def preprocess_text(text):
    # Lowercase the text
    text_lower = text.lower()
    
    # Tokenize the text
    tokens = word_tokenize(text_lower)
    
    # Remove punctuation from tokens
    table = str.maketrans('', '', string.punctuation)
    stripped_tokens = [w.translate(table) for w in tokens]
    
    # Remove non-alphabetic tokens, stopwords, and ensure no blank tokens remain
    stopwords_set = set(stopwords.words('english'))
    words = [w for w in stripped_tokens if w.isalpha() and w not in stopwords_set and w.strip() != '']
    
    return words

def and_operation(set1, set2):
    return set1.intersection(set2)

def or_operation(set1, set2):
    return set1.union(set2)

def and_not_operation(set1, set2):
    return set1 - set2

def or_not_operation(set1, set2, all_documents):
    # Assuming all_documents is a set of all document names
    return set1.union(all_documents - set2)

def execute_query(query_tokens, operations, inverted_index, all_documents):
    # Convert query tokens to sets of documents
    sets = [set(inverted_index.get(token, [])) for token in query_tokens]
    
    # Execute operations
    result_set = sets[0]
    for op, next_set in zip(operations, sets[1:]):
        if op == "AND":
            result_set = and_operation(result_set, next_set)
        elif op == "OR":
            result_set = or_operation(result_set, next_set)
        elif op == "AND NOT":
            result_set = and_not_operation(result_set, next_set)
        elif op == "OR NOT":
            result_set = or_not_operation(result_set, next_set, all_documents)
    
    return sorted(list(result_set))

def reconstruct_query_with_operations(original_query, operations):
    # Preprocess the original query to match the processed terms
    preprocessed_terms = preprocess_text(original_query)
    
    # Reconstruct the query by interleaving operations between preprocessed terms
    # Note: This simplistic approach assumes that the number of operations is one less than the number of preprocessed terms
    reconstructed_query = ""
    for i, term in enumerate(preprocessed_terms):
        if i > 0 and i-1 < len(operations):  # Check to avoid index error
            reconstructed_query += f" {operations[i-1]} "
        reconstructed_query += term
    
    return reconstructed_query

def parse_and_execute_queries(queries, inverted_index, all_documents):
    results = []
    for i, (original_query, ops) in enumerate(queries, start=1):
        query_tokens = preprocess_text(original_query)
        operations = ops.split(', ')
        result_docs = execute_query(query_tokens, operations, inverted_index, all_documents)
        reconstructed_query = reconstruct_query_with_operations(original_query, operations)
        results.append((i, reconstructed_query, len(result_docs), result_docs))
    return results

# Prompt for the number of queries
N = int(input("Enter the number of queries: "))

queries = []
for i in range(N):
    # For each query, gather the input sequence and the operations
    input_sequence = input(f"Enter input sequence for query {i+1}: ")
    operations = input(f"Enter operations (separated by comma) for query {i+1}: ")
    queries.append((input_sequence, operations))

# Assuming 'preprocessed_directory' is the path to your directory of preprocessed files
preprocessed_directory = 'preprocessed_files'

# Create a set of all document names
all_documents = set(os.listdir(preprocessed_directory))

# Now, you can call 'parse_and_execute_queries' with the correct 'all_documents'
results = parse_and_execute_queries(queries, loaded_unigram_inverted_index, all_documents)

# Print results with reconstructed query
for i, reconstructed_query, num_docs, docs in results:
    print(f"Query {i}: {reconstructed_query}")
    print(f"Number of documents retrieved for query {i}: {num_docs}")
    print(f"Names of the documents retrieved for query {i}: {', '.join(docs)}")

Query 1: car OR bag AND NOT canister
Number of documents retrieved for query 1: 31
Names of the documents retrieved for query 1: file118.txt, file166.txt, file174.txt, file264.txt, file3.txt, file313.txt, file363.txt, file404.txt, file459.txt, file466.txt, file542.txt, file573.txt, file665.txt, file682.txt, file686.txt, file698.txt, file699.txt, file73.txt, file738.txt, file746.txt, file780.txt, file797.txt, file860.txt, file863.txt, file864.txt, file886.txt, file892.txt, file930.txt, file942.txt, file956.txt, file981.txt
Query 2: coffee AND brewing OR NOT techniques OR cookbook
Number of documents retrieved for query 2: 999
Names of the documents retrieved for query 2: file1.txt, file10.txt, file100.txt, file101.txt, file102.txt, file103.txt, file104.txt, file105.txt, file106.txt, file107.txt, file108.txt, file109.txt, file11.txt, file110.txt, file111.txt, file112.txt, file113.txt, file114.txt, file115.txt, file116.txt, file117.txt, file118.txt, file119.txt, file12.txt, file120.txt, f