Q1. Data Preprocessing

In [67]:
import os
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Ensure the NLTK datasets are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Set the path to your dataset directory and the preprocessed directory
dataset_directory = 'text_files'
preprocessed_directory = 'preprocessed_files'

# Ensure the preprocessed directory exists
if not os.path.exists(preprocessed_directory):
    os.makedirs(preprocessed_directory)

def preprocess_file(file_path, save_path, verbose=False):
    # Read the file content
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    if verbose:
        print(f"Original text from {file_path}:\n{text[:500]}\n")  # Print the first 500 characters of the original text
    
    # Lowercase the text
    text_lower = text.lower()
    if verbose:
        print(f"After lowercase:\n{text_lower[:500]}\n")  # Print the first 500 characters after lowercase
    
    # Tokenize the text
    tokens = word_tokenize(text_lower)
    if verbose:
        print("After tokenization:")
        print(tokens[:50])  # Print the first 50 tokens directly
    
    # Remove stopwords
    stopwords_set = set(stopwords.words('english'))
    tokens_no_stopwords = [w for w in tokens if w not in stopwords_set]
    if verbose:
        print("After removing stopwords:")
        print(tokens_no_stopwords[:50])  # Print the first 50 tokens directly after stopwords removal
    
    # Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    tokens_no_punctuation = [w.translate(table) for w in tokens_no_stopwords if w.translate(table)]
    if verbose:
        print("After removing punctuation:")
        print(tokens_no_punctuation[:50])  # Print the first 50 tokens directly after punctuation removal
    
    # Remove blank space tokens, if any remain
    final_words = [word for word in tokens_no_punctuation if word.strip()]
    if verbose:
        print("After removing blank spaces:")
        print(final_words[:50])  # Print the first 50 tokens directly after removing blank spaces
    
    # Save the preprocessed text to a new file
    with open(save_path, 'w', encoding='utf-8') as file:
        file.write(' '.join(final_words))

files_processed = 0
for filename in os.listdir(dataset_directory):
    file_path = os.path.join(dataset_directory, filename)
    preprocessed_path = os.path.join(preprocessed_directory, filename)
    if os.path.isfile(file_path):
        preprocess_file(file_path, preprocessed_path, verbose=files_processed < 5)
        files_processed += 1
        if files_processed <= 5:
            print(f'-----------------------------------\n')

[nltk_data] Downloading package punkt to /Users/vasanth/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vasanth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Original text from text_files/file502.txt:
Kit is awesome. I play in my garage just for personal enjoyment not for performances or anything. Once you take the time to break down all the settings, your able to dial in pretty much any kit and sound. With the expansion options and the relatively inexpensive parts expanding is easy and fun.

After a few weeks of daily use for at least an hour a day it still looks and plays beautifully. Overall one of the best purchases I could have made.

After lowercase:
kit is awesome. i play in my garage just for personal enjoyment not for performances or anything. once you take the time to break down all the settings, your able to dial in pretty much any kit and sound. with the expansion options and the relatively inexpensive parts expanding is easy and fun.

after a few weeks of daily use for at least an hour a day it still looks and plays beautifully. overall one of the best purchases i could have made.

After tokenization:
['kit', 'is', 'awesome', '

Q2. Unigram Inverted Index and Boolean Queries

In [68]:
import os
import pickle

# Assuming the preprocessed files are stored in 'preprocessed_files'
preprocessed_directory = 'preprocessed_files'

# Function to create a unigram inverted index
def create_unigram_inverted_index(directory):
    inverted_index = {}
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                for word in file.read().split():
                    if word in inverted_index:
                        if filename not in inverted_index[word]:
                            inverted_index[word].append(filename)
                    else:
                        inverted_index[word] = [filename]
    return inverted_index

# Create the inverted index
unigram_inverted_index = create_unigram_inverted_index(preprocessed_directory)

In [69]:
# Function to save the inverted index using pickle
def save_inverted_index(index, save_path):
    with open(save_path, 'wb') as file:
        pickle.dump(index, file)

# Save the unigram inverted index to a file
save_path = 'unigram_inverted_index.pkl'
save_inverted_index(unigram_inverted_index, save_path)


In [70]:
# Function to load the inverted index using pickle
def load_inverted_index(load_path):
    with open(load_path, 'rb') as file:
        index = pickle.load(file)
    return index

# Load the unigram inverted index from a file
loaded_unigram_inverted_index = load_inverted_index(save_path)


In [71]:
def preprocess_text(text):
    # Lowercase the text
    text_lower = text.lower()
    
    # Tokenize the text
    tokens = word_tokenize(text_lower)
    
    # Remove punctuation from tokens
    table = str.maketrans('', '', string.punctuation)
    stripped_tokens = [w.translate(table) for w in tokens]
    
    # Remove non-alphabetic tokens, stopwords, and ensure no blank tokens remain
    stopwords_set = set(stopwords.words('english'))
    words = [w for w in stripped_tokens if w.isalpha() and w not in stopwords_set and w.strip() != '']
    
    return words

def and_operation(set1, set2):
    return set1.intersection(set2)

def or_operation(set1, set2):
    return set1.union(set2)

def and_not_operation(set1, set2):
    return set1 - set2

def or_not_operation(set1, set2, all_documents):
    # Assuming all_documents is a set of all document names
    return set1.union(all_documents - set2)

def execute_query(query_tokens, operations, inverted_index, all_documents):
    # Convert query tokens to sets of documents
    sets = [set(inverted_index.get(token, [])) for token in query_tokens]
    
    # Execute operations
    result_set = sets[0]
    for op, next_set in zip(operations, sets[1:]):
        if op == "AND":
            result_set = and_operation(result_set, next_set)
        elif op == "OR":
            result_set = or_operation(result_set, next_set)
        elif op == "AND NOT":
            result_set = and_not_operation(result_set, next_set)
        elif op == "OR NOT":
            result_set = or_not_operation(result_set, next_set, all_documents)
    
    return sorted(list(result_set))

def reconstruct_query_with_operations(original_query, operations):
    # Preprocess the original query to match the processed terms
    preprocessed_terms = preprocess_text(original_query)
    
    # Reconstruct the query by interleaving operations between preprocessed terms
    # Note: This simplistic approach assumes that the number of operations is one less than the number of preprocessed terms
    reconstructed_query = ""
    for i, term in enumerate(preprocessed_terms):
        if i > 0 and i-1 < len(operations):  # Check to avoid index error
            reconstructed_query += f" {operations[i-1]} "
        reconstructed_query += term
    
    return reconstructed_query

def parse_and_execute_queries(queries, inverted_index, all_documents):
    results = []
    for i, (original_query, ops) in enumerate(queries, start=1):
        query_tokens = preprocess_text(original_query)
        operations = ops.split(', ')
        result_docs = execute_query(query_tokens, operations, inverted_index, all_documents)
        reconstructed_query = reconstruct_query_with_operations(original_query, operations)
        results.append((i, reconstructed_query, len(result_docs), result_docs))
    return results

# Prompt for the number of queries
N = int(input("Enter the number of queries: "))

queries = []
for i in range(N):
    # For each query, gather the input sequence and the operations
    input_sequence = input(f"Enter input sequence for query {i+1}: ")
    operations = input(f"Enter operations (separated by comma) for query {i+1}: ")
    queries.append((input_sequence, operations))

# Assuming 'preprocessed_directory' is the path to your directory of preprocessed files
preprocessed_directory = 'preprocessed_files'

# Create a set of all document names
all_documents = set(os.listdir(preprocessed_directory))

# Now, you can call 'parse_and_execute_queries' with the correct 'all_documents'
results = parse_and_execute_queries(queries, loaded_unigram_inverted_index, all_documents)

# Print results with reconstructed query
for i, reconstructed_query, num_docs, docs in results:
    print(f"Query {i}: {reconstructed_query}")
    print(f"Number of documents retrieved for query {i}: {num_docs}")
    print(f"Names of the documents retrieved for query {i}: {', '.join(docs)}")

Query 1: perfect AND fit AND color
Number of documents retrieved for query 1: 1
Names of the documents retrieved for query 1: file26.txt


Q3. Positional Index and Phrase Queries

In [82]:
import os
import pickle
from collections import defaultdict
import string

# Redefine the default factory for the positional index
def default_factory():
    return defaultdict(list)

def create_positional_index(directory):
    positional_index = defaultdict(default_factory)
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                # Assume preprocess_text is defined and properly preprocesses the text
                words = preprocess_text(file.read())
                for position, word in enumerate(words):
                    positional_index[word][filename].append(position)
    return positional_index

# Save the positional index with pickle
def save_positional_index(index, filename):
    with open(filename, 'wb') as f:
        pickle.dump(index, f)

# Load the positional index with pickle
def load_positional_index(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

# Execute phrase queries using the positional index
def execute_phrase_query(query, positional_index):
    words = preprocess_text(query)
    if not words:
        return []
    # Initial list of files containing the first word
    potential_files = set(positional_index[words[0]].keys())
    for word in words[1:]:
        potential_files &= set(positional_index[word].keys())
    
    valid_docs = []
    for file in potential_files:
        positions = [positional_index[word][file] for word in words]
        for start_pos in zip(*positions):
            if all(start_pos[i+1] - start_pos[i] == 1 for i in range(len(start_pos)-1)):
                valid_docs.append(file)
                break
    return valid_docs

# Main execution flow
if __name__ == "__main__":
    # Directory containing preprocessed text files
    preprocessed_directory = 'preprocessed_files'
    index_filename = 'positional_index.pkl'
    
    # Create and save the positional index
    positional_index = create_positional_index(preprocessed_directory)
    save_positional_index(positional_index, index_filename)
    
    # Load the positional index for querying
    loaded_positional_index = load_positional_index(index_filename)
    
    # Execute queries based on user input
    N = int(input("Enter the number of queries: "))
    for i in range(N):
        query = input(f"Enter phrase query {i+1}: ")
        valid_docs = execute_phrase_query(query, loaded_positional_index)
        print(f"Number of documents retrieved for query {i+1} using positional index: {len(valid_docs)}")
        print(f"Names of documents retrieved for query {i+1} using positional index: {', '.join(valid_docs)}")


Number of documents retrieved for query 1 using positional index: 4
Names of documents retrieved for query 1 using positional index: file1.txt, file254.txt, file723.txt, file391.txt


In [72]:
# from collections import defaultdict
# import os
# import pickle

# def default_dict():
#     return defaultdict(list)

# def create_positional_index(directory):
#     positional_index = defaultdict(default_dict)
    
#     for filename in os.listdir(directory):
#         file_path = os.path.join(directory, filename)
#         if os.path.isfile(file_path):
#             with open(file_path, 'r', encoding='utf-8') as file:
#                 words = file.read().split()
#                 for position, word in enumerate(words):
#                     positional_index[word][filename].append(position)
#     return positional_index

# # Saving the positional index
# def save_positional_index(index, save_path):
#     with open(save_path, 'wb') as file:
#         pickle.dump(index, file)

# # Loading the positional index
# def load_positional_index(load_path):
#     with open(load_path, 'rb') as file:
#         index = pickle.load(file)
#     return index

# preprocessed_directory = 'preprocessed_files'
# positional_index = create_positional_index(preprocessed_directory)

# save_path = 'positional_index.pkl'
# save_positional_index(positional_index, save_path)

# loaded_positional_index = load_positional_index(save_path)


In [75]:
# def execute_phrase_query(query_tokens, positional_index):
#     if not query_tokens:
#         return []

#     # Start with the candidate documents for the first token
#     candidate_docs = positional_index.get(query_tokens[0], {})
    
#     # If it's a single-word query, return its documents directly
#     if len(query_tokens) == 1:
#         return list(candidate_docs.keys())

#     # For phrases, ensure all tokens appear in the exact sequence
#     valid_docs = []
#     for doc, positions in candidate_docs.items():
#         for pos in positions:
#             # Assume a match until proven otherwise
#             match = True
#             for i, token in enumerate(query_tokens[1:], 1):
#                 next_positions = positional_index.get(token, {}).get(doc, [])
#                 if not any(pos + i == next_pos for next_pos in next_positions):
#                     match = False
#                     break  # This position doesn't match the sequence; try the next position
#             if match:
#                 valid_docs.append(doc)
#                 break  # Found a matching sequence in this document, no need to check further
#     return valid_docs

# def user_input_and_execute_queries(positional_index):
#     N = int(input("Enter the number of queries: "))

#     for i in range(1, N + 1):
#         query = input(f"Enter query {i}: ")
#         preprocessed_query = preprocess_text(query)  # Ensure this function is defined as per your preprocessing steps
#         docs_retrieved = execute_phrase_query(preprocessed_query, positional_index)

#         print(f"Number of documents retrieved for query {i} using positional index: {len(docs_retrieved)}")
#         if docs_retrieved:
#             print(f"Names of documents retrieved for query {i} using positional index: {', '.join(docs_retrieved)}")
#         else:
#             print(f"Names of documents retrieved for query {i} using positional index: None")

# # Assuming 'loaded_positional_index' is already defined and loaded
# user_input_and_execute_queries(loaded_positional_index)

Number of documents retrieved for query 1 using positional index: 4
Names of documents retrieved for query 1 using positional index: file1.txt, file391.txt, file723.txt, file254.txt
Number of documents retrieved for query 2 using positional index: 106
Names of documents retrieved for query 2 using positional index: file30.txt, file477.txt, file19.txt, file298.txt, file501.txt, file105.txt, file677.txt, file460.txt, file26.txt, file931.txt, file935.txt, file115.txt, file698.txt, file659.txt, file35.txt, file843.txt, file923.txt, file561.txt, file944.txt, file171.txt, file372.txt, file46.txt, file616.txt, file831.txt, file819.txt, file951.txt, file945.txt, file576.txt, file947.txt, file358.txt, file167.txt, file229.txt, file573.txt, file956.txt, file96.txt, file375.txt, file374.txt, file68.txt, file758.txt, file943.txt, file228.txt, file558.txt, file835.txt, file389.txt, file42.txt, file404.txt, file363.txt, file388.txt, file217.txt, file565.txt, file622.txt, file178.txt, file435.txt, fi