In [1]:
!pip install nltk
!pip install rarfile
!pip install chardet



In [2]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import os
from collections import deque
import json
import rarfile
import chardet
import re

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Unaiza\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### Preprocessing

In [3]:
def load_stopwords(file):
    with open(file, "r") as f:
        stopwords = f.read().splitlines()
        return set(stopwords)

In [4]:
def casefolding(text):
    return text.lower()

In [5]:
def tokenize(text):
    """
    Normally, word_tokenize would be enough, but I noticed that it wasn’t handling 
    '-' and '/' properly in the abstracts. So, I had to manually replace them with spaces 
    before tokenizing to make sure the words are split correctly.
    """
    text = re.sub(r"[-/]", " ", text)  # replacing '-' and '/' with spaces
    tokens = word_tokenize(text)  # tokenizing the cleaned text
    return tokens

In [6]:
def remove_stopwords(tokens, stopwords):
    filtered_tokens = []
    for token in tokens:
        if token.isalnum():  # making sure the token is a proper word (alphanumeric) - (not punctuation, symbols, etc)
            if token not in stopwords: # keeping only words that are not in the stopwords list
                filtered_tokens.append(token)
    return filtered_tokens

In [7]:
def stem_tokens(tokens, stemmer):
    stemmed_tokens = []
    for token in tokens:
        stemmed_word = stemmer.stem(token) # converting the word to its root form
        stemmed_tokens.append(stemmed_word) # storing the stemmed word in stemmed_tokens list
    return stemmed_tokens

In [8]:
def preprocess(text, stopwords, stemmer):
    text = casefolding(text) # 1. converting text to lowercase
    tokens = tokenize(text) # 2. tokenising the text
    tokens = remove_stopwords(tokens, stopwords) # 3. removing stopwords from text
    tokens = stem_tokens(tokens, stemmer) # 4. finally, reducing the text to its root form
    return tokens

In [9]:
# def extract_abstracts(rar_path, extract_folder):
#     if not os.path.exists(extract_folder):
#         os.makedirs(extract_folder)

#     with rarfile.RarFile(rar_path, 'r') as rf:
#         rf.extractall(extract_folder)

In [10]:
def read_abstracts(folder):
    """
    Reading all abstract files from the given folder and storing their content in a dictionary.
    Since abstracts can have different encodings, I’m using `chardet` to detect the correct encoding for each file.
    First, I read the file in binary mode to detect its encoding.
    Then, I open it using the detected encoding and store the text in a dictionary.
    """
    abstracts = {}
    
    for filename in os.listdir(folder):  
        file = os.path.join(folder, filename)  

        if os.path.isfile(file):  
            with open(file, "rb") as f:  # reading as binary to detect encoding
                raw_data = f.read()
                result = chardet.detect(raw_data)
                encoding = result['encoding']  # detecting encoding

            try:
                # opening the file with the detected encoding
                with open(file, "r", encoding=encoding, errors="replace") as f:
                    abstracts[filename] = f.read()  # storing filename and file content in asbtracts disctionary
            except Exception as e:
                print(f"Error reading {filename}: {e}") 
    return abstracts

In [11]:
stemmer = PorterStemmer()

In [12]:
stopwords = load_stopwords("Stopword-List.txt")

In [13]:
#rar_path = "/content/Abstracts.rar"
abstracts_folder = r"C:\Users\Unaiza\IR ASSIGNMENT # 1\abstracts"
#extract_abstracts(rar_path, abstracts_folder)

In [14]:
abstracts = read_abstracts(abstracts_folder)

In [15]:
preprocessed_abstracts = {} # dictionary to store preprocessed abstracts
for doc, text in abstracts.items():
    preprocessed_abstracts[doc] = preprocess(text, stopwords, stemmer)

In [16]:
print("Preprocessed Abstract [", list(preprocessed_abstracts.keys())[420], "]: ", # displaying a sample preprocessed file
      preprocessed_abstracts[list(preprocessed_abstracts.keys())[420]])

Preprocessed Abstract [ 74.txt ]:  ['genet', 'algorithm', 'approach', 'partit', 'cluster', 'case', 'studi', 'applic', 'partit', 'cluster', 'cluster', 'divis', 'analysi', 'genet', 'algorithm', 'gower', 'measur', 'of', 'similar', 'master', 'degre', 'inform', 'technolog', 'acquir', 'master', 'degre', 'is', 'becom', 'common', 'practic', 'ensur', 'success', 'life', 'and', 'good', 'career', 'path', 'especi', 'develop', 'countri', 'master', 'degre', 'inform', 'technolog', 'is', 'one', 'of', 'most', 'popular', 'programm', 'with', 'prolif', 'number', 'of', 'applic', 'and', 'student', 'thi', 'work', 'two', 'main', 'object', 'first', 'is', 'discov', 'number', 'of', 'cluster', 'of', 'applic', 'and', 'characterist', 'of', 'each', 'cluster', 'anoth', 'is', 'develop', 'genet', 'algorithm', 'base', 'partit', 'cluster', 'program', 'thi', 'is', 'achiev', 'by', 'incorpor', 'distanc', 'matrix', 'and', 'it', 'applic', 'divis', 'analysi', 'and', 'gower', 'measur', 'of', 'similar', 'genet', 'algorithm', 'bas

### Building Inverted Index

In [17]:
def build_inverted_index(preprocessed_abstracts):
    inverted_index = {}
    for doc, tokens in preprocessed_abstracts.items(): # iterating through each document and its preprocessed tokens 
        unique_tokens = set(tokens) # making sure there no duplicated tokens
        for token in unique_tokens: # iterating through each unique token 
            if token not in inverted_index: 
                inverted_index[token] = [] # if the token is not in the index already, initialise an empty list for it
            inverted_index[token].append(doc) # adding doc in the tokens list
    return inverted_index

In [18]:
inverted_index = build_inverted_index(preprocessed_abstracts)

In [19]:
with open(r"C:\Users\Unaiza\IR ASSIGNMENT # 1\inverted_index.json", "w") as f:
    json.dump(inverted_index, f) # saving the inverted index

In [20]:
print("Inverted Index of [", list(inverted_index.keys())[366], "]: ", # displaying a sample inverted index 
      inverted_index[list(inverted_index.keys())[366]])

Inverted Index of [ acycl ]:  ['104.txt', '105.txt', '65.txt']


In [21]:
print("Loading the entire Inverted Index: ", json.dumps(inverted_index, indent=4))

Loading the entire Inverted Index:  {
    "new": [
        "1.txt",
        "10.txt",
        "100.txt",
        "106.txt",
        "108.txt",
        "116.txt",
        "118.txt",
        "12.txt",
        "123.txt",
        "128.txt",
        "132.txt",
        "139.txt",
        "141.txt",
        "142.txt",
        "145.txt",
        "15.txt",
        "152.txt",
        "153.txt",
        "160.txt",
        "179.txt",
        "182.txt",
        "183.txt",
        "191.txt",
        "193.txt",
        "198.txt",
        "20.txt",
        "200.txt",
        "201.txt",
        "205.txt",
        "206.txt",
        "208.txt",
        "211.txt",
        "212.txt",
        "22.txt",
        "224.txt",
        "229.txt",
        "234.txt",
        "235.txt",
        "236.txt",
        "240.txt",
        "245.txt",
        "25.txt",
        "253.txt",
        "255.txt",
        "256.txt",
        "267.txt",
        "274.txt",
        "283.txt",
        "29.txt",
        "292.txt",
        

### Building Positional Index

In [22]:
def build_positional_index(preprocessed_abstracts):
    positional_index = {}
    for doc, tokens in preprocessed_abstracts.items(): # iterating through each document and its preprocessed tokens
        for pos, token in enumerate(tokens): # iterating through each token and its position in the doc
            if token not in positional_index:
                positional_index[token] = {} # if the token is not already in the index, create an empty dictionary for it
            if doc not in positional_index[token]:  
                positional_index[token][doc] = [] # if the doc is not already listed under the token, create an empty list for it
            positional_index[token][doc].append(pos) # storing the position of the token in the document
    return positional_index

In [23]:
positional_index = build_positional_index(preprocessed_abstracts)

In [24]:
with open(r"C:\Users\Unaiza\IR ASSIGNMENT # 1\positional_index.json", "w") as f:
    json.dump(positional_index, f) # saving the positional index 

In [25]:
print("Positional Index of [", list(positional_index.keys())[250], "]: ", # displaying a sample positional index 
      positional_index[list(positional_index.keys())[250]])

Positional Index of [ multivari ]:  {'102.txt': [7, 20], '134.txt': [89], '138.txt': [84], '232.txt': [111], '319.txt': [27, 93], '353.txt': [116], '394.txt': [55], '405.txt': [2], '42.txt': [9, 140]}


In [26]:
print("Loading the entire Positional Index: ", json.dumps(positional_index, indent=4))

Loading the entire Positional Index:  {
    "ensembl": {
        "1.txt": [
            0,
            11,
            44,
            58
        ],
        "105.txt": [
            5,
            68
        ],
        "120.txt": [
            56,
            122,
            167
        ],
        "171.txt": [
            9,
            158
        ],
        "198.txt": [
            8,
            57,
            83
        ],
        "2.txt": [
            14,
            74,
            95
        ],
        "229.txt": [
            12,
            68,
            78
        ],
        "256.txt": [
            3,
            11,
            73,
            95,
            108,
            146
        ],
        "262.txt": [
            128
        ],
        "268.txt": [
            213
        ],
        "284.txt": [
            5,
            11,
            61,
            68,
            152
        ],
        "3.txt": [
            2,
            18,
            55,
          

### Query Processing

In [27]:
def boolean_query_processing(query, inverted_index, all_docs):  
    result = set() # for storing matching docs  
    boolean_oper = None  

    for token in query:  
        if token in {"AND", "OR", "NOT"}: # if a word in query is "AND", "OR", or "NOT", it is stored as a boolean operator  
            boolean_oper = token  
        else:  
            if token in inverted_index: # if it's a normal word, a list of documents where the word appears is retrieved  
                doc_set = set(inverted_index[token])  
            else:  
                doc_set = set() # if the word is not in the index, assume no document contains this word  

            if boolean_oper == "NOT":  
                doc_set = all_docs - doc_set # if NOT was the last operator used, retrieved docs are subtracted from all docs  
            if not result: # if it's a normal word then result is set to the docs retrieved from inverted index 
                result = doc_set  
            elif boolean_oper == "AND": # if AND was the last operator used, intersection of retrieved docs and result is taken  
                result = result & doc_set  
            elif boolean_oper == "OR": # if OR was the last operator used, union of retrieved docs and result is taken  
                result = result | doc_set  

    return result  

In [28]:
def proximity_query_processing(term1, term2, k, pos_index):
    if term1 not in pos_index or term2 not in pos_index: # checking if both words exist in the positional index
        return set() # # if any word is missing, no document can match, so return an empty set

    docs_containing_both_terms = set(pos_index[term1].keys()) & set(pos_index[term2].keys())
    result = set() # # storing the documents where both words appear

    for doc in docs_containing_both_terms: # iterating through each doc that contains both words 
        pos_term1 = pos_index[term1][doc] # position of term1 in doc 
        pos_term2 = pos_index[term2][doc] # position of term2 in doc 
        i, j = 0, 0 # pointers to move through both position lists

        while i < len(pos_term1) and j < len(pos_term2): # # checking all positions of both words in the doc
            if abs(pos_term1[i] - pos_term2[j]) <= k: # if both the words appear within k distance in a doc, add that doc and stop checking further 
                result.add(doc)
                break 

            if pos_term1[i] < pos_term2[j]: # else move the pointer that is behind to check the words next position 
                i += 1
            else:
                j += 1

    return result

In [29]:
def process_boolean_and_proximity_search_query(query, inverted_index, pos_index, all_docs, stemmer):
    if isinstance(query, list):
        query = " ".join(query) # if the query is a list, convert it into a single string

    processed_query = []
    for word in query.split():
        if word in {"and", "or", "not"}: # if the word in query is a boolean operator, convert to uppercase
            processed_query.append(word.upper())
        else:
            processed_query.append(word) # if its a normal word keep it as it is 

    words_to_stem = [] # extract words that need to be stemmed (excluding boolean operators)
    for word in processed_query:
        if word not in {"AND", "OR", "NOT"}:
            words_to_stem.append(word)

    stemmed_words = [] # apply stemming to the extracted words
    for word in words_to_stem:
        stemmed_word = stemmer.stem(word)
        stemmed_words.append(stemmed_word)  

    stemmed_query = []
    stemmed_index = 0  
    for word in processed_query:  # rebuild the query with stemmed words
        if word not in {"AND", "OR", "NOT"}:
            stemmed_query.append(stemmed_words[stemmed_index])
            stemmed_index += 1
        else:
            stemmed_query.append(word)

    if len(stemmed_query) == 3 and "/" in stemmed_query[2]: # checking if it's a proximity search (query should have three parts and the third part contains '/')
        term1, term2 = stemmed_query[0], stemmed_query[1] # extract the two terms
        k = int(stemmed_query[2][1:]) # extract the k min distance 
        return proximity_query_processing(term1, term2, k, pos_index) # apply proximity search

    return boolean_query_processing(stemmed_query, inverted_index, all_docs) # if not a proximity search, process it as a boolean query

In [30]:
def load_gold_standard_queries(file):
    gold_standard_queries = {}
    with open(file, "r") as f:
        lines = f.readlines()
        query, result = None, None

        for line in lines:
            if line.startswith("Example Query:"):
                query = line.split(":")[1].strip()
            elif line.startswith("Result-Set:"):
                result = {doc.strip() for doc in line.split(":")[1].strip().split(", ")}
                gold_standard_queries[query] = result
    return gold_standard_queries

In [31]:
gold_standard_query_file = "Gold Query-Set Boolean Queries.txt"
gold_standard_queries = load_gold_standard_queries(gold_standard_query_file)

In [32]:
print("Gold Standard Queries: ")
for query in gold_standard_queries.keys():
    print("\t* ", query)

Gold Standard Queries: 
	*  image AND restoration
	*  deep AND learning
	*  autoencoders
	*  temporal AND deep AND learning
	*  time AND series
	*  time AND series AND classification
	*  time AND series OR classification
	*  pattern
	*  pattern AND clustering
	*  pattern AND clustering AND heart
	*  neural information /3
	*  feature track /5


In [33]:
all_documents = set(preprocessed_abstracts.keys()) # storing all document names/ids 

In [34]:
def convert_filenames_to_doc_ids(result_set): # removing .txt from doc names to get doc ids 
    return {doc.replace(".txt", "") for doc in result_set} 

def calculate_metrics(actual, expected): # to evaluate the performance and the effectiveness of the system built 
    actual = set(actual)
    expected = set(expected)

    tp = len(actual & expected)  
    fp = len(actual - expected)  
    fn = len(expected - actual)  
    
    precision = tp / (tp + fp)
    recall = tp / (tp + fn) 
    if (precision + recall) == 0:
        f1 = 0
    else:
        f1 = (2 * precision * recall) / (precision + recall) 

    return precision, recall, f1

In [35]:
overall_precision = []
overall_recall = []
overall_f1 = []

In [36]:
print("\n" + "=" * 124) # formatting for clean, structured look 
print(f"{'Query':<40} {'Precision':<10} {'Recall':<10} {'F1-Score':<10} {'Result':<35}")
print("=" * 124)

for i, (query, expected) in enumerate(gold_standard_queries.items()):
    
    actual = process_boolean_and_proximity_search_query( # process the query using boolean or proximity search  
            query, 
            inverted_index, 
            positional_index, 
            all_documents, 
            stemmer
        )

    actual = convert_filenames_to_doc_ids(actual) # convert filenames to document IDs  

    expected_sorted = sorted(expected)
    actual_sorted = sorted(actual)

    precision, recall, f1 = calculate_metrics(actual_sorted, expected_sorted)
    
    # storing performance metrics for overall evaluation  
    overall_precision.append(precision)
    overall_recall.append(recall)
    overall_f1.append(f1)

    if actual_sorted == expected_sorted:
        match_status = "All expected documents were retrieved successfully."
    else:
        match_status = "Some expected documents were missing or extra."

    print(f"{query:<40} {precision:.2f}        {recall:.2f}      {f1:.2f}      {match_status}")

print("=" * 124)


Query                                    Precision  Recall     F1-Score   Result                             
image AND restoration                    1.00        1.00      1.00      All expected documents were retrieved successfully.
deep AND learning                        0.67        1.00      0.80      Some expected documents were missing or extra.
autoencoders                             1.00        1.00      1.00      All expected documents were retrieved successfully.
temporal AND deep AND learning           1.00        1.00      1.00      All expected documents were retrieved successfully.
time AND series                          1.00        1.00      1.00      All expected documents were retrieved successfully.
time AND series AND classification       1.00        1.00      1.00      All expected documents were retrieved successfully.
time AND series OR classification        1.00        1.00      1.00      All expected documents were retrieved successfully.
pattern            

In [37]:
avg_precision = sum(overall_precision) / len(overall_precision)
avg_recall = sum(overall_recall) / len(overall_recall)
avg_f1 = sum(overall_f1) / len(overall_f1)

print("\n" + "=" * 60)
print(f"{'Overall System Performance':^60}")
print("=" * 60)
print(f"{'Overall Precision:':<25} {avg_precision:.2f}")
print(f"{'Overall Recall:':<25} {avg_recall:.2f}")
print(f"{'Overall F1-Score:':<25} {avg_f1:.2f}")
print("=" * 60 + "\n")


                 Overall System Performance                 
Overall Precision:        0.97
Overall Recall:           1.00
Overall F1-Score:         0.98



In [38]:
while True:
    print("\n" + "=" * 124)
    user_query = input(" ENTER QUERY (Boolean or Proximity) | Type 'exit' to quit: ").strip()
    print("=" * 124)

    if user_query.lower() == "exit":
        print("\n SYSTEM MESSAGE: Exiting search engine. Thank you for using the system! \n")
        print("=" * 124)
        break  

    if not user_query:
        print("\n ERROR: Query cannot be empty. Please enter a valid search query.")
        continue  

    try:
        user_result = process_boolean_and_proximity_search_query(
            user_query, inverted_index, positional_index, all_documents, stemmer
        )

        if user_result:
            print(f"\n QUERY: {user_query}")
            print("-" * 124)
            print(" MATCHING DOCUMENTS: ")
            print(", ".join(sorted(user_result)))
        else:
            print(f"\n QUERY: {user_query}")
            print("-" * 124)
            print(" NO MATCHING DOCUMENTS FOUND.")

    except ValueError as e:
        print("\n ERROR: Issue in query processing.")
        print(f" DETAILS: {e}")
        print(" TIP: Check query syntax and try again.")

    print("=" * 124)


 ENTER QUERY (Boolean or Proximity) | Type 'exit' to quit: deep AND learning

 QUERY: deep AND learning
----------------------------------------------------------------------------------------------------------------------------
 MATCHING DOCUMENTS: 
174.txt, 175.txt, 176.txt, 177.txt, 213.txt, 23.txt, 24.txt, 245.txt, 247.txt, 250.txt, 254.txt, 258.txt, 267.txt, 272.txt, 273.txt, 278.txt, 279.txt, 281.txt, 325.txt, 333.txt, 345.txt, 346.txt, 347.txt, 348.txt, 352.txt, 357.txt, 358.txt, 360.txt, 362.txt, 371.txt, 373.txt, 374.txt, 375.txt, 380.txt, 381.txt, 382.txt, 396.txt, 397.txt, 401.txt, 404.txt, 405.txt, 415.txt, 421.txt, 432.txt, 444.txt

 ENTER QUERY (Boolean or Proximity) | Type 'exit' to quit: neural network /1

 QUERY: neural network /1
----------------------------------------------------------------------------------------------------------------------------
 MATCHING DOCUMENTS: 
100.txt, 114.txt, 13.txt, 135.txt, 145.txt, 157.txt, 162.txt, 163.txt, 164.txt, 165.txt, 174.t

### GUI

In [40]:
import gradio as gr  

def get_document_link(doc_id): # get filepath for a given document ID 
    return rf"C:\Users\Unaiza\IR ASSIGNMENT # 1\abstract\{doc_id}"

with open("inverted_index.json", "r") as f: # loading inverted index 
    inverted_index = json.load(f)
    
with open("positional_index.json", "r") as f: # loading positional index
    positional_index = json.load(f)

def convert_filenames_to_doc_ids(result_set): # remove.txt extension from filenames 
    return {doc[0].replace(".txt", "") for doc in result_set}

def format_results(results): # format the output results, limiting display to 6 documents for readability
    sorted_results = sorted(results)
    return ", ".join(sorted_results[:6]) + "..." if len(sorted_results) > 6 else ", ".join(sorted_results)

def process_query(query, query_index=None): # process user query and return matching documents
    if not query.strip():  # check if query is empty
        return [["Error", "Please enter a valid query."]]

    # process the query using boolean or proximity search
    result_set = process_boolean_and_proximity_search_query(query, inverted_index, positional_index, all_documents, stemmer)

    if result_set:  # if matching documents are found, return them 
        return [[doc, get_document_link(doc)] for doc in sorted(result_set)]

    return [["No Results", "No matching documents found."]] 

def test_gold_standard():
    results = []
    mismatches = 0  
    
    for i, (query, expected_result) in enumerate(gold_standard_queries.items()):
        actual_result = process_query(query, i + 1) 
        actual_result = convert_filenames_to_doc_ids(actual_result)  
        expected_set = set(expected_result)

        match_status = "🟢" if actual_result == expected_set else "🔴"  
        if match_status == "🔴":
            mismatches += 1  

        formatted_expected = format_results(expected_result)
        formatted_actual = format_results(actual_result)

        results.append([ # storing query results, expected docs vs actual docs comparison, and match status (whether all relevant docs retrieved or not)
            query, 
            formatted_expected if formatted_expected else "None", 
            formatted_actual if formatted_actual else "None", 
            match_status
        ])

    return results  

with gr.Blocks(title="Boolean & Proximity Search Engine") as demo:
    gr.Markdown("# Boolean & Proximity Search Engine") 

    query_input = gr.Textbox(label="Enter Query", placeholder="e.g., temporal AND deep AND learning")
    search_btn = gr.Button("Search") 
    output_table = gr.DataFrame(headers=["Document", "Open"], interactive=False) 

    search_btn.click(process_query, inputs=query_input, outputs=output_table)  

    gr.Markdown("## Test Gold Standard Queries")  
    test_btn = gr.Button("Run Gold Standard Test")  
    test_results = gr.DataFrame(headers=["Query", "Expected Result", "Actual Result", "Status"], interactive=False)
    test_btn.click(test_gold_standard, outputs=test_results)  

demo.launch()  

* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


