In [None]:
# Import Necessary Libraries
import json
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import defaultdict
import tkinter as tk
from tkinter import ttk
import webbrowser

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Initialize Porter Stemmer and stop words
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Preprocessing Function
def preprocess(text):
    tokens = nltk.word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [ps.stem(word) for word in tokens]
    return tokens

# Load data from JSON file
def load_data_from_json(filename):
    """Load data from a JSON file."""
    filepath = os.path.join("./scrapedData", filename)
    if os.path.exists(filepath):
        with open(filepath, "r") as f:
            data = json.load(f)
        print(f"Data loaded from {filepath}, containing {len(data)} items.")
        return data
    else:
        print(f"No file found at {filepath}")
        return []

# Load authors and documents data
authors = load_data_from_json("authors-20240801-233645.json")
documents = load_data_from_json("papers-20240731-170433.json")

# Create Inverted Index for Documents
def create_document_inverted_index(documents):
    inverted_index = defaultdict(list)
    for i, doc in enumerate(documents):
        # Safely convert all components to strings
        title = doc.get('title', '') or ''
        journal = doc.get('journal', '') or ''
        abstract = doc.get('abstract', '') or ''
        authors = ' '.join(doc.get('authors', [])) or ''
        
        # Preprocess the concatenated string
        words = preprocess(f"{title} {journal} {abstract} {authors}")
        
        for word in words:
            if i not in inverted_index[word]:
                inverted_index[word].append(i)
    return inverted_index

# Create Inverted Index for Authors
def create_author_inverted_index(authors):
    inverted_index = defaultdict(list)
    for i, author in enumerate(authors):
        words = preprocess(author.get('name', '') + ' ' + author.get('department', ''))
        for word in words:
            if i not in inverted_index[word]:
                inverted_index[word].append(i)
    return inverted_index

# Initialize Inverted Indices
document_inverted_index = create_document_inverted_index(documents) if documents else {}
author_inverted_index = create_author_inverted_index(authors) if authors else {}

# Define the search_and_display_results function
def search_and_display_results():
    global document_inverted_index, author_inverted_index
    query = search_entry.get()
    query_tokens = preprocess(query)
    print(f"Query Tokens: {query_tokens}")

    matching_docs = set()
    matching_authors = set()

    for token in query_tokens:
        if token in document_inverted_index:
            matching_docs.update(document_inverted_index[token])
            print(f"Token '{token}' found in documents: {document_inverted_index[token]}")
        else:
            print(f"Token '{token}' not found in document index.")

    for token in query_tokens:
        if token in author_inverted_index:
            matching_authors.update(author_inverted_index[token])
            print(f"Token '{token}' found in authors: {author_inverted_index[token]}")
        else:
            print(f"Token '{token}' not found in author index.")

    # Clear previous results
    for item in result_tree.get_children():
        result_tree.delete(item)

    # Display document results in Treeview
    if matching_docs:
        for doc_id in matching_docs:
            doc = documents[doc_id]
            title = doc.get('title', 'N/A')
            link = doc.get('link', 'N/A')

            # Use `coventryAuthors` to fetch profile links if available
            authors_data = doc.get('authors', [])
            author_links = doc.get('coventryAuthors', [])
            authors = ', '.join(
                f"{name} ({link})" if i < len(author_links) else name
                for i, name in enumerate(authors_data)
            )

            result_tree.insert('', tk.END, values=(title, link, authors))
    
    # Display author results
    if matching_authors:
        for author_id in matching_authors:
            author = authors[author_id]
            name = author.get('name', 'N/A')
            link = author.get('profileLink', 'N/A')
            dept = author.get('department', 'N/A')
            result_tree.insert('', tk.END, values=(name, link, dept))

    if not matching_docs and not matching_authors:
        result_tree.insert('', tk.END, values=("No matches found", "", ""))

# Function to open links on double-click
def on_treeview_click(event):
    item = result_tree.selection()
    if item:
        link = result_tree.item(item, 'values')[1]
        if link.startswith("http"):
            webbrowser.open(link)

# GUI Setup
root = tk.Tk()
root.title("Softwarica Search Engine")

search_frame = ttk.Frame(root, padding="10")
search_frame.grid(row=0, column=0, sticky=(tk.W, tk.E))

search_label = ttk.Label(search_frame, text="Enter the query you want to search:")
search_label.grid(row=0, column=0, sticky=tk.W)

search_entry = ttk.Entry(search_frame, width=50)
search_entry.grid(row=0, column=1, sticky=(tk.W, tk.E))

search_button = ttk.Button(search_frame, text="Search", command=search_and_display_results)
search_button.grid(row=0, column=2, sticky=tk.W)

result_frame = ttk.Frame(root, padding="10")
result_frame.grid(row=1, column=0, sticky=(tk.W, tk.E))

columns = ("Title", "Link", "Authors")
result_tree = ttk.Treeview(result_frame, columns=columns, show='headings')
result_tree.heading("Title", text="Title")
result_tree.heading("Link", text="Link")
result_tree.heading("Authors", text="Authors")

result_tree.column("Title", width=200)
result_tree.column("Link", width=200)
result_tree.column("Authors", width=150)

result_tree.pack(expand=True, fill='both')

result_tree.bind("<Double-1>", on_treeview_click)

root.mainloop()


[nltk_data] Downloading package punkt to C:\Users\Yash
[nltk_data]     Raj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Yash
[nltk_data]     Raj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Data loaded from ./scrapedData\authors-20240801-233645.json, containing 40959 items.
Data loaded from ./scrapedData\papers-20240731-170433.json, containing 12987 items.
