In [29]:
#Import Necessary Libraries
import requests
from bs4 import BeautifulSoup
import re
import json
import os
import time
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import defaultdict
import tkinter as tk
from tkinter import ttk
import webbrowser
import threading
import schedule

nltk.download('punkt')
nltk.download('stopwords')

ps = PorterStemmer()
stop_words = set(stopwords.words('english'))



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
#Preprocessing Function
def preprocess(text):
    tokens = nltk.word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [ps.stem(word) for word in tokens]
    return tokens  # Return a list of tokens for the inverted index



In [12]:
#Scrape Authors Function
def getProfileURLorNone(url):
    if "no-content" in url:
        return None
    pattern = r"^(\\/[^?]+)"
    path_match = re.match(pattern, url)
    path = None
    if path_match:
        path = path_match.group(1)
    return path

def scrapeAuthors(start_page=1, page_limit=1000):
    page = start_page
    base_url = "https://pureportal.coventry.ac.uk"
    url = f"{base_url}/en/organisations/eec-school-of-computing-mathematics-and-data-sciences-cmds/persons/"
    authors = []
    
    while page < page_limit:
        try:
            pageSource = requests.get(url).text
            soup = BeautifulSoup(pageSource, "html.parser")
            authorList = soup.select("li.grid-result-item div.result-container")
            if len(authorList) == 0:
                break
            for author in authorList:
                try:
                    authorInfo = {}
                    authorInfo['picUrl'] = getProfileURLorNone(
                            author.select_one("img")['src']
                    )
                    if authorInfo['picUrl'] is not None:
                        authorInfo['picUrl'] = base_url + authorInfo['picUrl']
                    name = author.select_one("a", attrs={'rel': 'Person'})
                    authorInfo['name'] = name.text
                    authorInfo['profileLink'] = base_url + name['href']
                    dept = author.select_one(".relations.organisations a", 
                            attrs={'rel': 'Organisation'})
                    authorInfo['department'] = dept.text
                    authorInfo['deptLink'] = base_url + dept['href']
                    authors.append(authorInfo)
                except Exception as e:
                    print(f"Error processing author: {e}")
                    pass
            
            print(f"Finished page {page}")
            page += 1
        except Exception as e:
            print(f"An error occurred: {e}")
            break
    
    timestamp = time.strftime("%Y%m%d-%H%M%S")
    if not os.path.exists("./scrapedData"):
        os.makedirs("./scrapedData")
    with open(f"./scrapedData/authors-{timestamp}.json", "w") as f:
        f.write(json.dumps(authors, indent=4))

    return authors


In [14]:
#Scrape Papers Function
def getAuthorsAndOtherDocumentInformation(paperInfo):
    paperInfo['authors'] = []
    authorList = paperInfo['paper'].select('p.contributors span')
    for author in authorList:
        try:
            authorInfo = {}
            authorInfo['name'] = author.text.strip()
            authorLinkElement = author.select_one('a')
            if authorLinkElement:
                authorInfo['link'] = authorLinkElement['href']
            paperInfo['authors'].append(authorInfo)
        except:
            pass

def scrapPapers(start_page=1, page_limit=1000):
    page = start_page
    base_url = "https://pureportal.coventry.ac.uk"
    url = f"{base_url}/en/organisations/eec-school-of-computing-mathematics-and-data-sciences-cmds/publications/"
    papers = []
    
    while page < page_limit:
        try:
            pageSource = requests.get(url).text
            soup = BeautifulSoup(pageSource, "html.parser")
            paperList = soup.select("li.grid-result-item div.result-container")
            if len(paperList) == 0:
                break
            for paper in paperList:
                try:
                    paperInfo = {}
                    link_element = paper.select_one('h3.title a')
                    if link_element:
                        paperInfo['link'] = link_element['href']
                        paperInfo['title'] = link_element.text

                    journal = paper.select_one('a', attrs={'rel': 'Journal'})
                    if journal:
                        paperInfo['journal'] = journal.text
                        paperInfo['journalLink'] = journal['href']

                    cols = ['date', 'volume', 'pages', 'numberofpages', 'type_classification']
                    for x in cols:
                        try:
                            element = paper.select_one(f'span.{x}')
                            if element:
                                paperInfo[x] = element.text
                                if x == 'numberofpages':
                                    paperInfo[x] = int(paperInfo[x][:-2])
                                elif x == 'pages':
                                    paperInfo[x] = paperInfo[x][3:]
                                elif x == 'volume':
                                    paperInfo[x] = int(paperInfo[x])
                        except:
                            pass
                    
                    getAuthorsAndOtherDocumentInformation(paperInfo)
                    papers.append(paperInfo)
                except Exception as e:
                    print(f"Error processing paper: {e}")

            print(f"Finished page {page}")
            page += 1
        except Exception as e:
            print(f"An error occurred: {e}")
            break

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    if not os.path.exists("./scrapedData"):
        os.makedirs("./scrapedData")
    with open(f"./scrapedData/papers-{timestamp}.json", "w") as f:
        f.write(json.dumps(papers, indent=4))

    return papers


In [28]:
# Scrape Data
def scrape_data():
    print("Starting to scrape authors...")
    authors = scrapeAuthors()
    print("Finished scraping authors.")

    print("Starting to scrape papers...")
    documents = scrapPapers()
    print("Finished scraping papers.")
    
    # Filter out documents with empty content
    documents = [doc for doc in documents if doc.get('content', '').strip()]

    # Debugging: Check contents of documents
    for i, doc in enumerate(documents[:5]):
        print(f"Document {i} content: '{doc.get('content', '')}'")  # Debugging statement

    return documents

documents = scrape_data()  # Scrape documents

# Create Inverted Index
def create_inverted_index(documents):
    inverted_index = defaultdict(list)
    for i, doc in enumerate(documents):
        words = preprocess(doc.get('content', ''))
        for word in words:
            if i not in inverted_index[word]:
                inverted_index[word].append(i)
    return inverted_index

inverted_index = create_inverted_index(documents)  # Create inverted index




Starting to scrape authors...
Finished page 1
Finished page 2
Finished page 3
Finished page 4
Finished page 5
Finished page 6
Finished page 7
Finished page 8
Finished page 9
Finished page 10
Finished page 11
Finished page 12
Finished page 13
Finished page 14
Finished page 15
Finished page 16
Finished page 17
Finished page 18
Finished page 19
Finished page 20
Finished page 21
Finished page 22
Finished page 23
Finished page 24
Finished page 25
Finished page 26
Finished page 27
Finished page 28
Finished page 29
Finished page 30
Finished page 31
Finished page 32
Finished page 33
Finished page 34
Finished page 35
Finished page 36
Finished page 37
Finished page 38
Finished page 39
Finished page 40
Finished page 41
Finished page 42
Finished page 43
Finished page 44
Finished page 45
Finished page 46
Finished page 47
Finished page 48
Finished page 49
Finished page 50
Finished page 51
Finished page 52
Finished page 53
Finished page 54
Finished page 55
Finished page 56
Finished page 57
Finished p

In [32]:
#Search Function
def search(query, inverted_index, documents):
    query_tokens = preprocess(query)
    matched_docs = set()
    for token in query_tokens:
        if token in inverted_index:
            matched_docs.update(inverted_index[token])

    results = [documents[i] for i in matched_docs]
    return results



In [34]:
# GUI Functions
def search_and_display_results():
    query = search_entry.get()
    results = search(query, inverted_index, documents)
    for item in result_tree.get_children():
        result_tree.delete(item)
    for result in results:
        result_tree.insert("", "end", values=(result['title'], result['link'], result['authors']))

def on_treeview_click(event):
    item = result_tree.identify('item', event.x, event.y)
    link = result_tree.item(item, "values")[1]
    if link != 'No link':
        webbrowser.open(link)




In [36]:
#Scheduler
def start_scheduler():
    schedule.every(2).weeks.do(scrape_data)

    while True:
        schedule.run_pending()
        time.sleep(1)

# Start the scheduler in a separate thread
scheduler_thread = threading.Thread(target=start_scheduler)
scheduler_thread.daemon = True
scheduler_thread.start()


In [1]:

# GUI Setup
root = tk.Tk()
root.title("Softwarica Search Engine")

search_frame = ttk.Frame(root, padding="10")
search_frame.grid(row=0, column=0, sticky=(tk.W, tk.E))

search_label = ttk.Label(search_frame, text="Enter the query you want to search:")
search_label.grid(row=0, column=0, sticky=tk.W)

search_entry = ttk.Entry(search_frame, width=50)
search_entry.grid(row=0, column=1, sticky=(tk.W, tk.E))

search_button = ttk.Button(search_frame, text="Search", command=search_and_display_results)
search_button.grid(row=0, column=2, sticky=tk.W)

result_frame = ttk.Frame(root, padding="10")
result_frame.grid(row=1, column=0, sticky=(tk.W, tk.E))

columns = ("Title", "Link", "Authors")
result_tree = ttk.Treeview(result_frame, columns=columns, show='headings')
result_tree.heading("Title", text="Title")
result_tree.heading("Link", text="Link")
result_tree.heading("Authors", text="Authors")

result_tree.column("Title", width=200)
result_tree.column("Link", width=200)
result_tree.column("Authors", width=150)

result_tree.pack(expand=True, fill='both')

result_tree.bind("<Double-1>", on_treeview_click)

root.mainloop()


NameError: name 'tk' is not defined