In [1]:
# Imports
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import datetime
import string
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tkinter import messagebox
nltk.download('stopwords')
nltk.download('wordnet')

# Crawler Component
URL = "https://pureportal.coventry.ac.uk/en/organisations/coventry-university/persons/"
profile_url = "https://pureportal.coventry.ac.uk/en/persons/"

def get_maximum_page():
    first = requests.get(URL)
    soup = BeautifulSoup(first.text, 'html.parser')
    final_page = soup.select('#main-content > div > section > nav > ul > li:nth-child(12) > a')[0]['href']
    fp = final_page.split('=')[-1]
    return int(fp)

def check_department(researcher):
    l1 = researcher.find('div', class_='rendering_person_short')
    for span in l1.find_all('span'):
        if span.text == 'School of Computing, Electronics and Maths':
            name = researcher.find('h3', class_='title').find('span').text
            return name
    return None

# def create_csv():
#     database = pd.DataFrame(columns=['Title', 'Author', 'Published', 'Link'])
#     database.to_csv('database.csv')

def update_csv(database):
    current_data = pd.read_csv(database, index_col="Unnamed: 0")
    return current_data

def enter_each_researchers_publication(researcher, url, df):
    new_url = url + str(researcher).replace(' ', '-').lower() + '/publications/'
    page = requests.get(new_url)
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find(id="main-content")
    papers = results.find_all("li", class_="list-result-item")

    for paper in papers:
        title = paper.find('h3', class_='title').find('span')
        author = paper.find('a', class_='link person').find('span')
        date = paper.find('span', class_="date")
        link = paper.find('h3', class_='title').find('a', href=True)['href']

        opening = pd.read_csv('database.csv', index_col="Unnamed: 0")
        opening = opening.append({'Title': title.text,
                                  'Author': author.text,
                                  'Published': date.text,
                                  'Link': link}, ignore_index=True)
        opening.to_csv('database.csv')

def scrape(mx):
    df = update_csv('database.csv')
    i = 0
    while True:
        if i > mx:
            break

        if i > 0:
            url = URL + '?page=' + str(i)
        else:
            url = URL

        i = i + 1
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        results = soup.find(id="main-content")
        researchers = results.find_all("li", class_="grid-result-item")

        for researcher in researchers:
            check = researcher.find('div', class_='stacked-trend-widget')
            if check:
                name = check_department(researcher)
                if name is None:
                    pass
                else:
                    enter_each_researchers_publication(name, profile_url, df)

# create_csv()
mx = get_maximum_page()
scrape(mx)

# Indexing Component
# Indexing Component
scraped_db = pd.read_csv('database.csv', index_col=0)

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Tokenization
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back into a single string
    processed_text = " ".join(tokens)
    
    return processed_text

def create_index():
    index = {}
    for i, row in scraped_db.iterrows():
        title = row['Title']
        author = row['Author']
        processed_title = preprocess_text(title)
        processed_author = preprocess_text(author)
        
        # Update index with title
        if processed_title not in index:
            index[processed_title] = []
        index[processed_title].append(i)
        
        # Update index with author
        if processed_author not in index:
            index[processed_author] = []
        index[processed_author].append(i)
    
    with open('index.json', 'w') as f:
        json.dump(index, f)

create_index()

import tkinter as tk
from tkinter import *
# Load the index from the JSON file



import tkinter as tk
from tkinter import *
import numpy as np
from sklearn.metrics import confusion_matrix

# Load the index from the JSON file
with open('index.json', 'r') as f:
    index = json.load(f)

def search_publications():
    query = entry.get(1.0, "end")
    processed_query = preprocess_text(query)
    
    if processed_query in index:
        matching_indices = index[processed_query]
        matching_publications = scraped_db.loc[matching_indices]
        
        # True positives: Matching publications retrieved and relevant
        true_positives = len(matching_publications)
        
        # False negatives: Relevant publications not retrieved
        false_negatives = len(scraped_db) - true_positives
        
        # False positives: Non-relevant publications retrieved
        false_positives = 0
        
        # True negatives: Non-relevant publications not retrieved
        true_negatives = 0
        
        # Calculate false positives and true negatives
        for i, row in scraped_db.iterrows():
            if i not in matching_indices:
                false_positives += 1
            else:
                true_negatives += 1
        
        # Calculate the predicted and actual labels
        predicted_labels = [1] * true_positives + [0] * false_positives
        actual_labels = [1] * (true_positives + false_negatives) + [0] * (false_positives + true_negatives)
        
        # Adjust the length of the lists if they are different
        max_length = max(len(predicted_labels), len(actual_labels))
        predicted_labels += [0] * (max_length - len(predicted_labels))
        actual_labels += [0] * (max_length - len(actual_labels))
        
        # Calculate the confusion matrix
        cm = confusion_matrix(actual_labels, predicted_labels)
        
        # Display the confusion matrix
        messagebox.showinfo("Confusion Matrix", str(cm))
        
        # Display the matching publications
        messagebox.showinfo("Search Results", matching_publications.to_string())
    else:
        messagebox.showinfo("Search Results", "No matching publications found.")

# Rest of the code...

# Create the main window
from tkinter import *
import tkinter as tk
window = tk.Tk()
window.title("Publication Search")
window.geometry("600x650")
window.configure(bg="#111D88")
window.resizable(0,0)
from PIL import ImageTk, Image
img=(Image.open("coventry-university-logo.png"))
imgg=img.resize((200,200))
imggg=ImageTk.PhotoImage(imgg)

lbl=Label(window,image=imggg,bg="#111D88")
lbl.pack(side=TOP)
# Create and position the search label
label = tk.Label(window, text="Enter author name or title name:",fg="white",font=("Arial", 20,"italic"),bg="#111D88")
label.pack(pady=10)

# Create and position the search entry
entry = tk.Text(window,height=10,width=30,font=("Arial", 20))
entry.pack()

# Create and position the search button
button = tk.Button(window, text="Search",font=("Arial", 20),bg="#10164B",fg="white",command=search_publications)
button.pack(pady=10)


# Start the main event loop
window.mainloop()



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
