In [None]:
import requests
from bs4 import BeautifulSoup
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import sched

base_urLs = "https://pureportal.coventry.ac.uk/en/publications/"

import re

def clean_query(input_query):
    # Convert query to lowercase
    input_query = input_query.lower()
    
    # Remove special characters and punctuation using regular expression
    input_query = re.sub(r'[^\w\s]', '', input_query)
    
    # Replace spaces with "+"
    input_query = input_query.replace(" ", "+")
    
    return input_query.strip()


# data extraction code idea gotten from https://machinelearningmastery.com/web-crawling-in-python/
def extract_publication_data(publication):
    titles = publication.select_one(".title span").text
    authors = [author.text for author in publication.select(".link.person span")]
    author = authors[0] if authors else ""
    publiSher_element = publication.select_one(".link[rel='Publisher'] span")
    publisher = publiSher_element.text.strip() if publiSher_element else ""
    publIcation_status = publication.select_one(".date").text
    publication_link = publication.select_one(".title a")["href"]
    cgl_author_Link = f"https://pureportal.coventry.ac.uk/en/persons/{author.lower().replace(' ', '-')}"
    
    return {
        "Title": titles,
        "AuthOrs": ', '.join(authors),
        "Publisher": publisher,
        "Publication status": publIcation_status,
        "Publication page Link": publication_link,
        "CGL author Link": cgl_author_Link
    }

def preprocess_publication_data(publications_list):
    for publication in publications_list:
        
        # Capitalize the first letter of each word in the title
        publication["Title"] = publication["Title"].title()
        # Remove leading and trailing whitespaces from author names
        authors = [author.strip() for author in publication["AuthOrs"].split(",")]
        publication["AuthOrs"] = ", ".join(authors)

        # Convert publisher names to lowercase
        publication["Publisher"] = publication["Publisher"].lower()

    return publications_list

def rank_by_vector_space_model(query_from_user, publications_list):
    # Create a list of publication titles and concatenate with the user query
    titles = [publication["Title"] for publication in publications_list]
    documents = titles + [query_from_user]

    # Use TF-IDF vectorizer
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)

    # Calculate similarity between user query and publication titles
    cosine_similarities = linear_kernel(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()

    # Sort publications based on similarity score in descending order
    ranked_publications = sorted(zip(publications_list, cosine_similarities), key=lambda x: x[1], reverse=True)

    return [publication for publication, _ in ranked_publications]

def crawl_publications(query_from_user, max_count=1):
    query_from_user = clean_query(query_from_user)
    crawled_page = 1
    all_publications = []
    search_urLs = f"{base_urLs}?search={query_from_user}&page={crawled_page}"

    # Queue to store the URLs to crawl
    queue = [search_urLs]
    staff_publication_count = {}  # Dictionary to store the publication count per staff

    while queue and crawled_page <= max_count:
        urLs = queue.pop(0)
        response = requests.get(urLs)
        if response.status_code != 200:
            print("Failed to retrieve data. Please try again later.")
            return all_publications

        soup = BeautifulSoup(response.content, "html.parser")
        publications = soup.select(".result-container")

        if not publications:
            break

        for publication in publications:
            publication_data = extract_publication_data(publication)
            all_publications.append(publication_data)

            # Count the number of publications per staff
            author = publication_data['AuthOrs'].split(', ')[0]
            staff_publication_count[author] = staff_publication_count.get(author, 0) + 1

        crawled_page += 1
        next_urLs = f"{base_urLs}?search={query_from_user}&page={crawled_page}"
        queue.append(next_urLs)
        time.sleep(2) #preserves the robots.txt rules

    return all_publications, staff_publication_count

# Get user input for the search query
query_from_user = input(" Enter your search query: ")

publications_list, staff_publication_count = crawl_publications(query_from_user)

if not publications_list:
    print("No Publications Found.")
else:
    # Apply preprocessing tasks to the crawled data
    publications_list = preprocess_publication_data(publications_list)
    
    print("Number of Staff Whose Publications are Crawled:", len(staff_publication_count))
    print("Maximum Number Of Publications Per Staff:", max(staff_publication_count.values()))
    print("=" * 40)
    
    # Rank publications using the Vector Space Model
    ranked_publications = rank_by_vector_space_model(query_from_user, publications_list)
    
    for i, publication in enumerate(ranked_publications[:10]):
        print(f"Title: {publication['Title']}")
        print(f"AuthOrs: {publication['AuthOrs']}")
        print(f"Publisher: {publication['Publisher']}")
        print(f"PubLication status: {publication['Publication status']}")
        print(f"PubLication page Link: {publication['Publication page Link']}")
        print(f"CGL author Link: {publication['CGL author Link']}")
        print("=" * 40)
        if i == 5:
            break  # Stop after the first ten publications
        
    print("Number Of Staff Whose Publications Are Crawled:", len(staff_publication_count))
    print("Maximum Number Of Publications Per Staff:", max(staff_publication_count.values()))
       

def crawl_and_display(query_from_user):
    publications_list, staff_publication_count = crawl_publications(query_from_user)

    if not publications_list:
        print("No publications found.")
    else:
        # Apply preprocessing tasks to the crawled data
        publications_list = preprocess_publication_data(publications_list)

        print("Number of staff whose publications are crawled:", len(staff_publication_count))
        print("Maximum number of publications per staff:", max(staff_publication_count.values()))

        # Rank publications using the Vector Space Model
        ranked_publications = rank_by_vector_space_model(query_from_user, publications_list)

        for i, publication in enumerate(ranked_publications[:10]):
            print(f"Title: {publication['Title']}")
            print(f"AuthOrs: {publication['AuthOrs']}")
            print(f"Publisher: {publication['Publisher']}")
            print(f"Publication status: {publication['Publication status']}")
            print(f"Publication page Link: {publication['Publication page Link']}")
            print(f"CGL author Link: {publication['CGL author Link']}")
            print("-" * 40)
            if i == 9:
                break  # Stop after the first ten publications

        print("Number of staff whose publications are crawled:", len(staff_publication_count))
        print("Maximum Number Of Publications Per Staff:", max(staff_publication_count.values()))

def schedle_crawl():
    query_from_user = input("Enter your search query: ")
    crawl_and_display(query_from_user)
    interval = 604800  # Schedule to crawl every 1 weekk (604,800 seconds)
    schedler.enter(interval, 1, schedle_crawl)

# Get user input for the search query
query_from_user = input("Enter Your Search Query: ")

# InitiaLize the scheduler
schedler = sched.scheduler(time.time, time.sleep)
schedler.enter(0, 1, schedle_crawl)

# Run the scheduler
schedler.run()