In [17]:
!pip install numpy




[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: C:\Users\Asus\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [4]:
import requests
from bs4 import BeautifulSoup
import re
from collections import defaultdict
import tkinter as tk
from tkinter import ttk
import webbrowser
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [5]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [1]:
import time
import datetime

In [6]:
!pip install scrapy




[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: C:\Users\Asus\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [7]:
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

In [8]:
def preprocess(text):
    tokens = nltk.word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [ps.stem(word) for word in tokens]
    return tokens

In [9]:
def crawl_and_parse(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        publications = soup.find_all('div', class_='result-container')
        results = []
        
        for pub in publications:
            title_tag = pub.find('h3', class_='title')
            title = title_tag.text.strip() if title_tag else 'No title'
            link = title_tag.find('a')['href'] if title_tag and title_tag.find('a') else 'No link'
            
            authors = []
            for author_tag in pub.find_all('a', class_='link person'):
                authors.append(author_tag.text.strip())
            authors = ', '.join(authors) if authors else 'No authors'
            
            date_tag = pub.find('span', class_='date')
            publication_date = date_tag.text.strip() if date_tag else 'No date'
            
            results.append({
                'title': title,
                'link': link,
                'authors': authors,
                'publication_date': publication_date,
                'content': f"{title} {authors} {publication_date}"
            })
        
        return results
    else:
        return []

In [10]:
def create_inverted_index(documents):
    inverted_index = defaultdict(list)
    
    for doc_id, doc in enumerate(documents):
        content = doc['content']
        words = preprocess(content)
        for word in words:
            if doc_id not in inverted_index[word]:
                inverted_index[word].append(doc_id)
    
    return inverted_index

In [11]:
def calculate_tf_idf(documents):
    corpus = [doc['content'] for doc in documents]
    vectorizer = TfidfVectorizer(tokenizer=preprocess)
    tfidf_matrix = vectorizer.fit_transform(corpus)
    return tfidf_matrix, vectorizer

In [12]:
def search(query, inverted_index, documents, tfidf_matrix, vectorizer):
    query_vector = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    relevant_docs_indices = cosine_similarities.argsort()[-10:][::-1]
    results = [(documents[idx], cosine_similarities[idx]) for idx in relevant_docs_indices if cosine_similarities[idx] > 0]
    return results

In [13]:
def search_and_display_results():
    query = search_entry.get()
    search_results = search(query, inverted_index, documents, tfidf_matrix, vectorizer)
    
    for row in result_tree.get_children():
        result_tree.delete(row)
    
    for result, score in search_results:
        result_tree.insert("", "end", values=(result['title'], result['link'], result['authors'], result['publication_date'], f"{score:.2f}"))
        

In [14]:
def on_treeview_click(event):
    item = result_tree.identify('item', event.x, event.y)
    link = result_tree.item(item, "values")[1]
    if link != 'No link':
        webbrowser.open(link)

In [15]:
url = 'https://pureportal.coventry.ac.uk/en/organisations/eec-school-of-computing-mathematics-and-data-sciences-cmds'
documents = crawl_and_parse(url)
inverted_index = create_inverted_index(documents)
tfidf_matrix, vectorizer = calculate_tf_idf(documents)



In [16]:
root = tk.Tk()
root.title("Search Engine Basics")

search_frame = ttk.Frame(root, padding="10")
search_frame.grid(row=0, column=0, sticky=(tk.W, tk.E))

search_label = ttk.Label(search_frame, text="Enter your search query:")
search_label.grid(row=0, column=0, sticky=tk.W)

search_entry = ttk.Entry(search_frame, width=50)
search_entry.grid(row=0, column=1, sticky=(tk.W, tk.E))

search_button = ttk.Button(search_frame, text="Search", command=search_and_display_results)
search_button.grid(row=0, column=2, sticky=tk.W)

result_frame = ttk.Frame(root, padding="10")
result_frame.grid(row=1, column=0, sticky=(tk.W, tk.E))

columns = ("Title", "Link", "Authors",  "Publication Date", "Relevance Score")
result_tree = ttk.Treeview(result_frame, columns=columns, show='headings')
result_tree.heading("Title", text="Title")
result_tree.heading("Link", text="Link")
result_tree.heading("Authors", text="Authors")

result_tree.heading("Publication Date", text="Publication Date")
result_tree.heading("Relevance Score", text="Relevance Score")

result_tree.column("Title", width=200)
result_tree.column("Link", width=200)
result_tree.column("Authors", width=150)

result_tree.column("Publication Date", width=100)
result_tree.column("Relevance Score", width=100)

result_tree.pack(expand=True, fill='both')


result_tree.bind("<Double-1>", on_treeview_click)

root.mainloop()