# Ανάκτησης Πληροφορίας - Δημιουργία Μηχανής Αναζήτησης
- Μπηλιώνη Παρασκευή <br> Α.Μ. ice20390286
  
- Πλάγου Αικατερίνη  <br> Α.Μ. ice20390191

# 1. Συλλογή δεδομένων

Εισαγωγή και αρχικοποίηση των απαραίτητων βιβλιοθηκών.

In [None]:
# Import libraries
from bs4 import BeautifulSoup  # Web scraping and parsing 
import requests                # Makes HTTP request
import json                    # Handles JSON data
import nltk                    # Text processing
import string                  # Handles strings
import sys                     # System-specific parameters and functions
from nltk.corpus import stopwords       # Handles stopwords in text processing
from nltk.stem import WordNetLemmatizer # Word lemmatization
from collections import defaultdict     # Creates dictionaries 
import ipywidgets as widgets            # Creates widgets
import numpy as np                      # Does computations
from IPython.display import display, Markdown                # Displays widgets and text
from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDF vectorization
from sklearn.metrics.pairwise import cosine_similarity       # Calculates vector similarity (used in VSM)
from rank_bm25 import BM25Okapi                              # Rankins documents (used in okapi BM25)
from sklearn.metrics import precision_score, recall_score, f1_score  # Calculates engine evaluation metrics
from IPython.display import display, Markdown               
import pandas as pd   

# Download NLTK datasets
# Tokenizer models
nltk.download('punkt')  
# List of stopwords
nltk.download('stopwords')  
# For lemmatization
nltk.download('wordnet')    

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))  # Set of stopwords
lemmatizer = WordNetLemmatizer()              # Lemmatizer for reducing words to base forms


Επιλογή πηγής από την οποία θα ξεκινήσει η συλλογή εγγράφων. 

In [None]:
# Starting wikipedia link to scrape
wiki_url = 'https://en.wikipedia.org/wiki/Data_analysis'

try:
    # Send request to starting link
    response = requests.get(wiki_url)
    # Raise an exception for errors
    response.raise_for_status()
        
    # Parse HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
        
# Handle any exceptions during the request
except requests.RequestException as e:
    print(f"____Request failed____\n{e}\n")

display(Markdown(f"Data will be scraped from this starting page: [{wiki_url}]({wiki_url})<br><br>"))


Αποθήκευση δεδομένων σε JSON αρχείο.
Σε περίπτωση σφάλματος κατά την διάρκεια της διαδικασίας, εμφανίζεται κατάλληλο μήνυμα λάθους.

In [None]:
def store_things(things, filename):
    try:
        # Open the specified file in write mode
        with open(filename, 'w') as file:
            # Convert data to a JSON string and write it to the file
            json.dump(things, file, indent = 4)

    # If the process fails print error message
    except Exception as e:
        print(f"____Error saving____\n{e}")
        

Απόκτηση εγγράφων/συνδέσμων από την κύρια σελίδα.

In [None]:
def get_links(soup):
    # Base wikipedia url
    https = 'https://en.wikipedia.org'  
    # For storing valid links
    links = []  

    display(Markdown("Links saved: <br>"))
    # Find all anchor tags with 'href' attribute
    for link in soup.find_all('a', href = True):
        # Extract link reference
        url = link.get('href') 

        # Check if the link is a wikipedia aticle and filter out irrelevant links
        if url.startswith('/wiki/') and not any(
            url.startswith(f'/wiki/{keyword}')
            for keyword in ['Wikipedia', 'Help', 'Special', 'Portal', 'Talk', 'Category', 'File', 'Main_Page']):
            # Construct full wikipedia url
            full_url = f"{https}{url}"

            # Skip links that appear if they are already saved
            if full_url not in links:
                links.append(full_url)
                #display(Markdown(f"[{full_url}]({full_url})<br>"))
                print(f"{full_url}\n")
                
        # Collect 70 first valid links
        if len(links) >= 70:
            break 
            
    # Return links
    return links  


# Collect and store links from the main page
links = get_links(soup)
store_things(links, 'wikipedia_collected_urls.json')

# For tracking visited pages
visited_links = set()

# Mark the starting link as visited
visited_links.add(wiki_url)


Συλλογή παραγράφων από κάθε σελίδα. 

In [None]:
def get_paragraphs(soup, visited_paragraphs, link):
    # Stores paragraphs
    paragraphs = []  

    # Remove the content from superscripts and references
    for sup in soup.find_all(['sup', 'reflist']):
        sup.decompose()  

    # Find all paragraph tags 
    for p in soup.find_all('p'):
        # Extract text from each paragraph
        text = p.get_text()  
        
        # Remove empty paragraphs and those containing the term 'displaystyle' to avoid mathematical functions
        if text and 'displaystyle' not in text.lower():
            # Calculate the number of words in the paragraph
            word_count = len(text.split())  
            
            # Include paragraphs with word count between 50 and 100 and avoid duplicates
            if 50 <= word_count <= 100 and text not in visited_paragraphs:
                # Store paragraph with source link
                paragraphs.append({'text': text, 'link': link}) 
                # Mark the paragraph as visited to avoid repetition
                visited_paragraphs.add(text)  
                        
    # Return filtered paragraphs         
    return paragraphs  


# For tracking visited paragraphs
visited_paragraphs = set()

# Collect paragraphs from starting page
original_paragraphs = get_paragraphs(soup, visited_paragraphs, wiki_url)

display(Markdown("Paragraphs saved from starting page: <br>"))
for paragraph in original_paragraphs:
    print(f"{paragraph['text']}")


# 2. Προεπεξεργασία κειμένου (Text Procissing)

Διαμόρφωση κειμένου μετά από αφαίρεση διακόπτουσων λέξεων, ειδικών χαρακτήρων και λημματοποίση όρων. 

In [None]:
def clean_text(text):
    # Tokenize text into individual words and convert to lowercase for better search results
    tokens = nltk.word_tokenize(text.lower())

    # List to store cleaned tokens
    cleaned_tokens = []

    # Remove punctuation and non alphanumeric tokens
    for token in tokens:
        if token not in string.punctuation and token.isalnum():
            cleaned_tokens.append(token)

    # List to store tokens after stopword removal
    filtered_tokens = []

    # Remove stopwords from the tokenized text
    for token in cleaned_tokens:
        if token not in stop_words:
            filtered_tokens.append(token)

    # Initialize a list to store lemmatized tokens
    lemmatized_tokens = []

    # Lemmatize the tokens
    for token in filtered_tokens:
        lemmatized_tokens.append(lemmatizer.lemmatize(token))

    # Return processed tokens
    return lemmatized_tokens

cleaned_paragraphs = []

# Clean the collected paragraphs using text preprocessing
for paragraph in original_paragraphs:
    #clean_paragraph = ' '.join(clean_text(paragraph))
    clean_paragraph = clean_text(paragraph['text'])
    cleaned_paragraphs.append({'tokens': clean_paragraph, 'link': paragraph['link']})

display(Markdown("Processed paragraphs from starting page: <br>"))
for paragraph in cleaned_paragraphs:
    print(f"{paragraph['tokens']}\n")
    

Ληψη και εξαγωγή παραγράφων από κάθε σύνδεσμο. 
Λαμβάνεται το περιεχόμενο από έναν σύνδεσμο και εξάγονται οι παράγραφοι, σε περίπτωση που δεν έχουν ήδη καταχωρηθεί.

In [None]:
def paragraphs_within_links(link, visited_links, visited_paragraphs):
    # Skip the link and return an empty list if it has already been processed
    if link in visited_links:
        return []

    try:
        # Send get request to link and raise error if the request was unsuccessful
        response = requests.get(link)
        response.raise_for_status()
        
        # Parse the pages content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Mark current link as visited
        visited_links.add(link)
        
        # Extract and return paragraphs from the page
        return get_paragraphs(soup, visited_paragraphs, link)
        
    # Exception for request errors
    except requests.RequestException as e:
        print(f"Error retrieving links: {e}")
        # Return an empty list if there is an error
        return []  

display(Markdown("Scraping paragraphs from each link <br>"))
progress = widgets.IntProgress(
    value = 0,
    min = 0,
    max = len(links)
)

display(progress)

# Get paragraphs from each link and avoid re visiting links
for i, link in enumerate(links):
    # Get paragraphs from current link
    link_paragraphs = paragraphs_within_links(link, visited_links, visited_paragraphs)

    # Extend list of original paragraphs adding the new ones
    original_paragraphs.extend(link_paragraphs)

    # Clean and store paragraphs from current link
    for paragraph in link_paragraphs:
        #clean_paragraph = ' '.join(clean_text(paragraph))
        clean_paragraph = clean_text(paragraph['text'])
        cleaned_paragraphs.append({'tokens': clean_paragraph, 'link': paragraph['link']})

    # Update progress bar
    progress.value = i + 1


Αποθήκευση επιλεγμένων παραγράφων στην αρχική και επεξεργασμένη μορφή τους.

In [None]:
store_things(original_paragraphs, 'wikipedia_paragraphs.json')
store_things(cleaned_paragraphs, 'wikipedia_paragraphs_cleaned.json')

display(Markdown("How some of the paragraphs are saved: <br>"))
for paragraph in original_paragraphs[:10]:
    text = paragraph['text']
    link = paragraph['link']

    print(f"text: {text}")
    print(f"link: {link}\n\n")
    

In [None]:
display(Markdown("How some of the processed and tokenised paragraphs are saved: <br>"))
for paragraph in cleaned_paragraphs[:10]:
    tokens = paragraph['tokens']
    link = paragraph['link']

    print(f"tokens: {tokens}\n")
    print(f"link: {link}\n\n")
    

# 3. Ευρετήριο (Indexing)

Δημιουργία ανεστραμμένου ευρετήριου από τις επιλεγμένες και επεξεργασμένες παραγράφους και αποθήκευσή του σε αρχείο JSON.

In [None]:
# Function to build an inverted index from the collected paragraphs
def build_inverted_index(cleaned_paragraphs):
    """
    Builds an inverted index from a list of paragraphs.

    Parameters:
    - paragraphs: A list of dictionaries, each containing a 'text' key with the paragraph content.

    Returns:
    - inverted_index: A defaultdict where each token (word) maps to a set of paragraph IDs where the token appears.
    """
    # Defaultdict where keys are tokens and values are sets of paragraph IDs
    inverted_index = defaultdict(set)
    
    # Look through each tokenized paragraph and assign a unique ID
    for paragraph_id, paragraph in enumerate(cleaned_paragraphs):
        # Get the tokens from each paragraph
        tokens = paragraph['tokens']
        
        # Add each token to the inverted index with its paragraph ID
        for token in tokens:
            inverted_index[token].add(paragraph_id)
             
    return inverted_index


# Function to store an inverted index in a JSON file
def store_inverted_index(inverted_index):
    """
    Stores an inverted index into a JSON file named 'inverted_index.json'.

    Parameters:
    - inverted_index: A dictionary where each key is a term and the value is a set of paragraph IDs.

    Data Conversion:
    - Since sets are not JSON serializable, the sets are converted to lists before storage.

    Exception Handling:
    - Catches any exceptions that may occur during file writing and prints an error message.
    """
    try:
        # Convert sets to lists to save in JSON file
        serializable_index = {}
        for term, paragraph_ids in inverted_index.items():
            serializable_index[term] = list(paragraph_ids)

        # Save the converted index to the file
        with open('inverted_index.json', 'w') as file:
            json.dump(serializable_index, file, indent = 4)

    except Exception as e:
        print(f"____Error saving inverted index____\n{e}")

# Build inverted index from cleaned paragraphs
inverted_index = build_inverted_index(cleaned_paragraphs)

# Store generated inverted index
store_inverted_index(inverted_index)


# Convert the inverted index into a list of token and paragraph IDs
index = list(inverted_index.items())
# Create a data frame from the inverted index
inverted_df = pd.DataFrame(index, columns = ['Token', 'Paragraph ID'])
# Display the inverted index
display(Markdown("Inverted Index <br>"))
pd.set_option('display.max_colwidth', None)
display(inverted_df)


# 4. Μηχανή αναζήτησης (Search Engine) 

α) Επεξεργασία ερωτήματος (Query Processing)

Μετατροπή ερωτήματος από infix σε postfix μορφή και ό,τι δεν συμπεριλαμβάνεται στην άλγεβρα Boole, γίνεται lemmatized. 

In [None]:
def infix_to_postfix(query):
    # Define operator precedence (higher value means higher precedence)
    precedence = {'NOT': 3, 'AND': 2, 'OR': 1}
    
    # Output and operator stack
    output = []  
    operators = []

    # Split the query into tokens
    tokens = query.split()
    
    # Process each token
    for token in tokens:
        # If the token is an operator handle based on precedence
        if token in precedence:
            # Pop operators with higher or equal precedence from the stack
            while operators and precedence.get(operators[-1], 0) >= precedence[token]:
                output.append(operators.pop())
            # Push the current operator to the stack
            operators.append(token)  

        # If the token is left parenthesis push it onto the stack
        elif token == '(':
            operators.append(token)

        # If the token is right parenthesis pop until the matching left parenthesis
        elif token == ')':
            while operators and operators[-1] != '(':
                output.append(operators.pop())
            # Remove left parenthesis from stack
            operators.pop()  

        # If the token is a word or other charachters 
        else:
            # Lemmatize and turn into lowecase
            token = lemmatizer.lemmatize(token.lower())
            # Rremove word or number charachters
            if token.isalnum():
                output.append(token)

    # Pop any remaining operators from the stack and append them to output
    while operators:
        output.append(operators.pop())

    # Return the query in postfix notation
    return output


display(Markdown("Example of how queries are processed <br>"))
test_query = "( data AND analysis ) OR NOT science"
# Convert the query from infix to postfix notation
test_postfix_query = infix_to_postfix(test_query)

# Print the original query and its postfix conversion
print(f"Original queryn (Infix):   {test_query}")
print(f"Converted query (Postfix): {(test_postfix_query)}\n")


Συλλογή παραγράφων με βάση το ήδη επεξεργασμένο ερώτημα. 

In [None]:
def evaluate_postfix(postfix_tokens, inverted_index, num_paragraphs):
    # Evaluating the postfix expression
    stack = []  
    
    # Handling NOT operations
    all_paragraphs = set(range(num_paragraphs))  

    # Look through each token in the expression and preform nessesary operations
    for token in postfix_tokens:
        if token == 'AND':
            # Pop the top two sets
            right = stack.pop()  
            left = stack.pop()
            # Push the result of addition to stack
            stack.append(left & right)  

        elif token == 'OR':
            right = stack.pop()
            left = stack.pop()
            # Push the result of union to stack
            stack.append(left | right)  

        # If the token is NOT operator calculate the difference 
        elif token == 'NOT':
            operand = stack.pop()
            # Push ducuments that are not in list to stack
            stack.append(all_paragraphs - operand)  

        # If the token is search term retrieve the matching paragraph IDs from inverted index
        else:
            # Push matching paragraph IDs to stack
            stack.append(inverted_index.get(token, set())) 

    # Return the matching paragraph IDs if there are any or empty set
    if stack:
        return stack.pop()
    else:
        return set()


test_matching_paragraphs = evaluate_postfix(test_postfix_query, inverted_index, len(cleaned_paragraphs))
display(Markdown("Matching paragraph IDs for example query:"))
print(f"{test_matching_paragraphs}")


<br>β) Κατάταξη αποτελεσμάτων (Ranking)

Δημιουργία πίνακα TF-IDF με βάση τα αποτελέσματα του ερωτήματος. 

In [None]:
def tf_idf(results, cleaned_paragraphs):
    # Return nothing if there are no results
    if not results:
        return None, [], [], None

    # Store the filtered paragraphs and their IDs
    filtered_paragraphs = []
    filtered_ids = []

    # Extract the text and IDs of paragraphs that match the query results
    for paragraph_id in results:
        # Join the tokens of each paragraph into a single string for processing
        filtered_paragraphs.append(' '.join(cleaned_paragraphs[paragraph_id]['tokens']))
        filtered_ids.append(paragraph_id)

    # Initialize and compute the TF-IDF matrix of the paragraphs
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(filtered_paragraphs)

    # Return the TF-IDF matrix, paragraphs, their IDs and TF-IDF vectorizer
    return tfidf_matrix, filtered_paragraphs, filtered_ids, vectorizer
    

Κατάταξη αποτελεσμάτων με τον αλγόριθμο Vector Space Model και επιστροφή της παραγράφου με μεγαλύτερη τη βαθμολόγηση από κάθε σχετικό σύνδεσμο.

In [None]:
def vector_space_model(cleaned_query, tfidf_matrix, original_paragraphs, filtered_ids, vectorizer):
    # Join the query tokens into a string
    cleaned_query = ' '.join(cleaned_query)

    # Transform the query into a TF-IDF vector using the TF-IDF vectorizer
    query_vector = vectorizer.transform([cleaned_query])

    # Compute the cosine similarity between the query and the TF-IDF matrix
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix)[0]

    # Store the paragraph with the biggest score for each link
    link_top_scores = {}

    # Look through each paragraph and its similarity score
    for paragraph_id, score in zip(filtered_ids, cosine_similarities):
        # Retrieve the original paragraph data to print
        original = original_paragraphs[paragraph_id]
        # Extract the link of current paragraph
        link = original['link']  

        # If the link is new or the score is higher than the existing one update the record
        if link not in link_top_scores or score > link_top_scores[link]['score']:
            link_top_scores[link] = {
                'paragraph_id': paragraph_id,
                'link': link,
                'text': original['text'],
                'score': score
            }

    # Sort the results in descending order
    sorted_scores = sorted(
        link_top_scores.values(),
        key = lambda x: x['score'],
        reverse = True
    )

    # Return highest ranked paragraphs 
    return sorted_scores


Κατάταξη αποτελεσμάτων με τον αλγόριθμο Okapi BM25 και επιστροφή της παραγράφου με τη μεγαλύτερη βαθμολόγηση από κάθε σχετικό σύνδεσμο.

In [None]:
def okapi_bm25(cleaned_query, filtered_paragraphs, filtered_ids, original_paragraphs):
    # Tokenize the filtered paragraphs
    tokenized_paragraphs = []

    # For every paragraph
    for paragraph in filtered_paragraphs:
        # Split into tokens
        tokens = paragraph.split(' ')
        tokenized_paragraphs.append(tokens)

    # Initialize BM25 Okapi and fit it on the tokenized paragraphs
    bm25 = BM25Okapi(tokenized_paragraphs)

    # Compute the BM25 scores for the query
    bm25_scores = bm25.get_scores(cleaned_query)

    # Keep track of the highest ranked paragraph for each link
    link_top_scores = {}

    # Look through the filtered paragraph IDs and their BM25 scores
    for paragraph_id, score in zip(filtered_ids, bm25_scores):
        # Retrieve original paragraph data for printing
        original = original_paragraphs[paragraph_id]
        link = original['link']

        # Store the paragraph only if it has the highest score for this link
        if link not in link_top_scores or score > link_top_scores[link]['score']:
            link_top_scores[link] = {
                'paragraph_id': paragraph_id,
                'link': link,
                'text': original['text'],
                'score': score
            }

    # Sort the results descending order
    sorted_scores = sorted(
        link_top_scores.values(),
        key = lambda x: x['score'],
        reverse = True
    )

    # Return highest ranked paragraphs
    return sorted_scores


Δημιουργία μηχανής αναζήτησης (Search Engine) με τη δυνατότητα επιλογής αλγόριθμου αναζήτησης και εισαγωγή ερωτήματος από τον χρήστη. 

In [None]:
def search_engine(inverted_index, original_paragraphs, cleaned_paragraphs):
    # Select the ranking algorithm
    toggle_buttons = widgets.ToggleButtons(
        options = ['Boolean retrieval', 'Vector Space Model', 'Okapi BM25'],
        description = 'Select ranking algorithm'
    )
    
    space = widgets.HTML(value = '<br>')

    # Enter search queries
    input_text = widgets.Text(
        placeholder = 'Input search query here',
        layout = widgets.Layout(width = '70%')
    )
    
    # Button to activate searching
    search_button = widgets.Button(
        description = 'Search',
        button_style = 'primary'
    )
    
    # Display search results
    output = widgets.Output()

    # Handles the search when the button is clicked
    def search(b):
        # Clear previous results
        output.clear_output()  
        # Get the selected ranking algorithm
        algorithm = toggle_buttons.value  
        # Get query that user entered
        query = input_text.value  

        # Convert the query to postfix and evaluate using the inverted index
        postfix_query = infix_to_postfix(query)
        results = evaluate_postfix(postfix_query, inverted_index, len(cleaned_paragraphs))
        
        # Apply TF-IDF transformation on the resulting paragraphs
        tfidf_matrix, filtered_paragraphs, filtered_ids, vectorizer = tf_idf(results, cleaned_paragraphs)
        # Process query for ranking
        cleaned_query = clean_text(query)  

        # Display search results based on the selected ranking algorithm
        with output:
            if filtered_paragraphs:
                print(f"Total matching paragraphs: {len(filtered_ids)}\n")

                # Display results with Boolean retrieval
                if algorithm == 'Boolean retrieval':
                    displayed_links = set()
                    for i, paragraph_id in enumerate(filtered_ids):
                        original = original_paragraphs[paragraph_id]
                        if original['link'] not in displayed_links:
                            displayed_links.add(original['link'])
                            print(f"Link: {original['link']}\n{original['text']}")
                    print(f"Total links shown: {len(displayed_links)}\n")

                # Display ranked results with VSM
                elif algorithm == 'Vector Space Model':
                    ranked_results = vector_space_model(cleaned_query, tfidf_matrix, original_paragraphs, filtered_ids, vectorizer)
                    displayed_links = set()
                    for result in ranked_results:
                        displayed_links.add(result['link'])
                        print(f"Link: {result['link']}      (Score: {result['score']:.3f})\n{result['text']}")
                    print(f"Total links shown: {len(displayed_links)}\n")
                
                # Display ranked results with okapi BM25
                elif algorithm == 'Okapi BM25':
                    ranked_results = okapi_bm25(cleaned_query, filtered_paragraphs, filtered_ids, original_paragraphs)
                    displayed_links = set()
                    for result in ranked_results:
                        displayed_links.add(result['link'])
                        print(f"Link: {result['link']}      (Score: {result['score']:.3f})\n{result['text']}")
                    print(f"Total links shown: {len(displayed_links)}\n")
                
            else:
                # If no results match the query
                print(f"No results found for '{query}' using {algorithm}.")

    # Connect the search button to the search function
    search_button.on_click(search)

    # Display widgets
    display(widgets.VBox([toggle_buttons, space, input_text, search_button, output]))

search_engine(inverted_index, original_paragraphs, cleaned_paragraphs)


# 5. Αξιολόγηση συστήματος

Εισαγωγή και επεξεργασία δεδομένων από το CISI dataset. 

In [None]:
# Read and process CISI documents
def read_documents():
    # File path to the CISI documents
    fp = '/Users/vivh/ergasia/cisi/CISI.ALL/CISI.ALL'

    with open(fp, 'r') as f:
        # To merge content across lines
        merged = ''  

        # Read file line by line and merge content while preserving field tags
        for a_line in f:
            # Identify field tags (.I, .X)
            if a_line.startswith('.'):  
                merged += '\n' + a_line.strip()
            else:
                # Add text to the merged content
                merged += ' ' + a_line.strip()  

    # Store processed documents
    documents = []  
    # Temporary storage for document text
    content = ''  
    # Temporary storage for document ID
    doc_id = '' 

    # Process content line by line
    for a_line in merged.split('\n'):
        # New document identifier
        if a_line.startswith('.I'):
            # Save previous document
            if doc_id and content:  
                documents.append({'id': doc_id, 'text': content.strip()})

            # Extract document ID
            doc_id = a_line.split(' ')[1].strip()  
            # Reset for the new document
            content = ''  
        
        # End of document identifier    
        elif a_line.startswith('.X'): 
            if doc_id and content:
                documents.append({'id': doc_id, 'text': content.strip()})
                
            doc_id = ''
            content = ''
            
        else:
            # Add the content excluding the tags
            content += a_line[3:].strip() + ' '

    # Last document in the file
    if doc_id and content:
        documents.append({'id': doc_id, 'text': content.strip()})
    
    # Save processed documents to a JSON file
    store_things(documents, 'cisi_documents.json')
    return documents


# Read and process CISI queries
def read_queries():
    fp = '/Users/vivh/ergasia/cisi/CISI.QRY'

    with open(fp, 'r') as f:
        merged = '' 

        for a_line in f:
            if a_line.startswith('.'):
                merged += '\n' + a_line.strip()
            else:
                merged += ' ' + a_line.strip() 

    queries = [] 
    content = ''  
    qry_id = '' 

    for a_line in merged.split('\n'):
        if a_line.startswith('.I'):
            if qry_id and content: 
                queries.append({'id': qry_id, 'text': content.strip()})
                
            qry_id = a_line.split(' ')[1].strip() 
            content = '' 
        
        elif a_line.startswith('.W') or a_line.startswith('.T'): 
            content += a_line.strip()[3:] + ' '

    if qry_id and content:
        queries.append({'id': qry_id, 'text': content.strip()})

    store_things(queries, 'cisi_queries.json')
    return queries


# Function to read and process CISI relevance mappings
def read_mappings():
    fp = '/Users/vivh/ergasia/cisi/CISI.REL'

    with open(fp, 'r') as f:
        # Store processed mappings
        mappings = []  

        # Read file line by line
        for a_line in f:
            # Split the line into components
            voc = a_line.strip().split()  
            # Extract query ID
            qry_id = voc[0].strip() 
            # Extract document ID
            doc_id = voc[1].strip()  

            # Append the mapping as a dictionary
            mappings.append({'query_id': qry_id, 'doc_id': doc_id})

    store_things(mappings, 'cisi_mappings.json')
    return mappings


In [None]:
documents = read_documents()

display(Markdown("How some of the CISI documents are saved: <br>"))
for paragraph in documents[:10]:
    d_id = paragraph['id']
    text = paragraph['text']

    print(f"id: {d_id}")
    print(f"text: {text}\n")

In [None]:
queries = read_queries()  

display(Markdown("How some of the CISI queries are saved: <br>"))
for paragraph in queries[:10]:
    d_id = paragraph['id']
    text = paragraph['text']

    print(f"id: {d_id}")
    print(f"text: {text}\n")

In [None]:
mappings = read_mappings()

display(Markdown("How CISI mappings are saved: <br>"))
# Convert mappings to data frame
mappings_df = pd.DataFrame(mappings)
pd.set_option('display.max_colwidth', None)
display(mappings_df)


Υπολογισμός ακρίβειας, ανάκλησης, F1-score και μέσης ακρίβειας με αγλόριθμο αναζήτησης Okapi BM25 για τα έγγραφα και ερωτήματα του CISI dataset. 

In [None]:
def testing(documents, queries, mappings):
    # Stores cleaned documents
    cleaned_paragraphs = {}

    for doc in documents:
        doc_id = doc['id']
        # Clean and tokenize the document text, then store it
        cleaned_text = clean_text(doc['text'])  
        cleaned_paragraphs[doc_id] = {'tokens': cleaned_text}

    # Build the inverted index
    inverted_index = defaultdict(set)
    for doc_id, doc in cleaned_paragraphs.items():
        for term in doc['tokens']:  # Use 'tokens' instead of 'text'
            inverted_index[term].add(doc_id)

    # Store average precision for each query
    average_precisions = []

    for query in queries:
        query_id = query['id']
        query_text = query['text']

        # Preprocess the query text
        cleaned_query = ' '.join(clean_text(query_text))
      
        # Convert the query to postfix and evaluate using the inverted index
        postfix_query = infix_to_postfix(cleaned_query)
        results = evaluate_postfix(postfix_query, inverted_index, len(cleaned_paragraphs))

        # Apply TF-IDF transformation on the resulting paragraphs
        tfidf_matrix, filtered_paragraphs, filtered_ids, vectorizer = tf_idf(results, cleaned_paragraphs)

        if not filtered_paragraphs:  
            print(f"\nQuery ID: {query_id}")
            print(f"Query: {query_text}")
            print("No matches found.\n")
            average_precisions.append(0)
            continue

        # Use TF-IDF results as input to BM25
        tokenized_docs = []
        for doc_id in filtered_ids:
            tokenized_docs.append(cleaned_paragraphs[doc_id]['tokens'])
    
        bm25 = BM25Okapi(tokenized_docs)    
        bm25_scores = bm25.get_scores(postfix_query)

        # Rank documents based on BM25 scores
        ranked_results = sorted(
            zip(filtered_ids, bm25_scores),
            key = lambda x: x[1],
            reverse = True
        )

        # Extract retrieved document IDs with scores > 0
        retrieved_docs = []
        for doc_id, score in ranked_results:
            if score > 0:
                retrieved_docs.append(doc_id)

       # Find relevant documents for this query
        relevant_docs = set()
        for mapping in mappings:
            if mapping['query_id'] == query_id:
                relevant_docs.add(mapping['doc_id'])
        
        # Calculate precision at each relevant document's position
        true_positives = 0
        precisions = []

        for i, doc_id in enumerate(retrieved_docs, start = 1):
            if doc_id in relevant_docs:
                true_positives += 1
                precision_at_k = true_positives / i
                precisions.append(precision_at_k)

        if retrieved_docs:
            precision = true_positives / len(retrieved_docs)
        else:
            precision = 0
        
        if relevant_docs:
            ap = sum(precisions) / len(relevant_docs)
            recall = true_positives / len(relevant_docs)
        else:
            ap = 0
            
        average_precisions.append(ap)

        if precision + recall > 0:
            f1_score = (2 * precision * recall) / (precision + recall)
        else:
            f1_score = 0;

        # Print metrics for the current query
        display(Markdown("Query"))
        print(f"{query_text}")
        display(Markdown("Matching document IDs"))
        print(f"{retrieved_docs}")
        display(Markdown(f"Precision: {precision:.3f}"))
        display(Markdown(f"Recall: {recall:.3f}"))
        display(Markdown(f"F1-Score: {f1_score:.3f}<br><br>"))

    # Calculate Mean Average Precision 
    map_score = sum(average_precisions) / len(average_precisions) if average_precisions else 0
    display(Markdown(f"Mean Average Precision: {map_score:.3f}"))
    

display(Markdown("Using the CISI dataset to evaluate search engine<br>"))
testing(documents, queries, mappings)
