Load Data

In [None]:
import pickle

def load_artifacts():
    """Loads the processed data artifacts from disk."""
    print("--- Loading system artifacts... ---")
    try:
        with open("./artifacts/cranfield_vectors.pkl", 'rb') as f:
            doc_vectors = pickle.load(f)

        with open("./artifacts/cranfield_idf.pkl", 'rb') as f:
            idf_scores = pickle.load(f)
            
        print("Artifacts loaded successfully.")
        return doc_vectors, idf_scores
    except FileNotFoundError:
        print("Error: Could not find artifact files. Please run the '1_Index_Builder.ipynb' first.")
        return None, None

In [None]:
document_vectors, idf_scores =load_artifacts()

Load queries

In [None]:
def load_queries(filepath):
    """
    Parses the Cranfield queries file (.qry) to extract query IDs and their text.

    The function reads the file line by line, identifying new queries by the '.I' marker
    and accumulating the query text that follows the '.W' marker.
    """
    # Initialize an empty dictionary to store the final query data
    queries = {}
    
    # Use -1 as a placeholder to indicate that we haven't started reading the first query yet.
    current_id = -1
    
    # Initialize an empty string to accumulate the text for the query being processed.
    current_text = ""
    
    # Open the specified file for reading.
    with open(filepath, 'r') as f:
        # Iterate over each line in the file.
        for line in f:
            # Check if the line marks the beginning of a new query
            if line.startswith('.I'):
                # If current_id is not -1, it means we have just finished reading a
                # previous query, and its text needs to be saved before we start the new one.
                if current_id != -1:
                    # Save the accumulated text. .strip() removes whitespace from the ends,
                    # and .replace() handles text that spanned multiple lines.
                    queries[current_id] = current_text.strip().replace('\n', ' ')
                
                # Extract the new query ID from the line
                current_id = int(line.split()[1])
                
                # Reset the text accumulator to start fresh for this new query.
                current_text = ""
                
            # Check if the line is the '.W' marker, which indicates the start of the text.
            elif line.startswith('.W'):
                # This line is just a marker, so we don't need to do anything with it.
                pass
                
            # If the line is not a marker, it must be part of the query text.
            else:
                # Append the line to our accumulator for the current query.
                current_text += line
                
    # --- Final save after the loop ---
    # When the loop finishes, the text for the very last query is still in 'current_text'.
    # This final block ensures that the last query is also added to the dictionary.
    if current_id != -1:
        queries[current_id] = current_text.strip().replace('\n', ' ')
        
    # Return the completed dictionary of queries.
    return queries

In [None]:
queries =load_queries("./dataset/cran.qry")
    
# Print the first 3 queries to see if they look correct
for i in range(1, 5):
    if i in queries:
        print(f"Query ID {i}: {queries[i]}")

Load relevance judgments

In [None]:
from collections import defaultdict

def load_relevance_judgments(filepath):
    """
    Parses the cranqrel file to load the ground truth relevance data.

    The function reads the file line by line, where each line contains a
    query ID, a relevant document ID, and a relevance score. It then
    organizes this data into a dictionary for easy lookup.
    """
    # Initialize a defaultdict with the default factory 'list'.
    qrels = defaultdict(list)
    
    # Open the specified file 
    with open(filepath, 'r') as f:
        # Iterate over each line in the file.
        for line in f:
            # Each line in cranqrel looks like: "1 184 2"
            # .split() will turn this string into a list of strings: ["1", "184", "2"]
            # map(int, ...) applies the int() function to each item in that list,
            # converting them into integers.
            # We then unpack these three integers into three variables.
            # The underscore '_' is a convention for a variable we don't plan to use.
            query_id, doc_id, _ = map(int, line.split())
            
            # Use the query_id as the key and append the relevant doc_id to its list.
            qrels[query_id].append(doc_id)
            
    # Return the completed dictionary of relevance judgments.
    return qrels

In [None]:
relevance_data = load_relevance_judgments("./dataset/cranqrel")

print(f"\nLoaded {len(queries)} queries and relevance judgments for {len(relevance_data)} queries.")

Performance Metrics

In [None]:
def calculate_metrics(retrieved_docs, relevant_docs):
    """
    Calculates Precision, Recall, and F1-Score for a single query.
    """
    # Convert lists to sets for efficient intersection
    retrieved_set = set(retrieved_docs)
    relevant_set = set(relevant_docs)

    # True Positives are the documents that are in both lists
    true_positives = len(retrieved_set.intersection(relevant_set))

    # --- Precision --- 
    # Measures: "Of the documents our system returned, how many were actually relevant?"
    # P = (Number of relevant items retrieved) / (Total number of items retrieved)
    precision = true_positives / len(retrieved_docs) if retrieved_docs else 0.0

    # --- Recall --- 
    # Measures: "Of all the possible relevant documents, how many did our system find?"
    # R = (Number of relevant items retrieved) / (Total number of relevant items)
    recall = true_positives / len(relevant_docs) if relevant_docs else 0.0

    # --- F1-Score ---
    # The harmonic mean of Precision and Recall, balancing both metrics.
    f1_score = 0.0
    if precision + recall > 0:
        f1_score = 2 * (precision * recall) / (precision + recall)

    return {
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score
    }

In [None]:
from retrieval import rank_documents
from retrieval import preprocess_text

# A list to store the performance metrics for every query
all_metrics = []

# K is the number of documents we look at for calculating precision (e.g., Precision@10)
K = 10 

print("--- Starting full evaluation... ---")

# Ensure all the necessary data has been loaded before starting the loop
if 'document_vectors' in locals() and 'queries' in locals() and 'relevance_data' in locals():
    # Loop through each query using its ID and text from the loaded queries dictionary
    for query_id, query_text in queries.items():
        
        # 1. Preprocess the query text using the same function as the documents
        query_tokens = preprocess_text([query_text])[0]
        
        # 2. Get system's ranked list of documents for the processed query
        ranked_results = rank_documents(query_tokens, document_vectors, idf_scores)
        
        # 3. Extract just the document IDs from results, up to the first K documents
        retrieved_top_k_ids = [doc_id for doc_id, score in ranked_results[:K]]
        
        # 4. Get the official list of relevant documents (the ground truth) for this query ID
        # .get() is used safely in case a query ID has no relevance judgments
        ground_truth_ids = relevance_data.get(query_id, [])
        
        # 5. Use metric calculation function to get the performance for this single query
        query_metrics = calculate_metrics(retrieved_top_k_ids, ground_truth_ids)
        
        # 6. Store the results for this query in our main list
        all_metrics.append(query_metrics)

print(f"Evaluation complete. Processed {len(all_metrics)} queries.")

In [None]:
# Check if the evaluation loop has produced any metrics
if all_metrics:
    # Calculate the average (mean) of each metric across all 225 queries
    mean_precision = sum(m['precision'] for m in all_metrics) / len(all_metrics)
    mean_recall = sum(m['recall'] for m in all_metrics) / len(all_metrics)
    mean_f1_score = sum(m['f1_score'] for m in all_metrics) / len(all_metrics)

    # Print a clean, formatted summary of the final results
    print("\n--- System Performance Summary ---")
    print(f"Metrics calculated at K={K}")
    print("-" * 30)
    print(f"Mean Precision: {mean_precision:.4f}")
    print(f"Mean Recall:    {mean_recall:.4f}")
    print(f"Mean F1-Score:  {mean_f1_score:.4f}")
    print("-" * 30)
else:
    print("No metrics were calculated. Please ensure the evaluation loop ran correctly.")