Load Data

In [None]:
import pickle

def load_artifacts():
    """Loads the processed data artifacts from disk."""
    print("--- Loading system artifacts... ---")
    try:
        with open("./artifacts/cranfield_vectors.pkl", 'rb') as f:
            doc_vectors = pickle.load(f)

        with open("./artifacts/cranfield_idf.pkl", 'rb') as f:
            idf_scores = pickle.load(f)
            
        print("Artifacts loaded successfully.")
        return doc_vectors, idf_scores
    except FileNotFoundError:
        print("Error: Could not find artifact files. Please run the '1_Index_Builder.ipynb' first.")
        return None, None

In [None]:
load_artifacts()

Load queries

In [None]:
def load_queries(filepath):
    """
    Parses the Cranfield queries file (.qry) to extract query IDs and their text.

    The function reads the file line by line, identifying new queries by the '.I' marker
    and accumulating the query text that follows the '.W' marker.
    """
    # Initialize an empty dictionary to store the final query data
    queries = {}
    
    # Use -1 as a placeholder to indicate that we haven't started reading the first query yet.
    current_id = -1
    
    # Initialize an empty string to accumulate the text for the query being processed.
    current_text = ""
    
    # Open the specified file for reading.
    with open(filepath, 'r') as f:
        # Iterate over each line in the file.
        for line in f:
            # Check if the line marks the beginning of a new query
            if line.startswith('.I'):
                # If current_id is not -1, it means we have just finished reading a
                # previous query, and its text needs to be saved before we start the new one.
                if current_id != -1:
                    # Save the accumulated text. .strip() removes whitespace from the ends,
                    # and .replace() handles text that spanned multiple lines.
                    queries[current_id] = current_text.strip().replace('\n', ' ')
                
                # Extract the new query ID from the line
                current_id = int(line.split()[1])
                
                # Reset the text accumulator to start fresh for this new query.
                current_text = ""
                
            # Check if the line is the '.W' marker, which indicates the start of the text.
            elif line.startswith('.W'):
                # This line is just a marker, so we don't need to do anything with it.
                pass
                
            # If the line is not a marker, it must be part of the query text.
            else:
                # Append the line to our accumulator for the current query.
                current_text += line
                
    # --- Final save after the loop ---
    # When the loop finishes, the text for the very last query is still in 'current_text'.
    # This final block ensures that the last query is also added to the dictionary.
    if current_id != -1:
        queries[current_id] = current_text.strip().replace('\n', ' ')
        
    # Return the completed dictionary of queries.
    return queries

In [None]:
queries =load_queries("./dataset/cran.qry")
    
# Print the first 3 queries to see if they look correct
for i in range(1, 5):
    if i in queries:
        print(f"Query ID {i}: {queries[i]}")

Load relevance judgments

In [19]:
from collections import defaultdict

def load_relevance_judgments(filepath):
    """
    Parses the cranqrel file to load the ground truth relevance data.

    The function reads the file line by line, where each line contains a
    query ID, a relevant document ID, and a relevance score. It then
    organizes this data into a dictionary for easy lookup.
    """
    # Initialize a defaultdict with the default factory 'list'.
    qrels = defaultdict(list)
    
    # Open the specified file 
    with open(filepath, 'r') as f:
        # Iterate over each line in the file.
        for line in f:
            # Each line in cranqrel looks like: "1 184 2"
            # .split() will turn this string into a list of strings: ["1", "184", "2"]
            # map(int, ...) applies the int() function to each item in that list,
            # converting them into integers.
            # We then unpack these three integers into three variables.
            # The underscore '_' is a convention for a variable we don't plan to use.
            query_id, doc_id, _ = map(int, line.split())
            
            # Use the query_id as the key and append the relevant doc_id to its list.
            qrels[query_id].append(doc_id)
            
    # Return the completed dictionary of relevance judgments.
    return qrels

In [20]:
relevance_data = load_relevance_judgments("./dataset/cranqrel")

print(f"\nLoaded {len(queries)} queries and relevance judgments for {len(relevance_data)} queries.")


Loaded 225 queries and relevance judgments for 225 queries.
