Chunk Evaluation

First Block is to load Data. Second block is the information retrieval function

First run the code, and you will get a input pop-up for the Chunk ID you want to examine

In [7]:
#loading the MP's, the methods for this are explained in the extraction notebooks.

import pymupdf, glob
import pandas as pd

# Make sure all MP's are added into the same folder
folder_path = r"C:\Tudelft\WG_MPs"             # You edit the folder here

WG_MPs = [pymupdf.open(f) for f in glob.glob(folder_path + "/*.pdf")]
MP_texts = [chr(12).join([page.get_text() for page in MP]) for MP in WG_MPs]

from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
MP_Chunks = [text_splitter.split_text(text) for text in MP_texts]

#getting all chunks in a single list
all_chunks = []
mp_indices = []
for mp_index, chunks in enumerate(MP_Chunks):
    all_chunks.extend(chunks)
    mp_indices.extend([mp_index] * len(chunks))

#defining mp indexes to site names
mp_mapping = {
    0: "Colonies of Benevolence",
    1: "Frontiers of the Roman Empire - The Lower German Limes",
    2: "Eise Eisinga Planetarium in Franeker",
    3: "Dutch Water Defence Lines",
    4: "The Wadden Sea",
    5: "The seventeenth-century canal ring area of Amsterdam inside the Singelgracht",
    6: "Van Nellefabriek",
    7: "Schokland and surroundings",
    8: "Mill Network at Kinderdijk-Elshout",
    9: "Droogmakerij de Beemster",
    10: "Rietveld Schröder House"
}

# reading the csv file with relevant matches
filename = "normalized_similarity_results.csv"  #Make sure the CSV file is in the same folder as your notebook
df_results = pd.read_csv(filename)

In [8]:
def get_chunk_details(chunk_id):
    #Input: Chunk ID (int)
    #Output: Prints MP Name, TOP 5 Concepts, then Chunk text.

    # searching for the specific input ID
    matches = df_results[df_results['Chunk ID'] == chunk_id]
    
    #Printing Header 
    print(f"\n{'#'*60}")
    print(f"ANALYSIS FOR CHUNK ID: {chunk_id}") 
    print(f"{'#'*60}")
    
    if not matches.empty:                              
        mp_index = matches.iloc[0]['MP Index']  #getting the MP_index, if there is a match
    if matches.empty:
        return f"no matches for Chunk ID {chunk_id}"  #if there are no matches, return this

    mp_name = mp_mapping[mp_index] #Get the actual MP name from the dict.

    #printing information
    print(f"\n[MANAGEMENT PLAN]")
    print(f"Name:  {mp_name}")
    print(f"Index: {mp_index}")

    #Printing TOP 5 Matched Concepts
    count = len(matches) 
    print(f"\n[TOP 5 MATCHED CONCEPTS] (Total Found: {count})")
    
    # Sorting by Z_Score descending and taking top 5
    top_concepts = matches.sort_values(by='Z_Score', ascending=False).head(5)
        
    # Table Header, printing the stats of these top 5 matches
    header = f"{'Concept':<40} | {'Model':<10} | {'Z_Score':<10} | {'Cosine':<10}"
    print(header)
    print("-" * len(header))
        
    for _, row in top_concepts.iterrows():
        concept = row.get('Concept', 'N/A')
        model = row.get('Model', 'N/A')
        z_score = row.get('Z_Score', 0.0)
        cosine = row.get('Cosine_Score', 0.0)
            
        print(f"{concept:<40} | {model:<10} | {z_score:<10.4f} | {cosine:<10.4f}")

    #Printing the Chunk Text with the chunkid
    print(f"\n[Chunk Text]")
    print("-" * 60)
    raw_text = all_chunks[chunk_id]
    print(raw_text)
    
        
    print("-" * 60)
    print(f"{'='*60}\n")

In [9]:
try:
    user_input = input("Enter a CHUNK ID to analyze: ")
    if user_input.strip():
        c_id_input = int(user_input)
        get_chunk_details(c_id_input)
    else:
        print("No input provided.")
except ValueError:
    print("Invalid input. Please enter a numeric Chunk ID.")


############################################################
ANALYSIS FOR CHUNK ID: 1707
############################################################

[MANAGEMENT PLAN]
Name:  The Wadden Sea
Index: 4

[TOP 5 MATCHED CONCEPTS] (Total Found: 702)
Concept                                  | Model      | Z_Score    | Cosine    
-------------------------------------------------------------------------------
feasibility                              | MPNet      | 5.9723     | 0.6167    
adaptation needs                         | MPNet      | 5.9153     | 0.6119    
risk framework                           | MPNet      | 5.6611     | 0.5904    
climate governance                       | MPNet      | 5.5975     | 0.5850    
solution space                           | MPNet      | 5.4902     | 0.5759    

[Chunk Text]
------------------------------------------------------------
collaboration with relevant institutions and stakeholders to, inter 
alia, facilitate trilateral exchange and enhance a