In [None]:
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd

Embedding the rerieved concepts

In [None]:
#Lets load the concepts from the json file
with open('ipcc_concepts.json', 'r', encoding='utf-8') as f:
    concepts = json.load(f)

#in the json file, concepts and their descriptions are stored as a list of dictionaries
# To get proper fixed query embeddings for our retrieval pipeline, we will concatenate the concept name and description
concept_texts = [f"{concept['concept']}: {concept['definition']}" for concept in concepts]
concept_names = [concept['concept'] for concept in concepts]

In [25]:
# Embedding with the same three models we used for the MP's
model_name1 = 'sentence-transformers/LaBSE'
model1 = SentenceTransformer(model_name1)

model_name2 = 'sentence-transformers/all-MiniLM-L6-v2'
model2 = SentenceTransformer(model_name2)

model_name3 = 'sentence-transformers/all-mpnet-base-v2'
model3 = SentenceTransformer(model_name3)

In [26]:
concept_embeddings_LaBSE = model1.encode(concept_texts, convert_to_tensor=True) # Embedding with model 1
# convert_to_tensor=True ensures the output is a tensor, this makes it easier to work with for similarity search

concept_embeddings_MiniLM = model2.encode(concept_texts, convert_to_tensor=True) # Embedding with model 2

concept_embeddings_MPNet = model3.encode(concept_texts, convert_to_tensor=True) # Embedding with model 3

THE Retrieval

In [27]:
import torch #for creating tensors

#loading the targets
df_LaBSE = pd.read_parquet("WG_MPs_LaBSE_Embeddings.parquet")
df_MiniLM = pd.read_parquet("WG_MPs_MiniLM_Embeddings.parquet")
df_MPNet = pd.read_parquet("WG_MPs_mpnet_Embeddings.parquet")

# THe objective is to calculate the cosine similarity between concept embeddings and MP embeddings
# For this we need to stack the embeddings into numpy arrays and convert them to Tensors
mp_embeddings_LaBSE = np.vstack(df_LaBSE['embedding'].values)   
mp_embeddings_MiniLM = np.vstack(df_MiniLM['embedding'].values)
mp_embeddings_MPNet = np.vstack(df_MPNet['embedding'].values)
mp_embeddings_LaBSE = torch.from_numpy(mp_embeddings_LaBSE)
mp_embeddings_MiniLM = torch.from_numpy(mp_embeddings_MiniLM)
mp_embeddings_MPNet = torch.from_numpy(mp_embeddings_MPNet)


#Lets create a function to compute cosine similarity and get top k results

# Configuration
TOP_K = 5  # Number of matches to retrieve per concept

def Retrieval_Results_CCA_Concepts(model, model_name, concept_embeddings, mp_embeddings, concept_names, df_mps):
    print(f"--- Running Retrieval for {model_name} ---")
    results = []
    mp_embeddings = mp_embeddings.to(model.device) #otherwise we get device mismatch error
    cosine_scores = model.similarity(concept_embeddings, mp_embeddings)
    # reference for this function: https://sbert.net/docs/sentence_transformer/usage/semantic_textual_similarity.html
    # https://sbert.net/docs/package_reference/sentence_transformer/SentenceTransformer.html#sentence_transformers.SentenceTransformer.similarity
    # It calculates the cosine similarity between every concept and every chunk simultaneously
    
    # We have to create a dynamic THRESHOLD because different models have different score distributions
    all_scores = cosine_scores.flatten() # flatten to 1D tensor
    mean_score = torch.mean(all_scores) # mean score across all concept-MP pairs
    std_score = torch.std(all_scores) # standard deviation of scores
    THRESHOLD = mean_score + (2 * std_score) # Gets the top ~2.3% of matches
    print(f"Dynamic THRESHOLD set to: {THRESHOLD:.4f}")

    # now iterate over each concept and get top k results
    for i, concept in enumerate(concept_names):
        scores = cosine_scores[i] # scores for the i-th concept
        top_results = torch.topk(scores, k=TOP_K) # get top k results
        # Reference: https://pytorch.org/docs/stable/generated/torch.topk.html

        


        for score, idx in zip(top_results[0], top_results[1]): #zip function takes two iterables and aggregates them into tuples
            score = score.item() 
            idx = idx.item()
            
            if score >= THRESHOLD: # only consider scores above the threshold so we dont get low relevance results
                results.append({
                    'Model': model_name,
                    'Concept': concept,
                    'MP Index': df_mps.iloc[idx]['mp_index'],
                    'Chunk': df_mps.iloc[idx]['text_chunk'],
                    'Chunk ID': idx,
                    'Score': score
                })
    return results

In [28]:
#calculating for LaBSE
results_LaBSE = Retrieval_Results_CCA_Concepts(model1, "LaBSE", concept_embeddings_LaBSE, mp_embeddings_LaBSE, concept_names, df_LaBSE)

#calculating for MiniLM
results_MiniLM = Retrieval_Results_CCA_Concepts(model2, "MiniLM", concept_embeddings_MiniLM, mp_embeddings_MiniLM, concept_names, df_MiniLM)    

#calculating for MPNet
results_MPNet = Retrieval_Results_CCA_Concepts(model3, "MPNet", concept_embeddings_MPNet, mp_embeddings_MPNet, concept_names, df_MPNet)

--- Running Retrieval for LaBSE ---
Dynamic THRESHOLD set to: 0.4979
--- Running Retrieval for MiniLM ---
Dynamic THRESHOLD set to: 0.2868
--- Running Retrieval for MPNet ---
Dynamic THRESHOLD set to: 0.2807


Similarity Results

In [29]:
#Checking how many results were found for each model
print(f"LaBSE matches: {len(results_LaBSE)}")
print(f"MiniLM matches: {len(results_MiniLM)}")
print(f"MPNet matches: {len(results_MPNet)}")

#Turning the results into a DataFrame to view them easily
df_results_labse = pd.DataFrame(results_LaBSE)
df_results_minilm = pd.DataFrame(results_MiniLM)
df_results_mpnet = pd.DataFrame(results_MPNet)

# Displaying a preview of the results
display(df_results_labse.head())
display(df_results_minilm.head())
display(df_results_mpnet.head())

#displaying the topresults
print("Top 10 Highest Similarity Scores")
all_results = pd.concat([df_results_labse, df_results_minilm, df_results_mpnet], ignore_index=True)
top_10 = all_results.sort_values(by="Score", ascending=False).head(10)
display(top_10[['Model', 'Concept', 'Score', 'Chunk']])

print("Top results LaBSE")
display(df_results_labse.sort_values(by="Score", ascending=False).head(10))

print("Top results MiniLM")
display(df_results_minilm.sort_values(by="Score", ascending=False).head(10))

print("Top results MPNet")
display(df_results_mpnet.sort_values(by="Score", ascending=False).head(10))

LaBSE matches: 2151
MiniLM matches: 3592
MPNet matches: 3620


Unnamed: 0,Model,Concept,MP Index,Chunk,Chunk ID,Score
0,LaBSE,ablation,3,land. It has not yet been determined where and...,1664,0.547772
1,LaBSE,ablation,0,activities in the area. The spatial protection...,452,0.547607
2,LaBSE,ablation,2,are protected as national monuments. The buffe...,1103,0.525452
3,LaBSE,ablation,1,covering over.\nReversible is applied to actio...,963,0.520423
4,LaBSE,ablation,10,monitor the interior temperature and humidity ...,2589,0.516905


Unnamed: 0,Model,Concept,MP Index,Chunk,Chunk ID,Score
0,MiniLM,abrupt change,5,Theme\nTheme\nNr\nNr Type of crisis/ Type of i...,2090,0.288714
1,MiniLM,abrupt climate change,4,4.3 Proposals for \nManagement Activities\n• ...,1774,0.406125
2,MiniLM,abrupt climate change,0,traffic regulations and rules for the use of \...,590,0.369203
3,MiniLM,abrupt climate change,4,"already in 2010. Thereby, the \nTrilateral Wad...",1773,0.363278
4,MiniLM,abrupt climate change,4,impacting the Wadden Sea Out-\nstanding Univer...,1768,0.358789


Unnamed: 0,Model,Concept,MP Index,Chunk,Chunk ID,Score
0,MPNet,abrupt change,5,"(deliberate) disruption, failure or misuse of ...",2091,0.29535
1,MPNet,abrupt climate change,4,while applying the precau-\ntionary principle....,1813,0.403358
2,MPNet,abrupt climate change,0,and the component part in respect of which the...,243,0.387551
3,MPNet,abrupt climate change,4,4.3 Proposals for \nManagement Activities\n• ...,1774,0.378409
4,MPNet,abrupt climate change,3,does not individually impair the Outstanding U...,1659,0.365188


Top 10 Highest Similarity Scores


Unnamed: 0,Model,Concept,Score,Chunk
811,LaBSE,extreme sea level,0.742055,impacting the Wadden Sea Out-\nstanding Univer...
1186,LaBSE,just transitions,0.732577,and the expansion of the natural area. \n \nIm...
1187,LaBSE,just transitions,0.732453,(restoration and re-use) and methods to antici...
1738,LaBSE,sea level change,0.72983,impacting the Wadden Sea Out-\nstanding Univer...
1188,LaBSE,just transitions,0.727778,4.3 Proposals for \nManagement Activities\n• ...
1744,LaBSE,sea level rise,0.726161,impacting the Wadden Sea Out-\nstanding Univer...
1189,LaBSE,just transitions,0.724584,"already in 2010. Thereby, the \nTrilateral Wad..."
514,LaBSE,cultural impacts,0.724137,values as a resource for innovative solutions ...
1628,LaBSE,Reasons for Concern,0.722277,impacting the Wadden Sea Out-\nstanding Univer...
1190,LaBSE,just transitions,0.718149,can result in identification of \npriorities t...


Top results LaBSE


Unnamed: 0,Model,Concept,MP Index,Chunk,Chunk ID,Score
811,LaBSE,extreme sea level,4,impacting the Wadden Sea Out-\nstanding Univer...,1768,0.742055
1186,LaBSE,just transitions,8,and the expansion of the natural area. \n \nIm...,2327,0.732577
1187,LaBSE,just transitions,0,(restoration and re-use) and methods to antici...,31,0.732453
1738,LaBSE,sea level change,4,impacting the Wadden Sea Out-\nstanding Univer...,1768,0.72983
1188,LaBSE,just transitions,4,4.3 Proposals for \nManagement Activities\n• ...,1774,0.727778
1744,LaBSE,sea level rise,4,impacting the Wadden Sea Out-\nstanding Univer...,1768,0.726161
1189,LaBSE,just transitions,4,"already in 2010. Thereby, the \nTrilateral Wad...",1773,0.724584
514,LaBSE,cultural impacts,0,values as a resource for innovative solutions ...,273,0.724137
1628,LaBSE,Reasons for Concern,4,impacting the Wadden Sea Out-\nstanding Univer...,1768,0.722277
1190,LaBSE,just transitions,4,can result in identification of \npriorities t...,1811,0.718149


Top results MiniLM


Unnamed: 0,Model,Concept,MP Index,Chunk,Chunk ID,Score
2743,MiniLM,restoration,1,technically necessary.\nRestoration is the ret...,962,0.668433
1894,MiniLM,land rehabilitation,0,parts. The main challenge for the Colonies of ...,30,0.661591
1895,MiniLM,land rehabilitation,0,the HIA will be taken into account in decision...,432,0.650467
1896,MiniLM,land rehabilitation,0,of Benevolence is to preserve the quality of l...,234,0.649336
1776,MiniLM,integrated assessment,3,constitute the assessment framework for applic...,1440,0.645556
52,MiniLM,adaptation gap,4,4.3 Proposals for \nManagement Activities\n• ...,1774,0.636361
1897,MiniLM,land rehabilitation,0,"be safeguarded, as well as the harmonisation w...",691,0.626824
1898,MiniLM,land rehabilitation,0,"Besides, the nature values are additionally pr...",567,0.626585
42,MiniLM,adaptation behaviour,4,4.3 Proposals for \nManagement Activities\n• ...,1774,0.61932
87,MiniLM,adaptive governance,4,4.3 Proposals for \nManagement Activities\n• ...,1774,0.609193


Top results MPNet


Unnamed: 0,Model,Concept,MP Index,Chunk,Chunk ID,Score
62,MPNet,adaptation needs,4,4.3 Proposals for \nManagement Activities\n• ...,1774,0.663822
1108,MPNet,ecosystem-based adaptation,4,4.3 Proposals for \nManagement Activities\n• ...,1774,0.651587
2806,MPNet,restoration,1,technically necessary.\nRestoration is the ret...,962,0.650502
1014,MPNet,drainage,9,Integral water management Water management is ...,2475,0.647912
63,MPNet,adaptation needs,4,while applying the precau-\ntionary principle....,1813,0.636402
2906,MPNet,Sendai Framework for Disaster Risk Reduction,2,• \nproviding the control room function.\nIn a...,1173,0.632125
1360,MPNet,feasibility,4,while applying the precau-\ntionary principle....,1813,0.630085
568,MPNet,climate governance,4,4.3 Proposals for \nManagement Activities\n• ...,1774,0.629383
1109,MPNet,ecosystem-based adaptation,4,while applying the precau-\ntionary principle....,1813,0.622197
3496,MPNet,urbanisation,0,and the component part in respect of which the...,243,0.62205


In [30]:
#lets export the results to a CSV file for further analysis
#Selecting only relevant columns for export

columns_to_export = ['Model', 'Concept', 'MP Index', 'Chunk ID', 'Score']
df_clean = all_results[columns_to_export]
df_clean = df_clean.sort_values(by="Score", ascending=False)
df_clean_labse = df_results_labse[columns_to_export]
df_clean_mpnet = df_results_mpnet[columns_to_export]
df_clean_minilm = df_results_minilm[columns_to_export]


filename = "all_similarity_results.csv"
df_clean.to_csv(filename, index=False)
filename_labse = "similarity_results_labse.csv"
df_clean_labse.to_csv(filename_labse, index=False)  
filename_minilm = "similarity_results_minilm.csv"
df_clean_minilm.to_csv(filename_minilm, index=False)
filename_mpnet = "similarity_results_mpnet.csv"
df_clean_mpnet.to_csv(filename_mpnet, index=False)
print(f"\nSuccess! all Saved'")




Success! all Saved'


Normalizing the similarity Scores

In [33]:
# we can not compare the scores directly between models as they have different score distributions
# Comparing raw cosine similarity scores between different models (like LaBSE vs. MPNet) is misleading because each model operates in a different vector space with its own "density." 
# A score of 0.40 might be weak for LaBSE but exceptionally strong for MPNet.

# To fix this, we can use Z-Score Normalization (Standard Score).
# reference: https://www.geeksforgeeks.org/data-analysis/z-score-normalization-definition-and-examples/

def normalize_and_merge_scores(df_list, model_names):
    normalized_dfs = []
    
    for df, name in zip(df_list, model_names):
        # Creating a copy to avoid SettingWithCopy warnings
        df_norm = df.copy()
        
        # Calculate Statistics for the specific model
        mean_score = df_norm['Score'].mean()
        std_score = df_norm['Score'].std()
        
        # apply Z-Score Formula: (Score - Mean) / Std_Dev
        # We add a small epsilon (1e-9) to std to avoid division by zero if std is 0
        df_norm['Z_Score'] = (df_norm['Score'] - mean_score) / (std_score + 1e-9)
        
        # Add Min-Max normalization (optional, for 0-1 scaling)
        min_score = df_norm['Score'].min()
        max_score = df_norm['Score'].max()
        df_norm['MinMax_Score'] = (df_norm['Score'] - min_score) / (max_score - min_score)
        
        normalized_dfs.append(df_norm)
    
    # Merging all dataframes
    merged_df = pd.concat(normalized_dfs, ignore_index=True)

    # Sort by Z_Score descending to see the statistically "best" matches first
    merged_df_sorted = merged_df.sort_values(by="Z_Score", ascending=False)
    
    return merged_df_sorted


# List of your dataframes and names
dfs = [df_results_labse, df_results_minilm, df_results_mpnet]
names = ["LaBSE", "MiniLM", "MPNet"]

# Running the function
final_df = normalize_and_merge_scores(dfs, names)

# The new top 10 (Not just LaBSE now!)
print("Top 10 Results by Normalized Z-Score:")
display(final_df[['Model', 'Concept', 'Score', 'Z_Score', 'Chunk']].head(10))

#Exporting to CSV with z-scores
final_df.rename(columns={'Score': 'Cosine_Score'}, inplace=True)
columns_to_export = ['Model', 'Concept', 'MP Index', 'Chunk ID', 'Cosine_Score', 'Z_Score']
final_export_df = final_df[columns_to_export]
csv_filename = "normalized_similarity_results.csv"
final_df.to_csv(csv_filename, index=False)

Top 10 Results by Normalized Z-Score:


Unnamed: 0,Model,Concept,Score,Z_Score,Chunk
4894,MiniLM,restoration,0.668433,3.902671,technically necessary.\nRestoration is the ret...
4045,MiniLM,land rehabilitation,0.661591,3.806248,parts. The main challenge for the Colonies of ...
5805,MPNet,adaptation needs,0.663822,3.712237,4.3 Proposals for \nManagement Activities\n• ...
4046,MiniLM,land rehabilitation,0.650467,3.649466,the HIA will be taken into account in decision...
4047,MiniLM,land rehabilitation,0.649336,3.633536,of Benevolence is to preserve the quality of l...
811,LaBSE,extreme sea level,0.742055,3.602539,impacting the Wadden Sea Out-\nstanding Univer...
3927,MiniLM,integrated assessment,0.645556,3.580263,constitute the assessment framework for applic...
6851,MPNet,ecosystem-based adaptation,0.651587,3.548392,4.3 Proposals for \nManagement Activities\n• ...
8549,MPNet,restoration,0.650502,3.533859,technically necessary.\nRestoration is the ret...
6757,MPNet,drainage,0.647912,3.499183,Integral water management Water management is ...
