In [37]:
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd

Embedding the rerieved concepts

In [38]:
#Lets load the concepts from the json file
with open('ipcc_concepts.json', 'r', encoding='utf-8') as f:
    concepts = json.load(f)

#in the json file, concepts and their descriptions are stored as a list of dictionaries
# To get proper fixed query embeddings for our retrieval pipeline, we will concatenate the concept name and description
concept_texts = [f"{concept['concept']}: {concept['definition']}" for concept in concepts]
concept_names = [concept['concept'] for concept in concepts]

In [39]:
# Embedding with the same three models we used for the MP's
model_name1 = 'sentence-transformers/LaBSE'
model1 = SentenceTransformer(model_name1)

model_name2 = 'sentence-transformers/all-MiniLM-L6-v2'
model2 = SentenceTransformer(model_name2)

model_name3 = 'sentence-transformers/all-mpnet-base-v2'
model3 = SentenceTransformer(model_name3)

In [40]:
concept_embeddings_LaBSE = model1.encode(concept_texts, convert_to_tensor=True) # Embedding with model 1
# convert_to_tensor=True ensures the output is a tensor, this makes it easier to work with for similarity search

concept_embeddings_MiniLM = model2.encode(concept_texts, convert_to_tensor=True) # Embedding with model 2

concept_embeddings_MPNet = model3.encode(concept_texts, convert_to_tensor=True) # Embedding with model 3

THE Retrieval

In [41]:
import torch #for creating tensors

#loading the targets
df_LaBSE = pd.read_parquet("WG_MPs_LaBSE_Embeddings.parquet")
df_MiniLM = pd.read_parquet("WG_MPs_MiniLM_Embeddings.parquet")
df_MPNet = pd.read_parquet("WG_MPs_mpnet_Embeddings.parquet")

# THe objective is to calculate the cosine similarity between concept embeddings and MP embeddings
# For this we need to stack the embeddings into numpy arrays and convert them to Tensors
mp_embeddings_LaBSE = np.vstack(df_LaBSE['embedding'].values)   
mp_embeddings_MiniLM = np.vstack(df_MiniLM['embedding'].values)
mp_embeddings_MPNet = np.vstack(df_MPNet['embedding'].values)
mp_embeddings_LaBSE = torch.from_numpy(mp_embeddings_LaBSE)
mp_embeddings_MiniLM = torch.from_numpy(mp_embeddings_MiniLM)
mp_embeddings_MPNet = torch.from_numpy(mp_embeddings_MPNet)


#Lets create a function to compute cosine similarity and get top k results

# Configuration
TOP_K = 5  # Number of matches to retrieve per concept
THRESHOLD = 0.4 # Minimum score to be considered relevant

def Retrieval_Results_CCA_Concepts(model, model_name, concept_embeddings, mp_embeddings, concept_names, df_mps):
    print(f"--- Running Retrieval for {model_name} ---")
    results = []
    mp_embeddings = mp_embeddings.to(model.device) #otherwise we get device mismatch error
    cosine_scores = model.similarity(concept_embeddings, mp_embeddings)
    # reference for this function: https://sbert.net/docs/sentence_transformer/usage/semantic_textual_similarity.html
    # https://sbert.net/docs/package_reference/sentence_transformer/SentenceTransformer.html#sentence_transformers.SentenceTransformer.similarity
    # It calculates the cosine similarity between every concept and every chunk simultaneously

    # now iterate over each concept and get top k results
    for i, concept in enumerate(concept_names):
        scores = cosine_scores[i] # scores for the i-th concept
        top_results = torch.topk(scores, k=TOP_K) # get top k results
        # Reference: https://pytorch.org/docs/stable/generated/torch.topk.html


        for score, idx in zip(top_results[0], top_results[1]): #zip function takes two iterables and aggregates them into tuples
            score = score.item() 
            idx = idx.item()
            
            if score >= THRESHOLD: # only consider scores above the threshold so we dont get low relevance results
                results.append({
                    'Model': model_name,
                    'Concept': concept,
                    'MP Index': df_mps.iloc[idx]['mp_index'],
                    'Chunk': df_mps.iloc[idx]['text_chunk'],
                    'Chunk ID': idx,
                    'Score': score
                })
    return results

In [42]:
#calculating for LaBSE
results_LaBSE = Retrieval_Results_CCA_Concepts(model1, "LaBSE", concept_embeddings_LaBSE, mp_embeddings_LaBSE, concept_names, df_LaBSE)

#calculating for MiniLM
results_MiniLM = Retrieval_Results_CCA_Concepts(model2, "MiniLM", concept_embeddings_MiniLM, mp_embeddings_MiniLM, concept_names, df_MiniLM)    

#calculating for MPNet
results_MPNet = Retrieval_Results_CCA_Concepts(model3, "MPNet", concept_embeddings_MPNet, mp_embeddings_MPNet, concept_names, df_MPNet)

--- Running Retrieval for LaBSE ---
--- Running Retrieval for MiniLM ---
--- Running Retrieval for MPNet ---


Similarity Results

In [43]:
#Checking how many results were found for each model
print(f"LaBSE matches: {len(results_LaBSE)}")
print(f"MiniLM matches: {len(results_MiniLM)}")
print(f"MPNet matches: {len(results_MPNet)}")

#Turning the results into a DataFrame to view them easily
df_results_labse = pd.DataFrame(results_LaBSE)
df_results_minilm = pd.DataFrame(results_MiniLM)
df_results_mpnet = pd.DataFrame(results_MPNet)

# Displaying a preview of the results
display(df_results_labse.head())
display(df_results_minilm.head())
display(df_results_mpnet.head())

#displaying the topresults
print("Top 10 Highest Similarity Scores")
all_results = pd.concat([df_results_labse, df_results_minilm, df_results_mpnet], ignore_index=True)
top_10 = all_results.sort_values(by="Score", ascending=False).head(10)
display(top_10[['Model', 'Concept', 'Score', 'Chunk']])

print("Top results LaBSE")
display(df_results_labse.sort_values(by="Score", ascending=False).head(10))

print("Top results MiniLM")
display(df_results_minilm.sort_values(by="Score", ascending=False).head(10))

print("Top results MPNet")
display(df_results_mpnet.sort_values(by="Score", ascending=False).head(10))

LaBSE matches: 3717
MiniLM matches: 1419
MPNet matches: 1320


Unnamed: 0,Model,Concept,MP Index,Chunk,Chunk ID,Score
0,LaBSE,ablation,3,land. It has not yet been determined where and...,1664,0.547772
1,LaBSE,ablation,0,activities in the area. The spatial protection...,452,0.547607
2,LaBSE,ablation,2,are protected as national monuments. The buffe...,1103,0.525452
3,LaBSE,ablation,1,covering over.\nReversible is applied to actio...,963,0.520423
4,LaBSE,ablation,10,monitor the interior temperature and humidity ...,2589,0.516905


Unnamed: 0,Model,Concept,MP Index,Chunk,Chunk ID,Score
0,MiniLM,abrupt climate change,4,4.3 Proposals for \nManagement Activities\n• ...,1774,0.406125
1,MiniLM,access to modern energy services,5,located in the canal ring area must be taken i...,2081,0.485579
2,MiniLM,access to modern energy services,5,will still be required. When pursuing such mea...,2083,0.481535
3,MiniLM,access to modern energy services,5,attention is the risk that far-reaching insula...,2076,0.435108
4,MiniLM,access to modern energy services,5,therefore states: In the old inner city (…) bo...,2082,0.43417


Unnamed: 0,Model,Concept,MP Index,Chunk,Chunk ID,Score
0,MPNet,abrupt climate change,4,while applying the precau-\ntionary principle....,1813,0.403358
1,MPNet,access to modern energy services,5,will still be required. When pursuing such mea...,2083,0.487928
2,MPNet,access to modern energy services,5,attention is the risk that far-reaching insula...,2076,0.470736
3,MPNet,access to modern energy services,5,suitable location for solar panels or wind tur...,2077,0.465861
4,MPNet,access to modern energy services,3,achieved emphatically without causing any unac...,1646,0.443958


Top 10 Highest Similarity Scores


Unnamed: 0,Model,Concept,Score,Chunk
1344,LaBSE,extreme sea level,0.742055,impacting the Wadden Sea Out-\nstanding Univer...
1991,LaBSE,just transitions,0.732577,and the expansion of the natural area. \n \nIm...
1992,LaBSE,just transitions,0.732453,(restoration and re-use) and methods to antici...
2978,LaBSE,sea level change,0.72983,impacting the Wadden Sea Out-\nstanding Univer...
1993,LaBSE,just transitions,0.727778,4.3 Proposals for \nManagement Activities\n• ...
2988,LaBSE,sea level rise,0.726161,impacting the Wadden Sea Out-\nstanding Univer...
1994,LaBSE,just transitions,0.724584,"already in 2010. Thereby, the \nTrilateral Wad..."
842,LaBSE,cultural impacts,0.724137,values as a resource for innovative solutions ...
2784,LaBSE,Reasons for Concern,0.722277,impacting the Wadden Sea Out-\nstanding Univer...
1995,LaBSE,just transitions,0.718149,can result in identification of \npriorities t...


Top results LaBSE


Unnamed: 0,Model,Concept,MP Index,Chunk,Chunk ID,Score
1344,LaBSE,extreme sea level,4,impacting the Wadden Sea Out-\nstanding Univer...,1768,0.742055
1991,LaBSE,just transitions,8,and the expansion of the natural area. \n \nIm...,2327,0.732577
1992,LaBSE,just transitions,0,(restoration and re-use) and methods to antici...,31,0.732453
2978,LaBSE,sea level change,4,impacting the Wadden Sea Out-\nstanding Univer...,1768,0.72983
1993,LaBSE,just transitions,4,4.3 Proposals for \nManagement Activities\n• ...,1774,0.727778
2988,LaBSE,sea level rise,4,impacting the Wadden Sea Out-\nstanding Univer...,1768,0.726161
1994,LaBSE,just transitions,4,"already in 2010. Thereby, the \nTrilateral Wad...",1773,0.724584
842,LaBSE,cultural impacts,0,values as a resource for innovative solutions ...,273,0.724137
2784,LaBSE,Reasons for Concern,4,impacting the Wadden Sea Out-\nstanding Univer...,1768,0.722277
1995,LaBSE,just transitions,4,can result in identification of \npriorities t...,1811,0.718149


Top results MiniLM


Unnamed: 0,Model,Concept,MP Index,Chunk,Chunk ID,Score
1102,MiniLM,restoration,1,technically necessary.\nRestoration is the ret...,962,0.668433
773,MiniLM,land rehabilitation,0,parts. The main challenge for the Colonies of ...,30,0.661591
774,MiniLM,land rehabilitation,0,the HIA will be taken into account in decision...,432,0.650467
775,MiniLM,land rehabilitation,0,of Benevolence is to preserve the quality of l...,234,0.649336
707,MiniLM,integrated assessment,3,constitute the assessment framework for applic...,1440,0.645556
25,MiniLM,adaptation gap,4,4.3 Proposals for \nManagement Activities\n• ...,1774,0.636361
776,MiniLM,land rehabilitation,0,"be safeguarded, as well as the harmonisation w...",691,0.626824
777,MiniLM,land rehabilitation,0,"Besides, the nature values are additionally pr...",567,0.626585
15,MiniLM,adaptation behaviour,4,4.3 Proposals for \nManagement Activities\n• ...,1774,0.61932
41,MiniLM,adaptive governance,4,4.3 Proposals for \nManagement Activities\n• ...,1774,0.609193


Top results MPNet


Unnamed: 0,Model,Concept,MP Index,Chunk,Chunk ID,Score
30,MPNet,adaptation needs,4,4.3 Proposals for \nManagement Activities\n• ...,1774,0.663822
418,MPNet,ecosystem-based adaptation,4,4.3 Proposals for \nManagement Activities\n• ...,1774,0.651587
1034,MPNet,restoration,1,technically necessary.\nRestoration is the ret...,962,0.650502
31,MPNet,adaptation needs,4,while applying the precau-\ntionary principle....,1813,0.636402
1069,MPNet,Sendai Framework for Disaster Risk Reduction,2,• \nproviding the control room function.\nIn a...,1173,0.632125
487,MPNet,feasibility,4,while applying the precau-\ntionary principle....,1813,0.630086
198,MPNet,climate governance,4,4.3 Proposals for \nManagement Activities\n• ...,1774,0.629383
419,MPNet,ecosystem-based adaptation,4,while applying the precau-\ntionary principle....,1813,0.622197
1270,MPNet,urbanisation,0,and the component part in respect of which the...,243,0.62205
1224,MPNet,transformational adaptation,4,4.3 Proposals for \nManagement Activities\n• ...,1774,0.621967


In [46]:
#lets export the results to a CSV file for further analysis
#Selecting only relevant columns for export

columns_to_export = ['Model', 'Concept', 'MP Index', 'Chunk ID', 'Score']
df_clean = all_results[columns_to_export]
df_clean = df_clean.sort_values(by="Score", ascending=False)
df_clean_labse = df_results_labse[columns_to_export]
df_clean_mpnet = df_results_mpnet[columns_to_export]
df_clean_minilm = df_results_minilm[columns_to_export]


filename = "all_similarity_results.csv"
df_clean.to_csv(filename, index=False)
filename_labse = "similarity_results_labse.csv"
df_clean_labse.to_csv(filename_labse, index=False)  
filename_minilm = "similarity_results_minilm.csv"
df_clean_minilm.to_csv(filename_minilm, index=False)
filename_mpnet = "similarity_results_mpnet.csv"
df_clean_mpnet.to_csv(filename_mpnet, index=False)
print(f"\nSuccess! all Saved'")




Success! all Saved'
