In [None]:
import pandas as pd
from langchain_community.embeddings import OllamaEmbeddings
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import SemanticSimilarity
from ragas.embeddings import LangchainEmbeddingsWrapper

In [None]:
df_processed_html = pd.read_csv('filtered_data.csv')

In [None]:
OLLAMA_BASE_URL = "https://...."
embeddings_ollama = OllamaEmbeddings(model="mxbai-embed-large:latest", base_url=OLLAMA_BASE_URL)
scorer = SemanticSimilarity(embeddings=LangchainEmbeddingsWrapper(embeddings_ollama))


In [None]:
similarity_cb_128 = []

for i in range(len(df_processed_html['html_page'])):
    try:
        score = SingleTurnSample(
            response=df_processed_html['clean_bert_128'][i],
            reference=df_processed_html['html_page'][i]
        )
        
        value = await scorer.single_turn_ascore(score)
        similarity_cb_128.append(value)
    except: #handling for NaN values
        similarity_cb_128.append(0)

In [None]:
similarity_cb_64 = []

for i in range(len(df_processed_html['html_page'])):
    try:
        score = SingleTurnSample(
            response=df_processed_html['clean_bert_64'][i],
            reference=df_processed_html['html_page'][i]
        )
        
        value = await scorer.single_turn_ascore(score)
        similarity_cb_64.append(value)
    except: #handling for NaN values
        similarity_cb_64.append(0)

In [None]:
similarity_meta_128 = []

for i in range(len(df_processed_html['html_page'])):
    try:
        score = SingleTurnSample(
            response=df_processed_html['clean_meta_128'][i],
            reference=df_processed_html['html_page'][i]
        )
        
        value = await scorer.single_turn_ascore(score)
        similarity_meta_128.append(value)
    except: #handling for NaN values
        similarity_meta_128.append(0)

In [None]:
similarity_meta_64 = []

for i in range(len(df_processed_html['html_page'])):
    try:
        score = SingleTurnSample(
            response=df_processed_html['clean_meta_64'][i],
            reference=df_processed_html['html_page'][i]
        )
        
        value = await scorer.single_turn_ascore(score)
        similarity_meta_64.append(value)
    except: #handling for NaN values
        similarity_meta_64.append(0)
        

In [37]:
df_processed_html["cb_128"] = similarity_cb_128
df_processed_html["cb_64"] = similarity_cb_64
df_processed_html["meta_128"] = similarity_meta_128
df_processed_html["meta_64"] = similarity_meta_64


In [38]:
df_processed_html.to_csv("filtered_data_with_similarity.csv")

In [39]:
df_processed_html.describe()

Unnamed: 0,cb_128,cb_64,meta_128,meta_64
count,500.0,500.0,500.0,500.0
mean,0.511267,0.614304,0.883122,0.919228
std,0.233188,0.136959,0.130482,0.074946
min,0.0,0.0,0.0,0.475715
25%,0.480039,0.532409,0.835707,0.87613
50%,0.578425,0.61569,0.914622,0.944871
75%,0.65264,0.709296,0.979901,0.979821
max,0.898504,0.961838,0.994891,0.995325
