In [None]:

!pip install ragas sentence-transformers scikit-learn bert-score rouge-score sacrebleu


In [3]:
import json
import numpy as np
from sentence_transformers import SentenceTransformer, util
from bert_score import score as bert_score
from rouge_score import rouge_scorer
from sacrebleu import corpus_bleu

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def embed(texts):
    return model.encode(texts, convert_to_numpy=True, show_progress_bar=False)

def compute_all_metrics(gt_file, result_file):
    gt_data = json.load(open(gt_file))
    result_data = json.load(open(result_file))

    gt_map = {item["query"]: item["report"] for item in gt_data}

    retrieval_sims = []
    semantic_sims  = []
    faithfulness   = []
    relevancy      = []
    bert_f1_scores = []
    rouge_scores   = []
    bleu_scores    = []

    for item in result_data:
        original_query = item["original_query"]
        answer = item["final_answer"]
        retrieved_cases = item["retrieved_cases"]

        if original_query not in gt_map:
            continue

        gt_answer = gt_map[original_query]

        retrieved_docs = [case["report"] for case in retrieved_cases]

        all_texts = [original_query, answer, gt_answer] + retrieved_docs
        embeddings = embed(all_texts)

        q_emb   = embeddings[0]
        ans_emb = embeddings[1]
        gt_emb  = embeddings[2]
        retr_embs = embeddings[3:]

        if len(retr_embs) > 0:
            sim_vals = [float(util.cos_sim(q_emb, r)) for r in retr_embs]
            retrieval_sims.append(np.mean(sim_vals))
        else:
            retrieval_sims.append(0.0)

        semantic_sims.append(float(util.cos_sim(gt_emb, ans_emb)))

        if len(retr_embs) > 0:
            faith = [float(util.cos_sim(ans_emb, r)) for r in retr_embs]
            faithfulness.append(np.mean(faith))
        else:
            faithfulness.append(0.0)

        relevancy.append(float(util.cos_sim(q_emb, ans_emb)))

        _, _, F = bert_score([answer], [gt_answer], lang="en", verbose=False)
        bert_f1_scores.append(float(F[0]))

        scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
        r = scorer.score(gt_answer, answer)["rougeL"].fmeasure
        rouge_scores.append(r)

        bleu = corpus_bleu([answer], [[gt_answer]]).score / 100.0
        bleu_scores.append(bleu)

    return {
        "Retrieval_similarity_avg": float(np.mean(retrieval_sims)),
        "Semantic_similarity_avg": float(np.mean(semantic_sims)),
        "Faithfulness_avg": float(np.mean(faithfulness)),
        "Answer_relevancy_avg": float(np.mean(relevancy)),
        "Bert_f1_avg": float(np.mean(bert_f1_scores)),
        "RougeL_avg": float(np.mean(rouge_scores)),
        "Bleu_avg": float(np.mean(bleu_scores))
    }

scores = compute_all_metrics(
    "/content/drive/MyDrive/unseen-data-cal/artifacts/unseen-rag-test.json",
    "/content/drive/MyDrive/unseen-data-cal/encrypted_rag/user_query_results.json"
)
print("\nRAG Metrics (Average)\n")
for k, v in scores.items():
    print(f"{k}: {v}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho


RAG Metrics (Average)

Retrieval_similarity_avg: 0.3241628714162727
Semantic_similarity_avg: 0.596695714148468
Faithfulness_avg: 0.3716007979993505
Answer_relevancy_avg: 0.5076429891673013
Bert_f1_avg: 0.8619098550286787
RougeL_avg: 0.29714103922149687
Bleu_avg: 0.11366154259983038
