In [None]:
!pip install ragas sentence-transformers scikit-learn bert-score rouge-score sacrebleu


In [5]:
import json
import numpy as np
from sentence_transformers import SentenceTransformer, util
from bert_score import score as bert_score
from rouge_score import rouge_scorer
from sacrebleu import corpus_bleu


# LOAD EMBEDDING MODEL

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def embed(texts):
    return model.encode(texts, convert_to_numpy=True, show_progress_bar=False)


# FULL METRICS PIPELINE

def compute_all_metrics(gt_file, result_file):

    gt_data = json.load(open(gt_file))
    result_data = json.load(open(result_file))

    # Build GT lookup map
    gt_map = {item["query"]: item["report"] for item in gt_data}

    retrieval_sims = []
    semantic_sims  = []
    faithfulness   = []
    relevancy      = []
    bert_f1_scores = []
    rouge_scores   = []
    bleu_scores    = []

    for item in result_data:

        query = item["query"]
        answer = item["final_answer"]
        retrieved_docs = [d["report"] for d in item["retrieved_docs"]]

        if query not in gt_map:
            continue

        gt_answer = gt_map[query]


        # Encode items once

        all_texts = [query, answer, gt_answer] + retrieved_docs
        embeddings = embed(all_texts)

        q_emb   = embeddings[0]
        ans_emb = embeddings[1]
        gt_emb  = embeddings[2]
        retr_embs = embeddings[3:]


        # Retrieval similarity (query vs each retrieved)

        if len(retr_embs) > 0:
            sim_vals = [float(util.cos_sim(q_emb, r)) for r in retr_embs]
            retrieval_sims.append(np.mean(sim_vals))
        else:
            retrieval_sims.append(0.0)


        # Semantic similarity (GT vs Model Answer)

        semantic_sims.append(float(util.cos_sim(gt_emb, ans_emb)))


        # Faithfulness (Answer vs retrieved docs)

        if len(retr_embs) > 0:
            faith = [float(util.cos_sim(ans_emb, r)) for r in retr_embs]
            faithfulness.append(np.mean(faith))
        else:
            faithfulness.append(0.0)


        # Answer Relevancy (Answer vs Query)

        relevancy.append(float(util.cos_sim(q_emb, ans_emb)))


        # BERTScore F1

        _, _, F = bert_score([answer], [gt_answer], lang="en", verbose=False)
        bert_f1_scores.append(float(F[0]))


        # ROUGE-L

        scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
        r = scorer.score(gt_answer, answer)["rougeL"].fmeasure
        rouge_scores.append(r)


        # BLEU (normalized to 0â€“1)

        bleu = corpus_bleu([answer], [[gt_answer]]).score / 100.0
        bleu_scores.append(bleu)


    # RETURN FINAL AVERAGES ONLY

    return {
        "Retrieval_similarity_avg": float(np.mean(retrieval_sims)),
        "Semantic_similarity_avg": float(np.mean(semantic_sims)),
        "Faithfulness_avg": float(np.mean(faithfulness)),
        "Answer_relevancy_avg": float(np.mean(relevancy)),
        "Bert_f1_avg": float(np.mean(bert_f1_scores)),
        "RougeL_avg": float(np.mean(rouge_scores)),
        "Bleu_avg": float(np.mean(bleu_scores))
    }



# RUN + PRINT

scores = compute_all_metrics("user-query.json", "user_query_results.json")
print("\nRAG Metrics (Average)\n")
for k, v in scores.items():
    print(f"{k}: {v}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho


RAG Metrics (Average)

Retrieval_similarity_avg: 0.46856627720235683
Semantic_similarity_avg: 0.5482415168856581
Faithfulness_avg: 0.5538401207369235
Answer_relevancy_avg: 0.5089084484303991
Bert_f1_avg: 0.8718520580894418
RougeL_avg: 0.25296003693268887
Bleu_avg: 0.09781840069904699
