In [1]:
import pandas as pd
import json
import torch
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer
import re 

class CausalRAGEvaluator:
    def __init__(self, json_file_path):
        # Note: Ensure this file exists in your local directory
        with open(json_file_path, 'r') as f:
            self.knowledge_graph = json.load(f)
        
        # mpnet-base-v2 is excellent for semantic similarity in technical domains
        self.eval_model = SentenceTransformer('all-mpnet-base-v2')
        self.scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

    import re # Add this import at the top

    def _calculate_diversity(self, retrieved_context):
        # Use regex to split by ASCII arrow, Unicode arrow, OR commas
        # This ensures that even if arrows aren't found, we compare segments of the text
        items = [p.strip().lower() for p in re.split(r'->|→|,', retrieved_context) if len(p.strip()) > 5]
    
        if len(items) <= 1:
            return 0.0
        
        embeddings = self.eval_model.encode(items, convert_to_tensor=True)
        cos_sim_matrix = util.cos_sim(embeddings, embeddings)
        dist_matrix = 1 - cos_sim_matrix
        
        triu_indices = torch.triu_indices(len(items), len(items), offset=1)
        distances = dist_matrix[triu_indices[0], triu_indices[1]]
        
        return distances.mean().item()
        

    def calculate_metrics(self, retrieved_context, truth):
        # Guard clause for empty or failed retrievals
        if "No direct causal paths found" in retrieved_context or not retrieved_context:
            return 0.0, 0.0, 0.0, 0.0, 0.0

        # 1. Diversity (Calculated on the raw string context)
        diversity = self._calculate_diversity(retrieved_context)

        # 2. Tokenize context for structural metrics (Recall/Precision)
        retrieved_items = [p.strip().lower() for p in retrieved_context.split('->')]
        
        # 3. Formal Context Recall
        references = [truth.lower()] 
        hits = sum(1 for item in retrieved_items if any(ref in item for ref in references))
        context_recall = hits / len(references) if references else 0.0

        # 4. Formal Context Precision
        context_precision = hits / len(retrieved_items) if retrieved_items else 0.0

        # 5. Semantic Similarity (Global Context vs Ground Truth)
        embeddings = self.eval_model.encode([retrieved_context, truth], convert_to_tensor=True)
        similarity = util.cos_sim(embeddings[0], embeddings[1]).item()

        # 6. ROUGE-L (Overlap-based similarity)
        rouge_scores = self.scorer.score(truth, retrieved_context)
        rouge_l = rouge_scores['rougeL'].fmeasure

        return similarity, context_recall, context_precision, rouge_l, diversity

    def run_evaluation(self, results_data):
        evaluation_results = []
        for item in results_data:
            sim, recall, prec, rouge_l, div = self.calculate_metrics(item['context'], item['truth'])
            
            evaluation_results.append({
                "Query": item['query'],
                "Similarity": round(sim, 4),
                "Diversity": round(div, 4),
                "Context Recall": round(recall, 4),
                "Context Precision": round(prec, 4),
                "ROUGE_L": round(rouge_l, 4),
            })
            
        return pd.DataFrame(evaluation_results)

In [2]:
evaluator = CausalRAGEvaluator('causal_math_graph_llm.json')

## Causal Rag ##

In [5]:
# --- Execute with your data ---
# Assuming 'causal_math_graph_llm.json' is in your directory

data_to_evaluate = [
    {
        "query": "What happens when the circumcenter is on the side of the triangle?",
        "context": "bent or extended or broken sides → a triangle changes shape, broken joints → a triangle changes shape, one pair of corresponding sides of two triangles are in the same proportion as another pair of corresponding sides, and their included angles have the same measure → triangles are similar",
        "truth": "Thales' theorem implies that if the circumcenter is located on the side of the triangle, then the angle opposite that side is a right angle."
    },
    {
        "query": "how many side of squares",
        "context": "formula for the area of a square as the second power of its side length → use of the term squaring to mean raising any number to the second power, a square is a regular polygon → it has the least perimeter for a given area, squares tilted at 45° to the coordinate axes → the 'circles' in taxicab geometry",
        "truth": "A square is a regular quadrilateral that has four straight sides of equal length."
    },
    {
        "query": "what is the circumference of circle",
        "context": "ratio of s to radius r or circumference C → angle's size θ, length of the arc that subtends the angle at the centre of a unit circle → direct physical measurement of angles, length s → angle's size θ",
        "truth": "The size of an angle θ can be measured by taking the ratio of arc length s to the circumference C of the circle."
    },
    {
        "query": "How is the area of a square related to its side length?",
        "context": "formula for the area of a square as the second power of its side length → use of the term squaring to mean raising any number to the second power, a square is a regular polygon → it has the least perimeter for a given area, parallelogram being a rhombus → area can be expressed using sides B and C and angle",
        "truth": "The area of a square is the side length multiplied by itself (the second power of its side length)."
    },
    {
        "query": "What conditions make two triangles similar?",
        "context": "bent or extended or broken sides → a triangle changes shape, one pair of corresponding sides of two triangles are in the same proportion as another pair of corresponding sides, and their included angles have the same measure → triangles are similar, triangles are similar → violation of snark definition",
        "truth": "Triangles are similar if one pair of corresponding sides are in the same proportion as another pair and their included angles have the same measure."
    },
    {
        "query": "What characterizes a tangential quadrilateral?",
        "context": "concyclic intersection points of adjacent angle bisectors → the quadrilateral is a tangential quadrilateral, internal angle bisectors of a convex quadrilateral are concurrent → the quadrilateral is a tangential quadrilateral, orthodiagonal quadrilateral → largest area",
        "truth": "A tangential quadrilateral is a convex quadrilateral where the four sides are tangents to an inscribed circle."
    },
    {
        "query": "What happens to a graph when a cut-set is removed?",
        "context": "removal of a cut-set → disconnection of graph, vertex removal → disconnection of graph, cut whose cut-set has minimum total weight → max-flow min-cut theorem",
        "truth": "The removal of a cut-set from a connected graph disconnects it."
    },
    {
        "query": "How is Brownian motion related to the diffusion constant?",
        "context": "physically measurable quantities → diffusion constant, gravitational forces from surrounding stars → Brownian motion in a massive body, diffusion constant → mean squared displacement of a particle",
        "truth": "The diffusion constant relates to the mean squared displacement of a Brownian particle over time."
    },
    {
        "query": "Why do soap films form minimal area surfaces?",
        "context": "gravity → the liquid tends to drain in a vertical soap film, hole in soap film → rapid opening, liquid drainage → the soap film to thin at the top",
        "truth": "Surface tension leads to surface minimization, causing soap films to act as minimal surfaces that locally minimize their area."
    },
    {
        "query": "What is the relationship between snarks and the four-color theorem?",
        "context": "triangles are similar → violation of snark definition, four-color theorem proven → all snarks shown to be non-planar",
        "truth": "The four-color theorem is true if and only if every snark is a non-planar graph."
    }
]

df = evaluator.run_evaluation(data_to_evaluate)

print("### Formalized RAG Evaluation Results ###")
df

### Formalized RAG Evaluation Results ###


Unnamed: 0,Query,Similarity,Diversity,Context Recall,Context Precision,ROUGE_L
0,What happens when the circumcenter is on the s...,0.3944,0.6193,0.0,0.0,0.169
1,how many side of squares,0.6723,0.7396,0.0,0.0,0.169
2,what is the circumference of circle,0.8098,0.618,0.0,0.0,0.3
3,How is the area of a square related to its sid...,0.8152,0.7307,0.0,0.0,0.3077
4,What conditions make two triangles similar?,0.6527,0.6318,0.0,0.0,0.6111
5,What characterizes a tangential quadrilateral?,0.6357,0.5944,0.0,0.0,0.2857
6,What happens to a graph when a cut-set is remo...,0.7147,0.5752,0.0,0.0,0.3684
7,How is Brownian motion related to the diffusio...,0.7138,0.7205,0.0,0.0,0.4103
8,Why do soap films form minimal area surfaces?,0.6758,0.6765,0.0,0.0,0.1739
9,What is the relationship between snarks and th...,0.7586,0.7784,0.0,0.0,0.3429


In [6]:
df.to_csv('causal rag evalution.csv', index = False)

## Bunny Rag##

In [7]:
data_to_evaluate_2 = [
    {
        "query": "What happens when the circumcenter is on the side of the triangle?",
        "context": "a triangle changes shape, triangles are similar, a triangle will not change shape, tessellated triangles, the 'circles' in taxicab geometry, squared integer, minor-closed property, convincing evidence of Brownian motion, inhomogeneous surface concentrations",
        "truth": "When the circumcenter is located on a side of the triangle, it indicates the triangle is a right triangle, with the circumcenter serving as the midpoint of the hypotenuse."
    },
    {
        "query": "how many side of squares",
        "context": "construction of a square with a given side, squares tilted at 45° to the coordinate axes, a square is a regular polygon, formula for the area of a square as the second power of its side length, specifying the lengths of all three sides, minor-closed property, squared integer, convincing evidence of Brownian motion, Marangoni forces",
        "truth": "A square is a regular polygon that has four equal sides."
    },
    {
        "query": "what is the circumference of circle",
        "context": "size of the circle, ratio of s to radius r or circumference C, length of the arc that subtends the angle at the centre of a unit circle, shapes that would be Euclidean squares become the 'circles, angle's size θ, squared integer, minor-closed property, convincing evidence of Brownian motion, Marangoni forces",
        "truth": "The circumference is the total linear distance around the edge of a circle, calculated using the formula $C = 2\pi r$ or $C = \pi d$."
    },
    {
        "query": "How is the area of a square related to its side length?",
        "context": "formula for the area of a square as the second power of its side length, a square is a regular polygon, area can be expressed using sides B and C and angle, construction of a square with a given side, side length a, squared integer, minor-closed property, convincing evidence of Brownian motion, atoms and molecules exist",
        "truth": "The area of a square is directly proportional to the square of its side length ($A = s^2$)."
    },
    {
        "query": "What conditions make two triangles similar?",
        "context": "triangles are similar, a triangle changes shape, one pair of corresponding sides of two triangles are in the same proportion as another pair of corresponding sides, and their included angles have the same measure, tessellated triangles, a triangle will not change shape, squared integer, minor-closed property, convincing evidence of Brownian motion, inhomogeneous surface concentrations",
        "truth": "Two triangles are similar if their corresponding angles are equal and their corresponding sides are in the same proportion."
    },
    {
        "query": "What characterizes a tangential quadrilateral?",
        "context": "the quadrilateral is a tangential quadrilateral, structural quadrilaterals, orthodiagonal quadrilateral, internal angle bisectors of a convex quadrilateral are concurrent, splitting the quadrilateral into two rigid triangles, minor-closed property, squared integer, convincing evidence of Brownian motion, Marangoni forces",
        "truth": "A tangential quadrilateral is defined by the fact that all four of its sides are tangent to a single inscribed circle (the incircle)."
    },
    {
        "query": "What happens to a graph when a cut-set is removed?",
        "context": "removal of a cut-set, disconnection of graph, cut whose cut-set has minimum total weight, max-flow min-cut theorem, vertex removal, squared integer, minor-closed property, convincing evidence of Brownian motion, inhomogeneous surface concentrations",
        "truth": "Removing a cut-set from a graph results in the graph becoming disconnected, splitting it into two or more separate components."
    },
    {
        "query": "How is Brownian motion related to the diffusion constant?",
        "context": "diffusion constant, analog of Brownian motion, Brownian motion in a massive body, true Brownian dynamics, convincing evidence of Brownian motion, squared integer, minor-closed property, inhomogeneous surface concentrations",
        "truth": "Brownian motion describes the random movement of particles, and its rate of spread is quantified by the diffusion constant through the Einstein relation."
    },
    {
        "query": "Why do soap films form minimal area surfaces?",
        "context": "the soap film to thin at the top, the liquid tends to drain in a vertical soap film, unstable soap film, hole in soap film, soap film does not last forever, squared integer, minor-closed property, convincing evidence of Brownian motion",
        "truth": "Soap films form minimal area surfaces because surface tension acts to minimize the surface energy by pulling the film into the smallest possible area for a given boundary."
    },
    {
        "query": "What is the relationship between snarks and the four-color theorem?",
        "context": "four-color theorem proven, all snarks shown to be non-planar, violation of snark definition, required for snarks, prevents 3-edge-coloring, squared integer, minor-closed property, convincing evidence of Brownian motion, Marangoni forces",
        "truth": "Snarks are non-planar, cubic graphs that are not 3-edge-colorable; the four-color theorem implies that no planar graph can be a snark."
    }
]

df = evaluator.run_evaluation(data_to_evaluate_2)

print("### Formalized RAG Evaluation Results ###")
df

  "truth": "The circumference is the total linear distance around the edge of a circle, calculated using the formula $C = 2\pi r$ or $C = \pi d$."


### Formalized RAG Evaluation Results ###


Unnamed: 0,Query,Similarity,Diversity,Context Recall,Context Precision,ROUGE_L
0,What happens when the circumcenter is on the s...,0.2442,0.8408,0.0,0.0,0.2258
1,how many side of squares,0.5717,0.8411,0.0,0.0,0.209
2,what is the circumference of circle,0.3455,0.8629,0.0,0.0,0.2078
3,How is the area of a square related to its sid...,0.5906,0.8358,0.0,0.0,0.2933
4,What conditions make two triangles similar?,0.5167,0.8002,0.0,0.0,0.2703
5,What characterizes a tangential quadrilateral?,0.5914,0.8149,0.0,0.0,0.1967
6,What happens to a graph when a cut-set is remo...,0.543,0.8711,0.0,0.0,0.1754
7,How is Brownian motion related to the diffusio...,0.6341,0.7829,0.0,0.0,0.1176
8,Why do soap films form minimal area surfaces?,0.5322,0.7307,0.0,0.0,0.1739
9,What is the relationship between snarks and th...,0.639,0.8844,0.0,0.0,0.2034


In [8]:
df.to_csv('bunny rag evalution.csv', index = False)