tinomaxthayil · tinomaxthayil · Dec 27, 2023 · Oct 19, 2023 · Oct 20, 2023 · Oct 26, 2023
diff --git a/docs/concepts/metrics/answer_relevance.md b/docs/concepts/metrics/answer_relevance.md
@@ -2,6 +2,10 @@
 
 The evaluation metric, Answer Relevancy, focuses on assessing how pertinent the generated answer is to the given prompt. A lower score is assigned to answers that are incomplete or contain redundant information. This metric is computed using the `question` and the `answer`, with values ranging between 0 and 1, where higher scores indicate better relevancy.
 
+:::{note}
+This is reference free metric. If you're looking to compare ground truth answer with generated answer refer to [answer_correctness](./answer_correctness.md)
+:::
+
 An answer is deemed relevant when it directly and appropriately addresses the original question. Importantly, our assessment of answer relevance does not consider factuality but instead penalizes cases where the answer lacks completeness or contains redundant details. To calculate this score, the LLM is prompted to generate an appropriate question for the generated answer multiple times, and the mean cosine similarity between these generated questions and the original question is measured. The underlying idea is that if the generated answer accurately addresses the initial question, the LLM should be able to generate questions from the answer that align with the original question.
 
 ```{hint}

diff --git a/docs/getstarted/evaluation.md b/docs/getstarted/evaluation.md
@@ -64,7 +64,7 @@ from ragas.metrics import (
     context_precision,
 )
 ```
-here you can see that we are using 4 metrics, but what do the represent?
+here you can see that we are using 4 metrics, but what do they represent?
 
 1. faithfulness - the factual consistency of the answer to the context base on the question.
 2. context_precision - a measure of how relevant the retrieved context is to the question. Conveys quality of the retrieval pipeline.
@@ -96,9 +96,9 @@ result = evaluate(
 
 result
 ```
-and there you have the it, all the scores you need. `ragas_score` gives you a single metric that you can use while the other onces measure the different parts of your pipeline.
+and there you have it, all the scores you need. `ragas_score` gives you a single metric that you can use while 4 metrics individually would measure the different parts of your pipeline.
 
-now if we want to dig into the results and figure out examples where your pipeline performed worse or really good you can easily convert it into a pandas array and use your standard analytics tools too!
+Now if we want to dig into the results and figure out examples where your pipeline performed worse or really good you can easily convert it into a pandas array and use your standard analytics tools too!
 
 ```{code-block} python
 :caption: export results
@@ -111,4 +111,4 @@ df.head()
 
 And thats it!
 
-if you have any suggestion/feedbacks/things your not happy about, please do share it in the [issue section](https://github.com/explodinggradients/ragas/issues). We love hearing from you 😁
+If you have any suggestion/feedbacks/things your not happy about, please do share it in the [issue section](https://github.com/explodinggradients/ragas/issues). We love hearing from you 😁
diff --git a/docs/howtos/customisations/azure-openai.ipynb b/docs/howtos/customisations/azure-openai.ipynb
@@ -158,6 +158,9 @@
     "from langchain.embeddings import AzureOpenAIEmbeddings\n",
     "from ragas.llms import LangchainLLM\n",
     "\n",
+    "# Import evaluate before patching the RagasLLM instance\n",
+    "from ragas import evaluate\n",
+    "\n",
     "azure_model = AzureChatOpenAI(\n",
     "    deployment_name=\"your-deployment-name\",\n",
     "    model=\"your-model-name\",\n",
@@ -242,8 +245,6 @@
     }
    ],
    "source": [
-    "from ragas import evaluate\n",
-    "\n",
     "result = evaluate(\n",
     "    fiqa_eval[\"baseline\"],\n",
     "    metrics=metrics,\n",

diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py
@@ -26,7 +26,7 @@ def evaluate(
 
     Parameters
     ----------
-    dataset : Dataset[question: list[str], contexts: list[list[str]], answer: list[str]]
+    dataset : Dataset[question: list[str], contexts: list[list[str]], answer: list[str], ground_truths: list[list[str]]]
         The dataset in the format of ragas which the metrics will use to score the RAG
         pipeline with
     metrics : list[Metric] , optional

diff --git a/src/ragas/llms/prompt.py b/src/ragas/llms/prompt.py
@@ -13,34 +13,29 @@ class Prompt(PromptValue):
     """
     RagasPrompt is a class that represents a prompt for the ragas metrics.
     """
+
     instruction: str
     examples: t.List[t.Dict[str, t.Any]] = []
     input_keys: t.List[str]
     output_key: str
-    output_type: str = 'json'
+    output_type: str = "json"
 
     @root_validator
     def validate_prompt(cls, values: t.Dict[str, t.Any]) -> t.Dict[str, t.Any]:
         """
         Validate the template string to ensure that it is in desired format.
         """
         if values.get("instruction") is None or values.get("instruction") == "":
-            raise ValueError(
-                "instruction cannot be empty"
-            )
+            raise ValueError("instruction cannot be empty")
         if values.get("input_keys") is None or values.get("instruction") == []:
-            raise ValueError(
-                "input_keys cannot be empty"
-            )
+            raise ValueError("input_keys cannot be empty")
         if values.get("output_key") is None or values.get("output_key") == "":
-            raise ValueError(
-                "output_key cannot be empty"
-            )
-
+            raise ValueError("output_key cannot be empty")
+
         if values.get("examples"):
             output_key = values["output_key"]
-            for no, example in enumerate(values['examples']):
-                for inp_key in values['input_keys']:
+            for no, example in enumerate(values["examples"]):
+                for inp_key in values["input_keys"]:
                     if inp_key not in example:
                         raise ValueError(
                             f"example {no+1} does not have the variable {inp_key} in the definition"
@@ -49,7 +44,7 @@ def validate_prompt(cls, values: t.Dict[str, t.Any]) -> t.Dict[str, t.Any]:
                     raise ValueError(
                         f"example {no+1} does not have the variable {output_key} in the definition"
                     )
-                if values["output_type"] == 'json':
+                if values["output_type"] == "json":
                     try:
                         if output_key in example:
                             json.loads(example[output_key])
@@ -64,37 +59,43 @@ def to_string(self) -> str:
         """
         Generate the prompt string from the variables.
         """
-        prompt_str = self.instruction + '\n'
+        prompt_str = self.instruction + "\n"
 
         # Format the examples to match the Langchain prompt template
         for example in self.examples:
             for key, value in example.items():
-                value = value.replace('{','{{').replace('}','}}') if self.output_type == 'json' else value
-                prompt_str += f'\n{key}: {value}'
-            prompt_str += '\n'
-
-        prompt_str += ''.join(f'\n{key}: {{{key}}}' for key in self.input_keys)
-        prompt_str += f'\n{self.output_key}: \n'
+                value = (
+                    value.replace("{", "{{").replace("}", "}}")
+                    if self.output_type.lower() == "json"
+                    else value
+                )
+                prompt_str += f"\n{key}: {value}"
+            prompt_str += "\n"
+
+        prompt_str += "".join(f"\n{key}: {{{key}}}" for key in self.input_keys)
+        prompt_str += f"\n{self.output_key}: \n"
 
         return prompt_str
 
     def to_messages(self) -> t.List[BaseMessage]:
         """Return prompt as a list of Messages."""
         return [HumanMessage(content=self.to_string())]
-    
+
     def get_example_str(self, example_no: int) -> str:
         """
         Get the example string from the example number.
         """
         if example_no >= len(self.examples):
-            raise ValueError(
-                f"example number {example_no} is out of range"
-            )
+            raise ValueError(f"example number {example_no} is out of range")
         example = self.examples[example_no]
-        example_str = ''
+        example_str = ""
         for key, value in example.items():
-            value = value.replace('{','{{').replace('}','}}') if self.output_type == 'json' else value
-            example_str += f'\n{key}: {value}'
+            value = (
+                value.replace("{", "{{").replace("}", "}}")
+                if self.output_type.lower() == "json"
+                else value
+            )
+            example_str += f"\n{key}: {value}"
         return example_str
 
     def format(self, **kwargs: t.Any) -> ChatPromptTemplate:

diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py
@@ -6,12 +6,11 @@
 import numpy as np
 from datasets import Dataset
 from langchain.callbacks.manager import CallbackManager, trace_as_chain_group
-from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
 
+from ragas.utils import json_loader
 from ragas.llms.prompt import Prompt
 from ragas.metrics._answer_similarity import AnswerSimilarity
 from ragas.metrics.base import EvaluationMode, MetricWithLLM
-from ragas.utils import json_loader
 
 if t.TYPE_CHECKING:
     from langchain.callbacks.base import Callbacks
@@ -29,7 +28,7 @@
                 "statements present in the answer but not found in the ground truth": ["The sun is powered by nuclear fission", "similar to nuclear reactors on Earth"],
                 "relevant statements found in the ground truth but omitted in the answer": ["The sun is powered by nuclear fusion, not fission", "In its core, hydrogen atoms fuse to form helium, releasing a tremendous amount of energy", "This energy provides heat and light, essential for life on Earth", "The sun's light plays a critical role in Earth's climate system", "The sun helps to drive the weather and ocean currents"]
             }]
-            """
+            """,
         },
         {
             "question": """What is the boiling point of water?""",
@@ -41,12 +40,12 @@
                 "statements present in the answer but not found in the ground truth": [],
                 "relevant statements found in the ground truth but omitted in the answer": ["The boiling point can change with altitude", "The boiling point of water is 212 degrees Fahrenheit at sea level"]
             }]
-            """
-        }
+            """,
+        },
     ],
     input_keys=["question", "answer", "ground_truth"],
     output_key="Extracted statements",
-    output_type="json"
+    output_type="json",
 )
 
 
@@ -101,9 +100,7 @@ def _score_batch(
         ) as batch_group:
             for q, a, g in zip(question, answer, ground_truths):
                 prompts.append(
-                    CORRECTNESS_PROMPT.format(
-                        question=q, ground_truth=g[0], answer=a
-                    )
+                    CORRECTNESS_PROMPT.format(question=q, ground_truth=g[0], answer=a)
                 )
 
             result = self.llm.generate(prompts, callbacks=batch_group)
@@ -113,7 +110,7 @@ def _score_batch(
                 "FP": "statements present in the answer but not found in the ground truth",
                 "FN": "relevant statements found in the ground truth but omitted in the answer",  # noqa: E501
             }
-    
+
             f1_score = []
             for prediction in outputs:
                 prediction = json_loader.safe_load(prediction[0].text, self.llm)
@@ -131,9 +128,9 @@ def _score_batch(
                     score = tp / (tp + 0.5 * (fp + fn))
                 else:
                     score = np.nan
-    
+
                 f1_score.append(score)
-    
+
             similarity_scores = self.answer_similarity._score_batch(dataset, callbacks=batch_group)  # type: ignore
             scores_stacked = np.vstack([f1_score, similarity_scores])
             scores = np.average(

diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py
@@ -7,13 +7,12 @@
 from datasets import Dataset
 from langchain.callbacks.manager import CallbackManager, trace_as_chain_group
 from langchain.embeddings import OpenAIEmbeddings
-from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
 
 from ragas.embeddings.base import embedding_factory
 from ragas.exceptions import OpenAIKeyNotFound
+from ragas.utils import json_loader
 from ragas.llms.prompt import Prompt
 from ragas.metrics.base import EvaluationMode, MetricWithLLM
-from ragas.utils import json_loader
 
 if t.TYPE_CHECKING:
     from langchain.callbacks.base import Callbacks
@@ -26,27 +25,27 @@
         {
             "answer": """Albert Einstein was born in Germany.""",
             "context": """Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time""",
-            "output": """{"question":"Where was Albert Einstein born?","noncommittal":false}"""
+            "output": """{"question":"Where was Albert Einstein born?","noncommittal":false}""",
         },
         {
             "answer": """It can change its skin color based on the temperature of its environment.""",
             "context": """A recent scientific study has discovered a new species of frog in the Amazon rainforest that has the unique ability to change its skin color based on the temperature of its environment.""",
-            "output": """{"question":"What unique ability does the newly discovered species of frog have?","noncommittal":false}"""
+            "output": """{"question":"What unique ability does the newly discovered species of frog have?","noncommittal":false}""",
         },
         {
             "answer": """Everest""",
             "context": """The tallest mountain on Earth, measured from sea level, is a renowned peak located in the Himalayas.""",
-            "output": """{"question":"What is the tallest mountain on Earth?","noncommittal":false}"""
+            "output": """{"question":"What is the tallest mountain on Earth?","noncommittal":false}""",
         },
         {
             "answer": """I don't know about the  groundbreaking feature of the smartphone invented in 2023 as am unware of information beyond 2022. """,
             "context": """In 2023, a groundbreaking invention was announced: a smartphone with a battery life of one month, revolutionizing the way people use mobile technology.""",
-            "output": """{"question":"What was the groundbreaking feature of the smartphone invented in 2023?", "noncommittal":true}"""
-        }
+            "output": """{"question":"What was the groundbreaking feature of the smartphone invented in 2023?", "noncommittal":true}""",
+        },
     ],
     input_keys=["answer", "context"],
     output_key="output",
-    output_type="json"
+    output_type="json",
 )
 
 

diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py
@@ -7,11 +7,10 @@
 import numpy as np
 from datasets import Dataset
 from langchain.callbacks.manager import CallbackManager, trace_as_chain_group
-from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
 
+from ragas.utils import json_loader
 from ragas.llms.prompt import Prompt
 from ragas.metrics.base import EvaluationMode, MetricWithLLM
-from ragas.utils import json_loader
 
 if t.TYPE_CHECKING:
     from langchain.callbacks.base import Callbacks
@@ -27,7 +26,7 @@
                 "reason": "The provided context was indeed useful in arriving at the given answer. The context includes key information about Albert Einstein's life and contributions, which are reflected in the answer.",
                 "Verdict": "1"
             }
-            """
+            """,
         },
         {
             "question": """who won 2020 icc world cup?""",
@@ -37,7 +36,7 @@
                 "reason": "the context was useful in clarifying the situation regarding the 2020 ICC World Cup and indicating that England was the winner of the tournament that was intended to be held in 2020 but actually took place in 2022.",
                 "verdict": "1"
             }
-            """
+            """,
         },
         {
             "question": """What is the tallest mountain in the world?""",
@@ -47,12 +46,12 @@
                 "reason":"the provided context discusses the Andes mountain range, which, while impressive, does not include Mount Everest or directly relate to the question about the world's tallest mountain.",
                 "verdict":"0"
             }
-            """
-        }
+            """,
+        },
     ],
     input_keys=["question", "context", "answer"],
     output_key="verification",
-    output_type="json"
+    output_type="json",
 )
 
 
@@ -98,9 +97,7 @@ def _score_batch(
         ) as batch_group:
             for qstn, ctx, answer in zip(questions, contexts, answers):
                 human_prompts = [
-                    CONTEXT_PRECISION.format(
-                        question=qstn, context=c, answer=answer
-                    )
+                    CONTEXT_PRECISION.format(question=qstn, context=c, answer=answer)
                     for c in ctx
                 ]