## How's the performance of our Relevance Feeedback Function?

In [1]:
import os
os.environ["OPENAI_API_KEY"] = "..."

In [2]:
# Imports main tools:
import openai

In [3]:
def relevance(template: str, question: str, statement: str) -> float:
        """
        Uses OpenAI's Chat Completion Model. A function that completes a
        template to check the relevance of the response to a prompt.

        Parameters:
            prompt (str): A text prompt to an agent. response (str): The agent's
            response to the prompt.

        Returns:
            float: A value between 0 and 1. 0 being "not relevant" and 1 being
            "relevant".
        """
        return openai.ChatCompletion.create(
                    model='gpt-3.5-turbo',
                    temperature=0.0,
                    messages=[
                        {
                            "role":
                                "system",
                            "content":
                                str.format(
                                    template,
                                    question=question,
                                    statement=statement
                                )
                        }
                    ]
                )["choices"][0]["message"]["content"]

In [4]:
QS_RELEVANCE = """You are a RELEVANCE classifier; providing the relevance of the given STATEMENT to the given QUESTION.
Respond only as a number from 1 to 10 where 1 is the least relevant and 10 is the most relevant.
Never elaborate.

QUESTION: {question}

STATEMENT: {statement}

RELEVANCE: """

In [5]:
relevance(QS_RELEVANCE, "What is the purpose of a porpoise?","Porpoises are marine mammals that have two flippers.")

'5'

## Llama-Index Eval

In [6]:
llamaindex_DEFAULT_EVAL_PROMPT = (
    "Please tell if a given piece of information "
    "is supported by the context.\n"
    "You need to answer with either YES or NO.\n"
    "Answer YES if any of the context supports the information, even "
    "if most of the context is unrelated. "
    "Some examples are provided below. \n\n"
    "Information: Apple pie is generally double-crusted.\n"
    "Context: An apple pie is a fruit pie in which the principal filling "
    "ingredient is apples. \n"
    "Apple pie is often served with whipped cream, ice cream "
    "('apple pie à la mode'), custard or cheddar cheese.\n"
    "It is generally double-crusted, with pastry both above "
    "and below the filling; the upper crust may be solid or "
    "latticed (woven of crosswise strips).\n"
    "Answer: YES\n"
    "Information: Apple pies tastes bad.\n"
    "Context: An apple pie is a fruit pie in which the principal filling "
    "ingredient is apples. \n"
    "Apple pie is often served with whipped cream, ice cream "
    "('apple pie à la mode'), custard or cheddar cheese.\n"
    "It is generally double-crusted, with pastry both above "
    "and below the filling; the upper crust may be solid or "
    "latticed (woven of crosswise strips).\n"
    "Answer: NO\n"
    "Information: {question}\n"
    "Context: {statement}\n"
    "Answer: "
)

In [7]:
relevance(llamaindex_DEFAULT_EVAL_PROMPT, "What is the purpose of a porpoise?","Porpoises are marine mammals that have two flippers.")

'NO'

## LangChain auto-evaluator

from https://github.com/rlancemartin/auto-evaluator/blob/main/text_utils.py

In [8]:
autoeval_prompt = """ 
    Given the question: \n
    {question}
    Decide if the following retrieved context is relevant: \n
    {statement}
    Answer in the following format: \n
    "Context is relevant: True or False." \n 
    """

In [9]:
relevance(autoeval_prompt, "What is the purpose of a porpoise?","Porpoises are marine mammals that have two flippers.")

'Context is relevant: True'