# https://docs.arize.com/phoenix/evaluation/evals

In [1]:
import pandas as pd

df = pd.DataFrame(
    [
        {
            "reference": "The Eiffel Tower is located in Paris, France. It was constructed in 1889 as the entrance arch to the 1889 World's Fair.",
            "query": "Where is the Eiffel Tower located?",
            "response": "The Eiffel Tower is located in Paris, France.",
        },
        {
            "reference": "The Great Wall of China is over 13,000 miles long. It was built over many centuries by various Chinese dynasties to protect against nomadic invasions.",
            "query": "How long is the Great Wall of China?",
            "response": "The Great Wall of China is approximately 13,171 miles (21,196 kilometers) long.",
        },
        {
            "reference": "The Amazon rainforest is the largest tropical rainforest in the world. It covers much of northwestern Brazil and extends into Colombia, Peru and other South American countries.",
            "query": "What is the largest tropical rainforest?",
            "response": "The Amazon rainforest is the largest tropical rainforest in the world. It is home to the largest number of plant and animal species in the world.",
        },
        {
            "reference": "Mount Everest is the highest mountain on Earth. It is located in the Mahalangur Himal sub-range of the Himalayas, straddling the border between Nepal and Tibet.",
            "query": "Which is the highest mountain on Earth?",
            "response": "Mount Everest, standing at 29,029 feet (8,848 meters), is the highest mountain on Earth.",
        },
        {
            "reference": "The Nile is the longest river in the world. It flows northward through northeastern Africa for approximately 6,650 km (4,132 miles) from its most distant source in Burundi to the Mediterranean Sea.",
            "query": "What is the longest river in the world?",
            "response": "The Nile River, at 6,650 kilometers (4,132 miles), is the longest river in the world.",
        },
        {
            "reference": "The Mona Lisa was painted by Leonardo da Vinci. It is considered an archetypal masterpiece of the Italian Renaissance and has been described as 'the best known, the most visited, the most written about, the most sung about, the most parodied work of art in the world'.",
            "query": "Who painted the Mona Lisa?",
            "response": "The Mona Lisa was painted by the Italian Renaissance artist Leonardo da Vinci.",
        },
        {
            "reference": "The human body has 206 bones. These bones provide structure, protect organs, anchor muscles, and store calcium.",
            "query": "How many bones are in the human body?",
            "response": "The adult human body typically has 256 bones.",
        },
        {
            "reference": "Jupiter is the largest planet in our solar system. It is a gas giant with a mass more than two and a half times that of all the other planets in the solar system combined.",
            "query": "Which planet is the largest in our solar system?",
            "response": "Jupiter is the largest planet in our solar system.",
        },
        {
            "reference": "William Shakespeare wrote 'Romeo and Juliet'. It is a tragedy about two young star-crossed lovers whose deaths ultimately reconcile their feuding families.",
            "query": "Who wrote 'Romeo and Juliet'?",
            "response": "The play 'Romeo and Juliet' was written by William Shakespeare.",
        },
        {
            "reference": "The first moon landing occurred in 1969. On July 20, 1969, American astronauts Neil Armstrong and Edwin 'Buzz' Aldrin became the first humans to land on the moon as part of the Apollo 11 mission.",
            "query": "When did the first moon landing occur?",
            "response": "The first moon landing took place on July 20, 1969.",
        },
    ]
)
df.head()

Unnamed: 0,reference,query,response
0,"The Eiffel Tower is located in Paris, France. ...",Where is the Eiffel Tower located?,"The Eiffel Tower is located in Paris, France."
1,"The Great Wall of China is over 13,000 miles l...",How long is the Great Wall of China?,"The Great Wall of China is approximately 13,17..."
2,The Amazon rainforest is the largest tropical ...,What is the largest tropical rainforest?,The Amazon rainforest is the largest tropical ...
3,Mount Everest is the highest mountain on Earth...,Which is the highest mountain on Earth?,"Mount Everest, standing at 29,029 feet (8,848 ..."
4,The Nile is the longest river in the world. It...,What is the longest river in the world?,"The Nile River, at 6,650 kilometers (4,132 mil..."


In [6]:
import nest_asyncio
from phoenix.evals import HallucinationEvaluator, OpenAIModel, QAEvaluator, run_evals

nest_asyncio.apply()  # This is needed for concurrency in notebook environments

# Set your OpenAI API key
eval_model = OpenAIModel(model="gpt-4o")

# Define your evaluators
hallucination_evaluator = HallucinationEvaluator(eval_model)
qa_evaluator = QAEvaluator(eval_model)

# We have to make some minor changes to our dataframe to use the column names expected by our evaluators
# for `hallucination_evaluator` the input df needs to have columns 'output', 'input', 'context'
# for `qa_evaluator` the input df needs to have columns 'output', 'input', 'reference'
df["context"] = df["reference"]
df.rename(columns={"query": "input", "response": "output"}, inplace=True)
assert all(column in df.columns for column in ["output", "input", "context", "reference"])

# Run the evaluators, each evaluator will return a dataframe with evaluation results
# We upload the evaluation results to Phoenix in the next step
hallucination_eval_df, qa_eval_df = run_evals(
    dataframe=df, evaluators=[hallucination_evaluator, qa_evaluator], provide_explanation=True
)

run_evals |          | 0/20 (0.0%) | ⏳ 00:00<? | ?it/s

In [7]:
pd.set_option('display.max_colwidth', None)
results_df = df.copy()
results_df["hallucination_eval"] = hallucination_eval_df["label"]
results_df["hallucination_explanation"] = hallucination_eval_df["explanation"]
results_df["qa_eval"] = qa_eval_df["label"]
results_df["qa_explanation"] = qa_eval_df["explanation"]
results_df.head()

Unnamed: 0,reference,input,output,context,hallucination_eval,hallucination_explanation,qa_eval,qa_explanation
0,"The Eiffel Tower is located in Paris, France. It was constructed in 1889 as the entrance arch to the 1889 World's Fair.",Where is the Eiffel Tower located?,"The Eiffel Tower is located in Paris, France.","The Eiffel Tower is located in Paris, France. It was constructed in 1889 as the entrance arch to the 1889 World's Fair.",factual,"To determine if the answer is factual or hallucinated, we first look at the query, which asks about the location of the Eiffel Tower. The reference text provides the information that the Eiffel Tower is located in Paris, France. The answer given is ""The Eiffel Tower is located in Paris, France."" This answer directly matches the information provided in the reference text. There is no additional information or deviation from the reference text in the answer. Therefore, the answer is consistent with the reference text and does not contain any hallucinated or made-up information.",correct,"To determine if the answer is correct, we need to compare the information provided in the answer with the information in the reference text. \n\n1. The question asks for the location of the Eiffel Tower.\n2. The reference text states that ""The Eiffel Tower is located in Paris, France.""\n3. The answer provided is ""The Eiffel Tower is located in Paris, France.""\n\nThe answer matches exactly with the information given in the reference text regarding the location of the Eiffel Tower. Therefore, the answer correctly and fully answers the question based on the reference text."
1,"The Great Wall of China is over 13,000 miles long. It was built over many centuries by various Chinese dynasties to protect against nomadic invasions.",How long is the Great Wall of China?,"The Great Wall of China is approximately 13,171 miles (21,196 kilometers) long.","The Great Wall of China is over 13,000 miles long. It was built over many centuries by various Chinese dynasties to protect against nomadic invasions.",hallucinated,"To determine if the answer is factual or hallucinated, we need to compare the information provided in the answer with the reference text. The reference text states that the Great Wall of China is over 13,000 miles long. The answer specifies that the Great Wall is approximately 13,171 miles (21,196 kilometers) long. The reference text does not provide an exact figure, only stating that it is over 13,000 miles. The answer provides a specific length that is not mentioned in the reference text. Since the reference text does not confirm the exact length given in the answer, the answer is considered to be a hallucination of facts, as it assumes information not available in the reference text.",correct,"To determine if the answer is correct, we need to compare the information provided in the answer with the information in the reference text. \n\n1. The question asks about the length of the Great Wall of China.\n2. The reference text states that the Great Wall of China is over 13,000 miles long.\n3. The answer states that the Great Wall of China is approximately 13,171 miles (21,196 kilometers) long.\n4. The answer provides a more precise figure (13,171 miles) which is consistent with the reference text's statement that the wall is over 13,000 miles long.\n5. The reference text does not contradict the answer, and the answer provides additional detail that is plausible and aligns with the reference.\n\nTherefore, the answer correctly answers the question based on the reference text."
2,"The Amazon rainforest is the largest tropical rainforest in the world. It covers much of northwestern Brazil and extends into Colombia, Peru and other South American countries.",What is the largest tropical rainforest?,The Amazon rainforest is the largest tropical rainforest in the world. It is home to the largest number of plant and animal species in the world.,"The Amazon rainforest is the largest tropical rainforest in the world. It covers much of northwestern Brazil and extends into Colombia, Peru and other South American countries.",hallucinated,"To determine if the answer is factual or hallucinated, we need to compare the information in the answer with the reference text. The query asks about the largest tropical rainforest. The reference text states that the Amazon rainforest is the largest tropical rainforest in the world, which matches the first part of the answer. However, the answer also claims that the Amazon rainforest is home to the largest number of plant and animal species in the world. This information is not mentioned in the reference text. The reference text only provides information about the size and location of the Amazon rainforest, not about the biodiversity or the number of species it contains. Therefore, the claim about the largest number of plant and animal species is not supported by the reference text and is considered a hallucination.",correct,"To determine if the answer is correct, we need to compare the information provided in the answer with the information in the reference text. The question asks for the largest tropical rainforest. The reference text states that the Amazon rainforest is the largest tropical rainforest in the world. The answer also states that the Amazon rainforest is the largest tropical rainforest in the world, which directly matches the information in the reference text. Additionally, the answer provides extra information about the Amazon rainforest being home to the largest number of plant and animal species, but this does not contradict the reference text or the question. Therefore, the answer correctly and fully answers the question based on the reference text."
3,"Mount Everest is the highest mountain on Earth. It is located in the Mahalangur Himal sub-range of the Himalayas, straddling the border between Nepal and Tibet.",Which is the highest mountain on Earth?,"Mount Everest, standing at 29,029 feet (8,848 meters), is the highest mountain on Earth.","Mount Everest is the highest mountain on Earth. It is located in the Mahalangur Himal sub-range of the Himalayas, straddling the border between Nepal and Tibet.",hallucinated,"First, we identify the query, which asks for the highest mountain on Earth. The reference text clearly states that Mount Everest is the highest mountain on Earth, located in the Mahalangur Himal sub-range of the Himalayas, straddling the border between Nepal and Tibet. The answer provided states that Mount Everest is the highest mountain on Earth and adds specific details about its height, 29,029 feet (8,848 meters). The reference text does not provide the height of Mount Everest, so the specific height mentioned in the answer is not supported by the reference text. Therefore, the answer includes information that is not present in the reference text, making it a hallucination.",correct,"To determine if the answer is correct, we need to compare the information provided in the answer with the information in the reference text. The question asks for the highest mountain on Earth. The reference text states that Mount Everest is the highest mountain on Earth. The answer also states that Mount Everest is the highest mountain on Earth and provides additional information about its height, which is consistent with known data. The additional detail about the height does not contradict the reference text and is accurate. Therefore, the answer correctly identifies Mount Everest as the highest mountain on Earth, which is consistent with the reference text."
4,"The Nile is the longest river in the world. It flows northward through northeastern Africa for approximately 6,650 km (4,132 miles) from its most distant source in Burundi to the Mediterranean Sea.",What is the longest river in the world?,"The Nile River, at 6,650 kilometers (4,132 miles), is the longest river in the world.","The Nile is the longest river in the world. It flows northward through northeastern Africa for approximately 6,650 km (4,132 miles) from its most distant source in Burundi to the Mediterranean Sea.",factual,"To determine if the answer is factual or hallucinated, we need to compare the information in the answer with the reference text. The query asks for the longest river in the world. The reference text states that the Nile is the longest river in the world, flowing for approximately 6,650 km (4,132 miles). The answer also states that the Nile River is the longest river in the world and provides the same length of 6,650 kilometers (4,132 miles). Since the answer matches the information provided in the reference text, it does not contain any false information or assumptions not supported by the reference text. Therefore, the answer is factual.",correct,"To determine if the answer is correct, we need to compare the information provided in the answer with the information in the reference text. \n\n1. The question asks for the longest river in the world.\n2. The reference text states that the Nile is the longest river in the world, with a length of approximately 6,650 km (4,132 miles).\n3. The answer states that the Nile River, at 6,650 kilometers (4,132 miles), is the longest river in the world.\n\nThe answer correctly identifies the Nile River as the longest river in the world and provides the same length as mentioned in the reference text. Therefore, the answer is consistent with the reference text and fully answers the question."


# Evals with Explanations

In [8]:
from phoenix.evals import (
    RAG_RELEVANCY_PROMPT_RAILS_MAP,
    RAG_RELEVANCY_PROMPT_TEMPLATE,
    OpenAIModel,
    download_benchmark_dataset,
    llm_classify,
)

model = OpenAIModel(
    model_name="gpt-4",
    temperature=0.0,
)

#The rails is used to hold the output to specific values based on the template
#It will remove text such as ",,," or "..."
#Will ensure the binary value expected from the template is returned
rails = list(RAG_RELEVANCY_PROMPT_RAILS_MAP.values())
relevance_classifications = llm_classify(
    dataframe=df,
    template=RAG_RELEVANCY_PROMPT_TEMPLATE,
    model=model,
    rails=rails,
    provide_explanation=True
)
#relevance_classifications is a Dataframe with columns 'label' and 'explanation'

The `model_name` field is deprecated. Use `model` instead.                 This will be removed in a future release.


  relevance_classifications = llm_classify(


llm_classify |          | 0/10 (0.0%) | ⏳ 00:00<? | ?it/s

In [9]:
rails

['relevant', 'unrelated']

In [10]:
relevance_classifications

Unnamed: 0,label,explanation,exceptions,execution_status,execution_seconds
0,relevant,"The question asks for the location of the Eiffel Tower. The reference text provides this information by stating that the Eiffel Tower is located in Paris, France. Therefore, the reference text is relevant to the question.",[],COMPLETED,2.168204
1,relevant,"The question asks for the length of the Great Wall of China. The reference text provides this information directly by stating that the Great Wall of China is over 13,000 miles long. Therefore, the reference text is relevant to the question.",[],COMPLETED,2.480344
2,relevant,"The question asks for the largest tropical rainforest. The reference text directly provides this information by stating that the Amazon rainforest is the largest tropical rainforest in the world. Therefore, the reference text is relevant to the question.",[],COMPLETED,2.74218
3,relevant,"The question asks for the highest mountain on Earth. The reference text directly provides this information, stating that Mount Everest is the highest mountain on Earth. Therefore, the reference text is relevant to the question.",[],COMPLETED,2.639431
4,relevant,"The question asks for the longest river in the world. The reference text directly provides this information, stating that the Nile is the longest river in the world. Therefore, the reference text is relevant to the question.",[],COMPLETED,2.672552
5,relevant,"The question asks who painted the Mona Lisa. The reference text directly answers this question by stating that the Mona Lisa was painted by Leonardo da Vinci. Therefore, the reference text is relevant to the question.",[],COMPLETED,2.761424
6,relevant,"The question asks for the number of bones in the human body. The reference text provides this information directly by stating that the human body has 206 bones. Therefore, the reference text is relevant to the question.",[],COMPLETED,2.77314
7,relevant,"The question asks for the largest planet in our solar system. The reference text directly provides this information by stating that Jupiter is the largest planet in our solar system. Therefore, the reference text is relevant to the question.",[],COMPLETED,2.853509
8,relevant,"The question asks for the author of 'Romeo and Juliet'. The reference text clearly states that William Shakespeare wrote 'Romeo and Juliet'. Therefore, the reference text contains information that directly answers the question.",[],COMPLETED,3.366698
9,relevant,"The question asks for the date of the first moon landing. The reference text provides this information, stating that the first moon landing occurred on July 20, 1969. Therefore, the reference text is relevant to the question.",[],COMPLETED,3.474249


# Custom Task Evaluation

In [11]:

df = download_benchmark_dataset(
    task="binary-hallucination-classification", dataset_name="halueval_qa_data"
)
df.head()

Unnamed: 0,reference,query,response,is_hallucination
0,"() is a prefecture-level city in northwestern Anhui province, China.Gaozhou is a county-level city in southwestern Guangdong Province, China.",Can Fuyang and Gaozhou be found in the same province?,no,False
1,"() is a prefecture-level city in northwestern Anhui province, China.Gaozhou is a county-level city in southwestern Guangdong Province, China.",Can Fuyang and Gaozhou be found in the same province?,"Yes, Fuyang and Gaozhou are in the same province.",True
2,"""808"" was a success in the United States becoming the group's first top ten hit peaking at number eight on the ""Billboard"" Hot 100 and top five peaking at number four on the Hot R&B/Hip-Hop Singles.The ""Billboard"" Hot 100 is the music industry standard record chart in the United States for singles, published weekly by ""Billboard"" magazine.",808 peaked at number eight on what?,"Billboard"" Hot 100",False
3,"""808"" was a success in the United States becoming the group's first top ten hit peaking at number eight on the ""Billboard"" Hot 100 and top five peaking at number four on the Hot R&B/Hip-Hop Singles.The ""Billboard"" Hot 100 is the music industry standard record chart in the United States for singles, published weekly by ""Billboard"" magazine.",808 peaked at number eight on what?,"""808"" peaked at number nine on ""Billboard"" Hot 100.",True
4,"""Arms"" then made a comeback in 2017 reaching #36 on the iTunes chart passing Auli'i Cravalho's ""How Far I'll Go"" from the Disney movie ""Moana"" (2017).Moana ( ) is a 2016 American 3D computer-animated musical fantasy-adventure film produced by Walt Disney Animation Studios and released by Walt Disney Pictures.","Arms is a song by American singer-songwriter Christina Perri, in 2017, it passed Auli'i Cravalho's, ""How Far I'll Go"" from which 2016, American 3D computer-animated Disney movie?",Moana,False


In [12]:
MY_CUSTOM_TEMPLATE = '''
    You are evaluating the positivity or negativity of the responses to questions.
    [BEGIN DATA]
    ************
    [Question]: {query}
    ************
    [Response]: {response}
    [END DATA]


    Please focus on the tone of the response.
    Your answer must be single word, either "positive" or "negative"
    '''

In [1]:
model = OpenAIModel(model_name="gpt-4",temperature=0.6)

positive_eval = llm_classify(
    dataframe=df,
    template= MY_CUSTOM_TEMPLATE,
    model=model,
    rails=rails
)

NameError: name 'OpenAIModel' is not defined