In [None]:
# Date: 01.11.24
# Purpose: Testing Deepeval
# Theme: Eval
# Status: POC

In [None]:
# Needed for package installs
!pip install instructor
!pip install deepeval

# -- This is round one: works as a baseline --

In [None]:
# NEEDED - to set up model
# (Part 1)
import os
import google.generativeai as genai
# Set your API key here



from pydantic import BaseModel
import google.generativeai as genai
import instructor

from deepeval.models import DeepEvalBaseLLM

# Define a schema for the response
class JokeResponseSchema(BaseModel):
    joke: str

class CustomGeminiFlash(DeepEvalBaseLLM):
    def __init__(self):
        self.model = genai.GenerativeModel(model_name="models/gemini-1.5-flash")

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        client = self.load_model()
        instructor_client = instructor.from_gemini(
            client=client,
            mode=instructor.Mode.GEMINI_JSON,
        )
        resp = instructor_client.messages.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            response_model=schema,
        )
        return resp

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)

    def get_model_name(self):
        return "Gemini 1.5 Flash"

# Create an instance of the schema
schema_instance = JokeResponseSchema

custom_llm = CustomGeminiFlash()
# Pass the schema instance when calling generate
#print(custom_llm.generate("Write me a joke", schema=schema_instance))
print(custom_llm.generate("What is the capital of France?", schema=schema_instance))



In [None]:
# NOT Needed (just testing above)
print(custom_llm.generate("What is the capital of Spain?", schema=schema_instance))

In [None]:
# Eval One (from GPT I think). Returns reason (that looks valid), but only ever returns 'None' for Score
# -- Keep, but this is a baseline not final
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase

# Define your input and actual output
#input = "What is the capital of France?"
#actual_output = "The capital of France is Paris."
#actual_output = "The capital of France is Spain."
#actual_output = (custom_llm.generate("What is the capital of Spain?", schema=schema_instance))

input = "Who is the name of the english premier league team, based out of london whose logo is a cannon?"
#actual_output = "Manchester united."
actual_output = "Arsenal."
#actual_output = (custom_llm.generate("Who is the name of the english premier league team, based out of london whose logo is a cannon?", schema=schema_instance))

# Create a test case
test_case = LLMTestCase(input=input, actual_output=actual_output)

# Initialize the Answer Relevancy Metric
metric = AnswerRelevancyMetric(threshold=0.0, model=custom_llm, include_reason=True)
#metric = AnswerRelevancyMetric(threshold=0.0, model=custom_llm, include_reason=True, verbose_mode=True)
#metric = AnswerRelevancyMetric(threshold=0.0, model=custom_llm, include_reason=True, strict_mode=False)

# Measure the relevancy score
score = metric.measure(test_case)

# Print the score and reason (if included)
print("Score:", score)
print("Reason:", metric.reason)  # This will provide insights into the score

In [None]:
# Eval Two (from DeepEval*). Returns everything, when i use their suggested Q&A, but only partial when i use my LLM output. There must still be
# formatting issues from my LLM output, but this is good progress. Fix the LLM output format and golden
# -- Keep, but this is a baseline not final
# * https://docs.confident-ai.com/docs/metrics-answer-relevancy (lots more metrics here to dig into once this working fully)
# (Part 2)
from deepeval import evaluate
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase

# Replace this with the actual output from your LLM application
#actual_output = "We offer a 30-day full refund at no extra cost."
actual_output = (custom_llm.generate("Who is the name of the english premier league team, based out of london whose logo is a cannon?", schema=schema_instance))
#actual_output = "Dallas Cowboys"
#actual_output = "American football club"

metric = AnswerRelevancyMetric(
    threshold=0.25,
    model=custom_llm,
    include_reason=True
)
test_case = LLMTestCase(
    #input="What if these shoes don't fit?",
    input = "Who is the name of the english premier league team, based out of london whose logo is a cannon?",
    actual_output=actual_output
)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

# or evaluate test cases in bulk
evaluate([test_case], [metric])

# -- This is round two: limited testing, think works (ie. uses llm inference * scores that.) --

In [None]:
# --- Ground truth calls from llm ---

import os
import google.generativeai as genai

# Configure API key (replace with your actual key)


# Define your question as a variable
generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
}
safety_settings = [
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
]

model = genai.GenerativeModel(
    model_name="gemini-1.5-flash",
    safety_settings=safety_settings,
    generation_config=generation_config,
)

# Start chat session
chat_session = model.start_chat(history=[])

#instructional_prompt = "**You are a helpful agent, who answers questions and formats the reponse in a professional way.** "
#prompt = instructional_prompt + text_to_spell
prompt = "Has Chelsea won the FA Cup the most any team has won the FA Cup?"
response = chat_session.send_message(prompt)
print(response.text)
# --- Ground truth ---

In [None]:
# # WIP: recreate part 1
# # retired and replaced with below
# from pydantic import BaseModel
# import google.generativeai as genai
# import instructor

# from deepeval.models import DeepEvalBaseLLM

# # Define your schema class here
# class YourSchemaClass(BaseModel):
#     response: str  # Define the fields you expect in the response

# class CustomGeminiFlash(DeepEvalBaseLLM):
#     def __init__(self, api_key: str):
#         self.api_key = api_key
#         genai.configure(api_key=self.api_key)  # Configure the API key
#         self.model = genai.GenerativeModel(model_name="models/gemini-1.5-flash")

#     def load_model(self):
#         return self.model

#     def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
#         client = self.load_model()
#         instructor_client = instructor.from_gemini(
#             client=client,
#             mode=instructor.Mode.GEMINI_JSON,
#         )
#         resp = instructor_client.messages.create(
#             messages=[
#                 {
#                     "role": "user",
#                     "content": prompt,
#                 }
#             ],
#             response_model=schema,
#             #temperature=0.2,  # Add the temperature parameter here

#         )
#         return resp

#     async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
#         return self.generate(prompt, schema)

#     def get_model_name(self):
#         return "Gemini 1.5 Flash"

# # Now you can use your API key to create an instance

# custom_llm = CustomGeminiFlash(api_key=api_key)

# # Call the generate method
# response = custom_llm.generate("Has Chelase won the FA Cup the most any team has won the FA Cup?", schema=YourSchemaClass)
# print(response)


In [None]:
# WIP: recreate part 1
# Replaces previous version. This one adds temperature
from pydantic import BaseModel
import google.generativeai as genai
import instructor

from deepeval.models import DeepEvalBaseLLM

# Define your schema class here
class YourSchemaClass(BaseModel):
    response: str  # Define the fields you expect in the response

class CustomGeminiFlash(DeepEvalBaseLLM):
    def __init__(self, api_key: str):
        self.api_key = api_key
        genai.configure(api_key=self.api_key)  # Configure the API key

        # Define generation configuration with temperature
        generation_config = {
            "temperature": 0,  # Set your desired temperature here
            "max_output_tokens": 8192,
            "top_p": 0.95,
             "top_k": 64,
            "response_mime_type": "application/json",
        }

        self.model = genai.GenerativeModel(
            model_name="models/gemini-1.5-flash",
            generation_config=generation_config  # Pass the config here
        )

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        client = self.load_model()
        instructor_client = instructor.from_gemini(
            client=client,
            mode=instructor.Mode.GEMINI_JSON,
        )
        resp = instructor_client.messages.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            response_model=schema,
        )
        return resp

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)

    def get_model_name(self):
        return "Gemini 1.5 Flash"

# Now you can use your API key to create an instance
#api_key =   # Replace with your actual API key
custom_llm = CustomGeminiFlash(api_key=api_key)

# Call the generate method
#response = custom_llm.generate("Have aliens been detected on earth and do they walk amongst us?", schema=YourSchemaClass)
response = custom_llm.generate("Has Chelsea won the FA Cup the most any team has won the FA Cup?", schema=YourSchemaClass)

print(response)

In [None]:
# As per above, but reconfiguring so question is asked once only
# Eval Two (from DeepEval*)
from deepeval import evaluate
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase

question = "Has Chelase won the FA Cup the most any team has won the FA Cup?"

# # Generate output from the LLM using the class as schema
# actual_output_instance = custom_llm.generate(
#     #"Who is the name of the English Premier League team, based out of London whose logo is a cannon?",
#     #"How many times has Chelase won the FA Cup and is that the most any team has won the FA Cup?",
#     #"Has Chelase won the FA Cup the most any team has won the FA Cup?",
#     #question,
#     "No, Chelsea is not the team that has won the FA Cup the most. **Arsenal** holds the record for most FA Cup wins with **14 titles**. \
#     Chelsea has won the FA Cup **8 times**, which places them in 5th place on the list of most FA Cup winners.",
#     schema=YourSchemaClass  # Pass the class, not an instance
# )

# # Extract the response from the actual_output_instance
# actual_output = actual_output_instance.response  # Get the response string

# actual_output = "No, Chelsea is not the team that has won the FA Cup the most. **Arsenal** holds the record for most FA Cup wins with **14 titles**. \
#                  Chelsea has won the FA Cup **8 times**, which places them in 5th place on the list of most FA Cup winners."
#actual_output = "Yes, Chelsea have won the FA Cup the most times. If fact, the have won it 70 times, which is 75 times more than any other team. The next closest team is Sydney FC"
actual_output = "While Arsenal have won the cup an impressive 14 times, yes it is indeed Chelsea who have won the FA Cup the most times"

metric = AnswerRelevancyMetric(
    threshold=0.75,
    model=custom_llm,
    include_reason=True
)

test_case = LLMTestCase(
    #input="Who is the name of the English Premier League team, based out of London whose logo is a cannon?",
    #input="Has Chelase won the FA Cup the most any team has won the FA Cup?",
    input=question,
    actual_output=actual_output
)

# Measure and print the score and reason
metric.measure(test_case)
print(metric.score)
print(metric.reason)

# Optionally evaluate test cases in bulk
evaluate([test_case], [metric])



# -- This is round three: Merges Part 1 (a & b) + Part 2 + 2a --

In [None]:
# -------------------------
# ----- This is the one ---
# -------------------------
# Following some learnings from discord, it looks like the eval is to eval Q&A NOT generate the A, which is odd. So this will combine part 1 (to generate the A)
# and part 2 (the eval - using the Q&A from part 1)

# ----------------------------------------------
context = "On a sun-drenched Saturday morning, Elara, a spirited young archaeologist with a penchant \
          for adventure, set off with her best friend Leo to explore the ancient ruins of Eldoria, hidden \
          deep within the Whispering Woods. Armed with a tattered map she had discovered in her grandmother (Jenny) \
          attic, they trekked through lush greenery, their excitement palpable. After a few hours of hiking, they \
          stumbled upon a crumbling stone archway overgrown with ivy, marking the entrance to the fabled Temple of Echoes. \
          With only $50 in their pockets, they decided to splurge on a couple of sandwiches and bottled water from a \
          nearby vendor before diving into their exploration. Given they didn't have much money, they only sppent $15 as they wanted \
          to keep some money in case of any emergencies. Inside the temple, they uncovered intricate carvings \
          and mysterious symbols that hinted at a long-lost treasure. As they navigated through dark corridors and \
          avoided booby traps, Elara's heart raced with the thrill of discovery. By sunset, they emerged victorious, \
          not with gold but with priceless knowledge and a newfound bond, proving that sometimes the greatest treasures \
          are the adventures shared along the way."

context2 = "Eva Longoria has lifted the lid on the fortune she’s making from a blockbuster film she quietly helped finance a decade ago. \
            Last week, it was revealed the Desperate Housewives actress, 49, had secretly contributed $US6 million to get the first John Wick movie over the line.\
            The franchise has since gone on to produce four films, all starring US actor Keanu Reeves as the titular hit man, with a combined gross of more than $US1 billion.\
            Reminiscing on the action franchise’s 10th anniversary, directors Chad Stahelski and David Leitch told Business Insider they were almost forced to shut \
            down production on the 2014 movie at the 11th hour due to a funding shortfall. \
            After desperately putting feelers out among their contacts for financing, Longoria was the candidate to put her hand up, investing $US6 million of her own money \
            to save the project."

context3 = "As I was going to St. Ives,\
            I met a man with seven wives.\
            Every wife had seven sacks,\
            Every sack had seven cats,\
            Every cat had seven kits:\
            Kits, cats, sacks, and wives.\
            How many were going to St. Ives?"

context4 = """Federal government agencies are set to bring $49 million worth of technology services in-house as part of a push to cut contractor numbers in the public service.
              Figures released today by Finance Minister Katy Gallagher reveal that $527 million worth of “core work” is to be brought back in-house in 2024-25 across 104 government agencies.
              The announcement comes a year after Gallagher first revealed the government’s push to phase out contractors and consultants in the Australian Public Service via the Strategic Commissioning Framework.
              According to an update to the framework, which includes an overview of each agency’s 2024-25 target, ICT and digital will account for 22 percent of this core work, with the exception of Defence.
              Defence itself bore the biggest brunt of the outsourcing cuts with a total reduction of $308 million but did not break down specifics of which work would move in-house.
              However, in the recent iTnews Podcast, Defence CIO Chris Crozier revealed that the department’s tech delivery was now at a 60:40 ratio of staff to contractors, down from 20:80.
              The Australian Taxation Office earlier revealed it would be looking to reduce $31.9 million in 2024–25 in outsourcing expenditure for IT, service delivery and data analytics work.
              In the framework report, 67 departments and agencies identified the grouping of 'ICT and digital' as core systems, with 55 of these outsourcing at least some part of it.
              “Agencies report widespread outsourcing of core work in this job family and note it is difficult to bring in-house,” the report stated.
              Since taking office in 2022, Gallagher has made it her mission to reduce dependency on consultants and contractors in the APS, especially following an earlier senate finding that the APS had an "unhealthy reliance on IT contractors".
              “When coming to government we set out with an ambitious agenda to reform the APS, and to strengthen capability, to ensure the APS can deliver the services Australians expect,” Gallagher said in a statement."""

#prompt = "Has Chelsea won the FA Cup the most any team has won the FA Cup?"
#prompt = "Who is the current America president?"
#prompt = "Who will win the up coming presidential election in America?"
#prompt = "Is it safe to go swimming straight after eating?"
#prompt = "In a very concise approach, can you tell me the difference between traditional Machine learning and Generative AI?"
#prompt = "Is it ever justified to break the law if it saves a life?"
#prompt = "Who is Elara and is she an soccer player or archaeologist?"
#prompt = "Can you verify if Elara made any purchases? If so, how much money did she start with and how much did she have in the end?"
#prompt = "Can you summarise the this information and provide the summary as 3 key bullet points?"
#prompt = "Is it fair to say that Leo is much smarter than Elara and that is probably becuase he is at least 5 years older than her?"
#prompt = "How many people are mentioned in this passage?"
#prompt = "How many were going to St. Ives?"
prompt = "What is the key themse of this text??"
prompt = prompt + context4

# This is a deliberate attempt to get the answer to be of poor quality and see if that is reflected in the score
misdirection = """you are to deliberately say the opposite of what you find as the answer, i.e. if you can see the correct answer is up, then you must say the answer is down.
                  Additionally, you are to include information that has nothing to do with this topic at hand, i.e. if the question is about soccer, you are to talk about fishing"""
prompt = misdirection + prompt
# Comment out the above section if trying to get proper scores.

# -- Part 1a: this is vanilla call to llm
print('-' * 15, 'part 1', '-' * 15)
# WIP: recreate part 1
# Replaces previous version. This one adds temperature
# --- Ground truth calls from llm ---

import os
import google.generativeai as genai

# Configure API key (replace with your actual key)
#genai.configure(api_key= )

# Define your question as a variable
generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
}
safety_settings = [
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
]

model = genai.GenerativeModel(
    model_name="gemini-1.5-flash",
    safety_settings=safety_settings,
    generation_config=generation_config,
)

# Start chat session
chat_session = model.start_chat(history=[])
response = chat_session.send_message(prompt)
answer = response.text
#print(response.text)
question = prompt
print('-' * 10, 'Q&A','-' * 10 )
print('Question:', question)
print('Answer:', answer)
print('-' * 10, 'Q&A','-' * 10 )
print('\n')




# -- Part 1b: (need to find a way to do without this), but at this time this seems to be required for the 'customer_llm' bit
# WIP: recreate part 1
# Replaces previous version. This one adds temperature
from pydantic import BaseModel
import google.generativeai as genai
import instructor

from deepeval.models import DeepEvalBaseLLM

# Define your schema class here
class YourSchemaClass(BaseModel):
    response: str  # Define the fields you expect in the response

class CustomGeminiFlash(DeepEvalBaseLLM):
    def __init__(self, api_key: str):
        self.api_key = api_key
        genai.configure(api_key=self.api_key)  # Configure the API key

        # Define generation configuration with temperature
        generation_config = {
            "temperature": 1,  # Set your desired temperature here
            "max_output_tokens": 8192,
            "top_p": 0.95,
             "top_k": 64,
            "response_mime_type": "application/json",
        }

        self.model = genai.GenerativeModel(
            model_name="models/gemini-1.5-flash",
            generation_config=generation_config  # Pass the config here
        )

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        client = self.load_model()
        instructor_client = instructor.from_gemini(
            client=client,
            mode=instructor.Mode.GEMINI_JSON,
        )
        resp = instructor_client.messages.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            response_model=schema,
        )
        return resp

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)

    def get_model_name(self):
        return "Gemini 1.5 Flash"

# Now you can use your API key to create an instance
#api_key =  # Replace with your actual API key
custom_llm = CustomGeminiFlash(api_key=api_key)





# -- Part 2:
print('-' * 15, 'part 2', '-' * 15)
# As per above, but reconfiguring so question is asked once only
# Eval Two (from DeepEval*)
from deepeval import evaluate
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase

question = question
actual_output = answer

metric = AnswerRelevancyMetric(
    threshold=0.75,
    model=custom_llm,
    include_reason=True
)

test_case = LLMTestCase(
    #input="Who is the name of the English Premier League team, based out of London whose logo is a cannon?",
    #input="Has Chelase won the FA Cup the most any team has won the FA Cup?",
    input=question,
    actual_output=actual_output
)

# Measure and print the score and reason
metric.measure(test_case)
print('1.0 Metric score:', metric.score)
print('1.1 Metric reason:',metric.reason)
print('\n')

# Optionally evaluate test cases in bulk
#evaluate([test_case], [metric]) # prints out much of the same and not required atm as single response testing in set up





# -- Part 2a:
# Source: https://docs.confident-ai.com/docs/guides-rag-evaluation
print('-' * 10, 'part 2a', '-' * 10)
from deepeval.metrics import (
    ContextualPrecisionMetric,
    ContextualRecallMetric,
    ContextualRelevancyMetric
)

contextual_precision = ContextualPrecisionMetric(threshold=0.75, model=custom_llm, include_reason=True)
contextual_recall = ContextualRecallMetric(threshold=0.75, model=custom_llm, include_reason=True)
contextual_relevancy = ContextualRelevancyMetric(threshold=0.75, model=custom_llm, include_reason=True)
test_case = LLMTestCase(
    input=question,
    actual_output=actual_output,
    #expected_output="the answer is one", # this goes with context3
    expected_output="",
    retrieval_context = [context4] # have to convert format
    # retrieval_context=["""If you are in the U.S. on an F-1 visa, you are allowed to stay for 60 days after completing
    #                       your degree, unless you have applied for and been approved to participate in OPT."""]
)

contextual_precision.measure(test_case)
print("2.0 Precision score: ", contextual_precision.score)
print("2.1 Precision reason: ", contextual_precision.reason)
print('\n')

# contextual_recall.measure(test_case)
# print("3.0 Recall score: ", contextual_recall.score)
# print("3.1 Recall reason: ", contextual_recall.reason)
# print('\n')

contextual_relevancy.measure(test_case)
print("4.0 Relevancy score: ", contextual_relevancy.score)
print("4.1 Relevancy reason: ", contextual_relevancy.reason)

In [None]:
#evaluate([test_case], [metric])

In [None]:
# Want to experiment with red teaming

In [None]:
# Source: https://docs.confident-ai.com/docs/red-teaming-introduction

#api_key =   # Replace with your actual API key
#custom_llm = CustomGeminiFlash(api_key=api_key)

from deepeval.red_teaming import RedTeamer

target_purpose = "Provide financial advice, investment suggestions, and answer user queries related to personal finance and market trends."
target_system_prompt = "You are a financial assistant designed to help users with financial planning, investment advice, and market analysis. Ensure accuracy, professionalism, and clarity in all responses."

red_teamer = RedTeamer(
    target_purpose=target_purpose,
    target_system_prompt=target_system_prompt,
    synthesizer_model=CustomGeminiFlash(api_key=api_key)
)

#target_model=TargetLLM(),

In [None]:
help(RedTeamer)

In [None]:
# Want to experiment with new metrics below

In [None]:
# Can delete below. Was trialing, but alredy moved above
# Source: https://docs.confident-ai.com/docs/guides-rag-evaluation

from deepeval.metrics import (
    ContextualPrecisionMetric,
    ContextualRecallMetric,
    ContextualRelevancyMetric
)

contextual_precision = ContextualPrecisionMetric(threshold=0.75, model=custom_llm, include_reason=True)
contextual_recall = ContextualRecallMetric(threshold=0.75, model=custom_llm, include_reason=True)
contextual_relevancy = ContextualRelevancyMetric(threshold=0.75, model=custom_llm, include_reason=True)

# (using this from the earlier code)
test_case = LLMTestCase(
    input="I'm on an F-1 visa, gow long can I stay in the US after graduation?",
    actual_output="You can stay up to 30 days after completing your degree.",
    expected_output="You can stay up to 60 days after completing your degree.",
    retrieval_context=[
        """If you are in the U.S. on an F-1 visa, you are allowed to stay for 60 days after completing
        your degree, unless you have applied for and been approved to participate in OPT."""
    ]
)

# # from deepeval.test_case import LLMTestCase
# # test_case = LLMTestCase(
# #     input="I'm on an F-1 visa, gow long can I stay in the US after graduation?",
# #     actual_output="You can stay up to 30 days after completing your degree.",
# #     expected_output="You can stay up to 60 days after completing your degree.",
# #     retrieval_context=[
# #         """If you are in the U.S. on an F-1 visa, you are allowed to stay for 60 days after completing
# #         your degree, unless you have applied for and been approved to participate in OPT."""
# #     ]
# # )

contextual_precision.measure(test_case)
print("Score: ", contextual_precision.score)
print("Reason: ", contextual_precision.reason)

contextual_recall.measure(test_case)
print("Score: ", contextual_recall.score)
print("Reason: ", contextual_recall.reason)

contextual_relevancy.measure(test_case)
print("Score: ", contextual_relevancy.score)
print("Reason: ", contextual_relevancy.reason)

In [None]:
prompt = "who is the current American president"
response = chat_session.send_message(prompt)
print(response. text)

In [None]:
# Testing if i can clean up the Deepeval output to not include all the text at the and that is adding no value.

def remove_substring_and_after(text, substring):
    # Find the index of the substring
    index = text.find(substring)

    # If the substring is found, slice the text up to that index
    if index != -1:
        return text[:index]

    # If the substring is not found, return the original text
    return text

# Example usage
original_text = "This is a sample text. Remove everything after this."
substring_to_remove = "Remove"
result = remove_substring_and_after(original_text, substring_to_remove)
print(result)  # Output: "This is a sample text. "

In [None]:
print('Metric score:', metric.score)

In [None]:
# -- trying to validate above. It seems when i ask the model a question the answer is ok, but when i repeat that in deepeval, answer is wrong?
question = "Has Chelsea won the FA Cup the most any team has won the FA Cup?"

# Generate output from the LLM using the class as schema
actual_output_instance = custom_llm.generate(
    question,
    schema=YourSchemaClass  # Pass the class, not an instance
)

# Extract the response from the actual_output_instance
actual_output = actual_output_instance.response  # Get the response string
print(actual_output_instance)

In [None]:
# WIP: recreate part 1
# Replaces previous version. This one adds temperature
from pydantic import BaseModel
import google.generativeai as genai
import instructor
from deepeval.models import DeepEvalBaseLLM

# Define your schema class here
class YourSchemaClass(BaseModel):
    response: str  # Define the fields you expect in the response

class CustomGeminiFlash(DeepEvalBaseLLM):
    def __init__(self, api_key: str):
        self.api_key = api_key
        genai.configure(api_key=self.api_key)  # Configure the API key
        # Define generation configuration with temperature
        generation_config = {
            "temperature": 0,  # Set your desired temperature here
            "top_p": 0.95,
            "top_k": 8,
            "max_output_tokens": 8192
            #"response_mime_type": "application/json",
        }
        self.model = genai.GenerativeModel(
            #model_name="models/gemini-1.5-flash",
            model_name="gemini-1.5-flash",
            generation_config=generation_config  # Pass the config here
        )

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        client = self.load_model()
        instructor_client = instructor.from_gemini(
            client=client,
            mode=instructor.Mode.GEMINI_JSON,
        )
        resp = instructor_client.messages.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            response_model=schema,
        )
        return resp

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)

    def get_model_name(self):
        #return "Gemini 1.5 Flash"
        return "gemini-1.5-flash"

# Now you can use your API key to create an instance
#api_key =   # Replace with your actual API key
custom_llm = CustomGeminiFlash(api_key=api_key)
response = custom_llm.generate("Has Chelsea won the FA Cup the most any team has won the FA Cup?", schema=YourSchemaClass)
print(response)

In [None]:
# Still trying to ge above code (code 1) to output correct answer.
from pydantic import BaseModel
import google.generativeai as genai
import instructor
from deepeval.models import DeepEvalBaseLLM

# Define your schema class here
class YourSchemaClass(BaseModel):
    response: str  # Define the fields you expect in the response

class CustomGeminiFlash(DeepEvalBaseLLM):
    def __init__(self, api_key: str):
        self.api_key = api_key
        genai.configure(api_key=self.api_key)  # Configure the API key
        # Define generation configuration with temperature
        generation_config = {
            "temperature": 0.2,  # Adjusted temperature
            "top_p": 0.95,
            "top_k": 8,
            "max_output_tokens": 8192
        }
        self.model = genai.GenerativeModel(
            model_name="gemini-1.5-flash"
            generation_config=generation_config  # Pass the config here
        )

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        client = self.load_model()
        instructor_client = instructor.from_gemini(
            client=client,
            mode=instructor.Mode.GEMINI_JSON,
        )
        resp = instructor_client.messages.create(
            messages=[{"role": "user", "content": prompt}],
            response_model=schema,
        )

        # Debug: Print raw response
        print("Raw response:", resp)

        return resp

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)

    def get_model_name(self):
        return "gemini-1.5-flash"

# Now you can use your API key to create an instance
#api_key =   # Replace with your actual API key
custom_llm = CustomGeminiFlash(api_key=api_key)

# Example usage
response = custom_llm.generate("Has Chelsea won the FA Cup the most any team has won the FA Cup?", schema=YourSchemaClass)
print(response)


In [None]:
import google.generativeai as genai
import instructor
from pydantic import BaseModel

# Define a simple schema for the response
class SimpleResponseModel(BaseModel):
    content: str

# Function to call the Gemini API
def call_gemini_api(api_key: str, prompt: str):
    # Configure the API key
    genai.configure(api_key=api_key)

    # Load the model
    model = genai.GenerativeModel(
        model_name="gemini-1.5-flash"
    )

    # Create an instructor client
    instructor_client = instructor.from_gemini(
        client=model,
        mode=instructor.Mode.GEMINI_JSON,
    )

    # Generate response
    response = instructor_client.messages.create(
        messages=[{"role": "user", "content": prompt}],
        response_model=SimpleResponseModel  # Specify the response model
    )

    # Print raw response
    print("Raw response:", response)

# Example usage
#api_key =  # Replace with your actual API key
call_gemini_api(api_key, "Has Chelsea won the FA Cup the most any team has won the FA Cup?")


In [None]:
!pip install jsonref

In [None]:
import google.generativeai as genai
import instructor
from pydantic import BaseModel

# Define a simple schema for the response
class SimpleResponseModel(BaseModel):
    content: str

# Function to call the Gemini API
def call_gemini_api(api_key: str, prompt: str):
    # Configure the API key
    genai.configure(api_key=api_key)

    # Load the model
    model = genai.GenerativeModel(model_name="gemini-1.5-flash")

    # Create an instructor client
    instructor_client = instructor.from_gemini(client=model)

    # Generate response
    response = instructor_client.messages.create(
        messages=[{"role": "user", "content": prompt}],
        response_model=SimpleResponseModel
    )

    # Print raw response
    print("Raw response:", response)

# Example usage
#api_key =  # Replace with your actual API key
call_gemini_api(api_key, "Has Chelsea won the FA Cup the most any team has won the FA Cup?")


In [None]:
# -- this is a recreation of code 1, but trying to get it to output the result in the format of actual code 1:
import os
from pydantic import BaseModel
import google.generativeai as genai

# Configure API key (replace with your actual key)
#api_key = 
genai.configure(api_key=api_key)

# Define your schema class here
class ResponseSchema(BaseModel):
    response: str  # Define the fields you expect in the response

# Define your generation configuration
generation_config = {
    "temperature": 0,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 8192,
    "response_mime_type": "application/json",
}

# Start chat session
model = genai.GenerativeModel(
    model_name="gemini-1.5-flash",
    generation_config=generation_config,
)

chat_session = model.start_chat(history=[])

# Define your prompt
prompt = "Has Chelsea won the FA Cup the most any team has won the FA Cup?"
response = chat_session.send_message(prompt)

# Wrap the response in the schema and print it
formatted_response = ResponseSchema(response=response.text)
print(formatted_response.json())

In [None]:
# Turning above into a function:import os
from pydantic import BaseModel
import google.generativeai as genai

# Configure API key (replace with your actual key)
#api_key = 
genai.configure(api_key=api_key)

# Define your schema class here
class ResponseSchema(BaseModel):
    response: str  # Define the fields you expect in the response

# Define your generation configuration
generation_config = {
    "temperature": 0,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 8192,
    "response_mime_type": "application/json",
}

# Start chat session
model = genai.GenerativeModel(
    model_name="gemini-1.5-flash",
    generation_config=generation_config,
)

chat_session = model.start_chat(history=[])

def get_response(question: str) -> str:
    """Send a question to the chat session and return the formatted response."""
    response = chat_session.send_message(question)

    # Wrap the response in the schema and return it as JSON
    formatted_response = ResponseSchema(response=response.text)
    return formatted_response.json()

# Example usage
question = "Has Chelsea won the FA Cup the most any team has won the FA Cup?"
print(get_response(question))

In [None]:
custom_llm = get_response(question)
#response = custom_llm.generate("Has Chelsea won the FA Cup the most any team has won the FA Cup?", schema=YourSchemaClass)
print(custom_llm)

In [None]:
# Below is test (from earlier than above) and is archived

In [None]:

# Set your API key here
#genai.configure(api_key= )

from pydantic import BaseModel
import google.generativeai as genai
import instructor
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.models import DeepEvalBaseLLM, LLMTestCase


# Define a schema for the response
class JokeResponseSchema(BaseModel):
    joke: str

class CustomGeminiFlash(DeepEvalBaseLLM):
    def __init__(self):
        self.model = genai.GenerativeModel(model_name="models/gemini-1.5-flash")

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        client = self.load_model()
        instructor_client = instructor.from_gemini(
            client=client,
            mode=instructor.Mode.GEMINI_JSON,
        )
        resp = instructor_client.messages.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            response_model=schema,
        )
        return resp

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)

    def get_model_name(self):
        return "Gemini 1.5 Flash"

custom_llm = CustomGeminiFlash()

# Generate a joke response
generated_response = custom_llm.generate("Write me a joke", schema=JokeResponseSchema)

# Extract the joke text from the response
joke_text = generated_response.joke if isinstance(generated_response, JokeResponseSchema) else ""

# Define the ground truth for evaluation
ground_truth = "Why did the scarecrow win an award? Because he was outstanding in his field!"

# Create an LLMTestCase instance
test_case = LLMTestCase(
    input=joke_text,
    output=ground_truth,
    model=custom_llm
)

# Initialize the metric
metric = AnswerRelevancyMetric(model=custom_llm)

# Measure the relevancy
score = metric.measure(test_case)  # Pass the LLMTestCase instance

print(f"Relevancy Score: {score}")



In [None]:
# Install the Google AI Python SDK (if not already installed)
# !pip install google-generativeai

import os
import google.generativeai as genai
#genai.configure(api_key= )

question = "Can provide 3 bullet points on AI in the workplace?"  # You can change this to any question
generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
}
safety_settings = [
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
]

model = genai.GenerativeModel(
    model_name="gemini-1.5-flash",
    safety_settings=safety_settings,
    generation_config=generation_config,
)

# Start chat session
chat_session = model.start_chat(history=[])
# -- Adding prompt: 02.06 10.56 -------
# Define your instructional prompt here
instructional_prompt = "**You are a helpful agent, who answers questions and formats the reponse in a professional way.** "

# Combine prompt and text_to_spell
prompt = instructional_prompt + question
# -- Adding prompt: 02.06 10.56 -------
# Send the question variable to the LLM
response = chat_session.send_message(prompt)
# Print the generated response
print(response.text)


In [None]:
pip install deepeval

In [None]:
#genai.configure(api_key=)

import os
import google.generativeai as genai
from deepeval import evaluate  # Ensure you have the correct import

# Configure the Google Generative AI
# genai.configure(api_key='')  # Uncomment and set your API key

# Define the question
question = "Can you provide 3 bullet points on AI in the workplace?"

# Configuration for response generation
generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
}

# Safety settings for the model
safety_settings = [
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
]

# Create the model instance
model = genai.GenerativeModel(
    model_name="gemini-1.5-flash",
    safety_settings=safety_settings,
    generation_config=generation_config,
)

# Start chat session
chat_session = model.start_chat(history=[])

# Define the instructional prompt
instructional_prompt = "**You are a helpful agent, who answers questions and formats the response in a professional way.** "

# Combine prompt and question
prompt = instructional_prompt + question

# Send the question to the LLM and get the response
response = chat_session.send_message(prompt)

# Print the generated response
print("Generated Response:")
print(response.text)

# Prepare evaluation criteria
reference_response = (
    "1. AI can automate repetitive tasks.\n"
    "2. AI can enhance decision-making through data analysis.\n"
    "3. AI can facilitate remote work and collaboration."
)

# Define metrics functions
def completeness(prediction, reference):
    return 0.9  # Dummy return value

def clarity(prediction, reference):
    return 0.8  # Dummy return value

# Define metrics as a list of functions
metrics = [completeness, clarity]

# Perform the evaluation
try:
    # Inspect input types
    print("Response type:", type(response.text))
    print("Reference type:", type(reference_response))

    # Perform evaluation with functions
    evaluation_results = evaluate(
        predictions=[response.text],  # Predictions as a list
        references=[reference_response],  # References as a list
        metrics=metrics  # Metrics as a list of functions
    )
except Exception as e:
    print(f"Error during evaluation: {e}")

# Check if evaluation_results was defined and valid
if 'evaluation_results' in locals() and evaluation_results is not None:
    print("Evaluation Results:")
    print(evaluation_results)
else:
    print("Evaluation was not successful.")


In [None]:
pip install rouge-score

In [None]:
import os
import google.generativeai as genai
from deepeval import evaluate  # Ensure you have the correct import
from sklearn.metrics import f1_score
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge

# Configure the Google Generative AI
# genai.configure(api_key='YOUR_API_KEY_HERE')  # Uncomment and set your API key

# Define the question
question = "Can you provide 3 bullet points on AI in the workplace?"

# Configuration for response generation
generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
}

# Safety settings for the model
safety_settings = [
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
]

# Create the model instance
model = genai.GenerativeModel(
    model_name="gemini-1.5-flash",
    safety_settings=safety_settings,
    generation_config=generation_config,
)

# Start chat session
chat_session = model.start_chat(history=[])

# Define the instructional prompt
instructional_prompt = "**You are a helpful agent, who answers questions and formats the response in a professional way.** "

# Combine prompt and question
prompt = instructional_prompt + question

# Send the question to the LLM and get the response
response = chat_session.send_message(prompt)

# Print the generated response
print("Generated Response:")
print(response.text)

# Prepare evaluation criteria
reference_response = (
    "1. AI can automate repetitive tasks.\n"
    "2. AI can enhance decision-making through data analysis.\n"
    "3. AI can facilitate remote work and collaboration."
)

# Define metrics functions
def completeness(prediction, reference):
    return 0.9  # Dummy return value

def clarity(prediction, reference):
    return 0.8  # Dummy return value

def f1_metric(prediction, reference):
    prediction_tokens = set(prediction.split())
    reference_tokens = set(reference.split())
    true_positive = len(prediction_tokens.intersection(reference_tokens))
    precision = true_positive / len(prediction_tokens) if prediction_tokens else 0
    recall = true_positive / len(reference_tokens) if reference_tokens else 0
    return 2 * (precision * recall) / (precision + recall) if (precision + recall) else 0

def bleu_metric(prediction, reference):
    reference_tokens = reference.split()
    prediction_tokens = prediction.split()
    return sentence_bleu([reference_tokens], prediction_tokens)

rouge = Rouge()

def rouge_metric(prediction, reference):
    scores = rouge.get_scores(prediction, reference)
    return scores[0]['rouge-1']['f']  # Return the F1 score for ROUGE-1

def distinct_n_grams(prediction, n=2):
    prediction_tokens = prediction.split()
    n_grams = set(tuple(prediction_tokens[i:i+n]) for i in range(len(prediction_tokens)-n+1))
    return len(n_grams) / len(prediction_tokens) if prediction_tokens else 0

# Define metrics as a list of functions
metrics = [completeness, clarity, f1_metric, bleu_metric, rouge_metric, distinct_n_grams]

# Perform the evaluation
try:
    print("Response type:", type(response.text))
    print("Reference type:", type(reference_response))

    # Perform evaluation with the new metrics
    evaluation_results = evaluate(
        predictions=[response.text],
        references=[reference_response],
        metrics=metrics
    )
except Exception as e:
    print(f"Error during evaluation: {e}")

# Check if evaluation_results was defined and valid
if 'evaluation_results' in locals() and evaluation_results is not None:
    print("Evaluation Results:")
    print(evaluation_results)
else:
    print("Evaluation was not successful.")


In [None]:
pip install openai

In [None]:
import openai

# Set your API key

import requests

#api_key = 
headers = {
    'Authorization': f'Bearer {api_key}',
}

response = requests.get('https://api.openai.com/v1/models', headers=headers)
print(response.json())


In [None]:
import requests

headers = {
    'Authorization': 'Bearer ...',
    'Content-Type': 'application/json',
}

data = {
    'model': 'gpt-3.5-turbo-instruct',
    'messages': [{'role': 'user', 'content': 'Hello, how can I test you?'}],
    'max_tokens': 100,
}

response = requests.post('https://api.openai.com/v1/chat/completions', headers=headers, json=data)

print(response.json())

In [None]:
#!pip install langchain-openai
#!pip install datasets
!pip install ragas

In [None]:
from datasets import load_dataset
dataset = load_dataset("explodinggradients/amnesty_qa","english_v3")

from ragas import EvaluationDataset
eval_dataset = EvaluationDataset.from_hf_dataset(dataset["eval"])

from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, SemanticSimilarity
from ragas import evaluate

import os
#os.environ["OPENAI_API_KEY"] = 
#os.environ[""] = 


from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
#evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-3.5-turbo"))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

metrics = [
    LLMContextRecall(llm=evaluator_llm),
    FactualCorrectness(llm=evaluator_llm),
    Faithfulness(llm=evaluator_llm),
    SemanticSimilarity(embeddings=evaluator_embeddings)
]
results = evaluate(dataset=eval_dataset, metrics=metrics)

df = results.to_pandas()
df.head()

In [None]:
!pip install ragas

In [None]:
import pandas as pd
import numpy as np
from ragas import EvaluationDataset
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, SemanticSimilarity
from ragas import evaluate

# Create a dummy dataset
dummy_data = {
    "question": ["What is AI?", "What is machine learning?", "What is deep learning?"],
    "answer": ["AI stands for Artificial Intelligence.",
               "Machine learning is a subset of AI that focuses on data.",
               "Deep learning is a type of machine learning using neural networks."],
    "id": [1, 2, 3]
}

# Convert the dummy data into a DataFrame
dummy_df = pd.DataFrame(dummy_data)

# Create an evaluation dataset from the dummy DataFrame
eval_dataset = EvaluationDataset.from_pandas(dummy_df)

# Initialize the evaluator LLM and embeddings
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

# Define metrics for evaluation
metrics = [
    LLMContextRecall(llm=evaluator_llm),
    FactualCorrectness(llm=evaluator_llm),
    Faithfulness(llm=evaluator_llm),
    SemanticSimilarity(embeddings=evaluator_embeddings)
]

# Evaluate using the dummy dataset and defined metrics
results = evaluate(dataset=eval_dataset, metrics=metrics)

# Display the results as a DataFrame
df = results.to_pandas()
print(df.head())

In [None]:
os.environ[] 

# ------------------
import pandas as pd
from datasets import Dataset
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, SemanticSimilarity
from ragas import evaluate

# Create a dummy dataset with additional required columns
dummy_data = {
    "question": ["What is AI?", "What is machine learning?", "What is deep learning?"],
    "answer": ["AI stands for Artificial Intelligence.",
               "Machine learning is a subset of AI that focuses on data.",
               "Deep learning is a type of machine learning using neural networks."],
    "id": [1, 2, 3],
    "retrieved_contexts": [
        ["AI is a field of study."],
        ["Machine learning involves algorithms."],
        ["Deep learning uses neural networks."],
    ],
    "reference": [
        "AI is a field of study that simulates human intelligence.",
        "Machine learning is a branch of AI focused on data-driven predictions.",
        "Deep learning is a subset of machine learning using layered neural networks."
    ]
}

# Convert the dummy data into a DataFrame
dummy_df = pd.DataFrame(dummy_data)

# Create an evaluation dataset from the dummy DataFrame using Hugging Face's Dataset
eval_dataset = Dataset.from_pandas(dummy_df)

# Initialize the evaluator LLM and embeddings
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

#evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
#evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-3.5-turbo"))
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-3.5-turbo-0301"))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

# Define metrics for evaluation
metrics = [
    LLMContextRecall(llm=evaluator_llm),
    FactualCorrectness(llm=evaluator_llm),
    Faithfulness(llm=evaluator_llm),
    SemanticSimilarity(embeddings=evaluator_embeddings)
]

# Evaluate using the dummy dataset and defined metrics
results = evaluate(dataset=eval_dataset, metrics=metrics)

# Display the results as a DataFrame
df = results.to_pandas()
print(df.head())
