-
Notifications
You must be signed in to change notification settings - Fork 192
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* added class EvalPromptfoo * Add eval_model to EvalPromptfoo --------- Co-authored-by: Dhruv Chawla <dhruv@uptrain.ai>
- Loading branch information
1 parent
1b713a8
commit 44fcd88
Showing
6 changed files
with
335 additions
and
233 deletions.
There are no files selected for viewing
313 changes: 81 additions & 232 deletions
313
examples/integrations/observation_tools/promptfoo/promptfoo.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,6 +34,7 @@ dependencies = [ | |
"openai>=1.6.1", | ||
"fsspec", | ||
"litellm", | ||
"pyyaml" | ||
] | ||
|
||
[project.urls] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
__all__ = ["EvalLlamaIndex"] | ||
__all__ = ["EvalLlamaIndex", "EvalPromptfoo"] | ||
|
||
from .llama_index import EvalLlamaIndex | ||
from .promptfoo import EvalPromptfoo |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
from __future__ import annotations | ||
from loguru import logger | ||
import polars as pl | ||
import pandas as pd | ||
import typing as t | ||
import subprocess | ||
import time | ||
from uptrain.integrations import promptfoo_utils as pr_u | ||
|
||
timestr = time.strftime("%m_%d_%Y-%H:%M:%S") # used for filename | ||
|
||
__all__ = ["EvalPromptfoo"] | ||
|
||
|
||
class EvalPromptfoo: | ||
def __init__(self) -> None: | ||
return None | ||
|
||
def evaluate( | ||
self, | ||
evals_list: t.Union[list], | ||
evals_weight: t.Union[list], | ||
input_data: t.Union[list[dict], pl.DataFrame, pd.DataFrame], | ||
threshold: t.Union[float], | ||
prompts: t.Union[str], | ||
providers: t.Union[list], | ||
redirect_webview: t.Optional[bool] = False, | ||
output_file: t.Optional[str] = "results_" + timestr + ".csv", | ||
port: t.Optional[int] = pr_u.generate_open_port(), | ||
eval_model: t.Optional[str] = "gpt-3.5-turbo-1106", | ||
): | ||
evals_compiled = pr_u.compile_evals(evals_list, evals_weight) | ||
|
||
if isinstance(input_data, pl.DataFrame): | ||
input_data = input_data.to_dicts() | ||
elif isinstance(input_data, pd.DataFrame): | ||
input_data = input_data.to_dict(orient="records") | ||
|
||
yaml_data = { | ||
"prompts": prompts, | ||
"providers": providers, | ||
"tests": [ | ||
{ | ||
"description": "Data " + str(i + 1), | ||
"vars": { | ||
"question": input_data[i]["question"], | ||
"context": input_data[i]["context"], | ||
}, | ||
"threshold": threshold, | ||
"assert": [ | ||
{ | ||
"type": "python", | ||
"value": pr_u.format_uptrain_template( | ||
input_data[i], evals_compiled[j], threshold, eval_model | ||
), | ||
} | ||
for j in range(len(evals_compiled)) | ||
], | ||
} | ||
for i in range(len(input_data)) | ||
], | ||
} | ||
|
||
pr_u.generate_promptfoo_yaml_file(yaml_data) | ||
try: | ||
subprocess.run( | ||
["npx", "--yes", "promptfoo@latest", "eval", "-o", output_file] | ||
) | ||
logger.success("Evaluations successfully generated") | ||
logger.success("Saved results to file: " + output_file) | ||
if redirect_webview: | ||
try: | ||
if isinstance(port, int): | ||
subprocess.run( | ||
["npx", "promptfoo@latest", "view", "-y", "-p", str(port)] | ||
) | ||
logger.success( | ||
"Successfully rerouted to promptfoo dashboards @ http://localhost:" | ||
+ str(port) | ||
+ "/" | ||
) | ||
else: | ||
subprocess.run(["npx", "promptfoo@latest", "view", "-y"]) | ||
logger.success( | ||
"Successfully rerouted to promptfoo dashboards @ http://localhost:" | ||
+ str(pr_u.DEFAULT_PORT_PROMPTFOO) | ||
) | ||
except Exception as e: | ||
logger.error(f"Failed to generate a view: {e}") | ||
else: | ||
try: | ||
if isinstance(port, int): | ||
subprocess.run( | ||
["npx", "promptfoo@latest", "view", "-p", str(port)] | ||
) | ||
logger.success( | ||
"Open " | ||
+ str(port) | ||
+ "/" | ||
+ " in webbrowser to view dashboards" | ||
) | ||
else: | ||
subprocess.run(["npx", "promptfoo@latest", "view"]) | ||
logger.success( | ||
"Open http://localhost:" | ||
+ str(pr_u.DEFAULT_PORT_PROMPTFOO) | ||
+ "/" | ||
+ " in webbrowser to view dashboards" | ||
) | ||
except Exception as e: | ||
logger.error(f"Failed to generate a view: {e}") | ||
except Exception as e: | ||
logger.error(f"Evaluation failed with error: {e}") | ||
raise e | ||
return None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
from loguru import logger | ||
import inspect | ||
import yaml | ||
from uptrain import Evals | ||
import socket | ||
|
||
DEFAULT_PORT_PROMPTFOO = 15500 | ||
""" | ||
evals: [Evals.Metric1, Evals.Metric2] | ||
weight: [Weight1, Weight2] | ||
Basically weights help to generate weighted average of the metrics we are using | ||
func(compile_evals): we are creating a list which looks like: | ||
[ | ||
{ | ||
'eval_type': Evals.Metri1, | ||
'eval_weight': Weight1, | ||
'score_type': score_metric1, | ||
'explanation_type': explanation_metric1 | ||
} | ||
] | ||
""" | ||
|
||
|
||
def compile_evals(evals, weight): | ||
if len(evals) != len(weight): | ||
logger.error("Length of evals != Length of weight") | ||
if not all(isinstance(_, Evals) for _ in evals): | ||
logger.error("Please check the list of evals") | ||
res_dict = [ | ||
{ | ||
"eval_type": str(evals[i]), | ||
"eval_weight": weight[i], | ||
"score_type": "score_" + evals[i].value, | ||
"explanation_type": "explanation_" + evals[i].value, | ||
} | ||
for i in range(len(evals)) | ||
] | ||
return res_dict | ||
|
||
|
||
""" | ||
func(tempalate_uptrain_eval): | ||
In itself this function may not look like a function. | ||
Basically, it's more of a template. | ||
We will be using the information inside this function to create yaml file for promptfoo (through inspect) | ||
""" | ||
|
||
|
||
def tempalate_uptrain_eval(): | ||
from uptrain import EvalLLM, Settings, Evals | ||
import os | ||
|
||
data = [{"question": input_question, "context": input_context, "response": output}] | ||
settings = Settings(openai_api_key=os.environ["OPENAI_API_KEY"], model=eval_model) | ||
eval_llm = EvalLLM(settings=settings) | ||
results = eval_llm.evaluate(data=data, checks=eval_type) | ||
if results[0][score_var] > threshold_var: | ||
return { | ||
"pass": True, | ||
"score": results[0][score_var], | ||
"reason": results[0][explanation_var], | ||
} | ||
else: | ||
return { | ||
"pass": False, | ||
"score": results[0][score_var], | ||
"reason": results[0][explanation_var], | ||
} | ||
|
||
|
||
""" | ||
func(format_uptrain_template): Adds individual variables to the template | ||
""" | ||
|
||
|
||
def format_uptrain_template(data, evals_compiled, threshold, eval_model): | ||
uptrain_template_lines = inspect.getsourcelines(tempalate_uptrain_eval)[0] | ||
uptrain_template = "".join(uptrain_template_lines[1:]) | ||
|
||
uptrain_template = uptrain_template.replace( | ||
"input_question", "'{}'".format(data["question"]) | ||
) | ||
uptrain_template = uptrain_template.replace( | ||
"input_context", "'{}'".format(data["context"]) | ||
) | ||
uptrain_template = uptrain_template.replace( | ||
"eval_type", "[{}]".format(evals_compiled["eval_type"]) | ||
) | ||
uptrain_template = uptrain_template.replace( | ||
"score_var", "'{}'".format(evals_compiled["score_type"]) | ||
) | ||
uptrain_template = uptrain_template.replace( | ||
"explanation_var", "'{}'".format(evals_compiled["explanation_type"]) | ||
) | ||
uptrain_template = uptrain_template.replace("threshold_var", "{}".format(threshold)) | ||
uptrain_template = uptrain_template.replace("eval_model", "'{}'".format(eval_model)) | ||
return uptrain_template | ||
|
||
|
||
""" | ||
Generate yaml file for promptfoo | ||
""" | ||
|
||
|
||
def generate_promptfoo_yaml_file(py_obj): | ||
try: | ||
with open("promptfooconfig.yaml", "w") as f: | ||
yaml.dump(py_obj, f, sort_keys=False) | ||
logger.success("Succesfully generated file: promptfooconfig.yaml") | ||
except: | ||
logger.error("Unable to generate file: promptfooconfig.yaml") | ||
|
||
|
||
""" | ||
Used to check for an open port | ||
""" | ||
|
||
|
||
def generate_open_port(): | ||
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) | ||
port_res = sock.connect_ex( | ||
("localhost", DEFAULT_PORT_PROMPTFOO) | ||
) # if 0 then default: DEFAULT_PORT_PROMPTFOO already in use | ||
if port_res == 0: | ||
logger.info("Default port " + str(DEFAULT_PORT_PROMPTFOO) + " already in use") | ||
sock = socket.socket() | ||
sock.bind(("", 0)) | ||
open_port = sock.getsockname()[1] | ||
logger.info("Using port: " + str(open_port)) | ||
else: | ||
open_port = 15500 | ||
return open_port |