diff --git a/pyproject.toml b/pyproject.toml index 10caa953..f7dc4ed8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ requires = ["setuptools", "wheel"] [project] name = "uptrain" -version = "0.6.12" +version = "0.6.13" description = "UpTrain - tool to evaluate LLM applications on aspects like factual accuracy, response quality, retrieval quality, tonality, etc." readme = "README.md" maintainers = [{ name = "UpTrain AI Team", email = "oss@uptrain.ai" }] diff --git a/tests/test_builtins.py b/tests/test_builtins.py index 97642f86..8fa16188 100644 --- a/tests/test_builtins.py +++ b/tests/test_builtins.py @@ -199,13 +199,13 @@ def test_check_response_matching(): output = check.setup(settings).run(response_matching_dataset) assert isinstance(output, pl.DataFrame) assert ( - "score_response_matching" in output.columns + "score_response_match" in output.columns and "explanation_response_matching" in output.columns ) assert ( - output["score_response_matching"].dtype == pl.Float64 - and len(output["score_response_matching"]) - - output["score_response_matching"].null_count() + output["score_response_match"].dtype == pl.Float64 + and len(output["score_response_match"]) + - output["score_response_match"].null_count() > 0 ) assert ( @@ -563,8 +563,8 @@ def test_check_guideline_adherence(): # check = CheckResponseMatching() # output = check.setup(settings).run(dataset) # assert isinstance(output, pl.DataFrame) -# assert "score_response_matching" in output.columns and "explanation_response_matching" in output.columns -# assert output["score_response_matching"].dtype == pl.Float64 and len(output["score_response_matching"]) - output["score_response_matching"].null_count() > 0 +# assert "score_response_match" in output.columns and "explanation_response_matching" in output.columns +# assert output["score_response_match"].dtype == pl.Float64 and len(output["score_response_match"]) - output["score_response_match"].null_count() > 0 # assert output["explanation_response_matching"].dtype == pl.Utf8 and len(output["explanation_response_matching"]) - output["explanation_response_matching"].null_count() > 0 diff --git a/uptrain/framework/builtins.py b/uptrain/framework/builtins.py index 2b82363b..39d2356a 100644 --- a/uptrain/framework/builtins.py +++ b/uptrain/framework/builtins.py @@ -76,7 +76,7 @@ def CheckResponseMatching(method="llm"): return Check( name="response_matching_score", operators=[ResponseMatchingScore(method=method)], - plots=[Histogram(x="score_response_matching")], + plots=[Histogram(x="score_response_match")], ) diff --git a/uptrain/framework/evalllm.py b/uptrain/framework/evalllm.py index a073a651..cf5f00c8 100644 --- a/uptrain/framework/evalllm.py +++ b/uptrain/framework/evalllm.py @@ -98,7 +98,7 @@ def __init__(self, settings: Settings = None, openai_api_key: str = None) -> Non if self.settings.openai_api_key is not None and len(self.settings.openai_api_key): response = check_openai_api_key(self.settings.openai_api_key) if not response: - raise Exception("OpenAI API Key is invalid") + raise ValueError("OpenAI API Key is invalid") self.executor = APIClientWithoutAuth(self.settings) diff --git a/uptrain/operators/language/prompts/few_shots.py b/uptrain/operators/language/prompts/few_shots.py index 2e1967d4..643cf72a 100644 --- a/uptrain/operators/language/prompts/few_shots.py +++ b/uptrain/operators/language/prompts/few_shots.py @@ -438,7 +438,7 @@ """ -LANGUAGE_CRITIQUE_COHERENCE_FEW_SHOT__CLASSIFY = """ +LANGUAGE_CRITIQUE_COHERENCE_FEW_SHOT__COT = """ [Response]: Exercise is beneficial for both physical and mental health. It strengthens the body and uplifts the mind. [Output]: { diff --git a/uptrain/operators/language/response_quality.py b/uptrain/operators/language/response_quality.py index db9328ba..dacb03f7 100644 --- a/uptrain/operators/language/response_quality.py +++ b/uptrain/operators/language/response_quality.py @@ -903,7 +903,7 @@ def evaluate_local(self, data): precision = combined_row[0]["score_factual_accuracy"] recall = combined_row[1]["score_factual_accuracy"] output = { - "score_response_matching": None, + "score_response_match": None, "explanation_response_matching": None, "score_response_match_recall": None, "score_response_match_precision": None, @@ -921,11 +921,11 @@ def evaluate_local(self, data): output["explanation_response_matching"] = explanation if precision != 0 and recall != 0: - output["score_response_matching"] = 4 * ( + output["score_response_match"] = 4 * ( (precision * recall) / (precision * 3 + recall) ) else: - output["score_response_matching"] = 0.0 + output["score_response_match"] = 0.0 output["score_response_match_recall"] = recall output["score_response_match_precision"] = precision results.append(output)