From af1b997098e7e39bc5010f2cf918b8da07eea67b Mon Sep 17 00:00:00 2001 From: Alexej Penner Date: Thu, 6 Mar 2025 11:25:00 +0100 Subject: [PATCH 1/2] Better visualization of the evaluation --- llm-complete-guide/ZENML_VERSION.txt | 2 +- llm-complete-guide/pipelines/llm_eval.py | 6 +- llm-complete-guide/steps/create_prompt.py | 7 + .../steps/eval_visualisation.py | 538 +++++++++++++----- llm-complete-guide/steps/url_scraper.py | 18 +- 5 files changed, 428 insertions(+), 143 deletions(-) diff --git a/llm-complete-guide/ZENML_VERSION.txt b/llm-complete-guide/ZENML_VERSION.txt index b7c0622b4..c52842c46 100644 --- a/llm-complete-guide/ZENML_VERSION.txt +++ b/llm-complete-guide/ZENML_VERSION.txt @@ -1 +1 @@ -0.74.0 +0.75.0 diff --git a/llm-complete-guide/pipelines/llm_eval.py b/llm-complete-guide/pipelines/llm_eval.py index 4c644344b..9112a8ccf 100644 --- a/llm-complete-guide/pipelines/llm_eval.py +++ b/llm-complete-guide/pipelines/llm_eval.py @@ -17,7 +17,7 @@ from typing import Optional import click -from steps.create_prompt import create_prompt +from steps.create_prompt import PROMPT, create_prompt from steps.eval_e2e import e2e_evaluation, e2e_evaluation_llm_judged from steps.eval_retrieval import ( retrieval_evaluation_full, @@ -26,14 +26,14 @@ retrieval_evaluation_small_with_reranking, ) from steps.eval_visualisation import visualize_evaluation_results -from zenml import pipeline +from zenml import pipeline, save_artifact @pipeline(enable_cache=True) def llm_eval(after: Optional[str] = None) -> None: """Executes the pipeline to evaluate a RAG pipeline.""" # define prompt - prompt = create_prompt() + prompt = save_artifact(PROMPT, "prompt") # Retrieval evals failure_rate_retrieval = retrieval_evaluation_small(after=after) diff --git a/llm-complete-guide/steps/create_prompt.py b/llm-complete-guide/steps/create_prompt.py index e2450012d..61465592a 100644 --- a/llm-complete-guide/steps/create_prompt.py +++ b/llm-complete-guide/steps/create_prompt.py @@ -16,6 +16,13 @@ from zenml import log_metadata, step +PROMPT = """ +You are a friendly chatbot. \ +You can answer questions about ZenML, its features and its use cases. \ +You respond in a concise, technically credible tone. \ +You ONLY use the context from the ZenML documentation to provide relevant +answers. \ +""" @step def create_prompt() -> str: diff --git a/llm-complete-guide/steps/eval_visualisation.py b/llm-complete-guide/steps/eval_visualisation.py index 1a5824905..65b26fd02 100644 --- a/llm-complete-guide/steps/eval_visualisation.py +++ b/llm-complete-guide/steps/eval_visualisation.py @@ -12,97 +12,384 @@ # or implied. See the License for the specific language governing # permissions and limitations under the License. -import io -from typing import Annotated, Tuple +from typing import Annotated, Dict, List, Tuple -import matplotlib.pyplot as plt -import numpy as np -from PIL import Image +import plotly.graph_objects as go +from plotly.subplots import make_subplots from zenml import ArtifactConfig, get_step_context, log_metadata, step +from zenml.types import HTMLString - -def create_image( - labels: list, - scores: list, +def create_plotly_bar_chart( + labels: List[str], + scores: List[float], title: str, - alternate_colours: bool = False, + alternate_colors: bool = False, percentage_scale: bool = False, -) -> Image.Image: + descriptions: Dict[str, str] = None, +) -> go.Figure: """ - Create a horizontal bar chart image from the given labels, scores, and title. + Create a horizontal bar chart using Plotly. Args: - labels (list): List of labels for the y-axis. - scores (list): List of scores corresponding to each label. + labels (List[str]): List of labels for the y-axis. + scores (List[float]): List of scores corresponding to each label. title (str): Title of the chart. - alternate_colours (bool): Whether to alternate colours for the bars. + alternate_colors (bool): Whether to alternate colors for the bars. percentage_scale (bool): Whether to use a percentage scale (0-100) for the x-axis. + descriptions (Dict[str, str]): Optional descriptions for hover text. Returns: - Image.Image: The generated chart image. + go.Figure: Plotly figure object """ - # Create a new figure and axis with a smaller left margin - fig, ax = plt.subplots(figsize=(10, 6)) - fig.subplots_adjust(left=0.2) # Adjust left margin - - # Plot the horizontal bar chart - y_pos = np.arange(len(labels)) - if alternate_colours: - colors = ["blue" if i % 2 == 0 else "red" for i in range(len(labels))] - ax.barh(y_pos, scores, align="center", color=colors) + # Generate colors for bars + if alternate_colors: + colors = ["rgba(66, 133, 244, 0.8)" if i % 2 == 0 else "rgba(219, 68, 55, 0.8)" for i in range(len(labels))] else: - ax.barh(y_pos, scores, align="center") - - # Display the actual value to the left of each bar, or to the right if value is 0 - for i, v in enumerate(scores): - if v == 0: - ax.text( - 0.3, # Position the text label slightly to the right of 0 - i, - f"{v:.1f}", - color="black", - va="center", - fontweight="bold", - ) - else: - colors[i] if alternate_colours else "blue" - text_color = "white" - ax.text( - v - - 0.1, # Adjust the x-position of the text labels to the left - i, - f"{v:.1f}", - color=text_color, - va="center", - fontweight="bold", - ha="right", # Align the text to the right - ) - - ax.set_yticks(y_pos) - ax.set_yticklabels(labels) - ax.invert_yaxis() # Labels read top-to-bottom - ax.set_xlabel("Score") - if percentage_scale: - ax.set_xlim(0, 100) # Set x-axis limits to 0-100 for percentage scale - ax.set_xlabel("Percentage") + colors = ["rgba(66, 133, 244, 0.8)" for _ in range(len(labels))] + + # Prepare hover text + if descriptions: + hover_text = [f"{label}
Value: {score:.2f}
{descriptions.get(label, '')}" + for label, score in zip(labels, scores)] else: - ax.set_xlim(0, 5) # Set x-axis limits based on maximum score - ax.set_xlabel("Score") + hover_text = [f"{label}
Value: {score:.2f}" for label, score in zip(labels, scores)] + + # Create figure + fig = go.Figure() + + fig.add_trace( + go.Bar( + y=labels, + x=scores, + orientation='h', + marker_color=colors, + text=[f"{score:.2f}" for score in scores], + textposition='auto', + hovertext=hover_text, + hoverinfo='text', + ) + ) + + # Set layout + max_value = max(scores) if scores else 5 + xaxis_range = [0, 100] if percentage_scale else [0, max(5, max_value * 1.1)] + xaxis_title = "Percentage (%)" if percentage_scale else "Score" + + fig.update_layout( + title=title, + xaxis=dict( + title=xaxis_title, + range=xaxis_range, + showgrid=True, + gridcolor='rgba(230, 230, 230, 0.8)', + ), + yaxis=dict( + autorange="reversed", # Make labels read top-to-bottom + ), + margin=dict(l=20, r=20, t=60, b=20), + height=max(300, 70 * len(labels)), + plot_bgcolor='rgba(255, 255, 255, 1)', + ) + + return fig + + +def generate_evaluation_html( + pipeline_run_name: str, + retrieval_labels: List[str], + retrieval_scores: List[float], + generation_basic_labels: List[str], + generation_basic_scores: List[float], + generation_quality_labels: List[str], + generation_quality_scores: List[float], + metrics_metadata: Dict[str, float], +) -> str: + """ + Generate a comprehensive HTML report with all evaluation visualizations. + + Args: + pipeline_run_name (str): Name of the pipeline run + retrieval_labels (List[str]): Labels for retrieval metrics + retrieval_scores (List[float]): Scores for retrieval metrics + generation_basic_labels (List[str]): Labels for basic generation metrics + generation_basic_scores (List[float]): Scores for basic generation metrics + generation_quality_labels (List[str]): Labels for generation quality metrics + generation_quality_scores (List[float]): Scores for generation quality metrics + metrics_metadata (Dict[str, float]): All metrics for displaying in the summary + + Returns: + str: HTML string containing the interactive dashboard + """ + # Metric descriptions for hovering + metric_descriptions = { + "Small Retrieval Eval Failure Rate": + "Percentage of small test cases where retrieval failed to find relevant documents.", + "Small Retrieval Eval Failure Rate Reranking": + "Percentage of small test cases where retrieval with reranking failed to find relevant documents.", + "Full Retrieval Eval Failure Rate": + "Percentage of all test cases where retrieval failed to find relevant documents.", + "Full Retrieval Eval Failure Rate Reranking": + "Percentage of all test cases where retrieval with reranking failed to find relevant documents.", + "Failure Rate Bad Answers": + "Percentage of responses that were factually incorrect or misleading.", + "Failure Rate Bad Immediate Responses": + "Percentage of immediate responses that did not adequately address the query.", + "Failure Rate Good Responses": + "Percentage of responses rated as good by evaluators.", + "Average Toxicity Score": + "Average score measuring harmful, offensive, or inappropriate content (lower is better).", + "Average Faithfulness Score": + "Average score measuring how accurately the response represents the source material (higher is better).", + "Average Helpfulness Score": + "Average score measuring the practical utility of responses to users (higher is better).", + "Average Relevance Score": + "Average score measuring how well responses address the specific query intent (higher is better).", + } - ax.set_title(title) + # Create individual charts + retrieval_fig = create_plotly_bar_chart( + retrieval_labels, + retrieval_scores, + f"Retrieval Evaluation Metrics", + alternate_colors=True, + descriptions=metric_descriptions + ) + + generation_basic_fig = create_plotly_bar_chart( + generation_basic_labels, + generation_basic_scores, + f"Basic Generation Metrics", + percentage_scale=True, + descriptions=metric_descriptions + ) + + generation_quality_fig = create_plotly_bar_chart( + generation_quality_labels, + generation_quality_scores, + f"Generation Quality Metrics", + descriptions=metric_descriptions + ) - # Adjust the subplot parameters - plt.subplots_adjust(left=0.2, right=0.9, top=0.9, bottom=0.1) + # Create summary metrics cards + composite_quality = metrics_metadata.get("composite.overall_quality", 0) + retrieval_effectiveness = metrics_metadata.get("composite.retrieval_effectiveness", 0) + + # Combine into complete HTML report + html = f""" + + + + Evaluation Results: {pipeline_run_name} + + + + +
+

LLM Evaluation Results: {pipeline_run_name}

+ +
+
+

Overall Quality Score

+
{composite_quality:.2f}
+

Average of faithfulness, helpfulness, and relevance

+
+
+

Retrieval Effectiveness

+
{retrieval_effectiveness:.2f}
+

Average success rate across retrieval tests

+
+
+

Toxicity

+
{metrics_metadata.get("quality.toxicity", 0):.2f}
+

Average toxicity score (lower is better)

+
+
- # Save the plot to a BytesIO object - buf = io.BytesIO() - plt.savefig(buf, format="png", bbox_inches="tight") - buf.seek(0) +
+
All Metrics
+
Retrieval
+
Generation
+
Quality
+
- # Create a PIL Image object from the BytesIO object - image = Image.open(buf) +
+
+
+
+ +

All Metrics

+ + + + + + {"".join(f'' for k, v in metrics_metadata.items())} +
MetricValue
{k}{v:.4f}
+
+ +
+
+
+

About Retrieval Metrics

+

+ These metrics measure how effectively the system retrieves relevant documents for answering queries. + Lower failure rates indicate better retrieval performance. Reranking shows the impact of the reranking + algorithm on improving retrieval quality. +

+
+
+ +
+
+
+

About Generation Failure Metrics

+

+ These metrics measure different types of failures in response generation: +

    +
  • Bad Answers: Responses that are factually incorrect or misleading
  • +
  • Bad Immediate Responses: Initial responses that don't address the query adequately
  • +
  • Good Responses: The percentage of responses rated as good (higher is better)
  • +
+

+
+
+ +
+
+
+

About Quality Metrics

+

+ These metrics evaluate the quality of generated responses across different dimensions: +

    +
  • Toxicity: Measures harmful or inappropriate content (lower is better)
  • +
  • Faithfulness: Measures accuracy to source material (higher is better)
  • +
  • Helpfulness: Measures practical utility to users (higher is better)
  • +
  • Relevance: Measures alignment with query intent (higher is better)
  • +
+

+
+
+
- return image + + + + """ + + return HTMLString(html) @step(enable_cache=False) @@ -118,13 +405,9 @@ def visualize_evaluation_results( average_faithfulness_score: float, average_helpfulness_score: float, average_relevance_score: float, -) -> Tuple[ - Annotated[Image.Image, ArtifactConfig(name="retrieval_eval_metrics")], - Annotated[Image.Image, ArtifactConfig(name="generation_eval_basic")], - Annotated[Image.Image, ArtifactConfig(name="generation_eval_full")], -]: +) -> Annotated[HTMLString, "evaluation_dashboard"]: """ - Visualize the evaluation results by creating three separate images and logging metrics. + Visualize the evaluation results by creating an interactive HTML dashboard. Args: small_retrieval_eval_failure_rate (float): Small retrieval evaluation failure rate. @@ -140,60 +423,61 @@ def visualize_evaluation_results( average_relevance_score (float): Average relevance score. Returns: - Tuple[Image.Image, Image.Image, Image.Image]: A tuple of three images visualizing the evaluation results. + str: HTML content for the interactive evaluation dashboard. """ step_context = get_step_context() pipeline_run_name = step_context.pipeline_run.name - # Log all metrics as metadata for dashboard visualization - log_metadata( - metadata={ - # Retrieval metrics - "retrieval.small_failure_rate": small_retrieval_eval_failure_rate, - "retrieval.small_failure_rate_reranking": small_retrieval_eval_failure_rate_reranking, - "retrieval.full_failure_rate": full_retrieval_eval_failure_rate, - "retrieval.full_failure_rate_reranking": full_retrieval_eval_failure_rate_reranking, - # Generation failure metrics - "generation.failure_rate_bad_answers": failure_rate_bad_answers, - "generation.failure_rate_bad_immediate": failure_rate_bad_immediate_responses, - "generation.failure_rate_good": failure_rate_good_responses, - # Quality metrics - "quality.toxicity": average_toxicity_score, - "quality.faithfulness": average_faithfulness_score, - "quality.helpfulness": average_helpfulness_score, - "quality.relevance": average_relevance_score, - # Composite scores - "composite.overall_quality": ( - average_faithfulness_score - + average_helpfulness_score - + average_relevance_score - ) - / 3, - "composite.retrieval_effectiveness": ( - (1 - small_retrieval_eval_failure_rate) - + (1 - full_retrieval_eval_failure_rate) - ) - / 2, - } - ) + # Calculate composite metrics + composite_overall_quality = ( + average_faithfulness_score + + average_helpfulness_score + + average_relevance_score + ) / 3 + + composite_retrieval_effectiveness = ( + (1 - small_retrieval_eval_failure_rate/100) + + (1 - full_retrieval_eval_failure_rate/100) + ) / 2 - normalized_scores = [ - score / 20 - for score in [ - small_retrieval_eval_failure_rate, - small_retrieval_eval_failure_rate_reranking, - full_retrieval_eval_failure_rate, - full_retrieval_eval_failure_rate_reranking, - ] - ] + # Collect all metrics for dashboard and logging + metrics_metadata = { + # Retrieval metrics + "retrieval.small_failure_rate": small_retrieval_eval_failure_rate, + "retrieval.small_failure_rate_reranking": small_retrieval_eval_failure_rate_reranking, + "retrieval.full_failure_rate": full_retrieval_eval_failure_rate, + "retrieval.full_failure_rate_reranking": full_retrieval_eval_failure_rate_reranking, + # Generation failure metrics + "generation.failure_rate_bad_answers": failure_rate_bad_answers, + "generation.failure_rate_bad_immediate": failure_rate_bad_immediate_responses, + "generation.failure_rate_good": failure_rate_good_responses, + # Quality metrics + "quality.toxicity": average_toxicity_score, + "quality.faithfulness": average_faithfulness_score, + "quality.helpfulness": average_helpfulness_score, + "quality.relevance": average_relevance_score, + # Composite scores + "composite.overall_quality": composite_overall_quality, + "composite.retrieval_effectiveness": composite_retrieval_effectiveness, + } + + # Log all metrics as metadata for dashboard visualization + log_metadata(metadata=metrics_metadata) + # Prepare data for visualization image1_labels = [ "Small Retrieval Eval Failure Rate", "Small Retrieval Eval Failure Rate Reranking", "Full Retrieval Eval Failure Rate", "Full Retrieval Eval Failure Rate Reranking", ] - image1_scores = normalized_scores + # Note: No need to normalize scores for Plotly visualization + image1_scores = [ + small_retrieval_eval_failure_rate, + small_retrieval_eval_failure_rate_reranking, + full_retrieval_eval_failure_rate, + full_retrieval_eval_failure_rate_reranking, + ] image2_labels = [ "Failure Rate Bad Answers", @@ -219,22 +503,16 @@ def visualize_evaluation_results( average_relevance_score, ] - image1 = create_image( + # Generate the HTML dashboard + html_content = generate_evaluation_html( + pipeline_run_name, image1_labels, image1_scores, - f"Retrieval Evaluation Metrics for {pipeline_run_name}", - alternate_colours=True, - ) - image2 = create_image( image2_labels, image2_scores, - f"Basic Generation Evaluation for {pipeline_run_name}", - percentage_scale=True, - ) - image3 = create_image( image3_labels, image3_scores, - f"Generation Evaluation (Average Scores for {pipeline_run_name})", + metrics_metadata, ) - return image1, image2, image3 + return html_content diff --git a/llm-complete-guide/steps/url_scraper.py b/llm-complete-guide/steps/url_scraper.py index 0e41cff3a..4efed534c 100644 --- a/llm-complete-guide/steps/url_scraper.py +++ b/llm-complete-guide/steps/url_scraper.py @@ -48,15 +48,15 @@ def url_scraper( "https://docs.zenml.io/how-to/track-metrics-metadata/logging-metadata", "https://docs.zenml.io/how-to/debug-and-solve-issues", "https://docs.zenml.io/stack-components/step-operators/azureml", - # "https://docs.zenml.io/how-to/interact-with-secrets", - # "https://docs.zenml.io/how-to/infrastructure-deployment/auth-management/service-connectors-guide", - # "https://docs.zenml.io/how-to/infrastructure-deployment/auth-management/hyperai-service-connector", - # "https://docs.zenml.io/stack-components/data-validators/evidently", - # "https://docs.zenml.io/stack-components/data-validators", - # "https://docs.zenml.io/stack-components/step-operators/sagemaker", - # "https://docs.zenml.io/stack-components/alerters/slack", - # "https://docs.zenml.io/how-to/infrastructure-deployment/auth-management/kubernetes-service-connector", - # "https://docs.zenml.io/how-to/infrastructure-deployment/auth-management/azure-service-connector" + "https://docs.zenml.io/how-to/interact-with-secrets", + "https://docs.zenml.io/how-to/infrastructure-deployment/auth-management/service-connectors-guide", + "https://docs.zenml.io/how-to/infrastructure-deployment/auth-management/hyperai-service-connector", + "https://docs.zenml.io/stack-components/data-validators/evidently", + "https://docs.zenml.io/stack-components/data-validators", + "https://docs.zenml.io/stack-components/step-operators/sagemaker", + "https://docs.zenml.io/stack-components/alerters/slack", + "https://docs.zenml.io/how-to/infrastructure-deployment/auth-management/kubernetes-service-connector", + "https://docs.zenml.io/how-to/infrastructure-deployment/auth-management/azure-service-connector" ] else: docs_urls = get_all_pages(docs_url) From 8fa36e2012e70cc8f1c236cce53963721b790695 Mon Sep 17 00:00:00 2001 From: Alexej Penner Date: Mon, 10 Mar 2025 11:53:22 +0100 Subject: [PATCH 2/2] Update llm-complete-guide/steps/url_scraper.py --- llm-complete-guide/steps/url_scraper.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/llm-complete-guide/steps/url_scraper.py b/llm-complete-guide/steps/url_scraper.py index 4efed534c..0e41cff3a 100644 --- a/llm-complete-guide/steps/url_scraper.py +++ b/llm-complete-guide/steps/url_scraper.py @@ -48,15 +48,15 @@ def url_scraper( "https://docs.zenml.io/how-to/track-metrics-metadata/logging-metadata", "https://docs.zenml.io/how-to/debug-and-solve-issues", "https://docs.zenml.io/stack-components/step-operators/azureml", - "https://docs.zenml.io/how-to/interact-with-secrets", - "https://docs.zenml.io/how-to/infrastructure-deployment/auth-management/service-connectors-guide", - "https://docs.zenml.io/how-to/infrastructure-deployment/auth-management/hyperai-service-connector", - "https://docs.zenml.io/stack-components/data-validators/evidently", - "https://docs.zenml.io/stack-components/data-validators", - "https://docs.zenml.io/stack-components/step-operators/sagemaker", - "https://docs.zenml.io/stack-components/alerters/slack", - "https://docs.zenml.io/how-to/infrastructure-deployment/auth-management/kubernetes-service-connector", - "https://docs.zenml.io/how-to/infrastructure-deployment/auth-management/azure-service-connector" + # "https://docs.zenml.io/how-to/interact-with-secrets", + # "https://docs.zenml.io/how-to/infrastructure-deployment/auth-management/service-connectors-guide", + # "https://docs.zenml.io/how-to/infrastructure-deployment/auth-management/hyperai-service-connector", + # "https://docs.zenml.io/stack-components/data-validators/evidently", + # "https://docs.zenml.io/stack-components/data-validators", + # "https://docs.zenml.io/stack-components/step-operators/sagemaker", + # "https://docs.zenml.io/stack-components/alerters/slack", + # "https://docs.zenml.io/how-to/infrastructure-deployment/auth-management/kubernetes-service-connector", + # "https://docs.zenml.io/how-to/infrastructure-deployment/auth-management/azure-service-connector" ] else: docs_urls = get_all_pages(docs_url)