diff --git a/llm-complete-guide/ZENML_VERSION.txt b/llm-complete-guide/ZENML_VERSION.txt
index b7c0622b..c52842c4 100644
--- a/llm-complete-guide/ZENML_VERSION.txt
+++ b/llm-complete-guide/ZENML_VERSION.txt
@@ -1 +1 @@
-0.74.0
+0.75.0
diff --git a/llm-complete-guide/pipelines/llm_eval.py b/llm-complete-guide/pipelines/llm_eval.py
index 4c644344..9112a8cc 100644
--- a/llm-complete-guide/pipelines/llm_eval.py
+++ b/llm-complete-guide/pipelines/llm_eval.py
@@ -17,7 +17,7 @@
from typing import Optional
import click
-from steps.create_prompt import create_prompt
+from steps.create_prompt import PROMPT, create_prompt
from steps.eval_e2e import e2e_evaluation, e2e_evaluation_llm_judged
from steps.eval_retrieval import (
retrieval_evaluation_full,
@@ -26,14 +26,14 @@
retrieval_evaluation_small_with_reranking,
)
from steps.eval_visualisation import visualize_evaluation_results
-from zenml import pipeline
+from zenml import pipeline, save_artifact
@pipeline(enable_cache=True)
def llm_eval(after: Optional[str] = None) -> None:
"""Executes the pipeline to evaluate a RAG pipeline."""
# define prompt
- prompt = create_prompt()
+ prompt = save_artifact(PROMPT, "prompt")
# Retrieval evals
failure_rate_retrieval = retrieval_evaluation_small(after=after)
diff --git a/llm-complete-guide/steps/create_prompt.py b/llm-complete-guide/steps/create_prompt.py
index e2450012..61465592 100644
--- a/llm-complete-guide/steps/create_prompt.py
+++ b/llm-complete-guide/steps/create_prompt.py
@@ -16,6 +16,13 @@
from zenml import log_metadata, step
+PROMPT = """
+You are a friendly chatbot. \
+You can answer questions about ZenML, its features and its use cases. \
+You respond in a concise, technically credible tone. \
+You ONLY use the context from the ZenML documentation to provide relevant
+answers. \
+"""
@step
def create_prompt() -> str:
diff --git a/llm-complete-guide/steps/eval_visualisation.py b/llm-complete-guide/steps/eval_visualisation.py
index 1a582490..65b26fd0 100644
--- a/llm-complete-guide/steps/eval_visualisation.py
+++ b/llm-complete-guide/steps/eval_visualisation.py
@@ -12,97 +12,384 @@
# or implied. See the License for the specific language governing
# permissions and limitations under the License.
-import io
-from typing import Annotated, Tuple
+from typing import Annotated, Dict, List, Tuple
-import matplotlib.pyplot as plt
-import numpy as np
-from PIL import Image
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
from zenml import ArtifactConfig, get_step_context, log_metadata, step
+from zenml.types import HTMLString
-
-def create_image(
- labels: list,
- scores: list,
+def create_plotly_bar_chart(
+ labels: List[str],
+ scores: List[float],
title: str,
- alternate_colours: bool = False,
+ alternate_colors: bool = False,
percentage_scale: bool = False,
-) -> Image.Image:
+ descriptions: Dict[str, str] = None,
+) -> go.Figure:
"""
- Create a horizontal bar chart image from the given labels, scores, and title.
+ Create a horizontal bar chart using Plotly.
Args:
- labels (list): List of labels for the y-axis.
- scores (list): List of scores corresponding to each label.
+ labels (List[str]): List of labels for the y-axis.
+ scores (List[float]): List of scores corresponding to each label.
title (str): Title of the chart.
- alternate_colours (bool): Whether to alternate colours for the bars.
+ alternate_colors (bool): Whether to alternate colors for the bars.
percentage_scale (bool): Whether to use a percentage scale (0-100) for the x-axis.
+ descriptions (Dict[str, str]): Optional descriptions for hover text.
Returns:
- Image.Image: The generated chart image.
+ go.Figure: Plotly figure object
"""
- # Create a new figure and axis with a smaller left margin
- fig, ax = plt.subplots(figsize=(10, 6))
- fig.subplots_adjust(left=0.2) # Adjust left margin
-
- # Plot the horizontal bar chart
- y_pos = np.arange(len(labels))
- if alternate_colours:
- colors = ["blue" if i % 2 == 0 else "red" for i in range(len(labels))]
- ax.barh(y_pos, scores, align="center", color=colors)
+ # Generate colors for bars
+ if alternate_colors:
+ colors = ["rgba(66, 133, 244, 0.8)" if i % 2 == 0 else "rgba(219, 68, 55, 0.8)" for i in range(len(labels))]
else:
- ax.barh(y_pos, scores, align="center")
-
- # Display the actual value to the left of each bar, or to the right if value is 0
- for i, v in enumerate(scores):
- if v == 0:
- ax.text(
- 0.3, # Position the text label slightly to the right of 0
- i,
- f"{v:.1f}",
- color="black",
- va="center",
- fontweight="bold",
- )
- else:
- colors[i] if alternate_colours else "blue"
- text_color = "white"
- ax.text(
- v
- - 0.1, # Adjust the x-position of the text labels to the left
- i,
- f"{v:.1f}",
- color=text_color,
- va="center",
- fontweight="bold",
- ha="right", # Align the text to the right
- )
-
- ax.set_yticks(y_pos)
- ax.set_yticklabels(labels)
- ax.invert_yaxis() # Labels read top-to-bottom
- ax.set_xlabel("Score")
- if percentage_scale:
- ax.set_xlim(0, 100) # Set x-axis limits to 0-100 for percentage scale
- ax.set_xlabel("Percentage")
+ colors = ["rgba(66, 133, 244, 0.8)" for _ in range(len(labels))]
+
+ # Prepare hover text
+ if descriptions:
+ hover_text = [f"{label}
Value: {score:.2f}
{descriptions.get(label, '')}"
+ for label, score in zip(labels, scores)]
else:
- ax.set_xlim(0, 5) # Set x-axis limits based on maximum score
- ax.set_xlabel("Score")
+ hover_text = [f"{label}
Value: {score:.2f}" for label, score in zip(labels, scores)]
+
+ # Create figure
+ fig = go.Figure()
+
+ fig.add_trace(
+ go.Bar(
+ y=labels,
+ x=scores,
+ orientation='h',
+ marker_color=colors,
+ text=[f"{score:.2f}" for score in scores],
+ textposition='auto',
+ hovertext=hover_text,
+ hoverinfo='text',
+ )
+ )
+
+ # Set layout
+ max_value = max(scores) if scores else 5
+ xaxis_range = [0, 100] if percentage_scale else [0, max(5, max_value * 1.1)]
+ xaxis_title = "Percentage (%)" if percentage_scale else "Score"
+
+ fig.update_layout(
+ title=title,
+ xaxis=dict(
+ title=xaxis_title,
+ range=xaxis_range,
+ showgrid=True,
+ gridcolor='rgba(230, 230, 230, 0.8)',
+ ),
+ yaxis=dict(
+ autorange="reversed", # Make labels read top-to-bottom
+ ),
+ margin=dict(l=20, r=20, t=60, b=20),
+ height=max(300, 70 * len(labels)),
+ plot_bgcolor='rgba(255, 255, 255, 1)',
+ )
+
+ return fig
+
+
+def generate_evaluation_html(
+ pipeline_run_name: str,
+ retrieval_labels: List[str],
+ retrieval_scores: List[float],
+ generation_basic_labels: List[str],
+ generation_basic_scores: List[float],
+ generation_quality_labels: List[str],
+ generation_quality_scores: List[float],
+ metrics_metadata: Dict[str, float],
+) -> str:
+ """
+ Generate a comprehensive HTML report with all evaluation visualizations.
+
+ Args:
+ pipeline_run_name (str): Name of the pipeline run
+ retrieval_labels (List[str]): Labels for retrieval metrics
+ retrieval_scores (List[float]): Scores for retrieval metrics
+ generation_basic_labels (List[str]): Labels for basic generation metrics
+ generation_basic_scores (List[float]): Scores for basic generation metrics
+ generation_quality_labels (List[str]): Labels for generation quality metrics
+ generation_quality_scores (List[float]): Scores for generation quality metrics
+ metrics_metadata (Dict[str, float]): All metrics for displaying in the summary
+
+ Returns:
+ str: HTML string containing the interactive dashboard
+ """
+ # Metric descriptions for hovering
+ metric_descriptions = {
+ "Small Retrieval Eval Failure Rate":
+ "Percentage of small test cases where retrieval failed to find relevant documents.",
+ "Small Retrieval Eval Failure Rate Reranking":
+ "Percentage of small test cases where retrieval with reranking failed to find relevant documents.",
+ "Full Retrieval Eval Failure Rate":
+ "Percentage of all test cases where retrieval failed to find relevant documents.",
+ "Full Retrieval Eval Failure Rate Reranking":
+ "Percentage of all test cases where retrieval with reranking failed to find relevant documents.",
+ "Failure Rate Bad Answers":
+ "Percentage of responses that were factually incorrect or misleading.",
+ "Failure Rate Bad Immediate Responses":
+ "Percentage of immediate responses that did not adequately address the query.",
+ "Failure Rate Good Responses":
+ "Percentage of responses rated as good by evaluators.",
+ "Average Toxicity Score":
+ "Average score measuring harmful, offensive, or inappropriate content (lower is better).",
+ "Average Faithfulness Score":
+ "Average score measuring how accurately the response represents the source material (higher is better).",
+ "Average Helpfulness Score":
+ "Average score measuring the practical utility of responses to users (higher is better).",
+ "Average Relevance Score":
+ "Average score measuring how well responses address the specific query intent (higher is better).",
+ }
- ax.set_title(title)
+ # Create individual charts
+ retrieval_fig = create_plotly_bar_chart(
+ retrieval_labels,
+ retrieval_scores,
+ f"Retrieval Evaluation Metrics",
+ alternate_colors=True,
+ descriptions=metric_descriptions
+ )
+
+ generation_basic_fig = create_plotly_bar_chart(
+ generation_basic_labels,
+ generation_basic_scores,
+ f"Basic Generation Metrics",
+ percentage_scale=True,
+ descriptions=metric_descriptions
+ )
+
+ generation_quality_fig = create_plotly_bar_chart(
+ generation_quality_labels,
+ generation_quality_scores,
+ f"Generation Quality Metrics",
+ descriptions=metric_descriptions
+ )
- # Adjust the subplot parameters
- plt.subplots_adjust(left=0.2, right=0.9, top=0.9, bottom=0.1)
+ # Create summary metrics cards
+ composite_quality = metrics_metadata.get("composite.overall_quality", 0)
+ retrieval_effectiveness = metrics_metadata.get("composite.retrieval_effectiveness", 0)
+
+ # Combine into complete HTML report
+ html = f"""
+
+
+
Average of faithfulness, helpfulness, and relevance
+Average success rate across retrieval tests
+Average toxicity score (lower is better)
+| Metric | +Value | +
|---|---|
| {k} | {v:.4f} |
+ These metrics measure how effectively the system retrieves relevant documents for answering queries. + Lower failure rates indicate better retrieval performance. Reranking shows the impact of the reranking + algorithm on improving retrieval quality. +
++ These metrics measure different types of failures in response generation: +
+ These metrics evaluate the quality of generated responses across different dimensions: +