From dc4ebd9ce03cbd183ee9f12d41dde744fecaf433 Mon Sep 17 00:00:00 2001 From: Joseph Berry Date: Thu, 27 Nov 2025 16:49:47 +0200 Subject: [PATCH 1/4] fix html rendering when multiple percentiles to collapse to the same value Signed-off-by: Joseph Berry --- src/guidellm/benchmark/outputs/html.py | 49 ++++++- tests/unit/benchmark/test_html_output.py | 162 +++++++++++++++++++++++ 2 files changed, 210 insertions(+), 1 deletion(-) create mode 100644 tests/unit/benchmark/test_html_output.py diff --git a/src/guidellm/benchmark/outputs/html.py b/src/guidellm/benchmark/outputs/html.py index 34cf71073..909a85da8 100644 --- a/src/guidellm/benchmark/outputs/html.py +++ b/src/guidellm/benchmark/outputs/html.py @@ -29,7 +29,7 @@ GenerativeBenchmark, GenerativeBenchmarksReport, ) -from guidellm.schemas import DistributionSummary +from guidellm.schemas import DistributionSummary, Percentiles from guidellm.settings import settings from guidellm.utils import camelize_str, recursive_key_update from guidellm.utils.text import load_text @@ -190,6 +190,24 @@ def percentile_rows(self) -> list[dict[str, str | float]]: filter(lambda row: row["percentile"] in ["p50", "p90", "p95", "p99"], rows) ) + def model_dump(self, **kwargs) -> dict: + """ + Override model_dump to filter duplicate consecutive percentile values. + + This prevents visualization errors when distributions have limited data + points causing multiple percentiles to collapse to the same value. + + :param kwargs: Arguments to pass to parent model_dump + :return: Dictionary with filtered percentiles + """ + data = super().model_dump(**kwargs) + + if "percentiles" in data and data["percentiles"]: + filtered_percentiles = _filter_duplicate_percentiles(data["percentiles"]) + data["percentiles"] = filtered_percentiles + + return data + @classmethod def from_distribution_summary( cls, distribution: DistributionSummary @@ -222,6 +240,35 @@ def _create_html_report(js_data: dict[str, str], output_path: Path) -> Path: return output_path +def _filter_duplicate_percentiles(percentiles: dict[str, float]) -> dict[str, float]: + """ + Filter out consecutive duplicate percentile values. + + When distributions have very few data points, multiple percentiles can have + the same value, which causes visualization libraries to fail. This function + keeps only the first occurrence of consecutive duplicate values. + + :param percentiles: Dictionary of percentile names to values + :return: Filtered percentiles dictionary with no consecutive duplicates + """ + if not percentiles: + return percentiles + + percentile_order = list(Percentiles.model_fields.keys()) + + filtered = {} + previous_value = None + + for key in percentile_order: + if key in percentiles: + current_value = percentiles[key] + if previous_value is None or current_value != previous_value: + filtered[key] = current_value + previous_value = current_value + + return filtered + + def _inject_data(js_data: dict[str, str], html: str) -> str: """ Inject JavaScript data into HTML head section. diff --git a/tests/unit/benchmark/test_html_output.py b/tests/unit/benchmark/test_html_output.py new file mode 100644 index 000000000..67bda8a64 --- /dev/null +++ b/tests/unit/benchmark/test_html_output.py @@ -0,0 +1,162 @@ +from guidellm.benchmark.outputs.html import _filter_duplicate_percentiles +from guidellm.schemas import Percentiles + + +def test_filter_all_same_values(): + """Test filtering when all percentiles have the same value.""" + percentiles = { + "p001": 15.288091352804853, + "p01": 15.288091352804853, + "p05": 15.288091352804853, + "p10": 15.288091352804853, + "p25": 15.288091352804853, + "p50": 15.288091352804853, + "p75": 15.288091352804853, + "p90": 15.288091352804853, + "p95": 15.288091352804853, + "p99": 15.288091352804853, + "p999": 15.288091352804853, + } + + filtered = _filter_duplicate_percentiles(percentiles) + + # Should only keep the first one + assert filtered == {"p001": 15.288091352804853} + + +def test_filter_consecutive_duplicates(): + """Test filtering when some consecutive percentiles have the same value.""" + percentiles = { + "p001": 15.288091352804853, + "p01": 15.288091352804853, + "p05": 15.288091352804853, + "p10": 15.288091352804853, + "p25": 15.288091352804853, + "p50": 16.41327511776994, # Different value + "p75": 16.41327511776994, + "p90": 17.03541629998259, # Different value + "p95": 17.03541629998259, + "p99": 17.03541629998259, + "p999": 17.03541629998259, + } + + filtered = _filter_duplicate_percentiles(percentiles) + + # Should keep first of each group + assert filtered == { + "p001": 15.288091352804853, + "p50": 16.41327511776994, + "p90": 17.03541629998259, + } + + +def test_no_duplicates(): + """Test that unique values are all preserved.""" + percentiles = { + "p001": 13.181080445834912, + "p01": 13.181080445834912, # Same as p001 + "p05": 13.530595573836457, # Different + "p10": 13.843972502554365, + "p25": 14.086376978251748, + "p50": 14.403258051191058, + "p75": 14.738608817056042, + "p90": 15.18136631856698, + "p95": 15.7213110894772, + "p99": 15.7213110894772, # Same as p95 + "p999": 15.7213110894772, # Same as p99 + } + + filtered = _filter_duplicate_percentiles(percentiles) + + assert filtered == { + "p001": 13.181080445834912, + "p05": 13.530595573836457, + "p10": 13.843972502554365, + "p25": 14.086376978251748, + "p50": 14.403258051191058, + "p75": 14.738608817056042, + "p90": 15.18136631856698, + "p95": 15.7213110894772, + } + + +def test_empty_percentiles(): + """Test with empty percentiles dictionary.""" + filtered = _filter_duplicate_percentiles({}) + assert filtered == {} + + +def test_single_percentile(): + """Test with only one percentile.""" + percentiles = {"p50": 14.403258051191058} + filtered = _filter_duplicate_percentiles(percentiles) + assert filtered == {"p50": 14.403258051191058} + + +def test_two_different_values(): + """Test with two different values.""" + percentiles = { + "p25": 14.086376978251748, + "p50": 14.403258051191058, + } + filtered = _filter_duplicate_percentiles(percentiles) + assert filtered == percentiles + + +def test_partial_percentiles(): + """Test that order is maintained even with partial percentiles.""" + percentiles = { + "p50": 16.41327511776994, + "p10": 15.288091352804853, + "p90": 17.03541629998259, + } + + filtered = _filter_duplicate_percentiles(percentiles) + + # Should maintain order from percentile_order list + assert list(filtered.keys()) == ["p10", "p50", "p90"] + + +def test_model_dump_filters_duplicates(): + """Test that model_dump applies percentile filtering.""" + from guidellm.benchmark.outputs.html import _TabularDistributionSummary + + # Create a distribution with duplicate percentiles (typical of small datasets) + dist = _TabularDistributionSummary( + mean=15.5, + median=15.288091352804853, + mode=15.288091352804853, + variance=0.1, + std_dev=0.316, + min=15.288091352804853, + max=17.03541629998259, + count=3, + total_sum=46.5, + percentiles=Percentiles( + p001=15.288091352804853, + p01=15.288091352804853, + p05=15.288091352804853, + p10=15.288091352804853, + p25=15.288091352804853, + p50=16.41327511776994, + p75=16.41327511776994, + p90=17.03541629998259, + p95=17.03541629998259, + p99=17.03541629998259, + p999=17.03541629998259, + ), + ) + + data = dist.model_dump() + + # Check that percentiles were filtered + assert data["percentiles"] == { + "p001": 15.288091352804853, + "p50": 16.41327511776994, + "p90": 17.03541629998259, + } + + # Ensure other fields remain unchanged + assert data["mean"] == 15.5 + assert data["median"] == 15.288091352804853 + assert data["count"] == 3 From 92389345ea2d926a1f28e9304d62c883746f0b5d Mon Sep 17 00:00:00 2001 From: Joseph Berry Date: Thu, 27 Nov 2025 22:52:03 +0200 Subject: [PATCH 2/4] add AI authorship comment to test_html_output.py Signed-off-by: Joseph Berry --- tests/unit/benchmark/test_html_output.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/benchmark/test_html_output.py b/tests/unit/benchmark/test_html_output.py index 67bda8a64..efd6727f9 100644 --- a/tests/unit/benchmark/test_html_output.py +++ b/tests/unit/benchmark/test_html_output.py @@ -1,3 +1,4 @@ +## WRITTEN BY AI ## from guidellm.benchmark.outputs.html import _filter_duplicate_percentiles from guidellm.schemas import Percentiles From 175611c9500c78d384b9023afd463a52ec661078 Mon Sep 17 00:00:00 2001 From: Joseph Berry Date: Sun, 7 Dec 2025 10:50:59 +0200 Subject: [PATCH 3/4] fix: update duplicate percentile filtering to retain largest values for accuracy Signed-off-by: Joseph Berry --- src/guidellm/benchmark/outputs/html.py | 12 ++++++++---- tests/unit/benchmark/test_html_output.py | 25 ++++++++++++------------ 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/src/guidellm/benchmark/outputs/html.py b/src/guidellm/benchmark/outputs/html.py index 909a85da8..318d9d4de 100644 --- a/src/guidellm/benchmark/outputs/html.py +++ b/src/guidellm/benchmark/outputs/html.py @@ -246,27 +246,31 @@ def _filter_duplicate_percentiles(percentiles: dict[str, float]) -> dict[str, fl When distributions have very few data points, multiple percentiles can have the same value, which causes visualization libraries to fail. This function - keeps only the first occurrence of consecutive duplicate values. + keeps only the largest percentile for consecutive duplicate values, which is + more mathematically accurate as higher percentiles have greater statistical + significance. :param percentiles: Dictionary of percentile names to values :return: Filtered percentiles dictionary with no consecutive duplicates """ if not percentiles: return percentiles - + percentile_order = list(Percentiles.model_fields.keys()) + # Iterate in reverse to keep the largest percentile for each value filtered = {} previous_value = None - for key in percentile_order: + for key in reversed(percentile_order): if key in percentiles: current_value = percentiles[key] if previous_value is None or current_value != previous_value: filtered[key] = current_value previous_value = current_value - return filtered + # Restore original order + return {key: filtered[key] for key in percentile_order if key in filtered} def _inject_data(js_data: dict[str, str], html: str) -> str: diff --git a/tests/unit/benchmark/test_html_output.py b/tests/unit/benchmark/test_html_output.py index efd6727f9..f5ce146c8 100644 --- a/tests/unit/benchmark/test_html_output.py +++ b/tests/unit/benchmark/test_html_output.py @@ -21,8 +21,8 @@ def test_filter_all_same_values(): filtered = _filter_duplicate_percentiles(percentiles) - # Should only keep the first one - assert filtered == {"p001": 15.288091352804853} + # Should only keep the largest (p999) for mathematical accuracy + assert filtered == {"p999": 15.288091352804853} def test_filter_consecutive_duplicates(): @@ -43,11 +43,11 @@ def test_filter_consecutive_duplicates(): filtered = _filter_duplicate_percentiles(percentiles) - # Should keep first of each group + # Should keep largest of each group for mathematical accuracy assert filtered == { - "p001": 15.288091352804853, - "p50": 16.41327511776994, - "p90": 17.03541629998259, + "p25": 15.288091352804853, + "p75": 16.41327511776994, + "p999": 17.03541629998259, } @@ -69,15 +69,16 @@ def test_no_duplicates(): filtered = _filter_duplicate_percentiles(percentiles) + # Should keep largest of each duplicate group (p01 instead of p001, p999 instead of p95) assert filtered == { - "p001": 13.181080445834912, + "p01": 13.181080445834912, "p05": 13.530595573836457, "p10": 13.843972502554365, "p25": 14.086376978251748, "p50": 14.403258051191058, "p75": 14.738608817056042, "p90": 15.18136631856698, - "p95": 15.7213110894772, + "p999": 15.7213110894772, } @@ -150,11 +151,11 @@ def test_model_dump_filters_duplicates(): data = dist.model_dump() - # Check that percentiles were filtered + # Check that percentiles were filtered, keeping largest of each group assert data["percentiles"] == { - "p001": 15.288091352804853, - "p50": 16.41327511776994, - "p90": 17.03541629998259, + "p25": 15.288091352804853, + "p75": 16.41327511776994, + "p999": 17.03541629998259, } # Ensure other fields remain unchanged From 65a60233de329000d9b7ac3802d1ba8397029d52 Mon Sep 17 00:00:00 2001 From: Joseph Berry Date: Sun, 7 Dec 2025 12:04:31 +0200 Subject: [PATCH 4/4] fix: line length E501 Signed-off-by: Joseph Berry --- tests/unit/benchmark/test_html_output.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/benchmark/test_html_output.py b/tests/unit/benchmark/test_html_output.py index f5ce146c8..39c46a763 100644 --- a/tests/unit/benchmark/test_html_output.py +++ b/tests/unit/benchmark/test_html_output.py @@ -69,7 +69,7 @@ def test_no_duplicates(): filtered = _filter_duplicate_percentiles(percentiles) - # Should keep largest of each duplicate group (p01 instead of p001, p999 instead of p95) + # Should keep largest of each duplicate group (e.g. p999 instead of p95) assert filtered == { "p01": 13.181080445834912, "p05": 13.530595573836457,