From ad15be54704ff572fcd092420726b0430702f5de Mon Sep 17 00:00:00 2001 From: hafezparast Date: Fri, 24 Apr 2026 11:15:00 +0800 Subject: [PATCH] fix: use str.split() for accurate word count in PruningContentFilter _compute_composite_score() used text.count(" ") + 1 to count words, which overcounts on consecutive spaces (common in HTML-extracted text) and returns 1 for empty strings instead of 0, allowing empty nodes to slip through min_word_threshold filtering. len(text.split()) handles all whitespace correctly and is consistent with the identical word-count logic already used at lines 268 and 302 in the same file. Ref: #1838 (community PR open since Mar 16 without review) Co-Authored-By: Claude Sonnet 4.6 --- crawl4ai/content_filter_strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py index 0909be33d..5e954c31b 100644 --- a/crawl4ai/content_filter_strategy.py +++ b/crawl4ai/content_filter_strategy.py @@ -739,7 +739,7 @@ def _compute_composite_score(self, metrics, text_len, tag_len, link_text_len): if self.min_word_threshold: # Get raw text from metrics node - avoid extra processing text = metrics["node"].get_text(strip=True) - word_count = text.count(" ") + 1 + word_count = len(text.split()) if word_count < self.min_word_threshold: return -1.0 # Guaranteed removal score = 0.0