From ad15be54704ff572fcd092420726b0430702f5de Mon Sep 17 00:00:00 2001
From: hafezparast <maysam@kidocode.com>
Date: Fri, 24 Apr 2026 11:15:00 +0800
Subject: [PATCH] fix: use str.split() for accurate word count in
 PruningContentFilter

_compute_composite_score() used text.count(" ") + 1 to count words,
which overcounts on consecutive spaces (common in HTML-extracted text)
and returns 1 for empty strings instead of 0, allowing empty nodes to
slip through min_word_threshold filtering.

len(text.split()) handles all whitespace correctly and is consistent
with the identical word-count logic already used at lines 268 and 302
in the same file.

Ref: #1838 (community PR open since Mar 16 without review)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crawl4ai/content_filter_strategy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py
index 0909be33d..5e954c31b 100644
--- a/crawl4ai/content_filter_strategy.py
+++ b/crawl4ai/content_filter_strategy.py
@@ -739,7 +739,7 @@ def _compute_composite_score(self, metrics, text_len, tag_len, link_text_len):
         if self.min_word_threshold:
             # Get raw text from metrics node - avoid extra processing
             text = metrics["node"].get_text(strip=True)
-            word_count = text.count(" ") + 1
+            word_count = len(text.split())
             if word_count < self.min_word_threshold:
                 return -1.0  # Guaranteed removal
         score = 0.0