tale-project · larryro · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026
diff --git a/services/crawler/app/services/crawler_service.py b/services/crawler/app/services/crawler_service.py
@@ -231,69 +231,68 @@ async def discover_urls(
         if not self.initialized:
             await self.initialize()
 
-        from crawl4ai import SeedingConfig
+        from crawl4ai import AsyncUrlSeeder, SeedingConfig
 
-        # Try with sitemap+cc first, then fallback to sitemap-only if Common Crawl fails.
-        # We also rate-limit discovery to avoid hammering sites and triggering 429s.
-        sources_to_try = ["sitemap+cc", "sitemap"]
+        # Try sitemap first (fastest, most reliable), then fall back to Common Crawl
+        # (covers sites without sitemaps). Re-initialize seeder between retries to
+        # avoid stale httpx client state from crawl4ai's async generator cleanup bug.
+        sources_to_try = ["sitemap", "cc"]
 
         for source in sources_to_try:
             try:
-                # Configure URL discovery
                 config = SeedingConfig(
                     source=source,
-                    extract_head=True,  # Get metadata for filtering
+                    extract_head=True,
                     max_urls=max_urls if max_urls > 0 else -1,
-                    filter_nonsense_urls=True,  # Skip robots.txt, .js, .css, etc.
+                    filter_nonsense_urls=True,
                     pattern=pattern,
                     query=query,
                     scoring_method="bm25" if query else None,
                     score_threshold=0.3 if query else None,
-                    concurrency=3,  # Be polite to target sites
-                    hits_per_sec=1,  # Explicit rate limit to reduce 429s
+                    concurrency=3,
+                    hits_per_sec=1,
                     verbose=True,
                 )
 
                 logger.info(f"Discovering URLs from {domain} using source: {source} with timeout: {timeout}s...")
 
-                # Add timeout to prevent hanging
                 urls = await asyncio.wait_for(
                     self._seeder.urls(domain, config),
                     timeout=timeout,
                 )
 
                 logger.info(f"Seeder returned {len(urls)} URLs for {domain} from source {source}")
 
-                # Keep all URLs returned by the seeder; let the crawler handle transient failures.
-                filtered_urls = urls
-
-                logger.info(
-                    "Discovered %s URLs from %s (no additional filtering applied)",
-                    len(filtered_urls),
-                    domain,
-                )
-
-                # Cleanup memory after discovery
-                _cleanup_memory()
+                if urls:
+                    logger.info(
+                        "Discovered %s URLs from %s (source: %s)",
+                        len(urls),
+                        domain,
+                        source,
+                    )
+                    _cleanup_memory()
+                    return urls
 
-                return filtered_urls
+                logger.info(f"Source '{source}' returned 0 URLs for {domain}, trying next source...")
 
             except TimeoutError:
                 logger.warning(f"Timeout discovering URLs with source '{source}', trying next source...")
-                _cleanup_memory()
-                if source == sources_to_try[-1]:
-                    raise Exception(f"All discovery sources timed out for {domain}") from None
-                continue
 
             except Exception as e:
                 logger.warning(f"Error discovering URLs with source '{source}': {e}")
-                _cleanup_memory()
-                if source == sources_to_try[-1]:
-                    raise Exception(f"Failed to discover URLs from {domain}: {e!s}") from e
-                continue
 
-        _cleanup_memory()
-        return []
+            # Re-initialize seeder before next retry to get a fresh httpx client
+            _cleanup_memory()
+            if source != sources_to_try[-1]:
+                try:
+                    if self._seeder:
+                        await self._seeder.__aexit__(None, None, None)
+                except Exception:
+                    pass
+                self._seeder = AsyncUrlSeeder()
+                await self._seeder.__aenter__()
+
+        raise Exception(f"Failed to discover URLs from {domain}: all sources exhausted")
 
     async def crawl_urls(
         self,
@@ -314,6 +313,8 @@ async def crawl_urls(
             await self.initialize()
 
         from crawl4ai import CrawlerRunConfig
+        from crawl4ai.content_filter_strategy import PruningContentFilter
+        from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 
         config = CrawlerRunConfig(
             only_text=False,  # We need HTML to extract structured data
@@ -324,6 +325,15 @@ async def crawl_urls(
             # Disable screenshot capture to save memory
             screenshot=False,
             pdf=False,
+            # Exclude structural/navigational HTML elements to reduce noise in markdown
+            excluded_tags=["nav", "footer", "header", "aside", "select", "option"],
+            exclude_external_links=True,
+            exclude_social_media_links=True,
+            # Use PruningContentFilter for fit_markdown: density-based main content extraction
+            markdown_generator=DefaultMarkdownGenerator(
+                content_filter=PruningContentFilter(threshold=0.4),
+                options={"ignore_links": True},
+            ),
         )
 
         logger.info(f"Crawling {len(urls)} URLs...")
@@ -332,8 +342,8 @@ async def crawl_urls(
         try:
             async for result in await self._crawler.arun_many(urls, config=config):
                 if result.success:
-                    # Use the new 'markdown' attribute instead of deprecated 'markdown_v2'
-                    markdown_content = result.markdown.raw_markdown
+                    # Prefer fit_markdown (density-filtered main content) over raw_markdown
+                    markdown_content = result.markdown.fit_markdown or result.markdown.raw_markdown
 
                     # Extract structured data (price, images, etc.) from HTML
                     structured_data = self._extract_structured_data_from_html(result.html)

diff --git a/services/crawler/tests/test_crawler_service.py b/services/crawler/tests/test_crawler_service.py
@@ -0,0 +1,194 @@
+"""
+Tests for CrawlerService content extraction and configuration.
+"""
+
+from types import SimpleNamespace
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from app.services.crawler_service import CrawlerService
+
+
+def _make_crawl_result(
+    url: str = "https://example.com",
+    title: str = "Test Page",
+    raw_markdown: str = "raw content",
+    fit_markdown: str | None = "fit content",
+    html: str = "<html></html>",
+    success: bool = True,
+    error_message: str | None = None,
+):
+    """Build a fake crawl4ai CrawlResult."""
+    md = SimpleNamespace(raw_markdown=raw_markdown, fit_markdown=fit_markdown)
+    metadata = {"title": title}
+    return SimpleNamespace(
+        url=url,
+        markdown=md,
+        metadata=metadata,
+        html=html,
+        success=success,
+        error_message=error_message,
+    )
+
+
+class TestCrawlUrlsMarkdownSelection:
+    """Verify that crawl_urls prefers fit_markdown and falls back to raw_markdown."""
+
+    async def _run_crawl(self, results):
+        """Helper: run crawl_urls with mocked arun_many returning given results."""
+        service = CrawlerService()
+        service.initialized = True
+        service._crawl_count = 0
+
+        async def fake_arun_many(urls, config):
+            async def gen():
+                for r in results:
+                    yield r
+
+            return gen()
+
+        service._crawler = MagicMock()
+        service._crawler.arun_many = fake_arun_many
+
+        with patch(
+            "app.services.crawler_service.CrawlerService._extract_structured_data_from_html",
+            return_value={},
+        ):
+            return await service.crawl_urls(["https://example.com"])
+
+    async def test_uses_fit_markdown_when_available(self):
+        result = _make_crawl_result(fit_markdown="clean content", raw_markdown="noisy content")
+        pages = await self._run_crawl([result])
+
+        assert len(pages) == 1
+        assert pages[0]["content"] == "clean content"
+
+    async def test_falls_back_to_raw_markdown_when_fit_is_none(self):
+        result = _make_crawl_result(fit_markdown=None, raw_markdown="raw content")
+        pages = await self._run_crawl([result])
+
+        assert len(pages) == 1
+        assert pages[0]["content"] == "raw content"
+
+    async def test_falls_back_to_raw_markdown_when_fit_is_empty(self):
+        result = _make_crawl_result(fit_markdown="", raw_markdown="raw content")
+        pages = await self._run_crawl([result])
+
+        assert len(pages) == 1
+        assert pages[0]["content"] == "raw content"
+
+    async def test_skips_failed_results(self):
+        result = _make_crawl_result(success=False, error_message="404")
+        pages = await self._run_crawl([result])
+
+        assert len(pages) == 0
+
+    async def test_word_count_uses_selected_markdown(self):
+        result = _make_crawl_result(fit_markdown="one two three")
+        pages = await self._run_crawl([result])
+
+        assert pages[0]["word_count"] == 3
+
+
+class TestCrawlerRunConfigSetup:
+    """Verify that CrawlerRunConfig is created with the correct filtering options."""
+
+    async def test_config_has_excluded_tags(self):
+        """Ensure excluded_tags and content filter are set in the config."""
+        captured_config = {}
+
+        async def fake_arun_many(urls, config):
+            captured_config["config"] = config
+
+            async def gen():
+                return
+                yield
+
+            return gen()
+
+        service = CrawlerService()
+        service.initialized = True
+        service._crawl_count = 0
+        service._crawler = MagicMock()
+        service._crawler.arun_many = fake_arun_many
+
+        await service.crawl_urls(["https://example.com"])
+
+        config = captured_config["config"]
+        assert "nav" in config.excluded_tags
+        assert "footer" in config.excluded_tags
+        assert "header" in config.excluded_tags
+        assert "aside" in config.excluded_tags
+        assert "select" in config.excluded_tags
+        assert "option" in config.excluded_tags
+        assert config.exclude_external_links is True
+        assert config.exclude_social_media_links is True
+
+
+class TestExtractStructuredData:
+    """Verify structured data extraction from HTML."""
+
+    def test_extracts_json_ld(self):
+        service = CrawlerService()
+        html = """
+        <html>
+        <head>
+            <script type="application/ld+json">{"@type": "Product", "name": "Test"}</script>
+        </head>
+        <body></body>
+        </html>
+        """
+        data = service._extract_structured_data_from_html(html)
+        assert "json_ld" in data
+        assert data["json_ld"][0]["@type"] == "Product"
+
+    def test_extracts_opengraph(self):
+        service = CrawlerService()
+        html = """
+        <html>
+        <head>
+            <meta property="og:title" content="Test Product" />
+            <meta property="og:price:amount" content="49.00" />
+        </head>
+        <body></body>
+        </html>
+        """
+        data = service._extract_structured_data_from_html(html)
+        assert "opengraph" in data
+        assert data["opengraph"]["title"] == "Test Product"
+
+    def test_extracts_meta_tags(self):
+        service = CrawlerService()
+        html = """
+        <html>
+        <head>
+            <meta name="description" content="A test page" />
+            <meta name="keywords" content="test, page" />
+        </head>
+        <body></body>
+        </html>
+        """
+        data = service._extract_structured_data_from_html(html)
+        assert "meta" in data
+        assert data["meta"]["description"] == "A test page"
+        assert data["meta"]["keywords"] == "test, page"
+
+    def test_returns_empty_dict_for_no_structured_data(self):
+        service = CrawlerService()
+        html = "<html><head></head><body><p>Hello</p></body></html>"
+        data = service._extract_structured_data_from_html(html)
+        assert data == {}
+
+    def test_handles_malformed_json_ld(self):
+        service = CrawlerService()
+        html = """
+        <html>
+        <head>
+            <script type="application/ld+json">not valid json</script>
+        </head>
+        <body></body>
+        </html>
+        """
+        data = service._extract_structured_data_from_html(html)
+        assert "json_ld" not in data
diff --git a/services/platform/app/features/chat/components/message-info-dialog.tsx b/services/platform/app/features/chat/components/message-info-dialog.tsx
@@ -22,7 +22,6 @@ import { SubAgentDetailsDialog } from './sub-agent-details-dialog';
 
 function formatAgentName(toolName: string): string {
   const nameMap: Record<string, string> = {
-    web_assistant: 'Web',
     document_assistant: 'Document',
     crm_assistant: 'CRM',
     integration_assistant: 'Integration',

diff --git a/services/platform/app/features/chat/components/sub-agent-details-dialog.tsx b/services/platform/app/features/chat/components/sub-agent-details-dialog.tsx
@@ -11,7 +11,6 @@ import type { SubAgentUsage } from '../hooks/queries';
 
 function formatAgentName(toolName: string): string {
   const nameMap: Record<string, string> = {
-    web_assistant: 'Web',
     document_assistant: 'Document',
     crm_assistant: 'CRM',
     integration_assistant: 'Integration',

diff --git a/services/platform/app/features/custom-agents/components/tool-selector.tsx b/services/platform/app/features/custom-agents/components/tool-selector.tsx
@@ -38,7 +38,6 @@ const TOOL_CATEGORIES: Record<string, ToolName[]> = {
   Data: ['database_schema'],
   Assistants: [
     'crm_assistant',
-    'web_assistant',
     'document_assistant',
     'workflow_assistant',
     'integration_assistant',