Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 44 additions & 34 deletions services/crawler/app/services/crawler_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,69 +231,68 @@ async def discover_urls(
if not self.initialized:
await self.initialize()

from crawl4ai import SeedingConfig
from crawl4ai import AsyncUrlSeeder, SeedingConfig

# Try with sitemap+cc first, then fallback to sitemap-only if Common Crawl fails.
# We also rate-limit discovery to avoid hammering sites and triggering 429s.
sources_to_try = ["sitemap+cc", "sitemap"]
# Try sitemap first (fastest, most reliable), then fall back to Common Crawl
# (covers sites without sitemaps). Re-initialize seeder between retries to
# avoid stale httpx client state from crawl4ai's async generator cleanup bug.
sources_to_try = ["sitemap", "cc"]

for source in sources_to_try:
try:
# Configure URL discovery
config = SeedingConfig(
source=source,
extract_head=True, # Get metadata for filtering
extract_head=True,
max_urls=max_urls if max_urls > 0 else -1,
filter_nonsense_urls=True, # Skip robots.txt, .js, .css, etc.
filter_nonsense_urls=True,
pattern=pattern,
query=query,
scoring_method="bm25" if query else None,
score_threshold=0.3 if query else None,
concurrency=3, # Be polite to target sites
hits_per_sec=1, # Explicit rate limit to reduce 429s
concurrency=3,
hits_per_sec=1,
verbose=True,
)

logger.info(f"Discovering URLs from {domain} using source: {source} with timeout: {timeout}s...")

# Add timeout to prevent hanging
urls = await asyncio.wait_for(
self._seeder.urls(domain, config),
timeout=timeout,
)

logger.info(f"Seeder returned {len(urls)} URLs for {domain} from source {source}")

# Keep all URLs returned by the seeder; let the crawler handle transient failures.
filtered_urls = urls

logger.info(
"Discovered %s URLs from %s (no additional filtering applied)",
len(filtered_urls),
domain,
)

# Cleanup memory after discovery
_cleanup_memory()
if urls:
logger.info(
"Discovered %s URLs from %s (source: %s)",
len(urls),
domain,
source,
)
_cleanup_memory()
return urls

return filtered_urls
logger.info(f"Source '{source}' returned 0 URLs for {domain}, trying next source...")

except TimeoutError:
logger.warning(f"Timeout discovering URLs with source '{source}', trying next source...")
_cleanup_memory()
if source == sources_to_try[-1]:
raise Exception(f"All discovery sources timed out for {domain}") from None
continue

except Exception as e:
logger.warning(f"Error discovering URLs with source '{source}': {e}")
_cleanup_memory()
if source == sources_to_try[-1]:
raise Exception(f"Failed to discover URLs from {domain}: {e!s}") from e
continue

_cleanup_memory()
return []
# Re-initialize seeder before next retry to get a fresh httpx client
_cleanup_memory()
if source != sources_to_try[-1]:
try:
if self._seeder:
await self._seeder.__aexit__(None, None, None)
except Exception:
pass
self._seeder = AsyncUrlSeeder()
await self._seeder.__aenter__()

raise Exception(f"Failed to discover URLs from {domain}: all sources exhausted")

async def crawl_urls(
self,
Expand All @@ -314,6 +313,8 @@ async def crawl_urls(
await self.initialize()

from crawl4ai import CrawlerRunConfig
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator

config = CrawlerRunConfig(
only_text=False, # We need HTML to extract structured data
Expand All @@ -324,6 +325,15 @@ async def crawl_urls(
# Disable screenshot capture to save memory
screenshot=False,
pdf=False,
# Exclude structural/navigational HTML elements to reduce noise in markdown
excluded_tags=["nav", "footer", "header", "aside", "select", "option"],
exclude_external_links=True,
exclude_social_media_links=True,
# Use PruningContentFilter for fit_markdown: density-based main content extraction
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(threshold=0.4),
options={"ignore_links": True},
),
)

logger.info(f"Crawling {len(urls)} URLs...")
Expand All @@ -332,8 +342,8 @@ async def crawl_urls(
try:
async for result in await self._crawler.arun_many(urls, config=config):
if result.success:
# Use the new 'markdown' attribute instead of deprecated 'markdown_v2'
markdown_content = result.markdown.raw_markdown
# Prefer fit_markdown (density-filtered main content) over raw_markdown
markdown_content = result.markdown.fit_markdown or result.markdown.raw_markdown

# Extract structured data (price, images, etc.) from HTML
structured_data = self._extract_structured_data_from_html(result.html)
Expand Down
194 changes: 194 additions & 0 deletions services/crawler/tests/test_crawler_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
"""
Tests for CrawlerService content extraction and configuration.
"""

from types import SimpleNamespace
from unittest.mock import AsyncMock, MagicMock, patch

import pytest

from app.services.crawler_service import CrawlerService


def _make_crawl_result(
url: str = "https://example.com",
title: str = "Test Page",
raw_markdown: str = "raw content",
fit_markdown: str | None = "fit content",
html: str = "<html></html>",
success: bool = True,
error_message: str | None = None,
):
"""Build a fake crawl4ai CrawlResult."""
md = SimpleNamespace(raw_markdown=raw_markdown, fit_markdown=fit_markdown)
metadata = {"title": title}
return SimpleNamespace(
url=url,
markdown=md,
metadata=metadata,
html=html,
success=success,
error_message=error_message,
)


class TestCrawlUrlsMarkdownSelection:
"""Verify that crawl_urls prefers fit_markdown and falls back to raw_markdown."""

async def _run_crawl(self, results):
"""Helper: run crawl_urls with mocked arun_many returning given results."""
service = CrawlerService()
service.initialized = True
service._crawl_count = 0

async def fake_arun_many(urls, config):
async def gen():
for r in results:
yield r

return gen()

service._crawler = MagicMock()
service._crawler.arun_many = fake_arun_many

with patch(
"app.services.crawler_service.CrawlerService._extract_structured_data_from_html",
return_value={},
):
return await service.crawl_urls(["https://example.com"])

async def test_uses_fit_markdown_when_available(self):
result = _make_crawl_result(fit_markdown="clean content", raw_markdown="noisy content")
pages = await self._run_crawl([result])

assert len(pages) == 1
assert pages[0]["content"] == "clean content"

async def test_falls_back_to_raw_markdown_when_fit_is_none(self):
result = _make_crawl_result(fit_markdown=None, raw_markdown="raw content")
pages = await self._run_crawl([result])

assert len(pages) == 1
assert pages[0]["content"] == "raw content"

async def test_falls_back_to_raw_markdown_when_fit_is_empty(self):
result = _make_crawl_result(fit_markdown="", raw_markdown="raw content")
pages = await self._run_crawl([result])

assert len(pages) == 1
assert pages[0]["content"] == "raw content"

async def test_skips_failed_results(self):
result = _make_crawl_result(success=False, error_message="404")
pages = await self._run_crawl([result])

assert len(pages) == 0

async def test_word_count_uses_selected_markdown(self):
result = _make_crawl_result(fit_markdown="one two three")
pages = await self._run_crawl([result])

assert pages[0]["word_count"] == 3


class TestCrawlerRunConfigSetup:
"""Verify that CrawlerRunConfig is created with the correct filtering options."""

async def test_config_has_excluded_tags(self):
"""Ensure excluded_tags and content filter are set in the config."""
captured_config = {}

async def fake_arun_many(urls, config):
captured_config["config"] = config

async def gen():
return
yield

Comment thread
coderabbitai[bot] marked this conversation as resolved.
return gen()

service = CrawlerService()
service.initialized = True
service._crawl_count = 0
service._crawler = MagicMock()
service._crawler.arun_many = fake_arun_many

await service.crawl_urls(["https://example.com"])

config = captured_config["config"]
assert "nav" in config.excluded_tags
assert "footer" in config.excluded_tags
assert "header" in config.excluded_tags
assert "aside" in config.excluded_tags
assert "select" in config.excluded_tags
assert "option" in config.excluded_tags
assert config.exclude_external_links is True
assert config.exclude_social_media_links is True


class TestExtractStructuredData:
"""Verify structured data extraction from HTML."""

def test_extracts_json_ld(self):
service = CrawlerService()
html = """
<html>
<head>
<script type="application/ld+json">{"@type": "Product", "name": "Test"}</script>
</head>
<body></body>
</html>
"""
data = service._extract_structured_data_from_html(html)
assert "json_ld" in data
assert data["json_ld"][0]["@type"] == "Product"

def test_extracts_opengraph(self):
service = CrawlerService()
html = """
<html>
<head>
<meta property="og:title" content="Test Product" />
<meta property="og:price:amount" content="49.00" />
</head>
<body></body>
</html>
"""
data = service._extract_structured_data_from_html(html)
assert "opengraph" in data
assert data["opengraph"]["title"] == "Test Product"

def test_extracts_meta_tags(self):
service = CrawlerService()
html = """
<html>
<head>
<meta name="description" content="A test page" />
<meta name="keywords" content="test, page" />
</head>
<body></body>
</html>
"""
data = service._extract_structured_data_from_html(html)
assert "meta" in data
assert data["meta"]["description"] == "A test page"
assert data["meta"]["keywords"] == "test, page"

def test_returns_empty_dict_for_no_structured_data(self):
service = CrawlerService()
html = "<html><head></head><body><p>Hello</p></body></html>"
data = service._extract_structured_data_from_html(html)
assert data == {}

def test_handles_malformed_json_ld(self):
service = CrawlerService()
html = """
<html>
<head>
<script type="application/ld+json">not valid json</script>
</head>
<body></body>
</html>
"""
data = service._extract_structured_data_from_html(html)
assert "json_ld" not in data
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import { SubAgentDetailsDialog } from './sub-agent-details-dialog';

function formatAgentName(toolName: string): string {
const nameMap: Record<string, string> = {
web_assistant: 'Web',
document_assistant: 'Document',
crm_assistant: 'CRM',
integration_assistant: 'Integration',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ import type { SubAgentUsage } from '../hooks/queries';

function formatAgentName(toolName: string): string {
const nameMap: Record<string, string> = {
web_assistant: 'Web',
document_assistant: 'Document',
crm_assistant: 'CRM',
integration_assistant: 'Integration',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ const TOOL_CATEGORIES: Record<string, ToolName[]> = {
Data: ['database_schema'],
Assistants: [
'crm_assistant',
'web_assistant',
'document_assistant',
'workflow_assistant',
'integration_assistant',
Expand Down
Loading
Loading