In [2]:
from pydantic.v1 import BaseModel, Field, PrivateAttr
from crewai_tools import BaseTool
from typing import Optional, Type, Any
import logging
from collections import deque
from urllib.parse import urlparse

from browser_tool import BrowserTool  # Make sure this import works



logger = logging.getLogger(__name__)

class CrawlWebsiteToolSchema(BaseModel):
    """Input for CrawlWebsiteTool."""
    website_url: str = Field(..., description="Mandatory website url to crawl and read content")

class FixedCrawlWebsiteToolSchema(BaseModel):
    """Input for CrawlWebsiteTool when website_url is fixed."""
    pass


class CrawlWebsiteTool(BaseTool):
    name: str = "Crawl and read website content"
    description: str = "A tool that can be used to crawl a website and read its content, including content from internal links on the same page."
    args_schema: Type[BaseModel] = CrawlWebsiteToolSchema

    website_url: Optional[str] = None
    max_pages: int = 10
    _browser_tool: BrowserTool = PrivateAttr()

    def __init__(self, website_url: Optional[str] = None, **kwargs):
        super().__init__(**kwargs)
        logger.info("Initializing CrawlWebsiteTool")
        if website_url is not None:
            self.website_url = website_url

    def _run(self, website_url: str) -> str:
        logger.info(f"Processing {website_url}")
        content = self._crawl_website(website_url)
        return content

    def _crawl_website(self, url: str) -> str:
        content = ""
        visited_urls = set()
        urls_to_visit = deque([url])
        base_domain = urlparse(url).netloc

        while urls_to_visit and len(visited_urls) < self.max_pages:
            current_url = urls_to_visit.popleft()
            if current_url in visited_urls:
                continue

            logger.info(f"Visiting: {current_url}")
            try:
                page_content = self._browser_tool.get_content(current_url)
                content += f"---link: {current_url}\n{page_content}\n---page-end---\n"
                visited_urls.add(current_url)

                links = self._browser_tool.get_links(current_url)
                for link in links:
                    if link not in visited_urls and urlparse(link).netloc == base_domain:
                        urls_to_visit.append(link)
            except Exception as e:
                logger.error(f"Error processing {current_url}: {e}")
                content += f"---link: {current_url}\nError: Failed to process this page\n---page-end---\n"

        return content


In [3]:
# Add this code block to test the CrawlWebsiteTool on the specified URL

# Create an instance of the CrawlWebsiteTool
crawler_tool = CrawlWebsiteTool()

# Run the crawler on the specified website
extracted_content = crawler_tool._run("https://brandon.neuralami.com")

# Print the extracted content
print(extracted_content)

2024-10-17 13:50:43,793 - 126323511457600 - 3650673014.py-3650673014:65 - ERROR: Error processing https://brandon.neuralami.com: 'ModelPrivateAttr' object has no attribute 'get_content'


---link: https://brandon.neuralami.com
Error: Failed to process this page
---page-end---

