In [None]:
from langchain_community.document_loaders import WebBaseLoader
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin, urlparse

def recursive_web_scraper(start_url, domain, max_depth=3):
    visited = set()
    documents = []

    def scrape(url, depth):
        if depth > max_depth or url in visited:
            return

        visited.add(url)

        try:
            loader = WebBaseLoader(
                web_paths=(url,),
                bs_kwargs=dict(
                    parse_only=bs4.SoupStrainer(class_=("post-title", "post-content", "post-header"))
                )
            )
            docs = loader.load()
            documents.extend(docs)

            # Get links from the page
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            links = soup.find_all('a', href=True)

            for link in links:
                next_url = urljoin(url, link['href'])
                if urlparse(next_url).netloc == domain:
                    scrape(next_url, depth + 1)

        except Exception as e:
            print(f"Error scraping {url}: {e}")

    domain = urlparse(start_url).netloc
    scrape(start_url, 0)
    return documents

# Usage
start_url = "https://lilianweng.github.io/posts/2023-06-23-agent/"
documents = recursive_web_scraper(start_url, urlparse(start_url).netloc)

print(f"Total documents scraped: {len(documents)}")
for doc in documents:
    print(f"URL: {doc.metadata['source']}")
    print(f"Content snippet: {doc.page_content[:100]}...")
    print("-" * 50)