In [1]:
import requests
from bs4 import BeautifulSoup
import os

def get_sitemap_data(url):
    """
    Retrieves the sitemap.xml data from the given URL.

    Args:
        url (str): The base URL of the documentation website.

    Returns:
        str: The content of the sitemap.xml file.
    """

    sitemap_url = f"{url}/sitemap.xml"
    try:
        response = requests.get(sitemap_url)
        response.raise_for_status()  # Raise an exception for bad status codes
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching sitemap: {e}")
        return None

def extract_urls_from_sitemap(sitemap_data):
    """
    Extracts URLs from the given sitemap.xml data.

    Args:
        sitemap_data (str): The content of the sitemap.xml file.

    Returns:
        list: A list of URLs extracted from the sitemap.
    """

    soup = BeautifulSoup(sitemap_data, 'xml')
    urls = []
    for url_tag in soup.find_all('url'):
        loc_tag = url_tag.find('loc')
        if loc_tag:
            urls.append(loc_tag.text)
    return urls

def fetch_and_store_documentation(base_url, save_dir="docs"):
    """
    Fetches documentation content from URLs and stores them in a directory.

    Args:
        base_url (str): The base URL of the documentation website.
        save_dir (str, optional): The directory to store the documentation files. Defaults to "docs".
    """

    sitemap_data = get_sitemap_data(base_url)
    if sitemap_data:
        urls = extract_urls_from_sitemap(sitemap_data)
        os.makedirs(save_dir, exist_ok=True)

        for url in urls:
            try:
                response = requests.get(url)
                response.raise_for_status()
                
                # Extract filename from URL
                filename = url.split('/')[-1]
                if not filename:
                    filename = 'index.html'  # Default for root page
                
                with open(os.path.join(save_dir, filename), 'w', encoding='utf-8') as f:
                    f.write(response.text)
                print(f"Saved {url} to {os.path.join(save_dir, filename)}")

            except requests.exceptions.RequestException as e:
                print(f"Error fetching {url}: {e}")

In [2]:
# Example usage:
base_url = "https://python.langchain.com"
fetch_and_store_documentation(base_url)

Saved https://python.langchain.com/v0.2/search/ to docs/index.html
Saved https://python.langchain.com/v0.2/docs/additional_resources/arxiv_references/ to docs/index.html
Saved https://python.langchain.com/v0.2/docs/additional_resources/dependents/ to docs/index.html
Saved https://python.langchain.com/v0.2/docs/additional_resources/tutorials/ to docs/index.html
Saved https://python.langchain.com/v0.2/docs/additional_resources/youtube/ to docs/index.html
Saved https://python.langchain.com/v0.2/docs/changes/changelog/core/ to docs/index.html
Saved https://python.langchain.com/v0.2/docs/changes/changelog/langchain/ to docs/index.html
Saved https://python.langchain.com/v0.2/docs/concepts/ to docs/index.html
Saved https://python.langchain.com/v0.2/docs/contributing/ to docs/index.html
Saved https://python.langchain.com/v0.2/docs/contributing/code/ to docs/index.html
Saved https://python.langchain.com/v0.2/docs/contributing/documentation/style_guide/ to docs/index.html
Saved https://python.la

KeyboardInterrupt: 