In [1]:
import requests
from bs4 import BeautifulSoup
import os

def get_sitemap_data(url):
    """
    Retrieves the sitemap.xml data from the given URL.

    Args:
        url (str): The base URL of the documentation website.

    Returns:
        str: The content of the sitemap.xml file.
    """

    sitemap_url = f"{url}/sitemap.xml"
    try:
        response = requests.get(sitemap_url)
        response.raise_for_status()  # Raise an exception for bad status codes
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching sitemap: {e}")
        return None

def extract_urls_from_sitemap(sitemap_data):
    """
    Extracts URLs from the given sitemap.xml data.

    Args:
        sitemap_data (str): The content of the sitemap.xml file.

    Returns:
        list: A list of URLs extracted from the sitemap.
    """

    soup = BeautifulSoup(sitemap_data, 'xml')
    urls = []
    for url_tag in soup.find_all('url'):
        loc_tag = url_tag.find('loc')
        if loc_tag:
            urls.append(loc_tag.text)
    return urls

def fetch_and_store_documentation(base_url):
    """
    Fetches documentation content from URLs and stores them in a dictionary.

    Args:
        base_url (str): The base URL of the documentation website.

    Returns:
        dict: A dictionary where keys are URLs and values are filtered HTML content.
    """

    sitemap_data = get_sitemap_data(base_url)
    if sitemap_data:
        urls = extract_urls_from_sitemap(sitemap_data)
        docs = {}  # Initialize an empty dictionary

        for url in urls:
            try:
                response = requests.get(url)
                response.raise_for_status()

                soup = BeautifulSoup(response.text, 'html.parser')
                # Filter out unwanted tags using BeautifulSoup (adjust as needed)
                for tag in ['script', 'style', 'nav', 'aside', 'footer']:
                    for element in soup.find_all(tag):
                        element.decompose()

                docs[url] = soup.get_text(separator=' ')  # Store filtered HTML content
                print(f"Fetched and stored content from: {url}")

            except requests.exceptions.RequestException as e:
                print(f"Error fetching {url}: {e}")

        return docs
    else:
        return None

In [4]:
# Example usage:
base_url = "https://llama-cpp-python.readthedocs.io/"
documentation_data = fetch_and_store_documentation(base_url)

Fetched and stored content from: https://llama-cpp-python.readthedocs.io/en/stable/
Fetched and stored content from: https://llama-cpp-python.readthedocs.io/en/latest/


In [6]:
print(documentation_data['https://llama-cpp-python.readthedocs.io/en/latest/'])


 
 
 
 
 
 
 
 
 Getting Started - llama-cpp-python 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
          Skip to content
         
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 🦙 Python Bindings for  llama.cpp 
 
 
 
 
 
 
 
 Simple Python bindings for  @ggerganov 's   llama.cpp  library.
This package provides: 
 
 Low-level access to C API via  ctypes  interface. 
 High-level Python API for text completion 
 OpenAI-like API 
 LangChain compatibility 
 LlamaIndex compatibility 
 
 
 OpenAI compatible web server 
 Local Copilot replacement 
 Function Calling support 
 Vision API support 
 Multiple Models 
 
 
 
 Documentation is available at  https://llama-cpp-python.readthedocs.io/en/latest . 
 Installation 
 Requirements: 
 
 Python 3.8+ 
 C compiler 
 Linux: gcc or clang 
 Windows: Visual Studio or MinGW 
 MacOS: Xcode 
 
 
 
 To install the package, run: 
 pip   install   llama-cpp-python
 
 This will also build  llama.cpp  from source and install it alongside this python package. 
 If this