### import libraries

In [25]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin, urlunparse
import os
import re
import pickle

#### 1. Scrape the documentation from the URL

In [26]:
# Url to scrape
docs_url = "https://fastapi.tiangolo.com/tutorial/"

In [27]:
def is_valid_url(url, base_url):
    """
    Helper method to check if the URL is valid and belongs to the same domain as the base URL.
    """
    parsed = urlparse(url)
    base_parsed = urlparse(base_url)
    return bool(parsed.netloc) and parsed.netloc == base_parsed.netloc

In [28]:
def extract_links(soup, base_url):
    """
    Helper method to extract all valid links from the BeautifulSoup object.
    """
    links = set()
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        full_url = urljoin(base_url, href)
        if is_valid_url(full_url, base_url):
            links.add(full_url)
    return links

In [29]:
def get_html_content(soup):
    """Extract relevant main HTML content and convert it to Markdown-like text."""
    # Locate main content container
    main_content = (
        soup.find("main")
        or soup.find("article")
        or soup.find("div", class_="content")
        or soup.find("div", class_="main-content")
        or soup.body
    )
    
    # Remove navigation/sidebars/menus
    for tag in main_content.find_all(["nav", "aside", "header", "footer"]):
        tag.decompose()
    for tag in main_content.find_all(attrs={"class": lambda c: c and ("sidebar" in c or "menu" in c)}):
        tag.decompose()
    for tag in main_content.find_all(attrs={"id": lambda i: i and ("sidebar" in i or "menu" in i)}):
        tag.decompose()
    
    # Convert remaining elements to Markdown
    parts = []
    for tag in main_content.find_all(["h1", "h2", "h3", "p", "pre", "code", "li"]):
        if tag.name == "h1":
            parts.append(f"# {tag.get_text(strip=True)}")
        elif tag.name == "h2":
            parts.append(f"## {tag.get_text(strip=True)}")
        elif tag.name == "h3":
            parts.append(f"### {tag.get_text(strip=True)}")
        elif tag.name == "p":
            parts.append(tag.get_text(strip=True))
        elif tag.name in ["pre", "code"]:
            code_lang = tag.get('class', [''])[0] if tag.get('class') else ''
            parts.append(f"```{code_lang}\n{tag.get_text(strip=True)}\n```")
        elif tag.name == "li":
            parts.append(f"- {tag.get_text(strip=True)}")
    
    return "\n\n".join(parts)

In [30]:

def normalize_url(url):
    """Normalize URL by removing fragment and query, ensuring consistent format."""
    parsed = urlparse(url)
    # Remove query and fragment to avoid duplicates like /page?id=1 or /page#section
    normalized = parsed._replace(query="", fragment="")
    # Ensure scheme and netloc are lowercase, path stays the same
    return urlunparse(normalized).rstrip("/")

def crawl_and_scrape(base_url, max_pages=250):
    """
    Crawl the site starting from the base URL and extract the HTML content of each page.
    """
    visited = set()
    to_visit = {normalize_url(base_url)}
    all_content = dict()

    while to_visit and len(visited) < max_pages:
        current_url = to_visit.pop()
        if current_url in visited:
            continue

        print(f"{len(visited)}/{max_pages} Crawling: {current_url}")
        visited.add(current_url)

        try:
            response = requests.get(current_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            # Normalize and filter links before adding
            links = {
                normalize_url(link)
                for link in extract_links(soup, base_url)
            }
            to_visit.update(links - visited)

            html_content = get_html_content(soup)
            all_content[current_url] = html_content

        except requests.RequestException as e:
            print(f"An error occurred while fetching the URL: {e}")

    return all_content


In [31]:
doc_sites = crawl_and_scrape(docs_url)

0/250 Crawling: https://fastapi.tiangolo.com/tutorial
1/250 Crawling: https://fastapi.tiangolo.com/advanced/security/http-basic-auth
2/250 Crawling: https://fastapi.tiangolo.com/advanced/using-request-directly
3/250 Crawling: https://fastapi.tiangolo.com/tutorial/query-params-str-validations
4/250 Crawling: https://fastapi.tiangolo.com/ur
5/250 Crawling: https://fastapi.tiangolo.com/reference/dependencies
6/250 Crawling: https://fastapi.tiangolo.com/de
7/250 Crawling: https://fastapi.tiangolo.com/tutorial/how-to/testing-database
An error occurred while fetching the URL: 404 Client Error: Not Found for url: https://fastapi.tiangolo.com/tutorial/how-to/testing-database
8/250 Crawling: https://fastapi.tiangolo.com/deployment/manually
9/250 Crawling: https://fastapi.tiangolo.com/tutorial/tutorial/response-model
An error occurred while fetching the URL: 404 Client Error: Not Found for url: https://fastapi.tiangolo.com/tutorial/tutorial/response-model
10/250 Crawling: https://fastapi.tiangol

#### 2. store the clean html in a folder

In [32]:
folder_name = docs_url.split("https://")[1].strip("/").strip()

# create a folder for the domain name
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

# create an html file inside the folder for each page
for i, site in enumerate(doc_sites):
    with open(f"{folder_name}/{i}.md", "w") as f:
        f.write(site)

# for eas of testing store the docs url dict in a pickle file
with open("docs_url.pickle", "wb") as f:
    pickle.dump(doc_sites, f)