<a href="https://colab.research.google.com/github/thor4/scripts/blob/master/doc_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Web Documentation Scraper
---

Install required libraries

In [None]:
!pip install beautifulsoup4 requests

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Load libraries and mount Google Drive

In [None]:
import requests
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


Create scraping functions

In [None]:
def get_soup(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    return soup

def get_links(soup):
    links = soup.find("div", {"class": "toctree-wrapper"}).find_all("a", {"class": "reference internal"})
    base_url = "https://scikit-learn.org/stable/"
    link_list = [urljoin(base_url, link['href']) for link in links]
    return link_list

def scrape_page(url):
    soup = get_soup(url)
    content = soup.find("div", {"class": "body"})

    if content is None:
        return ""

    # Remove unnecessary elements
    for script in content.find_all(["script", "style"]):
        script.extract()

    # Convert headers to markdown
    for header in content.find_all(re.compile('^h[1-6]$')):
        header_markdown = '#' * int(header.name[1])
        header.string = f"{header_markdown} {header.text}"

    # Convert links to markdown
    for link in content.find_all('a'):
        if link.get('href') and not link.get('href').startswith('#'):
            link.string = f"[{link.text}]({link['href']})"
            
    # Convert MathJax to inline LaTeX
    for mathjax in content.find_all('span', {'class': 'MathJax_Preview'}):
        mathjax.string = f"$$ {mathjax.text.strip()} $$"

    # Convert code blocks to markdown
    for code in content.find_all("code"):
        code.string = f"`{code.text}`"

    # Convert preformatted code blocks to markdown
    for pre in content.find_all("pre"):
        pre.string = f"```\n{pre.text}\n```"

    return content.get_text()

def main():
    base_url = "https://scikit-learn.org/stable/user_guide.html"
    soup = get_soup(base_url)
    links = get_links(soup)

    output_file = "/content/drive/MyDrive/scikit_learn_docs.md"  # Change this path as needed
    with open(output_file, "w", encoding="utf-8") as f:
        for i, link in enumerate(links):
            markdown_content = scrape_page(link)
            f.write(markdown_content)
            f.write("\n\n")
    print("Scraping completed")

Run script. It will output the markdown file to Google Drive.

In [None]:
main()

Scraping completed
