In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import threading

def normalize_url(url):
    if url.endswith('/'):
        return url[:-1]
    return url

def extract_links(url, domain):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = set()

        for link in soup.find_all('a'):
            href = link.get('href')
            if href:
                parsed_url = urljoin(url, href)
                if parsed_url.startswith(domain):
                    links.add(normalize_url(parsed_url))

        return links
    except Exception as e:
        return set()

def crawl_domain(start_url, domain, max_workers=10):
    start_url = normalize_url(start_url)
    domain = normalize_url(domain)
    visited = set()
    to_visit = {start_url}
    lock = threading.Lock()

    class UrlsDiscoveredTqdm(tqdm):
        def update_to(self, b=1, bsize=1, tsize=None):
            if tsize is not None:
                self.total = tsize
            self.update(b * bsize - self.n)

    def worker(url, pbar):
        nonlocal visited, to_visit, lock
        with lock:
            visited.add(url)
            to_visit.discard(url)

        links = extract_links(url, domain)
        with lock:
            new_links = links - visited
            to_visit.update(new_links)
            pbar.update_to(tsize=len(visited) + len(to_visit))

    with UrlsDiscoveredTqdm(desc='URLs discovered', total=1, unit='urls') as pbar:
        while to_visit:
            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                urls = list(to_visit)
                for url in urls:
                    executor.submit(worker, url, pbar)
                executor.shutdown()

    return visited

url = 'https://vectorbt.pro/pvt_d904e513'

links = crawl_domain(url, url)

URLs discovered:   5%|▍         | 1/22 [00:00<00:07,  2.75urls/s]

In [None]:
links