In [6]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

def extract_links_from_page(url):
    try:
        response = requests.get(url)
    except:
        return set()
    soup = BeautifulSoup(response.content, 'html.parser')
    links = set()
    for link in soup.find_all('a'):
        href = link.get('href')
        if href and href.startswith(url) and '#' not in href:
            if href[-1] == '/':
                href = href[:-1]
            links.add(href)
    return links

def crawl_domain(base_url):
    visited = set()
    queue = set([base_url])
    with ThreadPoolExecutor(max_workers=10) as executor:
        while queue:
            url = queue.pop()
            visited.add(url)
            links = executor.submit(extract_links_from_page, url).result()
            new_links = links - visited
            queue.update(new_links)
            tqdm.write(f"{len(visited)} discovered from {len(queue)} queued urls", end='\r')
    return visited


In [7]:
links = crawl_domain('https://vectorbt.pro/pvt_d904e513')

160 discovered from 0 queued urlss

In [11]:
# Save list to a file
import pickle
with open('list_file.pickle', 'wb') as f:
    pickle.dump(links, f)