In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def get_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features")
    options.add_argument("--disable-blink-features=AutomationControlled")

    driver = webdriver.Remote(
        command_executor=f'http://localhost:4444/wd/hub',
        options=options
    )

    return driver

In [12]:
driver = get_driver()

base_url = "https://www.gov.uk/"

driver.get(base_url)

page_source = driver.page_source

with open("page_content.html", "w", encoding="utf-8") as file:
    file.write(page_source)

driver.quit()

In [10]:
def get_section_headers(element):
    """Recursively collect text from parent section headers."""
    parent = element.find_parent()
    if parent is None:
        return []

    # Check if the parent is a header tag
    if parent.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
        header_text = parent.get_text(strip=True)
        return get_section_headers(parent) + [header_text]

    # Check if the parent is a section tag
    elif parent.name == "section":
        # Find the first header tag within the section
        header = parent.find(["h1", "h2", "h3", "h4", "h5", "h6"])
        if header:
            header_text = header.get_text(strip=True)
            return get_section_headers(parent) + [header_text]

    return get_section_headers(parent)


soup = BeautifulSoup(page_source, "html.parser")

links = soup.find_all("a")

for link in links:
    url = link.get("href")
    text = link.get_text(strip=True)
    full_url = urljoin(base_url, url)

    # parent_text_tree = get_section_headers(link)
    # print(f"URL: {url}, Text: {text}, Parent Text Tree: {parent_text_tree}")
    print(f"URL: {full_url}, Text: {text}")

URL: /help/cookies, Text: change your cookie settings, Parent Text Tree: []
URL: https://www.gov.uk/help/cookies, Text: change your cookie settings
URL: /help/cookies, Text: change your cookie settings, Parent Text Tree: []
URL: https://www.gov.uk/help/cookies, Text: change your cookie settings
URL: /help/cookies, Text: View cookies, Parent Text Tree: []
URL: https://www.gov.uk/help/cookies, Text: View cookies
URL: #content, Text: Skip to main content, Parent Text Tree: []
URL: https://www.gov.uk/#content, Text: Skip to main content
URL: https://www.gov.uk, Text: GOV.UK, Parent Text Tree: []
URL: https://www.gov.uk, Text: GOV.UK
URL: /browse, Text: Menu, Parent Text Tree: []
URL: https://www.gov.uk/browse, Text: Menu
URL: /search, Text: Search GOV.UK, Parent Text Tree: []
URL: https://www.gov.uk/search, Text: Search GOV.UK
URL: https://www.gov.uk/browse/benefits, Text: Benefits, Parent Text Tree: []
URL: https://www.gov.uk/browse/benefits, Text: Benefits
URL: https://www.gov.uk/browse/

In [3]:
base_url = "https://www.gov.uk/"

driver = get_driver()

found_urls = set()
urls_currently_in_queue_or_already_visited = set()

counter = 0
urls_queue = [base_url]
while urls_queue:
    counter += 1
    if counter > 10:
        break

    url = urls_queue.pop(0)
    print(f"Scraping {url}")
    driver.get(url)
    page_source = driver.page_source

    soup = BeautifulSoup(page_source, "html.parser")

    links = soup.find_all("a")

    for link in links:
        url = link.get("href")
        text = link.get_text(strip=True)
        full_url = urljoin(base_url, url)

        if full_url not in urls_currently_in_queue_or_already_visited:
            urls_queue.append(full_url)
            urls_currently_in_queue_or_already_visited.add(full_url)

        found_urls.add(full_url)

print(found_urls)

driver.quit()

Scraping https://www.gov.uk/
Scraping https://www.gov.uk/help/cookies
Scraping https://www.gov.uk/#content
Scraping https://www.gov.uk
Scraping https://www.gov.uk/browse
Scraping https://www.gov.uk/search
Scraping https://www.gov.uk/browse/benefits
Scraping https://www.gov.uk/browse/births-deaths-marriages
Scraping https://www.gov.uk/browse/business
Scraping https://www.gov.uk/browse/childcare-parenting
{'https://www.gov.uk/browse/business/manufacturing', 'https://www.gov.uk/find-energy-certificate', 'https://www.gov.uk/sign-in-childcare-account', 'https://www.gov.uk/browse/business/imports', 'https://www.gov.uk/browse/childcare-parenting/financial-help-children', 'https://www.gov.uk/browse/childcare-parenting/divorce-separation-legal', 'https://www.gov.uk/browse/births-deaths-marriages/child', 'https://www.gov.uk/search', 'https://www.gov.uk/browse/business/science', 'https://www.smartsurvey.co.uk/s/gov-uk-banner/?c=/', 'https://www.gov.uk/vehicle-tax', 'https://www.gov.uk/browse/birt

In [4]:
found_urls

{'https://www.gov.uk',
 'https://www.gov.uk/',
 'https://www.gov.uk/#content',
 'https://www.gov.uk/apply-renew-passport',
 'https://www.gov.uk/browse',
 'https://www.gov.uk/browse/abroad',
 'https://www.gov.uk/browse/benefits',
 'https://www.gov.uk/browse/benefits/bereavement',
 'https://www.gov.uk/browse/benefits/disability',
 'https://www.gov.uk/browse/benefits/families',
 'https://www.gov.uk/browse/benefits/help-for-carers',
 'https://www.gov.uk/browse/benefits/looking-for-work',
 'https://www.gov.uk/browse/benefits/low-income',
 'https://www.gov.uk/browse/benefits/manage-your-benefit',
 'https://www.gov.uk/browse/benefits/unable-to-work',
 'https://www.gov.uk/browse/births-deaths-marriages',
 'https://www.gov.uk/browse/births-deaths-marriages/child',
 'https://www.gov.uk/browse/births-deaths-marriages/child-adoption',
 'https://www.gov.uk/browse/births-deaths-marriages/death',
 'https://www.gov.uk/browse/births-deaths-marriages/lasting-power-attorney',
 'https://www.gov.uk/browse/