In [7]:
from selenium import webdriver
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv

def get_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features")
    options.add_argument("--disable-blink-features=AutomationControlled")

    driver = webdriver.Remote(
        command_executor=f'http://localhost:4444/wd/hub',
        options=options
    )

    return driver

In [12]:
driver = get_driver()

base_url = "https://www.gov.uk/"

driver.get(base_url)

page_source = driver.page_source

with open("page_content.html", "w", encoding="utf-8") as file:
    file.write(page_source)

driver.quit()

In [None]:
def get_section_headers(element):
    """Recursively collect text from parent section headers."""
    parent = element.find_parent()
    if parent is None:
        return []

    # Check if the parent is a header tag
    if parent.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
        header_text = parent.get_text(strip=True)
        return get_section_headers(parent) + [header_text]

    # Check if the parent is a section tag
    elif parent.name == "section":
        # Find the first header tag within the section
        header = parent.find(["h1", "h2", "h3", "h4", "h5", "h6"])
        if header:
            header_text = header.get_text(strip=True)
            return get_section_headers(parent) + [header_text]

    return get_section_headers(parent)


soup = BeautifulSoup(page_source, "html.parser")

links = soup.find_all("a")

for link in links:
    url = link.get("href")
    text = link.get_text(strip=True)
    full_url = urljoin(base_url, url)

    # parent_text_tree = get_section_headers(link)
    # print(f"URL: {url}, Text: {text}, Parent Text Tree: {parent_text_tree}")
    print(f"URL: {full_url}, Text: {text}")

In [12]:
base_url = "https://www.gov.uk/"

driver = get_driver()

found_urls = set()
urls_currently_in_queue_or_already_visited = set()


def write_to_csv(url_text_pairs):
    with open("links.csv", "w", newline="", encoding="utf-8") as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(["Text", "URL"])

        for url, text in url_text_pairs:
            csvwriter.writerow([text, url])


url_text_pairs = [(base_url, "Home")]
counter = 0
urls_queue = [base_url]
while urls_queue:
    counter += 1
    print(f"Scraping {counter} urls")
    # if counter > 10:
    #     break

    url = urls_queue.pop(0)

    if not url.startswith(base_url):
        continue

    print(f"Scraping {url}")
    try:
        driver.get(url)
        page_source = driver.page_source
    except Exception as e:
        print(f"    Error scraping {url}: {e}")
        continue

    soup = BeautifulSoup(page_source, "html.parser")

    links = soup.find_all("a")

    for link in links:
        url = link.get("href")
        text = " ".join(link.stripped_strings)
        full_url = urljoin(base_url, url)

        if full_url not in urls_currently_in_queue_or_already_visited:
            urls_queue.append(full_url)
            urls_currently_in_queue_or_already_visited.add(full_url)

            url_text_pairs.append((full_url, text))
            if len(url_text_pairs) % 1000 == 0:
                write_to_csv(url_text_pairs)

    found_urls.add(full_url)

print(found_urls)

driver.quit()

Scraping https://www.gov.uk/
Scraping https://www.gov.uk/help/cookies
Scraping https://www.gov.uk/#content
Scraping https://www.gov.uk/browse
Scraping https://www.gov.uk/search
Scraping https://www.gov.uk/browse/benefits
Scraping https://www.gov.uk/browse/births-deaths-marriages
Scraping https://www.gov.uk/browse/business
Scraping https://www.gov.uk/browse/childcare-parenting
Scraping https://www.gov.uk/browse/citizenship
Scraping https://www.gov.uk/browse/justice
Scraping https://www.gov.uk/browse/disabilities
Scraping https://www.gov.uk/browse/driving
Scraping https://www.gov.uk/browse/education
Scraping https://www.gov.uk/browse/employing-people
Scraping https://www.gov.uk/browse/environment-countryside
Scraping https://www.gov.uk/browse/housing-local-services
Scraping https://www.gov.uk/browse/tax
Scraping https://www.gov.uk/browse/abroad
Scraping https://www.gov.uk/browse/visas-immigration
Scraping https://www.gov.uk/browse/working
Scraping https://www.gov.uk/government/organisati