In [7]:
from selenium import webdriver
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv

def get_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features")
    options.add_argument("--disable-blink-features=AutomationControlled")

    driver = webdriver.Remote(
        command_executor=f'http://localhost:4444/wd/hub',
        options=options
    )

    return driver

In [23]:
driver = get_driver()

# base_url = "https://www.gov.uk/"
base_url = "https://vlada.gov.hr/"

driver.get(base_url)

page_source = driver.page_source

with open("page_content.html", "w", encoding="utf-8") as file:
    file.write(page_source)

driver.quit()

In [24]:
def get_section_headers(element):
    """Recursively collect text from parent section headers."""
    parent = element.find_parent()
    if parent is None:
        return []

    # Check if the parent is a header tag
    if parent.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
        header_text = parent.get_text(strip=True)
        return get_section_headers(parent) + [header_text]

    # Check if the parent is a section tag
    elif parent.name == "section":
        # Find the first header tag within the section
        header = parent.find(["h1", "h2", "h3", "h4", "h5", "h6"])
        if header:
            header_text = header.get_text(strip=True)
            return get_section_headers(parent) + [header_text]

    return get_section_headers(parent)


soup = BeautifulSoup(page_source, "html.parser")

links = soup.find_all("a")

for link in links:
    url = link.get("href")
    text = link.get_text(strip=True)
    full_url = urljoin(base_url, url)

    # parent_text_tree = get_section_headers(link)
    # print(f"URL: {url}, Text: {text}, Parent Text Tree: {parent_text_tree}")
    print(f"URL: {full_url}, Text: {text}")

URL: https://plus.google.com/u/0/103623098085519322705, Text: 
URL: https://vlada.gov.hr/#content, Text: Preskoči na glavni sadržaj
URL: https://gov.hr/, Text: 
URL: https://www.facebook.com/wwwvladahr, Text: 
URL: https://twitter.com/VladaRH, Text: 
URL: https://www.youtube.com/user/wwwvladahr, Text: 
URL: https://vlada.gov.hr/?big=0, Text: A
URL: https://vlada.gov.hr/?big=1, Text: A
URL: https://vlada.gov.hr/rss/18210, Text: RSS
URL: https://vlada.gov.hr/?impaired=1, Text: Pristupačnost
URL: https://vlada.gov.hr/en, Text: English
URL: https://vlada.gov.hr/, Text: 
URL: javascript:;, Text: 
URL: https://vlada.gov.hr/vijesti/8, Text: Vijesti
URL: https://vlada.gov.hr/sjednice/9, Text: Sjednice
URL: https://vlada.gov.hr/dokumenti/10, Text: Dokumenti
URL: https://vlada.gov.hr/pristup-informacijama/11859, Text: Pristup informacijama
URL: https://vlada.gov.hr/europski-semestar/19453, Text: Europski semestar
URL: https://vlada.gov.hr/istaknute-teme/11, Text: Istaknute teme
URL: https://vlad

In [None]:
base_url = "https://www.gov.uk/"

driver = get_driver()

found_urls = set()
urls_currently_in_queue_or_already_visited = set()


def write_to_csv(url_text_pairs):
    with open("links.csv", "w", newline="", encoding="utf-8") as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(["Text", "URL"])

        for url, text in url_text_pairs:
            csvwriter.writerow([text, url])


url_text_pairs = [(base_url, "Home")]
counter = 0
urls_queue = [base_url]
while urls_queue:
    counter += 1
    print(f"Scraping {counter} urls")
    # if counter > 10:
    #     break

    url = urls_queue.pop(0)

    if not url.startswith(base_url):
        continue

    print(f"Scraping {url}")
    try:
        driver.get(url)
        page_source = driver.page_source
    except Exception as e:
        print(f"    Error scraping {url}: {e}")
        continue

    soup = BeautifulSoup(page_source, "html.parser")

    links = soup.find_all("a")

    for link in links:
        url = link.get("href")
        text = " ".join(link.stripped_strings)
        full_url = urljoin(base_url, url)

        if full_url not in urls_currently_in_queue_or_already_visited:
            urls_queue.append(full_url)
            urls_currently_in_queue_or_already_visited.add(full_url)

            url_text_pairs.append((full_url, text))
            if len(url_text_pairs) % 1000 == 0:
                write_to_csv(url_text_pairs)

    found_urls.add(full_url)

print(found_urls)

driver.quit()

# Single page scraping

In [25]:
driver = get_driver()

# base_url = "https://www.gov.uk/guidance/register-an-unincorporated-association-for-corporation-tax"
base_url = "https://vlada.gov.hr/vijesti/hrvatskom-saboru-upucen-konacni-prijedlog-izmjena-zakona-o-sluzbenicima-i-namjestenicima/43519"

driver.get(base_url)

page_source = driver.page_source

with open("page_content.html", "w", encoding="utf-8") as file:
    file.write(page_source)

driver.quit()

In [29]:
for element in soup.find_all(True):  # True finds all tags
    # Get text directly within the element, excluding children
    text = ''.join(element.find_all(text=True, recursive=False)).strip()
    if text:
        print(f"{element.name.upper()}: {text}")

HTML: <![endif]
TITLE: Vlada Republike Hrvatske - Hrvatskom saboru upućen Konačni prijedlog izmjena Zakona o službenicima i namještenicima
STYLE: .website-translator.left-aligned .language-menu>.website-translator-select>.selected-item{padding:0 10px}.website-translator.left-aligned>.language-menu>.website-translator-select>.selected-item>.menu-text{text-align:left}.website-translator.inverted>.language-menu>.website-translator-select>.options{transform:translateY(calc(-100% - 45px))}.website-translator>.language-menu .icon *,.website-translator>.language-list .icon *{height:1em}.website-translator>.language-list .icon{margin-left:5px}.website-translator>.language-menu{position:relative;user-select:none;width:200px}.website-translator>.language-menu :focus{outline:none}.website-translator>.language-menu>.website-translator-select{position:relative;display:flex;flex-direction:column;font-weight:bold;font-family:Roboto}.website-translator>.language-menu>.website-translator-select.open>.s

  text = ''.join(element.find_all(text=True, recursive=False)).strip()


# Using code

In [22]:
import src.main

src.main.main()


Scraping 1 urls
Scraping https://www.gov.uk/
Scraping 2 urls
Scraping https://www.gov.uk/help/cookies
Scraping 3 urls
Scraping https://www.gov.uk/#content
Scraping 4 urls
Scraping 5 urls
Scraping https://www.gov.uk/browse
Scraping 6 urls
Scraping https://www.gov.uk/search
Scraping 7 urls
Scraping https://www.gov.uk/browse/benefits
Scraping 8 urls
Scraping https://www.gov.uk/browse/births-deaths-marriages
Scraping 9 urls
Scraping https://www.gov.uk/browse/business
Scraping 10 urls


FileNotFoundError: [Errno 2] No such file or directory: 'data/browse/tax.txt'