# Scrape oepul regulatory text from the web

There is a website which has a dump of all PDF files regarding the oepul regulations. This notebook is used to download all these PDFs into the data folder.

In [23]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import requests


def get_all_oepul_pdf_anchor_tags():
    # Initialize the web driver (make sure to specify the path to your webdriver)
    driver = webdriver.Firefox()

    # Open the website
    url = "https://www.ama.at/formulare-merkblaetter#18053"
    driver.get(url)

    div_present = False
    link_div = None
    while not div_present:
        driver.implicitly_wait(1)
        try:
            # Find the div with id "ui-id-21"
            link_div = driver.find_element(By.ID, "ui-id-21")
            div_present = True
        except:
            pass


    # Parse the HTML content of the div using BeautifulSoup
    html = link_div.get_attribute("innerHTML")
    soup = BeautifulSoup(html, 'html.parser')

    # Find and download all links in the div
    links = soup.find_all('a')

    driver.quit()

    return links


def download_pdfs(links):
    url_base = "https://www.ama.at/"
    for link in links:
        href = link.get('href')
        # Make sure the link is an absolute URL
        # also check ig the link is a pdf
        if not href.startswith("http"):
            href = f"{url_base}{href}"
        if not href.endswith(".pdf"):
            continue

        # Download the PDF files (you can adjust the file path and handling as needed)
        response = requests.get(href)
        if response.status_code == 200:
            filename = href.split("/")[-1]
            filepath = f"data/pdfs/{filename}"
            with open(filepath, 'wb') as f:
                f.write(response.content)


links = get_all_oepul_pdf_anchor_tags()
download_pdfs(links)



In [43]:
import fitz  # PyMuPDF
import os
import pypandoc
import tempfile

def get_pdf_files(folder):
    """Get a list of all PDF files in the specified folder."""
    return [file for file in os.listdir(folder) if file.endswith(".pdf")]

def extract_text_from_pdf(pdf_file):
    """Open a PDF file and extract its text content."""
    pdf_path = os.path.join(pdf_folder, pdf_file)
    pdf_document = fitz.open(pdf_path)
    text = ""
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text("text")
    pdf_document.close()
    return text

def save_text(text_content, text_path):
    """Save text content to a file."""
    with open(text_path, "w", encoding="utf-8") as md_file:
        md_file.write(text_content)

def main(pdf_folder, text_folder):
    # Create the output folder if it doesn't exist
    os.makedirs(text_folder, exist_ok=True)

    # Get a list of PDF files in the input folder
    pdf_files = get_pdf_files(pdf_folder)

    for pdf_file in pdf_files:
        text_file = os.path.splitext(pdf_file)[0] + ".txt"
        text_path = os.path.join(text_folder, text_file)

        # # Extract text from the PDF
        # pdf_text = extract_text_from_pdf(pdf_file)

        # Convert to text
        text_content = extract_text_from_pdf(pdf_file)

        # Save the text content to a file
        save_text(text_content, text_path)

        print(f"Converted {pdf_file} to {text_file}")

if __name__ == "__main__":
    pdf_folder = "data/pdfs"
    text_folder = "data/text"
    main(pdf_folder, text_folder)

print("Conversion complete.")


Converted O6_17_Humuserhalt_und_Bodenschutz_auf_umbruchsfaehigem-Gruenland_2022_12.pdf to O6_17_Humuserhalt_und_Bodenschutz_auf_umbruchsfaehigem-Gruenland_2022_12.txt
Converted O6_23_Natura2000_und_andere_Schutzgebiete-Landwirtschaft_2022_12.pdf to O6_23_Natura2000_und_andere_Schutzgebiete-Landwirtschaft_2022_12.txt
Converted O6_15_Tierwohl-Behirtung_2023_04.pdf to O6_15_Tierwohl-Behirtung_2023_04.txt
Converted O6_3_Heuwirtschaft_2022_12.pdf to O6_3_Heuwirtschaft_2022_12.txt
Converted O6_5_Erhaltung_gefaehrdeter_Nutztierrassen-_2023_04.pdf to O6_5_Erhaltung_gefaehrdeter_Nutztierrassen-_2023_04.txt
Converted O6_10_Erosionsschutz_Wein_Obst_Hopfen_2022_12.pdf to O6_10_Erosionsschutz_Wein_Obst_Hopfen_2022_12.txt
Converted O6_4_Bewirtschaftung_von_Bergmaehdern_2022_12.pdf to O6_4_Bewirtschaftung_von_Bergmaehdern_2022_12.txt
Converted O6_18_Naturschutz_2022_12.pdf to O6_18_Naturschutz_2022_12.txt
Converted O6_21_Tierwohl-Stallhaltung_Rinder_2023_04.pdf to O6_21_Tierwohl-Stallhaltung_Rinder_2