In [None]:
#This web scraping task involved downloading PDFs from the Muntinlupa City ordinance website. The code used was specifically designed for this site.

import time
import requests
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# Set up Chrome options
options = webdriver.ChromeOptions()
options.add_experimental_option('prefs', {
    "download.default_directory": "C:/Users/denis/Downloads/Thesis/Muntinlupa City Data_09032024/Full Text",  # Change to desired save directory
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    "safebrowsing.enabled": True,
    "plugins.always_open_pdf_externally": True  # Automatically open PDFs externally instead of in-browser
})

# Initialize WebDriver
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)

# Start the timer
start_time = time.time()

# URL to scrape
URL = 'https://muntinlupacity.gov.ph/city-ordinances-03-078-to-06-091/'
driver.get(URL)
driver.maximize_window()

# Handle the cookie popup by clicking "Reject All"
try:
    reject_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/div[1]/div/div/div[2]/button[1]'))
    )
    reject_button.click()
    print("Cookie popup handled by rejecting all cookies.")
except Exception as e:
    print(f"An error occurred while handling the cookie popup: {e}")

# Pages and their corresponding element identifiers
pages = [
    'City Ordinances (97-01 to 02-077)',
    'City Ordinances (03-078 to 06-091)',
    'City Ordinances (06-092 to 11-051)',
    'City Ordinances (12-052 to 2021-307)',
    'City Ordinances (2022-001 to 2023-144)',
    'City Ordinances (2024-145 to 2024-227)'
]

def download_file(url, save_dir):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        file_name = url.split("/")[-1]
        file_path = f"{save_dir}/{file_name}"
        with open(file_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"PDF downloaded successfully: {file_path}")
        return True
    except Exception as e:
        print(f"Failed to download {url}. Error: {e}")
        return False

total_downloads = 0  # Initialize counter for total downloads

try:
    # Iterate through each page by clicking on the corresponding element
    for page in pages:
        try:
            # Locate and click the link to the current ordinance page
            tab = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, f"//strong[contains(text(), '{page}')]"))
            )
            driver.execute_script("arguments[0].click();", tab)
            print(f"Switched to page: {page}")
            time.sleep(2)  # Allow time for the content to load after clicking the tab
        except Exception as e:
            print(f"An error occurred while switching to the page {page}: {e}")
            continue

        # Extract and download PDFs from the current page
        download_count = 0
        while True:
            try:
                # Locate PDF links using the provided XPath
                pdf_links = driver.find_elements(By.XPATH, '//*[@id="myTable"]/tbody/tr/td/a')
                if not pdf_links:
                    print("No PDF links found on the page.")
                    break

                for link in pdf_links:
                    href = link.get_attribute('href')
                    if href and href.endswith('.pdf'):
                        pdf_href = urljoin(URL, href)
                        if download_file(pdf_href, "C:/Users/denis/Downloads/Thesis/Muntinlupa City Data_09032024/Full Text"):
                            download_count += 1

                if download_count > 0:
                    print(f"Downloaded {download_count} PDFs for {page}")
                    total_downloads += download_count

                # Since the site does not have pagination for ordinances, break the loop
                print(f"No 'Next' button handling required for {page}.")
                break

            except TimeoutException:
                print("Timed out waiting for elements on the page.")
                break
            except Exception as e:
                print(f"An error occurred while processing PDFs: {e}")
                break

finally:
    driver.quit()
    # Stop the timer and calculate the elapsed time
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Total time elapsed: {elapsed_time:.2f} seconds")
    print(f"Total number of PDFs downloaded: {total_downloads}")


Cookie popup handled by rejecting all cookies.
Switched to page: City Ordinances (97-01 to 02-077)
PDF downloaded successfully: C:/Users/denis/Downloads/Thesis/Muntinlupa City Data_09032024/Full Text/blg-97-01-1.pdf
PDF downloaded successfully: C:/Users/denis/Downloads/Thesis/Muntinlupa City Data_09032024/Full Text/blg-97-02-1.pdf
PDF downloaded successfully: C:/Users/denis/Downloads/Thesis/Muntinlupa City Data_09032024/Full Text/blg-97-03.pdf
PDF downloaded successfully: C:/Users/denis/Downloads/Thesis/Muntinlupa City Data_09032024/Full Text/blg-97-04.pdf
PDF downloaded successfully: C:/Users/denis/Downloads/Thesis/Muntinlupa City Data_09032024/Full Text/blg-97-05.pdf
PDF downloaded successfully: C:/Users/denis/Downloads/Thesis/Muntinlupa City Data_09032024/Full Text/blg-97-06.pdf
PDF downloaded successfully: C:/Users/denis/Downloads/Thesis/Muntinlupa City Data_09032024/Full Text/blg-97-07.pdf
PDF downloaded successfully: C:/Users/denis/Downloads/Thesis/Muntinlupa City Data_09032024/F