In [None]:
#This web scraping task involved extracting ordinance descriptions from the Pasay City ordinance website. The code used was specifically designed for this site.

import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
import os

# Set up Chrome options
options = webdriver.ChromeOptions()
options.add_experimental_option('prefs', {
    "download.default_directory": "C:/Users/denis/Downloads/Thesis/Pasay City Data_090224",
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    "safebrowsing.enabled": True,
    "plugins.always_open_pdf_externally": True
})

# Initialize WebDriver
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)

# Start the timer
start_time = time.time()

# URL to scrape
URL = 'https://www.pasaycitysecretariat.com/_Public/ordinancelist.aspx'
driver.get(URL)
driver.maximize_window()

# Initialize lists to store extracted data
numbers = []
descriptions = []
categories = []
dates_enacted = []

try:
    # Generalized XPath to match rows with IDs like "gvwList_DXDataRowX"
    rows_xpath = '//*[starts-with(@id, "gvwList_DXDataRow")]'

    # Loop through pages to extract data
    while True:
        try:
            # Find all rows matching the generalized XPath
            rows = driver.find_elements(By.XPATH, rows_xpath)
            if not rows:
                print("No rows found.")
                break

            # Extract data from each row
            for row in rows:
                try:
                    # Extract each field from the corresponding <td> positions
                    number = row.find_element(By.XPATH, './td[1]').text.strip()
                    description = row.find_element(By.XPATH, './td[2]').text.strip()
                    category = row.find_element(By.XPATH, './td[3]').text.strip()
                    date_enacted = row.find_element(By.XPATH, './td[4]').text.strip()

                    # Append extracted data to lists
                    numbers.append(number)
                    descriptions.append(description)
                    categories.append(category)
                    dates_enacted.append(date_enacted)
                except NoSuchElementException:
                    print("Error finding data in one of the cells.")
                    continue

            print(f"Extracted {len(numbers)} entries so far...")

            # Handle pagination: Click the "Next" button if available
            next_button_xpath = '//img[@class="dxWeb_pNext_Office2010Black" and @alt="Next"]'
            try:
                # Locate the image element
                next_button_img = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, next_button_xpath))
                )

                # Find the parent element (usually an <a> or <button>) and click it
                parent_button = next_button_img.find_element(By.XPATH, './ancestor::a[1]')
                WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable(parent_button)
                ).click()

                print("Navigating to the next page...")
                WebDriverWait(driver, 10).until(
                    EC.staleness_of(next_button_img)
                )

            except TimeoutException:
                print("Timed out waiting for the 'Next' button.")
                break
            except StaleElementReferenceException:
                print("Stale element reference. The button may have changed.")
                break
            except Exception as e:
                print(f"Error clicking the next button: {e}")
                break

        except TimeoutException:
            print("Timed out waiting for elements.")
            break
        except Exception as e:
            print(f"An error occurred while extracting data: {e}")
            break

finally:
    driver.quit()
    # Stop the timer and calculate the elapsed time
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Total time elapsed: {elapsed_time:.2f} seconds")

    # Create a DataFrame and save it to an Excel file
    df_main = pd.DataFrame({
        'Number': numbers,
        'Ordinance Description': descriptions,
        'Category': categories,
        'Date Enacted': dates_enacted
    })

    # Save the DataFrame to an Excel file
    output_directory = "C:/Users/denis/Downloads/Thesis/Pasay City Data_090224"
    output_file = os.path.join(output_directory, 'ordinances_1.xlsx')

    with pd.ExcelWriter(output_file) as writer:
        df_main.to_excel(writer, sheet_name='Ordinances', index=False)

    print(f"Data has been saved to {output_file}")


Extracted 25 entries so far...
Navigating to the next page...
Extracted 50 entries so far...
Navigating to the next page...
Extracted 75 entries so far...
Navigating to the next page...
Extracted 100 entries so far...
Navigating to the next page...
Extracted 125 entries so far...
Navigating to the next page...
Extracted 150 entries so far...
Navigating to the next page...
Extracted 175 entries so far...
Navigating to the next page...
Extracted 200 entries so far...
Navigating to the next page...
Extracted 225 entries so far...
Navigating to the next page...
Extracted 250 entries so far...
Navigating to the next page...
Extracted 275 entries so far...
Navigating to the next page...
Extracted 300 entries so far...
Navigating to the next page...
Extracted 325 entries so far...
Navigating to the next page...
Extracted 350 entries so far...
Navigating to the next page...
Extracted 375 entries so far...
Navigating to the next page...
Extracted 400 entries so far...
Navigating to the next pag