# Simple scraper

The following cell includes code to scrape the first page of the Yarra City Council planning applications list. It provides a skeleton of the data processing required to scrape results. It does not provide support for navigating results or fetching links for the info page for each application.

In [35]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from datetime import datetime
from bs4 import BeautifulSoup


start_url = "https://eservices.yarracity.vic.gov.au/WebApps/eProperty/P1/eTrack/eTrackApplicationSearch.aspx?Custom=Yes&ApplicationID=P&r=P1.WEBGUEST&f=P1.ETR.SEARCH.ALL"

chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (without opening a browser window)
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")

with webdriver.Chrome(options=chrome_options) as driver:
    driver.get(start_url)
    driver.implicitly_wait(30)

    if "access denied" in driver.page_source.lower():
        print("403 Forbidden: Access Denied")
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    table = soup.find('table', {'id': 'ctl00_Content_cusResultsGrid_repWebGrid_ctl00_grdWebGridTabularView'}) 
    applications = []
    if table:
        rows = table.find_all('tr', class_=['normalRow', 'alternateRow'])
        for row in rows:
            columns = row.find_all('td')
            application = {
                'council_reference': columns[0].text.strip(),
                'date_received': datetime.strptime(columns[1].text.strip(), '%d/%m/%Y').strftime('%Y-%m-%d'),
                'description': columns[2].text.strip(),
                'address': columns[3].text.strip(),
                'info_url': 'URL Construction Logic Here',  # Placeholder for info URL logic
                'date_scraped': datetime.now().strftime('%Y-%m-%d')
            }
            applications.append(application)
    
for app in applications:
    print(app)

{'council_reference': 'PLN23/0800', 'date_received': '2023-11-17', 'description': 'Partial demolition, ground floor extension to the existing dwelling and construction of a double-storey dwelling to the rear, including a reduction in the car parking requirements and front fence', 'address': '34 Shelley St Richmond VIC 3121', 'info_url': 'URL Construction Logic Here', 'date_scraped': '2023-11-21'}
{'council_reference': 'PLN23/0799', 'date_received': '2023-11-16', 'description': 'PROPOSED OUTDOOR OVERHEAD LIGHTBOX SIGN - 1M(W) X 1M(H) X 0.15M(THK), NEW COLOUR TO GROUND FLOOR SHOPFRONT & NEW RETRACTABLE AWNING.', 'address': '191 Brunswick St Fitzroy VIC 3065', 'info_url': 'URL Construction Logic Here', 'date_scraped': '2023-11-21'}
{'council_reference': 'PLN22/0545.01', 'date_received': '2023-11-16', 'description': 'Installing a Round window', 'address': '6 Bliss St Richmond VIC 3121', 'info_url': 'URL Construction Logic Here', 'date_scraped': '2023-11-21'}
{'council_reference': 'PLN21/03

# Navigating the site

This script uses Selenium instead of BeautifulSoup to navigate the page. Thus we can grab multiple pages of results. We also grab the `info_url` by manually combining the prefix URL and the application number, with escape characters substituted in.

In [76]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from datetime import datetime
import time

url = "https://eservices.yarracity.vic.gov.au/WebApps/eProperty/P1/eTrack/eTrackApplicationSearch.aspx?Custom=Yes&ApplicationID=P&r=P1.WEBGUEST&f=P1.ETR.SEARCH.ALL"
info_url_prefix = 'https://eservices.yarracity.vic.gov.au/WebApps/eProperty/P1/eTrack/eTrackApplicationDetails.aspx?r=P1.WEBGUEST&f=%24P1.ETR.APPDET.VIW&ApplicationId='

chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")

driver = webdriver.Chrome(options=chrome_options)
driver.get(url)
driver.implicitly_wait(10) # Wait for JS to load

applications = []
n_pages = 5
for page in range(1, n_pages + 1):  # Iterate over first 2 pages
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    table = soup.find('table', {'id': 'ctl00_Content_cusResultsGrid_repWebGrid_ctl00_grdWebGridTabularView'})
    if table:
        new_apps = []
        rows = table.find_all('tr', class_=['normalRow', 'alternateRow'])
        for row in rows:
            columns = row.find_all('td')

            application = {
                'council_reference': columns[0].text.strip(),
                'date_received': datetime.strptime(columns[1].text.strip(), '%d/%m/%Y').strftime('%Y-%m-%d'),
                'description': columns[2].text.strip(),
                'address': columns[3].text.strip(),
                'stage': columns[4].text.strip(),
                'info_url': info_url_prefix + columns[0].text.strip().replace('/', '%2f'), # prefix + council_reference
                'date_scraped': datetime.now().strftime('%Y-%m-%d')
            }
            new_apps.append(application)
            
        applications.extend(new_apps)

    # Navigate to next page if not on the last page
    if page < n_pages:
        links = driver.find_elements(By.CSS_SELECTOR, f"tr.pagerRow > td td a")
        links[page-1].click()
        time.sleep(5)  # Wait for the next page to load

driver.quit()

for app in applications:
    print(app)


{'council_reference': 'PLN23/0800', 'date_received': '2023-11-17', 'description': 'Partial demolition, ground floor extension to the existing dwelling and construction of a double-storey dwelling to the rear, including a reduction in the car parking requirements and front fence', 'address': '34 Shelley St Richmond VIC 3121', 'stage': 'Preliminary Assessment', 'info_url': 'https://eservices.yarracity.vic.gov.au/WebApps/eProperty/P1/eTrack/eTrackApplicationDetails.aspx?r=P1.WEBGUEST&f=%24P1.ETR.APPDET.VIW&ApplicationId=PLN23%2f0800', 'date_scraped': '2023-11-21'}
{'council_reference': 'PLN23/0786', 'date_received': '2023-11-09', 'description': 'Partial demolition and construction of rear ground and first floor extension and roof terrace', 'address': '260 Amess St Carlton North VIC 3054', 'stage': 'Preliminary Assessment', 'info_url': 'https://eservices.yarracity.vic.gov.au/WebApps/eProperty/P1/eTrack/eTrackApplicationDetails.aspx?r=P1.WEBGUEST&f=%24P1.ETR.APPDET.VIW&ApplicationId=PLN23%2