# Jobs and Skills Atlas Scraper

Simple scraper for: https://www.jobsandskills.gov.au/jobs-and-skills-atlas/region/occupations?regionType=state&regionValue=aus&regionMetric=pop

In [106]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time

url = "https://www.jobsandskills.gov.au/jobs-and-skills-atlas/region/occupations?regionType=state&regionValue=aus&regionMetric=pop"

options = Options()
# options.add_argument("--headless")
options.add_argument("--start-maximized")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 2)
driver.get(url)

# Wait for the main table to be present
table = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'table.occupation.dataTable')))

# Process occupations by index to avoid stale element issues
num_rows_to_process = 3
processed_occupations = set()  # Track processed occupations to avoid duplicates

for i in range(num_rows_to_process):
    try:
        # Re-find the table and rows each iteration to avoid stale references
        current_table = driver.find_element(By.CSS_SELECTOR, 'table.occupation.dataTable')
        current_tbody = current_table.find_element(By.CSS_SELECTOR, 'tbody')
        current_rows = current_tbody.find_elements(By.CSS_SELECTOR, 'tr')
        
        if i >= len(current_rows):
            print(f"No more rows available (requested {i}, only {len(current_rows)} rows)")
            break
            
        row = current_rows[i]
        cols = row.find_element(By.CSS_SELECTOR, 'td')
        p = cols.find_element(By.TAG_NAME, 'p')
        occupation_name = p.text.strip()
        
        # Skip if we've already processed this occupation
        if occupation_name in processed_occupations:
            print(f"Skipping duplicate: {occupation_name}")
            continue
            
        processed_occupations.add(occupation_name)
        print(f"\n--- Processing Occupation {i+1}: {occupation_name} ---")
        
        # Scroll to and click the row using ActionChains
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", row)
        ActionChains(driver).move_to_element(row).click().perform()
        
        # Wait for the chart/popup to appear and then find the dropdown button
        data_btn = wait.until(EC.element_to_be_clickable((By.XPATH, "//*[contains(@class, 'highcharts-button')] | //*[contains(@class, 'highcharts-contextbutton')]")))
        ActionChains(driver).move_to_element(data_btn).click().perform()
       
        try:
            longer_wait = WebDriverWait(driver, 10)
            view_data_table_item = longer_wait.until(
                EC.element_to_be_clickable((By.XPATH, "//li[contains(@class, 'highcharts-menu-item') and contains(text(), 'View data table')]"))
            )
            ActionChains(driver).move_to_element(view_data_table_item).click().perform()
            
            # Wait for the specific occupation's data table to appear
            data_table = longer_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "table[id*='highcharts-data-table']")))
            table_html = data_table.get_attribute('outerHTML')
            soup = BeautifulSoup(table_html, 'html.parser')
            tbody = soup.find('tbody')

            if tbody:
                print("Time Series Data:")
                for data_row in tbody.find_all('tr'):
                    th = data_row.find('th')
                    td = data_row.find('td')
                    if th and td:
                        th_text = th.get_text(strip=True)
                        td_text = td.get_text(strip=True)
                        print(f"  {th_text}: {td_text}")
            else:
                print("No data table found")
                
        except Exception as e:
            print(f"Could not find or click 'View data table' option: {e}")
            print("Skipping data extraction for this occupation")
        
        # More robust cleanup - close all popups and dialogs
        try:
            # Close any modal/popup by pressing Escape
            driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.ESCAPE)
            
            # Click outside to close any remaining elements
            driver.execute_script("document.body.click();")
            
            # Wait for any menus to disappear
            try:
                longer_wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, '.highcharts-menu')))
            except:
                pass
                
            # Additional wait to ensure DOM is stable
            WebDriverWait(driver, 2).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'table.occupation.dataTable'))
            )
        except Exception as cleanup_error:
            print(f"Cleanup warning: {cleanup_error}")
            
    except Exception as e:
        print(f"Error processing occupation {i+1}: {e}")
        continue


driver.quit()


--- Processing Occupation 1: General Clerks ---
Time Series Data:
  Apr 2020: 3,064
  May 2020: 3,421
  Jun 2020: 3,868
  Jul 2020: 4,512
  Aug 2020: 4,787
  Sep 2020: 4,942
  Oct 2020: 5,199
  Nov 2020: 5,538
  Dec 2020: 5,735
  Jan 2021: 5,599
  Feb 2021: 6,232
  Mar 2021: 7,042
  Apr 2021: 7,479
  May 2021: 7,331
  Jun 2021: 7,486
  Jul 2021: 7,263
  Aug 2021: 7,205
  Sep 2021: 7,520
  Oct 2021: 7,739
  Nov 2021: 7,885
  Dec 2021: 7,829
  Jan 2022: 7,506
  Feb 2022: 7,767
  Mar 2022: 7,949
  Apr 2022: 8,208
  May 2022: 8,678
  Jun 2022: 8,979
  Jul 2022: 8,859
  Aug 2022: 8,868
  Sep 2022: 7,676
  Oct 2022: 8,769
  Nov 2022: 8,650
  Dec 2022: 9,114
  Jan 2023: 9,185
  Feb 2023: 9,404
  Mar 2023: 9,443
  Apr 2023: 9,693
  May 2023: 9,798
  Jun 2023: 9,920
  Jul 2023: 9,916
  Aug 2023: 10,342
  Sep 2023: 9,944
  Oct 2023: 10,039
  Nov 2023: 9,990
  Dec 2023: 9,748
  Jan 2024: 9,703
  Feb 2024: 9,400
  Mar 2024: 9,176
  Apr 2024: 9,139
  May 2024: 8,233
  Jun 2024: 7,149
  Jul 2024: 6