In [1]:
# !pip install selenium==4.20.0 pandas==2.2.2 webdriver-manager==4.0.1
# !pip install selenium pandas webdriver-manager


In [2]:
# Australian Entry-Level IT Jobs Scraper from Jora.com

# Required Libraries
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import urlparse, urlunparse
import time
import traceback

# Function to clean URLs by removing query params
def clean_url(url):
    parsed = urlparse(url)
    return urlunparse((parsed.scheme, parsed.netloc, parsed.path, '', '', ''))

# Setup Selenium WebDriver (using Chrome)
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)

In [3]:
# Navigate to Jora.com
driver.get('https://au.jora.com/')
wait = WebDriverWait(driver, 20)

# Input Job Search Query
search_job = wait.until(EC.presence_of_element_located((By.XPATH, "//input[@placeholder='Job title, company, keyword']")))
search_job.clear()
search_job.send_keys('Entry Level IT')
search_job.send_keys(Keys.RETURN)
time.sleep(5)

# Lists to store scraped data
data = []
visited_urls = set()
visited_combinations = set()


In [4]:
# Extended list of 160+ common tech skills
keyword_list = [
    "Python", "SQL", "Java", "JavaScript", "C++", "C#", "Ruby", "PHP", "Swift", "Kotlin",
    "Go", "R", "MATLAB", "Perl", "Scala", "Shell", "Bash", "HTML", "CSS", "SASS",
    "Django", "Flask", "Spring", "Node.js", "React", "Angular", "Vue.js", "Express.js",
    "AWS", "Azure", "GCP", "Firebase", "Heroku", "Linux", "Windows Server", "Git", "GitHub", "Bitbucket",
    "Jira", "Confluence", "Docker", "Kubernetes", "Terraform", "Ansible", "CI/CD", "Jenkins",
    "Power BI", "Tableau", "Looker", "Excel", "Pandas", "NumPy", "TensorFlow", "PyTorch",
    "NLP", "Machine Learning", "Deep Learning", "ETL", "BigQuery", "Snowflake",
    "Splunk", "ServiceNow", "Selenium", "Appium", "QTP", "Postman", "Rest API", "SOAP",
    "Agile", "Scrum", "Kanban", "DevOps", "ITIL", "VMware", "Citrix", "Azure DevOps",
    "Salesforce", "Zoho", "HubSpot", "Dynamics 365", "Shopify", "WordPress", "WooCommerce",
    "Magento", "Bootstrap", "Tailwind", "Webpack", "LESS", "GraphQL", "Redux",
    "Linux Shell", "Zsh", "MySQL", "PostgreSQL", "MongoDB", "Oracle", "DB2", "MariaDB",
    "Airflow", "Kafka", "Spark", "Hadoop", "Hive", "Cassandra", "Elasticsearch", "Redis",
    "RESTful Services", "Microservices", "OAuth", "SAML", "SSO", "LDAP", "Bamboo", "Nagios",
    "Prometheus", "Grafana", "New Relic", "Datadog", "Logstash", "Snort", "Wireshark",
    "IT Support", "Service Desk", "Help Desk", "Desktop Support", "Active Directory", "Group Policy",
    "Networking", "Switching", "Routing", "Firewalls", "VPN", "Remote Desktop"
]

while len(data) < 200:
    try:
        job_cards = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a[href*='/job/']")))

        for job in job_cards:
            try:
                url = job.get_attribute("href")
                cleaned_url = clean_url(url)
                title = job.text.strip()

                if cleaned_url in visited_urls or not cleaned_url:
                    continue
                visited_urls.add(cleaned_url)

                driver.execute_script("window.open(arguments[0]);", url)
                driver.switch_to.window(driver.window_handles[1])
                time.sleep(3)

                if "au.jora.com" not in driver.current_url:
                    driver.close()
                    driver.switch_to.window(driver.window_handles[0])
                    continue

                body_text = driver.find_element(By.TAG_NAME, 'body').text
                page_lines = body_text.split('\n')

                try:
                    h1 = driver.find_element(By.TAG_NAME, 'h1')
                    h1_parent = h1.find_element(By.XPATH, '..')
                    company_block = h1_parent.text.split('\n')
                    company = next((line for line in company_block if line.strip() and line.strip() != title), "N/A")
                except:
                    company = "N/A"

                if (title, company) in visited_combinations:
                    driver.close()
                    driver.switch_to.window(driver.window_handles[0])
                    continue
                visited_combinations.add((title, company))

                try:
                    location = next(line for line in page_lines if any(state in line for state in ["NSW", "VIC", "QLD", "SA", "WA", "TAS", "ACT", "NT"]))
                except:
                    location = "N/A"

                try:
                    date = next(line for line in page_lines if any(tag in line for tag in ["ago", "Today", "day"]))
                except:
                    date = "N/A"

                try:
                    marker_keywords = ["Description", "Responsibilities", "What you’ll do", "What You'll Do", "Your Role"]
                    desc_index = next((i for i, line in enumerate(page_lines) if any(marker in line for marker in marker_keywords)), 0)
                    filtered_lines = [line for line in page_lines[desc_index:] if not any(boiler in line for boiler in ['Search jobs', 'Browse salaries', 'Find recruiters', 'Go to Employer site', 'Log in'])]
                    description = " ".join(filtered_lines).strip()
                except:
                    description = body_text.strip()

                matched_skills = [skill for skill in keyword_list if skill.lower() in description.lower()]
                skills_found = ", ".join(sorted(set(matched_skills))) if matched_skills else "N/A"

                data.append({
                    'Title': title,
                    'Company': company,
                    'Location': location,
                    'Date Posted': date,
                    'Description': description,
                    'Skills': skills_found,
                    'Job URL': cleaned_url
                })

                driver.close()
                driver.switch_to.window(driver.window_handles[0])

                if len(data) >= 200:
                    break

            except Exception as e:
                print("Error scraping job details:", e)
                try:
                    driver.close()
                    driver.switch_to.window(driver.window_handles[0])
                except:
                    pass

        try:
            next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'a[aria-label="Next"]')))
            driver.execute_script("arguments[0].click();", next_button)
            time.sleep(3)
        except:
            print("No more pages or navigation issue.")
            break

    except Exception as e:
        print("Could not load job cards.")
        traceback.print_exc()
        break
# Closing the driver
driver.quit()

No more pages or navigation issue.


In [5]:
# Creating DataFrame and removing any rows with duplicate cleaned URLs
df_jobs = pd.DataFrame(data)
df_jobs = df_jobs.drop_duplicates(subset=['Job URL'], keep=False)
df_jobs.to_csv('australian_entry_level_it_jobs.csv', index=False)
print(f"Scraping complete. Found and saved {len(df_jobs)} unique jobs to CSV file: australian_entry_level_it_jobs.csv")

Scraping complete. Found and saved 13 unique jobs to CSV file: australian_entry_level_it_jobs.csv
