# **Web Scraping with Selenium**
---


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd

In [None]:
# Setup driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

keywords = ["data scientist", "data analyst", "machine learning", "data engineer", "data science"]

# Function for scraping per country
def scrape_job_links(domain, country_label):
    all_hrefs = []
    for keyword in keywords:
        search_keyword = keyword.replace(" ", "-")
        search_url = f"https://{domain}/en/job-search/{search_keyword}-jobs/"
        print(f"\nSearching on {country_label.upper()} for: {keyword.upper()} jobs")
        driver.get(search_url)
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "a[data-automation='jobTitle']"))
            )
            job_links_elements = driver.find_elements(By.CSS_SELECTOR, "a[data-automation='jobTitle']")
            hrefs = [(el.get_attribute("href"), country_label) for el in job_links_elements if el.get_attribute("href")]
            all_hrefs.extend(hrefs)
            print(f"  Found {len(hrefs)} job links in {country_label} for '{keyword}'")
        except TimeoutException:
            print(f"  Timeout loading results for: {keyword} in {country_label}")
    return all_hrefs

# Scrape per country
all_job_hrefs_malaysia = scrape_job_links("jobstreet.com.my", "malaysia")
all_job_hrefs_singapore = scrape_job_links("jobstreet.com.sg", "singapore")
all_job_hrefs_indonesia = scrape_job_links("id.jobstreet.com", "indonesia")  

# Gabungkan semua
all_job_hrefs = all_job_hrefs_malaysia + all_job_hrefs_singapore + all_job_hrefs_indonesia

# Deduplicate
unique_href_map = {}
for href, country in all_job_hrefs:
    if href and href not in unique_href_map:
        unique_href_map[href] = country

# Scrape detail tiap job
scraped_jobs = []
for i, (href, country_name) in enumerate(list(unique_href_map.items())):  # remove slicing if ingin semua
    if not href.startswith("http"):
        continue

    print(f"\n[{i+1}] Navigating to: {href}")
    try:
        driver.get(href)
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "h1[data-automation='job-detail-title']"))
        )

        title = driver.find_element(By.CSS_SELECTOR, "h1[data-automation='job-detail-title']").text
        company = driver.find_element(By.CSS_SELECTOR, "span[data-automation='advertiser-name']").text
        location = driver.find_element(By.CSS_SELECTOR, "span[data-automation='job-detail-location']").text
        category = driver.find_element(By.CSS_SELECTOR, "span[data-automation='job-detail-classifications']").text
        work_type = driver.find_element(By.CSS_SELECTOR, "span[data-automation='job-detail-work-type']").text
        description = driver.find_element(By.CSS_SELECTOR, "div[data-automation='jobAdDetails']").text

        scraped_jobs.append({
            "url": href,
            "country": country_name,
            "title": title.strip(),
            "company": company.strip(),
            "location": location.strip(),
            "category": category.strip(),
            "work_type": work_type.strip(),
            "description": description.strip(),
        })

        print(f"  Title: {title}")
        print(f"  Company: {company}")
        print(f"  Location: {location}")
        print(f"  Category: {category}")
        print(f"  Work Type: {work_type}")
        print(f"  Description preview: {description[:200]}...")

    except (TimeoutException, NoSuchElementException) as e:
        print(f"  Error scraping {href}: {e}")

driver.quit()

df_jobs = pd.DataFrame(scraped_jobs)
print("\nTotal jobs scraped:", len(df_jobs))

In [None]:
df_jobs = pd.DataFrame(scraped_jobs)
df_jobs