In [2]:
import json
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

In [None]:

chromedriver_path = "/opt/homebrew/bin/chromedriver"

chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")

driver = webdriver.Chrome(service=Service(chromedriver_path), options=chrome_options)

In [6]:
start_url = "https://www.simplyhired.com/search?q=data+engineer&l=united+states"
driver.get(start_url)
time.sleep(5)

In [None]:
def crawl_single_page():
    job_elements = driver.find_elements(By.XPATH, "//div[@data-testid='searchSerpJob']")
    print(f"✅ 当前页找到 {len(job_elements)} 个职位")
    jobs_on_page = []

    for job in job_elements:
        try:
            driver.execute_script("arguments[0].scrollIntoView();", job)
            time.sleep(1)
            job.click()
            time.sleep(2)

            # 抓取当前职位页面 URL
            job_url = driver.current_url

            # 抓取职位标题
            try:
                title = driver.find_element(By.XPATH, "//h2[@data-testid='viewJobTitle']").text.strip()
            except:
                title = "N/A"

            # 公司名称
            try:
                company = driver.find_element(By.XPATH, "//div[@data-testid='viewJobCompanyDetailsContainer']//span[@data-testid='detailText']").text.strip()
            except:
                company = "N/A"

            # 工作地点
            try:
                location = driver.find_elements(By.XPATH, "//div[@data-testid='viewJobCompanyDetailsContainer']//span[@data-testid='detailText']")[1].text.strip()
            except:
                location = "N/A"

            # 工作类型
            try:
                work_type = driver.find_element(By.XPATH, "//div[@data-testid='viewJobBodyJobDetailsContainer']//span[@data-testid='detailText']").text.strip()
            except:
                work_type = "N/A"

            # 薪资
            try:
                salary = driver.find_element(By.XPATH, "(//span[@data-testid='detailText' and contains(text(), '$')])[1]").text.strip()
            except:
                salary = "Not listed"

            # 福利
            try:
                benefits_elements = driver.find_elements(By.XPATH, "//span[@data-testid='viewJobBenefitItem']")
                benefits = ", ".join([b.text.strip() for b in benefits_elements])
            except:
                benefits = ""

            # 资格（技能）
            try:
                qualification_elements = driver.find_elements(By.XPATH, "//span[@data-testid='viewJobQualificationItem']")
                qualifications = ", ".join([q.text.strip() for q in qualification_elements])
            except:
                qualifications = "N/A"

            # 职位描述（所有 p 和 ul/li 合并）
            try:
                desc_elements = driver.find_elements(By.XPATH, "//div[@data-testid='viewJobBodyJobFullDescriptionContent']//p | //div[@data-testid='viewJobBodyJobFullDescriptionContent']//ul | //div[@data-testid='viewJobBodyJobFullDescriptionContent']//li")
                job_description = " ".join([d.text.strip() for d in desc_elements])
            except:
                job_description = "N/A"

            job_info = {
                "Job Title": title,
                "Company": company,
                "Location": location,
                "Work Type": work_type,
                "Salary": salary,
                "Benefits": benefits,
                "Qualifications": qualifications,
                "Full Job Description": job_description,
                "URL": job_url
            }

            jobs_on_page.append(job_info)

        except Exception as e:
            print(f" 抓取单个职位失败: {e}")
            continue

    return jobs_on_page

In [10]:
all_jobs = []
page_count = 0

while len(all_jobs) < 500:
    page_count += 1
    print(f"📄 开始抓取第 {page_count} 页...")
    current_jobs = crawl_single_page()
    all_jobs.extend(current_jobs)

    # 如果已经抓够 500 条，就跳出循环
    if len(all_jobs) >= 500:
        break

    try:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        next_page_button = driver.find_element(By.XPATH, "//a[@data-testid='pageNumberBlockNext']")
        next_page_button.click()
        time.sleep(5)
    except:
        print("❌ 没有找到下一页按钮，提前结束。")
        break

print(f"✅ 已完成，共抓取 {len(all_jobs)} 条职位数据")

📄 开始抓取第 1 页...
✅ 当前页找到 20 个职位
📄 开始抓取第 2 页...
✅ 当前页找到 20 个职位
📄 开始抓取第 3 页...
✅ 当前页找到 20 个职位
✅ 已完成，共抓取 60 条职位数据


In [12]:
with open("simplyhired_jobs_50.json", "w", encoding="utf-8") as f:
    json.dump(all_jobs, f, ensure_ascii=False, indent=4)
print("✅ 数据已保存为 simplyhired_jobs_50.json")

✅ 数据已保存为 simplyhired_jobs_50.json
