# Selenium Web Crawler

In [6]:
import json
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

In [8]:
chromedriver_path = "/opt/homebrew/bin/chromedriver"

chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")

driver = webdriver.Chrome(service=Service(chromedriver_path), options=chrome_options)

In [10]:
job_titles = [
    "Software Engineer", "Full-Stack Developer", "Frontend Developer", "Backend Developer", "Mobile App Developer",
    "Web Developer", "Game Developer", "DevOps Engineer", "Cloud Engineer", "Systems Engineer", "Cybersecurity Engineer",
    "Test Automation Engineer", "Data Scientist", "Data Analyst", "Machine Learning Engineer", "AI Researcher",
    "NLP Engineer", "Computer Vision Engineer", "Data Engineer", "Business Intelligence Engineer", "Quantitative Researcher",
    "AI Product Manager", "Database Administrator", "Big Data Engineer", "Cloud Data Engineer", "ETL Developer",
    "MLOps Engineer", "Security Engineer", "SOC Analyst", "IT Support Engineer", "Network Engineer", "Technical Program Manager",
    "Project Manager", "UX Engineer", "Bioinformatics Engineer", "Quantum Computing Engineer", "High-Performance Computing Engineer"
]

urls = [f"https://www.simplyhired.com/search?q={title.replace(' ', '+')}&l=united+states" for title in job_titles]

In [None]:
def crawl_single_page():
    job_elements = driver.find_elements(By.XPATH, "//div[@data-testid='searchSerpJob']")
    print(f"✅ 当前页找到 {len(job_elements)} 个职位")
    jobs_on_page = []

    for job in job_elements:
        try:
            driver.execute_script("arguments[0].scrollIntoView();", job)
            time.sleep(1)
            job.click()
            time.sleep(2)
            job_url = driver.current_url

            try:
                title = driver.find_element(By.XPATH, "//h2[@data-testid='viewJobTitle']").text.strip()
            except:
                title = "N/A"

            try:
                company = driver.find_element(By.XPATH, "//div[@data-testid='viewJobCompanyDetailsContainer']//span[@data-testid='detailText']").text.strip()
            except:
                company = "N/A"

            try:
                location = driver.find_elements(By.XPATH, "//div[@data-testid='viewJobCompanyDetailsContainer']//span[@data-testid='detailText']")[1].text.strip()
            except:
                location = "N/A"

            try:
                work_type = driver.find_element(By.XPATH, "//div[@data-testid='viewJobBodyJobDetailsContainer']//span[@data-testid='detailText']").text.strip()
            except:
                work_type = "N/A"

            try:
                salary = driver.find_element(By.XPATH, "(//span[@data-testid='detailText' and contains(text(), '$')])[1]").text.strip()
            except:
                salary = "Not listed"

            try:
                benefits_elements = driver.find_elements(By.XPATH, "//span[@data-testid='viewJobBenefitItem']")
                benefits = ", ".join([b.text.strip() for b in benefits_elements])
            except:
                benefits = ""

            try:
                qualification_elements = driver.find_elements(By.XPATH, "//span[@data-testid='viewJobQualificationItem']")
                qualifications = ", ".join([q.text.strip() for q in qualification_elements])
            except:
                qualifications = "N/A"

            try:
                desc_elements = driver.find_elements(By.XPATH, "//div[@data-testid='viewJobBodyJobFullDescriptionContent']//p | //div[@data-testid='viewJobBodyJobFullDescriptionContent']//ul | //div[@data-testid='viewJobBodyJobFullDescriptionContent']//li")
                job_description = " ".join([d.text.strip() for d in desc_elements])
            except:
                job_description = "N/A"

            job_info = {
                "Job Title": title,
                "Company": company,
                "Location": location,
                "Work Type": work_type,
                "Salary": salary,
                "Benefits": benefits,
                "Qualifications": qualifications,
                "Full Job Description": job_description,
                "URL": job_url
            }

            jobs_on_page.append(job_info)

        except Exception as e:
            print(f" 抓取单个职位失败: {e}")
            continue

    return jobs_on_page

In [14]:
all_jobs = []

for search_url in urls:
    driver.get(search_url)
    time.sleep(5)
    local_count = 0
    print(f"\n🚀 开始抓取关键词: {search_url}\n")

    while local_count < 200:
        jobs_on_page = crawl_single_page()
        all_jobs.extend(jobs_on_page)
        local_count += len(jobs_on_page)

        if local_count >= 200:
            print(f"✅ 关键词 {search_url} 已完成 200 条")
            break

        try:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
            next_page_button = driver.find_element(By.XPATH, "//a[@data-testid='pageNumberBlockNext']")
            next_page_button.click()
            time.sleep(5)
        except:
            print("❌ 没有找到下一页按钮，提前结束当前关键词")
            break


🚀 开始抓取关键词: https://www.simplyhired.com/search?q=Software+Engineer&l=united+states

✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 关键词 https://www.simplyhired.com/search?q=Software+Engineer&l=united+states 已完成 200 条

🚀 开始抓取关键词: https://www.simplyhired.com/search?q=Full-Stack+Developer&l=united+states

✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 关键词 https://www.simplyhired.com/search?q=Full-Stack+Developer&l=united+states 已完成 200 条

🚀 开始抓取关键词: https://www.simplyhired.com/search?q=Frontend+Developer&l=united+states

✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 当前页找到 20 个职位
✅ 关键词 https://www.simplyhired.com/search?q=Frontend+Developer&l=united+states 已完成 200 条

🚀 开始抓取关键词: https://www.simpl

In [16]:
with open("simplyhired_all_keywords_200each.json", "w", encoding="utf-8") as f:
    json.dump(all_jobs, f, ensure_ascii=False, indent=4)
print(f"✅ 全部关键词抓取完成，共计 {len(all_jobs)} 条数据，已保存为 simplyhired_all_keywords_200each.json")

✅ 全部关键词抓取完成，共计 7400 条数据，已保存为 simplyhired_all_keywords_200each.json


In [18]:
driver.quit()