In [1]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from tqdm import tqdm

In [2]:
options = Options()
# options.add_argument("--headless")  # Headless 모드 활성화
options.add_argument("--no-sandbox")  # 일부 Docker 환경에서 필요한 옵션
options.add_argument("--disable-dev-shm-usage")  # 메모리 사용을 줄이기 위해 필요할 수 있음
options.add_argument("--lang=ko-KR")  # 한국어 설정

In [3]:
driver = webdriver.Chrome(options=options)

In [4]:
driver.get("https://www.wanted.co.kr/")

In [5]:
search_box_button = driver.find_element(By.XPATH, '//*[@id="__next"]/div[1]/div[2]/nav/aside/ul/li[1]')
search_box_button.click()

In [6]:
search_box = driver.find_element(By.XPATH, '//*[@id="nav_searchbar"]/div/div[2]/div/form/input')

In [7]:
search_box.send_keys("데이터 분석가")

In [8]:
search_box.send_keys(Keys.ENTER)

In [9]:
time.sleep(1)
more_position_button = driver.find_element(By.XPATH, '//*[@id="search_tabpanel_overview"]/div[1]/div[3]/button/div')
more_position_button.click()

In [10]:
num_of_position = driver.find_element(By.XPATH, '//*[@id="search_tabpanel_position"]/div/div[1]/h2/span')
num_of_position.text

'109'

In [11]:
# 페이지를 완전히 아래로 스크롤
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    # 현재 페이지의 끝까지 스크롤
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1)  # 스크롤 후 페이지 로딩 시간 대기 (필요에 따라 조정 가능)

    # 새로운 높이 확인
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        # 높이 변화가 없으면 스크롤 완료로 판단하고 종료
        break
    last_height = new_height

In [12]:
job_cards = []
company_names = []
locations = []
careers = []
main_jobs = []
check_lists = []
good_lists = []
for num in tqdm(range(int(num_of_position.text)), desc='crawling...'):
    try:
        # 사진 클릭 버튼이 클릭 가능할 때까지 대기 후 클릭
        picture_click_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, f'//*[@id="search_tabpanel_position"]/div/div[3]/div[{num+1}]'))
        )
        picture_click_button.click()
        time.sleep(1)

        # 더보기 버튼이 클릭 가능할 때까지 대기 후 클릭
        see_more_button = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="__next"]/main/div[1]/div/section/section/article[1]/div/button/span[2]'))
        )
        see_more_button.click()

        job_card = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, f'//*[@id="__next"]/main/div[1]/div/section/header/h1'))
        )
        job_cards.append(job_card.text)
        
        company_name = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, f'//*[@id="__next"]/main/div[1]/div/section/header/div/div[1]/a'))
        )
        company_names.append(company_name.text)

        location = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, f'//*[@id="__next"]/main/div[1]/div/section/header/div/div[1]/span[2]'))
        )
        locations.append(location.text)

        career = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, f'//*[@id="__next"]/main/div[1]/div/section/header/div/div[1]/span[4]'))
        )
        careers.append(career.text)

        # main_jobs, check_lists, good_lists 텍스트가 로딩될 때까지 대기 후 추가
        main_job = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="__next"]/main/div[1]/div/section/section/article[1]/div/div[1]'))
        )
        main_jobs.append(main_job.text)
        
        check_list = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="__next"]/main/div[1]/div/section/section/article[1]/div/div[2]'))
        )
        check_lists.append(check_list.text)
        
        good_list = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="__next"]/main/div[1]/div/section/section/article[1]/div/div[3]'))
        )
        good_lists.append(good_list.text)

        # 이전 페이지로 이동 후 대기
        driver.back()
        time.sleep(1)

    except Exception as e:
        print(f"job_titles: {len(job_cards)}")
        print(f"company_names: {len(company_names)}")
        print(f'locations: {len(locations)}')
        print(f'careers: {len(careers)}')
        print(f"main_jobs: {len(main_jobs)}")
        print(f"check_lists: {len(check_lists)}")
        print(f"good_lists: {len(good_lists)}")
        print(f"error: {e}")
        raise

crawling...: 100%|██████████| 109/109 [04:09<00:00,  2.29s/it]


In [13]:
# 데이터프레임 생성
data = {
    "직무명": job_cards,
    "회사명": company_names,
    "위치": locations,
    "경력": careers,
    "주요업무": main_jobs,
    "자격요건": check_lists,
    "우대사항": good_lists
}
df = pd.DataFrame(data)

# CSV 파일로 저장
df.to_csv("job_data.csv", index=False)  # 현재 디렉토리에 job_data.csv로 저장