In [None]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import csv
from tqdm import tqdm


In [None]:
# 웹드라이버 설정
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# 디버거 모드로 설정
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),
                          options=webdriver.ChromeOptions().add_experimental_option("debuggerAddress","localhost:9222"))

site_url = 'url'
driver.get(site_url)

hrefs = set()

while True:
    # 현재 페이지의 HTML을 가져옴
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # 'div.info_box' 내의 'a' 태그를 찾아서 href 속성 저장
    a_tags = soup.select('div.info_box a')
    urls = [a['href'] for a in a_tags if 'href' in a.attrs]
    new_hrefs = set(urls) - hrefs  # 이전에 가져온 href 정보와 중복된 것을 제외함
    hrefs.update(new_hrefs)  # 새로운 href 정보를 추가

    # 다음 페이지로 넘어감
    try:
        next_button = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//a[@class="pg_next on"]')))
        driver.execute_script("arguments[0].click();", next_button)
        time.sleep(0.1)
    except:
        break

In [None]:
base_url = 'https://search.naver.com/search.naver?'

def parse_car_name_kind(html):
    # 이름, 차종, 연식 갖고오기
    soup = BeautifulSoup(html, 'html.parser')
    car_name = soup.select('span.area_text_title ._text')[0].text.strip()
    car_kind = soup.select('div.sub_title .txt')[0].text.strip()
    car_year = soup.select('div.sub_title .txt')[1].text.strip()
    return  car_name, car_kind, car_year

def pares_basic_info(html):
    # dt, dd 정보 가져오기
    soup = BeautifulSoup(html, 'lxml')
    info_groups = soup.select('dl.info .info_group')

    # dt, dd 정보 출력
    basic_info = []
    for info_group in info_groups:
        dt = info_group.select_one('dt').text.strip()
        dd = info_group.select_one('dd').text.strip()
        basic_info.append(dd)
    return basic_info




In [None]:
with open("./car.csv", "w", newline="", encoding="utf-8-sig") as fw:
    writer = csv.writer(fw)
    headers = ["name", "kind", "year", "price", "fuel_kind", "km/l", "car_output", "torque", "cc",
               "engine", "driving_system", "gear", "full_length", "full_height", "full_width", "range"]
    writer.writerow(headers)

    for href in tqdm(hrefs):
        full_url = base_url + href
        driver.get(full_url)
        car_name, car_kind, car_year = parse_car_name_kind(driver.page_source)
        
        # '기본정보' 탭 클릭
        basic_info_link = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a.more_link')))
        driver.execute_script("arguments[0].click();", basic_info_link)
        basic_info = pares_basic_info(driver.page_source)

        car_info = [car_name, car_kind, car_year] + basic_info
        writer.writerow(car_info)
