### <mark>**✅Crawling_Coupang**

##### **Library**

In [1]:
# selenium import
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium_stealth import stealth

# library import
from bs4 import BeautifulSoup
from dateutil import parser
import time
import pandas as pd
import platform
import os
from time import sleep
from glob import glob
from tqdm import tqdm
from datetime import datetime
import pyautogui as pg

##### **Accept**

In [2]:
os_name = platform.system().lower()
architecture = platform.machine()

driver_path = None
if os_name == 'darwin':
    driver_path = glob('../driver/**/chromedriver', recursive=True)[0]
else:
    driver_path = glob('../driver/**/chromedriver.exe', recursive=True)[0]

In [3]:
# 드라이버 파일의 권한을 확인하고 수정
if os_name == 'darwin':  # 맥 사용자
    driver_permissions = os.stat(driver_path).st_mode
    if not (driver_permissions & 0o100):  # 실행 권한 확인
        os.chmod(driver_path, driver_permissions | 0o111)  # 실행 권한 추가
else:  # 윈도우 사용자
    driver_permissions = os.stat(driver_path).st_mode
    if not (driver_permissions & 0o100):  # 실행 권한 확인
        os.chmod(driver_path, driver_permissions | 0o111)  # 실행 권한 추가

##### **Chrome Options**

In [4]:
service = Service(executable_path=driver_path)
chrome_options =  webdriver.ChromeOptions()
#chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-dev-shm-usage')
#chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--blink-settings=imagesEnabled=false')
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')

##### **Define**

In [5]:
url = 'https://www.coupang.com/vp/products/8251537122'

In [6]:
# Web OPEN
driver = webdriver.Chrome(service=service, options=chrome_options)
stealth(driver,
        languages=["ko-KR", "ko"],
        vendor="Google Inc.",
        platform="Win32",
        webgl_vendor="Intel Inc.",
        renderer="Intel Iris OpenGL Engine",
        fix_hairline=True,
        )
driver.get(url)
time.sleep(3)

In [51]:
driver.execute_cdp_cmd(
    'Page.addScriptToEvaluateOnNewDocument', 
    {'source': """ Object.defineProperty(navigator, 'webdriver', { get: () => undefined }) """}
)

{'identifier': '2'}

In [None]:
# Web ZOOM
driver.execute_script("document.body.style.zoom='10%'")
# Web Height
last_height = driver.execute_script("return document.body.scrollHeight")

# Page Loading

while True:
    # 끝까지 스크롤 다운
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # 1초 대기
    time.sleep(1)

    # 스크롤 다운 후 스크롤 높이 다시 가져옴
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

In [None]:
# Parsing
soup = BeautifulSoup(driver.page_source, 'html.parser')
data = []

In [None]:
review = '#btfTab > ul.tab-contents > li.product-review.tab-contents__content > div > div.sdp-review__article.js_reviewArticleContainer > section.js_reviewArticleListContainer'
count = (soup.select_one('#btfTab > ul.tab-contents > li.product-review.tab-contents__content > div > div.sdp-review__average.js_reviewAverageContainer > section.sdp-review__average__total-star > div.sdp-review__average__total-star__info > div.sdp-review__average__total-star__info-count').text)
selector_title = '#contents > div.prod-atf > div.prod-atf-main > div.prod-buy.new-oos-style.not-loyalty-member.eligible-address.without-subscribe-buy-type.DISPLAY_0.fix-verdor-section-display.prod-buy-alias.update-price-section-style-with-rds > div.prod-buy-header > h1'

In [None]:
# Count page
page_int = int(count.replace(',', ''))

def calculate_page_no(page_int):
    page_no = page_int // 5 if page_int % 5 == 0 else page_int // 5 + 1
    return page_no

page_no = calculate_page_no(page_int)

##### **Crawling**

In [None]:
for page in tqdm(range(1, page_no + 1), desc="현재 페이지", unit="page", leave=False):
    WebDriverWait(driver, 20).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, review))
    )
    
    time.sleep(1)

    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    empty_review_count = 0

    review_articles = soup.select(f"{review} article")

    for article in review_articles:
        try:
            id_selector = article.select_one('div.sdp-review__article__list__info > div.sdp-review__article__list__info__user > span')
            user_id = id_selector.get_text(strip=True) if id_selector else "N/A"

            created_selector = article.select_one('div.sdp-review__article__list__info > div.sdp-review__article__list__info__product-info > div.sdp-review__article__list__info__product-info__reg-date')
            created_at = parser.parse(created_selector.get_text(strip=True)) if created_selector else 'N/A'

            option_selector = article.select_one('div.sdp-review__article__list__info > div.sdp-review__article__list__info__product-info__name')
            option_text = option_selector.get_text(strip=True) if option_selector else None

            content_selector = article.select_one('div.sdp-review__article__list__review.js_reviewArticleContentContainer > div')
            review_text = content_selector.get_text(strip=True) if content_selector else None

            if review_text is None:
                empty_review_count += 1
                continue

            data.append({
                "ID": user_id,
                "CREATED": created_at,
                "OPTION": option_text,
                "REVIEW": review_text
            })
        except Exception as e:
            print(f"Error processing review: {e}")

    if empty_review_count == len(review_articles):
        break

    if page < page_no:
        try:
            page_number = (page % 9) + 3  # 3~11까지 반복됨
            next_page_selector = f"#btfTab > ul.tab-contents > li.product-review.tab-contents__content > div > div.sdp-review__article.js_reviewArticleContainer > section.js_reviewArticleListContainer > div.sdp-review__article__page.js_reviewArticlePagingContainer > button:nth-child({page_number})"
            
            # 페이지 번호 버튼이 있는 경우 클릭
            next_page_elements = driver.find_elements(By.CSS_SELECTOR, next_page_selector)
            if next_page_elements:
                driver.execute_script("arguments[0].click();", next_page_elements[0])
                time.sleep(3)  # 페이지 로딩 대기
            else:
                # 마지막 "다음" 버튼 클릭 시도
                next_button = driver.find_elements(By.CSS_SELECTOR, ".js_reviewArticlePageNextBtn")
                if next_button:
                    driver.execute_script("arguments[0].click();", next_button[0])
                    time.sleep(3)  # 페이지 로딩 대기
                else:
                    print(f"페이지 {page}에서 '다음' 버튼을 찾을 수 없음. 크롤링 종료.")
                    break
        except Exception as e:
            print(f"Error clicking next page: {e}")
            break


                                                             

##### **To CSV**

In [None]:
result_df = pd.DataFrame(data)
result_df['CREATED'] = result_df['CREATED'].apply(lambda x: f"{x.day:02d}-{x.month:02d}-{str(x.year)[2:]}")

In [None]:
driver.close()
driver.quit()

In [None]:
split_text = selector_title.text.split(" ")
if len(split_text) >= 3:
    file_name = " ".join(split_text[:2])
else:
    file_name = selector_title.text

AttributeError: 'NoneType' object has no attribute 'text'

In [None]:
today_str = datetime.today().strftime("%m%d")

In [None]:
result_df.to_csv(rf'..\data\review\{today_str}_.csv', index=False, encoding='utf-8-sig')