In [8]:
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from tqdm.notebook import tqdm
from multiprocessing import Pool
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# webdriver가 안보이게 설정
options = webdriver.ChromeOptions()
options.add_argument('--headless')

In [9]:
def get_sneakers_soup():
    '''
    리뷰수가 50개 이상인 스니커즈 제품의 html을 가져오는 코드
    '''
    product_soup = []
    
    temp_url = 'https://www.musinsa.com/categories/item/018'
    headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
    temp_page = requests.get(temp_url, headers=headers)
    temp_soup = BeautifulSoup(temp_page.content, 'html.parser')
    page_num = temp_soup.find('span', {'class': 'totalPagingNum'})
    page_num = int(re.sub('[^0-9]', '', page_num.get_text())) + 1

    # 스니커즈 모든 페이지에 대해서
    for i in range(1, page_num):
        url = 'https://www.musinsa.com/categories/item/018?d_cat_cd=018&brand=&list_kind=small&sort=pop_category&sub_sort=&page='+str(i)+'&display_cnt=90&group_sale=&exclusive_yn=&sale_goods=&timesale_yn=&ex_soldout=&plusDeliveryYn=&kids=&color=&price1=&price2=&shoeSizeOption=&tags=&campaign_id=&includeKeywords=&measure='
        page = requests.get(url, headers=headers)
        page_soup = BeautifulSoup(page.content, 'html.parser')
        
        # 상품의 정보가 있는 html을 가져옴
        product_list = page_soup.find_all('div', {'class': 'article_info'})

        for product in product_list:
            # 리뷰수를 가져오는 함수
            try:
                review_num = product.find('span', {'class':'count'}).get_text()
                review_num = int(re.sub('[^0-9]','',review_num))
            # 없으면 0으로 처리
            except:
                review_num = 0

            # 리뷰수가 50개 이상인 상품의 soup를 가져온다
            if review_num > 49:
                product_soup.append(product)

    return product_soup

In [10]:
def get_product_df(product_soup):
    '''
    상품의 정보를 가져와 dataframe으로 만드는 함수
    '''
    headers = {'User-Agent': 'Mozilla/5.0'}

    brand_list = []
    name_list = []
    all_tag_list = []
    sex_list = []
    price_list = []
    all_size_list = []
    view_num = []
    buy_num = []
    review_num = []
    like_num = []
    url_list = []

    for product in product_soup:
            # url 가져오기
            url = product.select('p.list_info > a')[0].attrs['href']
            url_list.append('https:'+url)

            # 제품명 가져오기
            name = product.find('a', {'name': 'goods_link'})
            name.find('strong').decompose()
            name = name.get_text().replace('\n', ' ').strip()
            name_list.append(name)

            # 브랜드명 가져오기
            brand = product.select('p.item_title > a')[0].get_text()
            brand_list.append(brand)

            # 가격 가져오기
            price = product.find('p', {'class': 'price'}).get_text().split()
            if len(price) == 1:
                # 할인가격 없을시 현재가격
                price_list.append(price[0])
            else:
                # 할인 가격이 있을 시 할인 가격을 추가
                price_list.append(price[1])

    for url in url_list:
        # 셀레니움으로 원하는 데이터 가져오기
        driver = webdriver.Chrome(options=options)
        driver.get(url)
        sel_html = driver.page_source
        sel_soup = BeautifulSoup(sel_html)

        # 성별 데이터 가져오기
        try:
            sex_html = sel_soup.select('#product-right-top > div.product-detail__sc-achptn-0.eXRtIE > ul > li:nth-child(2) > div.product-detail__sc-achptn-6.gfoaTb > span:nth-child(3)')
            sex = sex_html[0].get_text()
            sex_list.append(sex)
        except:
            sex_list.append('')
        
        # 남은 사이즈 정보 가져오기
        if len(sel_soup.select('#option2 > option')) != 0:
            color_size = []
            color_num = len(sel_soup.select('#option1 > option'))
            for i in range(1, color_num):
                size_list = []
                select = Select(driver.find_element(By.CSS_SELECTOR, '#option1'))
                select.select_by_index(i)
                time.sleep(0.5)

                color_html = driver.page_source
                color_soup = BeautifulSoup(color_html)

                color = color_soup.select('#option1 > option:nth-child('+str(i+1)+')')[0].get_text()
                color = str(color.split()[0])

                sizes = color_soup.select('#option2 > option')
                for size in sizes:
                    size = size.get_text()
                    if size.find('옵션') and size.find('품절') == -1:
                        size_list.append(size)
                        
                color_size.append({color:size_list})

            all_size_list.append(color_size)
        
        else:
            size_list = []
            sizes = sel_soup.select('#option1 > option')
            for size in sizes:
                size = size.get_text()
                size = re.sub('[^0-9가-힣\(\)]', '', size)
                if size.find('옵션') and size.find('품절') == -1:
                    size_list.append(size)

            all_size_list.append(size_list)
        
        # 제품 사이트 내 태그 가져오기
        tag_html = sel_soup.find_all('a', {'class': 'product-detail__sc-uwu3zm-1 hhzMHa'})
        tag_list =[]

        for tag in tag_html:
            tag_list.append(tag.get_text()[1:])

        all_tag_list.append(tag_list)

        # 좋아요 수 가져오기
        try:
            like = sel_soup.find_all('span', {'class':'product-detail__sc-achptn-4 flUHrZ'})[0].get_text()
            like_num.append(like)
        except:
            like_num.append('')

        # 조회수 가져오기
        try:
            view = sel_soup.select('#product-right-top > div.product-detail__sc-achptn-0.eXRtIE > ul > li:nth-child(3) > div.product-detail__sc-achptn-6.gfoaTb > span')[0].get_text()
            view_num.append(view)
        except:
            view_num.append('')

        # 누적 판매 수 가져오기
        try:
            buy = sel_soup.select('#product-right-top > div.product-detail__sc-achptn-0.eXRtIE > ul > li:nth-child(4) > div.product-detail__sc-achptn-6.gfoaTb > span')[0].get_text()
            buy_num.append(buy)
        except:
            buy_num.append('')

        # 리뷰 수 가져오기
        try:
            review = sel_soup.find_all('span', {'id':'review_total'})[0].get_text()
            review_num.append(review)
        except:
            review_num.append('')
        
        # 드라이버 닫아주기
        driver.close()

        # 시간추가
        time.sleep(0.01)
        
    df = pd.DataFrame(zip(brand_list, name_list, all_tag_list, sex_list, price_list, all_size_list, view_num, buy_num, like_num, review_num, url_list),
                      columns = ['brand', 'name', 'tag', 'sex', 'price', 'size', 'view', 'buy', 'like', 'review', 'url'])
    
    return df

In [15]:
def musinsa_sneakers_df():
    start = int(time.time())
    pool = Pool(processes=4)
    product_soup = get_sneakers_soup()
    print('soup을 가져오는 시간:', int(time.time()-start))
    print('리뷰 수가 50개 이상이 상품:', len(product_soup))
    df = get_product_df(product_soup)
    print('dataframe 만드는 시간 :', int(time.time())-start)
    return df

In [16]:
def save_to_csv(df):
    print('제품 정보 csv 변환 중...')
    df.to_csv('musinsa_sneakers.csv', index=False, encoding='utf-8-sig')

In [17]:
def start():
    df = musinsa_sneakers_df()
    save_to_csv(df)
    print('완료되었습니다')

In [18]:
start()

soup을 가져오는 시간: 71
리뷰 수가 50개 이상이 상품: 1261
dataframe 만드는 시간 : 4139
제품 정보 csv 변환 중...
완료되었습니다
