# 강의 게시판 스크래핑

웹 문서를 파싱한 뒤 원하는 태그를 추출하는 메서드는 'find'와 'select'이 사용됩니다.
'find'에 비해 'select'은 크롬의 개발자 도구에서 쉽게 원하는 데이터를 가지고 올 수 있으며 수행시간이 더 짧고 적은 메모리를 사용하기 때문에 'select' 메서드를 사용하였습니다.

In [1]:
import time
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager 
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings(action='ignore')

In [5]:
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.implicitly_wait(10)

driver.get('https://www.inflearn.com/courses')
time.sleep(2)

html = driver.page_source
soup = bs(html, 'html.parser')  

# 대분류 / 중분류 / 강의 이름 / 리뷰 점수 / 리뷰 수 / 수강생 인원 / 강사 이름 / 강의 관련 태그 / 가격 / 할인율 / 할인 가격 / 강의 섹션 개수 및 강의 시간 / 수강 기간 제한 / 수료증 여부 / 난이도
main_category_, sub_category_, course_title_, review_score_, review_cnt_, student_cnt_, instructor_, related_tags_, price_, discount_rate_, sale_price_, course_info_, limit_time_, certificates_, level_ = [[] for i in range(15)]

last_page = int(soup.select_one('#courses_section > div > div > div > footer > nav > div > ul > li:nth-of-type(12) > a').text)

for i in range(last_page) :
    
    page = str(i + 1)
    url = 'https://www.inflearn.com/courses?order=seq&page=' + page
    driver.get(url)

    html = driver.page_source
    soup = bs(html, 'html.parser')   

    course_page_list = []

    course_container = soup.select('div.card.course.course_card_item')

    for container in course_container :
        course_page_list.append(container.select_one('a')['href'].split('/course/')[1])

    for post_id in course_page_list :

        content_url = 'https://www.inflearn.com/course/' + post_id
        driver.get(content_url)

        course_html = driver.page_source        
        course_soup = bs(course_html, 'html.parser') 
        
        category = course_soup.select('span.cd-header__breadcrumb-el')
        main_category_.append(category[0].text)
        sub_category_.append(category[1].text)
        course_title_.append(course_soup.select_one('#main > section > div.cd-sticky-wrapper > div.cd-header.cd-header__not-owned-course > div > div > div.cd-header__right.ac-cd-7.ac-ct-12 > div.cd-header__title').text)

        has_rating = course_soup.select_one('#main > section > div.cd-sticky-wrapper > div.cd-header.cd-header__not-owned-course > div > div > div.cd-header__right.ac-cd-7.ac-ct-12 > div.cd-header__info-cover')
        no_rating = course_soup.select('div.cd-header__student_cnt.cd-header__sub-row')

        if (has_rating is not None) : # 수강 별점 및 수강후기 존재
            review_score_.append(course_soup.select_one('#main > section > div.cd-sticky-wrapper > div.cd-header.cd-header__not-owned-course > div > div > div.cd-header__right.ac-cd-7.ac-ct-12 > div.cd-header__info-cover > span.cd-header__info--star > strong').text)
            review_cnt_.append(course_soup.select_one('#main > section > div.cd-sticky-wrapper > div.cd-header.cd-header__not-owned-course > div > div > div.cd-header__right.ac-cd-7.ac-ct-12 > div.cd-header__info-cover > span:nth-of-type(2)').text)
            student_cnt_.append(course_soup.select_one('#main > section > div.cd-sticky-wrapper > div.cd-header.cd-header__not-owned-course > div > div > div.cd-header__right.ac-cd-7.ac-ct-12 > div.cd-header__info-cover > span:nth-of-type(3) > strong').text)

        elif (no_rating is not None) : # 수강 별점 및 수강후기 존재하지 않음
            review_score_.append(np.nan)
            review_cnt_.append('0')
            student_cnt_.append(course_soup.select_one('#main > section > div.cd-sticky-wrapper > div.cd-header.cd-header__not-owned-course > div > div > div.cd-header__right.ac-cd-7.ac-ct-12 > div.cd-header__student_cnt.cd-header__sub-row > span:nth-of-type(2) > strong').text)
        
        else :
            print('새로운 수강평가 구조가 존재합니다.')
            print(content_url)

        instructor_.append(course_soup.select_one('#main > section > div.cd-content > div > div > div.ac-cd-4.ac-ct-12 > div > div > div.cd-floating__card > div.cd-floating__card--bottom > div > div:nth-of-type(1) > a').text)

        related_tags_list = []
        related_tags_container = course_soup.select('a.cd-header__tag')
        if (related_tags_container is not None) :
            for container in related_tags_container :
                related_tags_list.append(container.text.replace('\n', '').split())    
            related_tags_.append(related_tags_list)
        else :
            related_tags_.append('None')

        price_regular_installment = course_soup.select_one('#main > section > div.cd-content > div > div > div.ac-cd-4.ac-ct-12 > div > div > div.cd-floating__card > div.cd-floating__card--top > div.cd-floating__price.cd-floating__price--installment')
        price_discount_installment = course_soup.select_one('#main > section > div.cd-content > div > div > div.ac-cd-4.ac-ct-12 > div > div > div.cd-floating__card > div.cd-floating__card--top > div.cd-floating__price.cd-floating__price--dis-installment')
        price_discount = course_soup.select_one('#main > section > div.cd-content > div > div > div.ac-cd-4.ac-ct-12 > div > div > div.cd-floating__card > div.cd-floating__card--top > div.cd-floating__price.cd-floating__price--dis')
        price_regular = course_soup.select_one('#main > section > div.cd-content > div > div > div.ac-cd-4.ac-ct-12 > div > div > div > div.cd-floating__card--top > div.cd-floating__price.cd-floating__price--reg')
        price_free = course_soup.select_one('#main > section > div.cd-content > div > div > div.ac-cd-4.ac-ct-12 > div > div > div > div.cd-floating__card--top > div.cd-floating__price.cd-floating__price--free')

        if (price_regular_installment is not None) : # 할부 / 세일X
            discount_rate_.append('0%')
            sale_price_.append(course_soup.select_one('#main > section > div.cd-content > div > div > div.ac-cd-4.ac-ct-12 > div > div > div.cd-floating__card > div.cd-floating__card--top > div.cd-floating__price.cd-floating__price--installment > div.cd-floating__price--top > span').text)
            price_.append(course_soup.select_one('#main > section > div.cd-content > div > div > div.ac-cd-4.ac-ct-12 > div > div > div.cd-floating__card > div.cd-floating__card--top > div.cd-floating__price.cd-floating__price--installment > div.cd-floating__price--top > span').text)
        
        elif (price_discount_installment is not None) : # 할부 / 세일
            discount_rate_.append(course_soup.select_one('#main > section > div.cd-sticky-wrapper > div.cd-mb-information > div.cd-floating__price.cd-floating__price--dis-installment > div.cd-floating__price--top > span.cd-price__discount-rate').text)
            price_.append(course_soup.select_one('#main > section > div.cd-content > div > div > div.ac-cd-4.ac-ct-12 > div > div > div.cd-floating__card > div.cd-floating__card--top > div.cd-floating__price.cd-floating__price--dis-installment > div.cd-floating__price--top > del').text)
            sale_price_.append(course_soup.select_one('#main > section > div.cd-content > div > div > div.ac-cd-4.ac-ct-12 > div > div > div.cd-floating__card > div.cd-floating__card--top > div.cd-floating__price.cd-floating__price--dis-installment > div.cd-floating__price--top > span.cd-price__pay-price').text)
        
        elif (price_discount is not None) : # 할부X / 세일
            discount_rate_.append(course_soup.select_one('#main > section > div.cd-content > div > div > div.ac-cd-4.ac-ct-12 > div > div > div.cd-floating__card > div.cd-floating__card--top > div.cd-floating__price.cd-floating__price--dis > h4.cd-price__discount-rate').text)
            sale_price_.append(course_soup.select_one('#main > section > div.cd-content > div > div > div.ac-cd-4.ac-ct-12 > div > div > div.cd-floating__card > div.cd-floating__card--top > div.cd-floating__price.cd-floating__price--dis > h4:nth-of-type(2)').text) 
            price_.append(course_soup.select_one('#main > section > div.cd-content > div > div > div.ac-cd-4.ac-ct-12 > div > div > div.cd-floating__card > div.cd-floating__card--top > div.cd-floating__price.cd-floating__price--dis > del').text) 

        elif (price_regular is not None) : # 할부X / 세일X
            discount_rate_.append('0%')
            sale_price_.append(course_soup.select_one('#main > section > div.cd-content > div > div > div.ac-cd-4.ac-ct-12 > div > div > div > div.cd-floating__card--top > div.cd-floating__price.cd-floating__price--reg > h4').text)
            price_.append(course_soup.select_one('#main > section > div.cd-content > div > div > div.ac-cd-4.ac-ct-12 > div > div > div > div.cd-floating__card--top > div.cd-floating__price.cd-floating__price--reg > h4').text)
        
        elif (price_free is not None) : # 무료 강의
            discount_rate_.append('0%')
            sale_price_.append(course_soup.select_one('#main > section > div.cd-content > div > div > div.ac-cd-4.ac-ct-12 > div > div > div > div.cd-floating__card--top > div.cd-floating__price.cd-floating__price--free > h4').text)
            price_.append(course_soup.select_one('#main > section > div.cd-content > div > div > div.ac-cd-4.ac-ct-12 > div > div > div > div.cd-floating__card--top > div.cd-floating__price.cd-floating__price--free > h4').text)
        
        else :
            print('새로운 price 구조가 존재합니다.')
            print(content_url)

        course_info_.append(course_soup.select_one('#main > section > div.cd-content > div > div > div.ac-cd-4.ac-ct-12 > div > div > div.cd-floating__card > div.cd-floating__card--bottom > div > div:nth-of-type(2)').text)
        limit_time_.append(course_soup.select_one('#main > section > div.cd-content > div > div > div.ac-cd-4.ac-ct-12 > div > div > div.cd-floating__card > div.cd-floating__card--bottom > div > div:nth-of-type(3)').text)
        certificates_.append(course_soup.select_one('#main > section > div.cd-content > div > div > div.ac-cd-4.ac-ct-12 > div > div > div.cd-floating__card > div.cd-floating__card--bottom > div > div:nth-of-type(4)').text)
        
        level_.append(course_soup.select_one('#main > section > div.cd-content > div > div > div.ac-cd-4.ac-ct-12 > div > div > div.cd-floating__card > div.cd-floating__card--bottom > div > div.cd-floating__info-row.cd-floating__info-row--levels > div > span.cd-floating__info--bold').text)

course_df = pd.DataFrame(np.array([main_category_, sub_category_, course_title_, review_score_, review_cnt_, student_cnt_, instructor_, related_tags_, price_, discount_rate_, sale_price_, course_info_, limit_time_, certificates_, level_]), index = ('main_category', 'sub_category', 'course_title', 'review_score', 'review_cnt', 'student_cnt', 'instructor', 'related_tags', 'price', 'discount_rate', 'sale_price', 'course_info', 'limit_time', 'certificates', 'level'))
course_df = course_df.transpose()
course_df.to_csv('o1_course.csv', encoding = 'utf-8-sig')
     
time.sleep(1)
driver.close()



Current google-chrome version is 97.0.4692
Get LATEST chromedriver version for 97.0.4692 google-chrome
Driver [C:\Users\TaeSoo\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe] found in cache
