# 질문 게시판 스크래핑

질문 게시판은 서비스 시작부터 있었던 모든 질문 데이터를 스크래핑해야 하므로 오랜 실행시간이 필요합니다. 

실행시간이 길기 때문에 '컴퓨터 부하 발생으로 인한 종료' 등의 변수로 스크래핑한 데이터가 저장되지 않는것을 방지하고자 세이브 포인트를 설정하여 50페이지 마다 현재 스크래핑된 데이터를 저장하도록 진행하였습니다.

In [6]:
import time
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager 
import pandas as pd
import math
import numpy as np
import re
import warnings

warnings.filterwarnings(action='ignore')

In [16]:
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.implicitly_wait(10)
driver.get('https://www.inflearn.com/community/questions')
time.sleep(2)

html = driver.page_source
soup = bs(html, 'html.parser')  

# 질문 게시글 제목 / 질문자 / 질문 날짜 / 질문 내용 / 강의 제목 / 강의 섹션 / 답변자 / 답변 날짜 / 답변 내용
title_, questioner_, q_date_, q_content_, course_title_, section_, answer_, a_date_, a_content_ = [[] for i in range(9)]

last_page = int(soup.select_one('#main > section.community-body > div.community-body__content > nav > ul > li:nth-of-type(12) > a').text)
save_point = [(50 * i) for i in range(1, math.ceil(last_page/50))]

for i in range(last_page) :
    
    page = str(i + 1)
    url = 'https://www.inflearn.com/community/questions?page=' + page
    driver.get(url)

    html = driver.page_source
    soup = bs(html, 'html.parser')   

    # 질문 게시글의 url 가져옴
    question_list = []
    question_container = soup.select('li.question-container')
    for container in question_container :
        question_list.append(container.select_one('a')['href'].split('/questions/')[1])

    # 하나의 게시글마다 스크래핑
    for post_id in question_list :

        content_url = 'https://www.inflearn.com/questions/' + post_id
        driver.get(content_url)

        questions_html = driver.page_source        
        questions_soup = bs(questions_html, 'html.parser') 

        title_.append(questions_soup.select_one('#main > section.community-post-detail__section.community-post-detail__post > div.section__content > div > div.community-post-info__header > div.header__title > h1').text)
        questioner = questions_soup.select_one('#main > section.community-post-detail__section.community-post-detail__post > div.section__content > div > div.community-post-info__header > div.header__sub-title > h6').text
        if (questioner is not None) :
            questioner = questioner.replace(', ', '')
            questioner_.append(questioner)        

        q_date = questions_soup.select_one('#main > section.community-post-detail__section.community-post-detail__post > div.section__content > div > div.community-post-info__header > div.header__sub-title > span').text
        q_date = q_date.replace('\xa0· ', '')
        q_date_.append(q_date)

        q_content = questions_soup.select_one('#main > section.community-post-detail__section.community-post-detail__post > div.section__content > div > div.community-post-info__content > div.content__body.markdown-body').text
        if (q_content is not None) :
            q_content = q_content.replace('\n', ' ')
            q_content_.append(q_content)

        course_title_list = questions_soup.select_one('#main > section.community-post-detail__section.community-post-detail__post > div.section__content > div > div.community-post-info__content > div.content__sub-info > div.sub-info__item.sub-info__course-unit > div')
        course_title = questions_soup.select_one('#main > section.community-post-detail__section.community-post-detail__post > div.section__content > div > div.community-post-info__content > div.content__sub-info-none-member > a > div > p')
        if (course_title is not None) :
            course_title_.append(course_title.text)
        else :
            course_title_.append('None')

        section = questions_soup.select_one('#main > section.community-post-detail__section.community-post-detail__post > div.section__content > div > div.community-post-info__content > div.content__sub-info-none-member > a.sub-info-none-member__unit > p.sub-info-none-member__unit-title')
        if (section is not None) :
            section_.append(section.text)
        else :
            section_.append('None')

        answer_count = questions_soup.select_one('#main > section.community-post-detail__section.community-post-detail__answer > div > div > div.answer-info__header > div').text[4:5]
        if (answer_count == '1') :
            answer_.append(questions_soup.select_one('#main > section.community-post-detail__section.community-post-detail__answer > div > div > div.community-post-info__content > div > div:nth-of-type(1) > div.comment__card > div.comment__header.flex-row > div > div > a').text.replace('\n', ' '))
            a_date_.append(questions_soup.select_one('#main > section.community-post-detail__section.community-post-detail__answer > div > div > div.community-post-info__content > div > div:nth-of-type(1) > div.comment__card > div.comment__header.flex-row > div > span').text)
            a_content_.append(questions_soup.select_one('#main > section.community-post-detail__section.community-post-detail__answer > div > div > div.community-post-info__content > div > div:nth-of-type(1) > div.comment__card > div.comment__body.markdown-body').text)
        elif (answer_count == '0'):
            answer_.append('None')
            a_date_.append('None')
            a_content_.append('None')
        else :
            answer_list = []
            a_date_list = []
            a_content_list = []

            answers = questions_soup.select('a.comment__user-name')
            for answer in answers :
                answer_list.append(answer.text.replace('\n', ' '))
            answer_.append(answer_list)

            a_dates = questions_soup.select('span.comment__updated-at')
            for a_date in a_dates :
                a_date_list.append(a_date.text)
            a_date_.append(a_date_list)

            a_contents = questions_soup.select('p.comment__body markdown-body')
            for a_content in a_contents :
                a_content_list.append(a_content.text)
            a_content_.append(a_content_list)

    # 스크래핑 데이터 양이 많고 여러가지 위험 요소를 방비하기 위해 중간중간 세이브 포인트를 삽입

    if i in save_point :
        q_df = pd.DataFrame(np.array([title_, questioner_, q_date_, q_content_, course_title_, section_, answer_, a_date_, a_content_]), index = ('title', 'questioner', 'q_date', 'q_content', 'course_title', 'section', 'answer', 'a_date', 'a_content'))
        q_df = q_df.transpose()
        q_df.to_csv('q_' + str(i) + '.csv', encoding = 'utf-8-sig')

# 스크래핑된 내용을 저장
q_df = pd.DataFrame(np.array([title_, questioner_, q_date_, q_content_, course_title_, section_, answer_, a_date_, a_content_]), index = ('title', 'questioner', 'q_date', 'q_content', 'course_title', 'section', 'answer', 'a_date', 'a_content'))
q_df = q_df.transpose()
q_df.to_csv('o1_question.csv', encoding = 'utf-8-sig') 

time.sleep(1)
driver.close()



Current google-chrome version is 97.0.4692
Get LATEST chromedriver version for 97.0.4692 google-chrome
Driver [C:\Users\TaeSoo\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe] found in cache
