# Web Crawler

In [1]:
import math
import time

# A compendium of commonly-used regular expressions.
import re

# The fundamental package for scientific computing with Python.
import numpy as np

# Flexible and powerful data analysis / manipulation library for Python, providing labeled data structures similar to R data.frame objects, statistical functions, and much more
import pandas as pd

In [2]:
# A Fast, Extensible Progress Bar for Python and CLI
from tqdm import tqdm

# A simple, yet elegant HTTP library.
import requests

# A browser automation framework and ecosystem.
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Beautiful Soup is a library that makes it easy to scrape information from web pages.
from bs4 import BeautifulSoup

In [3]:
# The old socket.ssl() support for TLS over sockets is being superseded in Python 2.6 by a new ‘ssl’ module. This package brings that module to older Python releases, 2.3.5 and up (it may also work on older versions of 2.3, but we haven’t tried it).
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    # Legacy Python that doesn't verify HTTPS certificates by default
    pass
else:
    # Handle target environment that doesn't support HTTPS verification
    ssl._create_default_https_context = _create_unverified_https_context

# WebDriver is an open source tool for automated testing of webapps across many browsers.
import chromedriver_autoinstaller

chromedriver_autoinstaller.install()

chrome_options = Options()
chrome_options.add_argument('headless')
chrome_options.add_argument('disable-gpu')
chrome_options.add_argument("--lang=ko_KR")
chrome_options.add_argument('content-type=application/x-www-form-urlencoded; charset=utf-8')
chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36')

In [4]:
driver = webdriver.Chrome(options=chrome_options)

params = {
    'section': 'qna', # kin, qna, ency
    'period': '1w', # 1w, 1m, 2002.01.01.%7C2020.12.09.
    'page': 1,
    'query': '여자친구+선물',
}
params['query'] = re.sub(' ', '+', params['query'])

endpoint = 'https://kin.naver.com/search/list.nhn?section={}&period={}&page={}&query={}'\
    .format(params['section'], params['period'], params['page'], params['query'])

driver.switch_to.window(driver.window_handles[0])
driver.get(endpoint)

WebDriverWait(driver, 10)

html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

# paging
page_tag = soup.select_one('h2 span.number').get_text(strip=True)
page_txt = re.sub('\(|\)', '', page_tag).split('/')
num_of_rows = int(page_txt[0].split('-')[1])
total_count = int(page_txt[1])
total_page = math.ceil(total_count / num_of_rows)

# series
dataset = []

for page_no in tqdm(np.arange(1, total_page+1)):
    try:

        params.update({'page': page_no})

        endpoint = 'https://kin.naver.com/search/list.nhn?section={}&period={}&page={}&query={}'\
            .format(params['section'], params['period'], params['page'], params['query'])

        driver.switch_to.window(driver.window_handles[0])
        driver.get(endpoint)

        WebDriverWait(driver, 10)

        # parse
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        menuitem = soup.select('#s_content ul.basic1 li')

        for item in menuitem:
            try:

                element = item.select_one('._searchListTitleAnchor')
                text = element.get_text(strip=True)
                href = element.get('href')

                data = {}
                data.update({ 'title':text, 'href':href })

                for arr in re.sub('.*\?', '', href).split('&'):
                    txt = arr.split('=')
                    data.update({ txt[0]:txt[1] })

                dataset.append(data)

            except Exception as e:
                print(e)

    except Exception as e:
        print(page_no, e)

# quit() is a webdriver command which calls the driver.dispose method, which in turn closes all the browser windows and terminates the WebDriver session. If we do not use quit() at the end of program, the WebDriver session will not be closed properly and the files will not be cleared off memory. This may result in memory leak errors.
driver.quit()

# dataframe
df1 = pd.DataFrame(dataset)
df1.to_csv('./tmp/df1.csv', index=False)

100%|██████████| 8/8 [00:04<00:00,  1.96it/s]


In [5]:
driver = webdriver.Chrome(options=chrome_options)

# series
dataset = []

for page_no in tqdm(df1.index):
    try:

        endpoint = df1['href'][page_no]

        driver.switch_to.window(driver.window_handles[0])
        driver.get(endpoint)

        WebDriverWait(driver, 10)

        # parse
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # readmore
        num_of_more = int(soup.select_one('#nextPageButton ._currentAnswerCount').get_text(strip=True))
        total_more_count = int(soup.select_one('#nextPageButton ._totalAnswerCount').get_text(strip=True))
        total_more = math.floor(total_more_count / num_of_more)

        time.sleep(1)
        for idx in np.arange(0, total_more):
            driver.find_element_by_css_selector('#nextPageButton').click()
            time.sleep(1)

        # data
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        menuitem = soup.select('.answer-content__list .answer-content__item')

        for item in menuitem:
            try:

                element = item.select_one('._endContentsText')
                content = element.get_text(strip=True)

                data = {}
                data.update({ 'content':content })

                for arr in re.sub('.*\?', '', endpoint).split('&'):
                    txt = arr.split('=')
                    data.update({ txt[0]:txt[1] })

                dataset.append(data)

            except Exception as e:
                print(e)

    except Exception as e:
        print(page_no, e)

# quit() is a webdriver command which calls the driver.dispose method, which in turn closes all the browser windows and terminates the WebDriver session. If we do not use quit() at the end of program, the WebDriver session will not be closed properly and the files will not be cleared off memory. This may result in memory leak errors.
driver.quit()

# dataframe
df2 = pd.DataFrame(dataset)
df2.to_csv('./tmp/df2.csv', index=False)

  3%|▎         | 2/63 [00:08<04:45,  4.68s/it]1 Message: element not interactable
  (Session info: headless chrome=87.0.4280.88)

  5%|▍         | 3/63 [00:11<04:16,  4.27s/it]'NoneType' object has no attribute 'get_text'
  6%|▋         | 4/63 [00:14<03:40,  3.74s/it]'NoneType' object has no attribute 'get_text'
  8%|▊         | 5/63 [00:18<03:43,  3.85s/it]4 Message: element not interactable
  (Session info: headless chrome=87.0.4280.88)

 13%|█▎        | 8/63 [00:28<03:04,  3.35s/it]7 Message: element not interactable
  (Session info: headless chrome=87.0.4280.88)

 14%|█▍        | 9/63 [00:30<02:46,  3.08s/it]8 Message: element not interactable
  (Session info: headless chrome=87.0.4280.88)

 22%|██▏       | 14/63 [00:51<02:58,  3.64s/it]13 Message: element not interactable
  (Session info: headless chrome=87.0.4280.88)

 24%|██▍       | 15/63 [00:55<03:05,  3.86s/it]14 Message: element not interactable
  (Session info: headless chrome=87.0.4280.88)

 27%|██▋       | 17/63 [01:04<03

In [6]:
# export
df = pd.merge(df1, df2, on='docId', how='right')
df = df[['docId', 'title', 'content', 'href']]
df.to_csv('./data/kin.csv', index=False)
df

Unnamed: 0,docId,title,content,href
0,374420755,여자친구 선물,여자친구 선물 고르실때쿠팡 참고하시면 좋습니다.​가격도 저렴한 상품이 많고생각 못했...,https://kin.naver.com/qna/detail.nhn?d1id=8&di...
1,374420755,여자친구 선물,어머낰!!그러시지 말고 차라리 마카롱을 직접 만들어 드리는 것은 어떨까요?감동일것 ...,https://kin.naver.com/qna/detail.nhn?d1id=8&di...
2,374420755,여자친구 선물,세상에 단 하나 뿐인 거라면 주문제작아이템이 좋죠1만원대 이하로 저렴하게 만들 수 ...,https://kin.naver.com/qna/detail.nhn?d1id=8&di...
3,374420755,여자친구 선물,​20대 여자 생일선물 추천 - 추천하는 선물 BEST (ver.5만원 이하)안녕하...,https://kin.naver.com/qna/detail.nhn?d1id=8&di...
4,374420755,여자친구 선물,안녕하세요 여자친구 선물이 고민이시군요 ^^먼저 아름다운 사랑을 응원합니다저도 여자...,https://kin.naver.com/qna/detail.nhn?d1id=8&di...
...,...,...,...,...
243,375159660,나이키조거팬츠 상품 문의드려요. 20,여자친구 선물해주시면 좋아할 것 같네요. ^^​AI 기술로 분석한 키워드 검색으로 ...,https://kin.naver.com/qna/detail.nhn?d1id=5&di...
244,374899456,여자친구 생일선물 로이드 목걸이 줄길이...,총길이는 40cm에 37에 여유줄이 3cm인것 같아요 ~~마르셨다면 너무 길게 착용...,https://kin.naver.com/qna/detail.nhn?d1id=8&di...
245,374899456,여자친구 생일선물 로이드 목걸이 줄길이...,"추우니까 따뜻한 담요 선물해주는 건 어떨까요? 연말 ,크리스마스 선물 겸 주기도 좋...",https://kin.naver.com/qna/detail.nhn?d1id=8&di...
246,374899456,여자친구 생일선물 로이드 목걸이 줄길이...,안녕하세요 여자친구 선물이 고민이시군요 ^^먼저 아름다운 사랑을 응원합니다저도 여자...,https://kin.naver.com/qna/detail.nhn?d1id=8&di...
