In [7]:
# ! pip install selenium

# Utility Functions for dev

In [1]:
import selenium
from bs4 import BeautifulSoup
import requests
import tqdm
import pprint

In [87]:
def HTML_string_pprint(HTML_str):
    soup = BeautifulSoup(HTML_str)
    print(soup.prettify())

# NewsCrawler_1.0

In [8]:
import os
import time
from IPython.display import clear_output #Ipython 환경에서만 필요
from abc import *
from bs4 import BeautifulSoup
import requests


class NewsCrawler(metaclass=ABCMeta):
    @classmethod
    def crawl(cls, max_num = 1):
        start_time = time.time()
        articles_list = []
        
        try:
            cls._crawl(max_num, articles_list)
        except StopIteration:
            pass
        except BaseException as e:
            raise e
        finally:
            print(f"execution time : {round(time.time() - start_time, 2)}s")
            return articles_list
        
    @classmethod
    @abstractmethod
    def _bs4_element2article_json(cls, bs4_element):
        pass
    
    @classmethod
    @abstractmethod
    def _crawl(cls, max_num, articles_list):
        pass
        
    @staticmethod
    def url2soup(url):
        req = requests.get(url)
        html = req.text
        return BeautifulSoup(html, "html.parser")
    

class HankyorehCrawler(NewsCrawler):
    home_url = "https://www.hani.co.kr"
    
    @classmethod
    def _bs4_element2article_json(cls, bs4_element):
        try:
            article_json = {}
            article_json["title"] = bs4_element.select_one(".article-title a").text if bs4_element.select_one(".article-title a") else None
            article_json["datetime_str"] = bs4_element.select_one(".date").text if bs4_element.select_one(".date") else None
            article_json["preview_prologue"] = bs4_element.select_one(".article-prologue a").text if bs4_element.select_one(".article-prologue a") else None
            article_json["detail_link_postfix"] = bs4_element.select_one(".article-title a")["href"] if bs4_element.select_one(".article-title a") else None
            article_json["preview_img_path"] = bs4_element.select_one(".article-photo img")["src"] if bs4_element.select_one(".article-photo img") else None
            article_json["journal_name"] = "한겨레"

            detail_url_str = cls.home_url + article_json["detail_link_postfix"]
            soup = NewsCrawler.url2soup(detail_url_str)

            article_json["detail_img_path"] = soup.select_one(".article-text .image img")["src"] if soup.select_one(".article-text .image img") else None
            article_json["detail_text"] = soup.select_one(".article-text .text").text if soup.select_one(".article-text .text") else None
        
            return article_json
        
        except BaseException as e:
            print("Error occured at ...")
            print(article_json)
            raise e
                        
    @classmethod
    def _crawl_from_page(cls, page_num, max_num, articles_list):
        url_str = f"{cls.home_url}/arti/politics/list{page_num}.html"
        soup = NewsCrawler.url2soup(url_str)
        elements = soup.select(".article-area")
        
        for element in elements:
            articles_list.append(cls._bs4_element2article_json(element))

            # progress checker
            # os.system('clear') # for pycharm, vscode etc...
            clear_output(wait=True) # for Ipython
            print(f"Crawled {len(articles_list)} / {max_num} articles.")

            if len(articles_list) >= max_num:
                raise StopIteration
                
    @classmethod
    def _crawl(cls, max_num, articles_list):
        page_num = 0
        
        while True:
            page_num += 1
            cls._crawl_from_page(page_num, max_num, articles_list)

In [9]:
articles_list = HankyorehCrawler.crawl(10)

Crawled 10 / 10 articles.
execution time : 1.32s


In [10]:
articles_list

[{'title': '[공덕포차] 윤석열 X 이재명...‘협치 임파서블’',
  'datetime_str': '2022-10-27 20:42',
  'preview_prologue': '지난 24일 검찰이 민주당사 민주연구원을 압수수색했죠. 민주당은 윤석열 대통령의 국회 시정연설을 전면 보이콧하며 맞섰는...',
  'detail_link_postfix': '/arti/politics/politics_general/1064607.html',
  'preview_img_path': '//flexible.img.hani.co.kr/flexible/normal/212/112/imgdb/original/2022/1027/20221027503837.jpg',
  'journal_name': '한겨레',
  'detail_img_path': '//flexible.img.hani.co.kr/flexible/normal/970/546/imgdb/original/2022/1027/20221027503837.jpg',
  'detail_text': '\n\n\n\n지난 24일 검찰이 민주당사 민주연구원을 압수수색했죠. 민주당은 윤석열 대통령의 국회 시정연설을 전면 보이콧하며 맞섰는데요. 여야가 강대강으로 대치하고 있는 상황! 말뿐인 “민생”, ‘협치’는 온데간데없고 그 피해는 오로지 국민이 입고 있습니다. 톰 크루즈에게 주어진 ‘미션’보다 어렵게 느껴지는 ‘협치’, 불가능한 걸까요? 〈공덕포차 ‘협치 임파서블’〉편에서 이야기해봤습니다. 지금 바로 확인하세요~\n\n한겨레TV ‘공덕포차’ 라이브방송 보러 가기\nhttps://youtu.be/CxVz7kpfNN4\n\n\n\n\n\n\n\n\n\n제작진\n기획 : 송호진\n프로듀서 : 이경주 김도성\n작가 : 박연신\n행정: 김양임\n타이틀 : 문석진\n기술: 박성영\n음향 : 장지남\n카메라 : 장승호 윤제욱 신형철\n자막그래픽디자인: 김수경\n연출 : 정주용 이규호 임여경 (graceyyk0826@hani.co.kr)\n제작:

In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument("--window-size=1920,1080")

chosun_politics_home = "https://www.chosun.com/politics/"
sample_Xpath1 = '//*[@id="main"]/div[8]/section/div/div/div/div[1]/div/div/div/div[1]'
sample_Xpath2 = '//*[@id="main"]/div[8]/section/div/div/div/div[2]/div/div/div/div[1]'
sample_Xpath3 = '//*[@id="main"]/div[8]/section/div/div/div/div[5]/div/div/div/div[1]'
driver = webdriver.Chrome(options=chrome_options)
driver.get(chosun_politics_home)
driver.implicitly_wait(3)
# html = driver.find_element(By.XPATH, sample_Xpath)

In [8]:
# 광고 윗쪽 영역
css_selector1 = "#main > div.flex-chain-wrapper.lg.\|.width--100.box--pad-bottom-xl.box--bg-grey-10.box--border.box--border-grey-40.box--border-xs.box--border-horizontal.box--border-horizontal-bottom.box--hidden-sm.box--hidden-md-only > section > div > div > div > div:nth-child(1) > div > div > div > div.story-card.story-card--art-left.\|.flex.flex--wrap"
X_path1 = '//*[@id="main"]/div[8]/section/div/div/div/div[1]/div/div/div/div[1]'
css_selector2 = "#main > div.flex-chain-wrapper.lg.\|.width--100.box--pad-bottom-xl.box--bg-grey-10.box--border.box--border-grey-40.box--border-xs.box--border-horizontal.box--border-horizontal-bottom.box--hidden-sm.box--hidden-md-only > section > div > div > div > div:nth-child(2) > div > div > div > div.story-card.story-card--art-left.\|.flex.flex--wrap"
X_path2 = '//*[@id="main"]/div[8]/section/div/div/div/div[2]/div/div/div/div[1]'
css_selector_parent_upper = "#main > div.flex-chain-wrapper.lg.\|.width--100.box--pad-bottom-xl.box--bg-grey-10.box--border.box--border-grey-40.box--border-xs.box--border-horizontal.box--border-horizontal-bottom.box--hidden-sm.box--hidden-md-only > section > div > div > div"

css_selector_sample = "main > div.flex-chain-wrapper.lg.\|.width--100.box--pad-bottom-xl.box--bg-grey-10.box--border.box--border-grey-40.box--border-xs.box--border-horizontal.box--border-horizontal-bottom.box--hidden-sm.box--hidden-md-only > section > div > div > div > div:nth-child(1) > div > div > div >"

# 광고 아래쪽 영역 (더보기에 의해 추가 되는 영역)
css_selector3 = "#main > div.flex-chain-wrapper.lg.\|.box--margin-none.width--100.box--pad-top-xl.box--bg-undefined.box--border.box--border-black.box--border-xs.box--border-horizontal.box--border-horizontal-top.box--hidden-sm.box--hidden-md-only > section > div > div > div > div:nth-child(1) > div > div > div > div.story-card.story-card--art-right.\|.flex.flex--wrap"
X_path3 = '//*[@id="main"]/div[10]/section/div/div/div/div[1]/div/div/div/div[1]'
# 아래는 사진 없는것
css_selector4 = "#main > div.flex-chain-wrapper.lg.\|.box--margin-none.width--100.box--pad-top-xl.box--bg-undefined.box--border.box--border-black.box--border-xs.box--border-horizontal.box--border-horizontal-top.box--hidden-sm.box--hidden-md-only > section > div > div > div > div:nth-child(3) > div > div > div > div.story-card.story-card--none.story-card--no-art.\|.flex.flex--wrap"
# 더보기 직전 마지막
css_selector5 = "#main > div.flex-chain-wrapper.lg.\|.box--margin-none.width--100.box--pad-top-xl.box--bg-undefined.box--border.box--border-black.box--border-xs.box--border-horizontal.box--border-horizontal-top.box--hidden-sm.box--hidden-md-only > section > div > div > div > div:nth-child(10) > div > div > div > div"

#더보기 이후
css_selector6 = "#main > div.flex-chain-wrapper.lg.\|.box--margin-none.width--100.box--pad-top-xl.box--bg-undefined.box--border.box--border-black.box--border-xs.box--border-horizontal.box--border-horizontal-top.box--hidden-sm.box--hidden-md-only > section > div > div > div > div:nth-child(19) > div > div > div > div.story-card.story-card--art-right.\|.flex.flex--wrap"





In [9]:
# elements = driver.find_elements(By.CSS_SELECTOR, css_selector_parent)
element = driver.find_element(By.XPATH, X_path1)
# element = driver.find_element(By.CSS_SELECTOR, css_selector_sample)

In [10]:
html_parent_str = element.get_attribute('innerHTML')
HTML_string_pprint(html_parent_str)

<html>
 <body>
  <div class="story-card-block story-card-left story-card-block--art | grid__col--sm-3 grid__col--md-3 grid__col--lg-3">
   <div class="story-card-component story-card__art | box--position-relative" id="artwrapper">
    <div class="image-wrapper | box--position-relative">
     <figure class="story-card__art-image | visual__image">
      <div class="width--100 box--margin-center">
       <div class="article-print-sty1 | box--position-relative width--100" style="padding-bottom: 56.25%;">
        <div class="article-print-sty2 | width--100 height--100 box--position-absolute box--position-absolute-top box--position-absolute-left">
         <a class="" href="/politics/assembly/2022/10/26/DGCOUVXEEZAJVFYEVBF7K2GB2M/">
          <div class="lazyload-wrapper">
           <img alt="尹 “과거 제시한 탄소감축 목표, 국민 부담 감안했는지 의문”" class="box--display-block" src="https://images.chosun.com/resizer/gdTSCzndc6HgWgMQa5L2REdm7-0=/400x225/smart/cloudfront-ap-northeast-1.images.arcpublishing.com/chosu

In [13]:
## 사진이 왼쪽에 있는 것들
bs4_element = BeautifulSoup(html_parent_str, "html.parser")
title = bs4_element.select_one("div>div>a>span")
# datetime_str =
preview_prologue = bs4_element.select_one("div>div>span")
detail_link_postfix = bs4_element.select_one("div>div>a")["href"]
preview_img_path = bs4_element.select_one("img")["src"]
journal_name = "조선일보"

detail_url_str = "https://www.chosun.com" + detail_link_postfix
soup = NewsCrawler.url2soup(detail_url_str)

detail_img_path = soup.select_one(".article-body img")
# soup
print(title)
print(preview_prologue)

<span>尹 “과거 제시한 탄소감축 목표, 국민 부담 감안했는지 의문”</span>
<span>윤석열 대통령이 26일 ‘2050 탄소중립녹색성장위원회’ 오찬 간담회에서 전임 문재인 정부가 설정한 탄소 배출 감축 목표치에 대해 “국민 부담이 어떤 것인지 과연 제대로 짚어보고 한 것인지 의문”이라고 말했다. 산업계에 미치는 영향을 충분히 고려하지 않고 설정한 목표치였다는 것이다. 윤 대통령은 이날 서울 용산 대통령실 청사에서 열린 오찬 간담회에서 “우리...</span>


In [23]:
class ChosunCrawler(NewsCrawler):
    home_url = "https://www.chosun.com"
    @classmethod
    def 
    
    @classmethod
    def _bs4_element2article_json(cls, bs4_element):
        try:
            article_json = {}
            article_json["title"] = bs4_element.select_one(".article-title a").text if bs4_element.select_one(".article-title a") else None
            article_json["datetime_str"] = bs4_element.select_one(".date").text if bs4_element.select_one(".date") else None
            article_json["preview_prologue"] = bs4_element.select_one(".article-prologue a").text if bs4_element.select_one(".article-prologue a") else None
            article_json["detail_link_postfix"] = bs4_element.select_one(".article-title a")["href"] if bs4_element.select_one(".article-title a") else None
            article_json["preview_img_path"] = bs4_element.select_one(".article-photo img")["src"] if bs4_element.select_one(".article-photo img") else None
            article_json["journal_name"] = "한겨레"

            detail_url_str = cls.home_url + article_json["detail_link_postfix"]
            soup = NewsCrawler.url2soup(detail_url_str)

            article_json["detail_img_path"] = soup.select_one(".article-text .image img")["src"] if soup.select_one(".article-text .image img") else None
            article_json["detail_text"] = soup.select_one(".article-text .text").text if soup.select_one(".article-text .text") else None
        
            return article_json
        
        except BaseException as e:
            print("Error occured at ...")
            print(article_json)
            raise e
                        
    @classmethod
    def _crawl_from_page(cls, page_num, max_num, articles_list):
        url_str = f"{cls.home_url}/arti/politics/list{page_num}.html"
        soup = NewsCrawler.url2soup(url_str)
        elements = soup.select(".article-area")
        
        for element in elements:
            articles_list.append(cls._bs4_element2article_json(element))

            # progress checker
            # os.system('clear') # for pycharm, vscode etc...
            clear_output(wait=True) # for Ipython
            print(f"Crawled {len(articles_list)} / {max_num} articles.")

            if len(articles_list) >= max_num:
                raise StopIteration
                
    @classmethod
    def _crawl(cls, max_num, articles_list):
        page_num = 0
        
        while True:
            page_num += 1
            cls._crawl_from_page(page_num, max_num, articles_list)
        

# NewsCrawler_2.0

In [1]:
import os
import time
from IPython.display import clear_output #Ipython 환경에서만 필요
from abc import *
from bs4 import BeautifulSoup
import requests


class NewsCrawler(metaclass=ABCMeta):
    @classmethod
    def crawl(cls, max_num = 1):
        start_time = time.time()
        articles_list = []
        
        try:
            cls._crawl(max_num, articles_list)
        except StopIteration as e:
            print(e)
            pass
        except BaseException as e:
            print(e)
        finally:
            print(f"execution time : {round(time.time() - start_time, 2)}s")
            return articles_list
        
    @classmethod
    @abstractmethod
    def _bs4_element2article_json(cls, bs4_element):
        pass
    
    @classmethod
    @abstractmethod
    def _crawl(cls, max_num, articles_list):
        pass
        
    @staticmethod
    def url2soup(url):
        req = requests.get(url, headers={'User-Agent':'Mozilla/5.0'})
        html = req.text
        return BeautifulSoup(html, "html.parser")
    

class NaverCrawler(NewsCrawler):
    home_url = "https://news.naver.com/main/main.naver?mode=LSD&mid=shm&sid1=100"
    
    @classmethod
    def _bs4_element2article_json(cls, bs4_element):
        try:
            article_json = {}
            article_json["datetime_str"] = bs4_element.select_one("span.date").text if bs4_element.select_one("span.date") else None
            article_json["preview_prologue"] = bs4_element.select_one("span.lede").text if bs4_element.select_one("span.lede") else None
            article_json["detail_link"] = bs4_element.select_one("dt.photo a")["href"] if bs4_element.select_one("dt.photo a") else None
            article_json["preview_img_path"] = bs4_element.select_one("dt.photo a img")["src"] if bs4_element.select_one("dt.photo a img") else None
            article_json["journal_name"] = bs4_element.select_one("span.writing").text if bs4_element.select_one("span.writing") else None

            detail_url_str = article_json["detail_link"]
            soup = NewsCrawler.url2soup(detail_url_str)

            article_json["title"] = soup.select_one(".media_end_head_title .media_end_head_headline").text if soup.select_one(".media_end_head_title .media_end_head_headline") else None
            article_json["detail_img_path"] = soup.select_one(".end_photo_org img._LAZY_LOADING")["data-src"] if soup.select_one(".end_photo_org img._LAZY_LOADING") else None
            article_json["detail_text"] = soup.select_one("div#dic_area").text if soup.select_one("div#dic_area") else None
        
            return article_json
        
        except BaseException as e:
            print("Error occured at ...")
            print(article_json)
            raise e
                    
    @classmethod
    def _crawl_from_page(cls, url_str, max_num, articles_list):    #TODO : add press 'see more'
        soup = NewsCrawler.url2soup(url_str)
        
        article_element_list = soup.select(".type06_headline li dl")
        
        for article_element in article_element_list:
            articles_list.append(cls._bs4_element2article_json(article_element))

            # progress checker
            # os.system('clear') # for pycharm, vscode etc...
            clear_output(wait=True) # for Ipython
            print(f"Crawled {len(articles_list)} / {max_num} articles.")

            if len(articles_list) >= max_num:
                raise StopIteration
                
    @classmethod
    def _crawl(cls, max_num, articles_list):
        soup = NewsCrawler.url2soup(cls.home_url)
        elements = soup.select("div.cluster_group .cluster_foot a")
        
        for element in elements:
            articles_page_url = "https://news.naver.com" + element["href"]
            cls._crawl_from_page(articles_page_url, max_num, articles_list)

        

In [2]:
a = NaverCrawler.crawl(5)

Crawled 5 / 5 articles.

execution time : 2.07s


In [4]:
a

[{'datetime_str': '2022.11.08',
  'preview_prologue': '외교부는 일본 강제징용 배상 문제에 대한 해결 방안을 모색하기 위해 "공청회뿐만 아니라 다양한 형태의 의견 수렴 절차를 계속 검토 중"이라고 밝혔습니다. 외교부 당국자는 "네 차례 민관협의회 이후에 확장된 형태의 의',
  'detail_link': 'https://n.news.naver.com/mnews/article/214/0001233845?sid=100',
  'preview_img_path': 'https://imgnews.pstatic.net/image/origin/214/2022/11/08/1233845.jpg?type=ofullfill106_72',
  'journal_name': 'MBC',
  'title': '외교부 "강제징용 관련 공청회뿐 아니라 다양한 형태 의견 수렴 추진"',
  'detail_img_path': 'https://imgnews.pstatic.net/image/214/2022/11/08/0001233845_001_20221108173401605.jpg?type=w647',
  'detail_text': '\n\n\n\n\n외교부 [자료사진]외교부는 일본 강제징용 배상 문제에 대한 해결 방안을 모색하기 위해 "공청회뿐만 아니라 다양한 형태의 의견 수렴 절차를 계속 검토 중"이라고 밝혔습니다.외교부 당국자는 "네 차례 민관협의회 이후에 확장된 형태의 의견 수렴 절차를 거치겠다고 이미 여러 차례 밝혀왔다"며, "이 같은 형식의 방식을 현재 검토하고 있고, 또 다른 형태의 의견 수렴하는 방식에 대해서도 검토를 하고 있다"고 말했습니다.이는 정부가 그동안 개최를 추진해 온 공청회나 공개토론회 등 확장된 형태의 의견 수렴과는 또 다른 형태의 의견 수렴 방식을 언급한 것으로 풀이됩니다.당국자는 "확장된 형태로 의견을 수렴하는 공청회 등의 방식에 대해서는 아직 시기 등이 정해진 것이 없다"며, "적절한 시기가 도래해 발표할 수 있도록 추진

## filter 추가


In [5]:
import os
import sys
import time
from IPython.display import clear_output #Ipython 환경에서만 필요
from abc import *
from bs4 import BeautifulSoup
import requests  
import pprint

class NewsCrawler(metaclass=ABCMeta):
    @classmethod
    def crawl(cls, max_num = 1):
        start_time = time.time()
        articles_list = []
        
        try:
            cls._crawl(max_num, articles_list)
        except StopIteration as e:
            print(e)
            pass
        except BaseException as e:
            print(e)
        finally:
            print(f"execution time : {round(time.time() - start_time, 2)}s")
            return articles_list
        
    @classmethod
    @abstractmethod
    def _bs4_element2article_json(cls, bs4_element):
        pass
    
    @classmethod
    @abstractmethod
    def _crawl(cls, max_num, articles_list):
        pass
        
    @staticmethod
    def url2soup(url):
        req = requests.get(url, headers={'User-Agent':'Mozilla/5.0'})
        html = req.text
        return BeautifulSoup(html, "html.parser")
    

class NaverCrawler(NewsCrawler):
    home_url = "https://news.naver.com/main/main.naver?mode=LSD&mid=shm&sid1=100"
    
    @classmethod
    def _bs4_element2article_json(cls, bs4_element):
        try:
            article_json = {}
            article_json["datetime"] = bs4_element.select_one("span.date").text if bs4_element.select_one("span.date") else None
            article_json["preview_prologue"] = bs4_element.select_one("span.lede").text if bs4_element.select_one("span.lede") else None
            article_json["detail_link"] = bs4_element.select_one("dt.photo a")["href"] if bs4_element.select_one("dt.photo a") else None
            article_json["preview_img_path"] = bs4_element.select_one("dt.photo a img")["src"] if bs4_element.select_one("dt.photo a img") else None
            article_json["journal_name"] = bs4_element.select_one("span.writing").text if bs4_element.select_one("span.writing") else None

            detail_url_str = article_json["detail_link"]
            soup = NewsCrawler.url2soup(detail_url_str)

            article_json["title"] = soup.select_one(".media_end_head_title .media_end_head_headline").text if soup.select_one(".media_end_head_title .media_end_head_headline") else None
            article_json["detail_img_path"] = soup.select_one(".end_photo_org img._LAZY_LOADING")["data-src"] if soup.select_one(".end_photo_org img._LAZY_LOADING") else None
            article_json["detail_text"] = soup.select_one("div#dic_area").text if soup.select_one("div#dic_area") else None
        
            return article_json
        
        except BaseException as e:
            print("Error occured at ...")
            print(article_json)
            print(e)
            raise e
                    
    @classmethod
    def _crawl_from_page(cls, url_str, max_num, articles_list):    #TODO : add press 'see more'
        soup = NewsCrawler.url2soup(url_str)
        
        article_element_list = soup.select(".type06_headline li dl")
        
        for article_element in article_element_list:
            article_json = cls._bs4_element2article_json(article_element)
            if article_json['journal_name'] in ['한겨레', '조선일보']:
              articles_list.append(article_json)
              print(f"Crawled {len(articles_list)} / {max_num} articles.")

            # progress checker
            # os.system('clear') # for pycharm, vscode etc...
            clear_output(wait=True) # for Ipython
            
            if len(articles_list) >= max_num:
                raise StopIteration
                
    @classmethod
    def _crawl(cls, max_num, articles_list):
        soup = NewsCrawler.url2soup(cls.home_url)
        elements = soup.select("div.cluster_group .cluster_foot a")
        
        for element in elements:
            articles_page_url = "https://news.naver.com" + element["href"]
            cls._crawl_from_page(articles_page_url, max_num, articles_list)

인자로 받을 개수를 적어야 합니다.


NameError: name 'number_of_articles' is not defined

: 