In [11]:
from selenium import webdriver as wd
from bs4 import BeautifulSoup
import pandas as pd
import time
import urllib
from selenium.webdriver.common.by import By
import os

def get_article_info(driver, crawl_date, press_list, title_list, link_list, date_list, more_news_base_url=None, more_news=False):
    more_news_url_list = []
    while True:    
        page_html_source = driver.page_source
        url_soup = BeautifulSoup(page_html_source, 'lxml')
        
        more_news_infos = url_soup.select('a.news_more')
        
        if more_news:
            for more_news_info in more_news_infos:
                more_news_url = f"{more_news_base_url}{more_news_info.get('href')}"

                more_news_url_list.append(more_news_url)

        article_infos = url_soup.select("div.news_area")
        
        if not article_infos:
            break

        for article_info in article_infos:  
            press_info = article_info.select_one("div.info_group > a.info.press")
            
            if press_info is None:
                press_info = article_info.select_one("div.info_group > span.info.press")
            article = article_info.select_one("a.news_tit")
            
            press = press_info.text.replace("언론사 선정", "")
            title = article.get('title')
            link = article.get('href')

#             print(f"press - {press} / title - {title} / link - {link}")
            press_list.append(press)
            title_list.append(title)
            link_list.append(link)
            date_list.append(crawl_date)

        time.sleep(2.0)
                      
                      
        next_button_status = url_soup.select_one("a.btn_next").get("aria-disabled")
        
        if next_button_status == 'true':
            break
        
        time.sleep(1.0)
        next_page_btn = driver.find_element(By.CSS_SELECTOR,"a.btn_next").click()      
         
    
    return press_list, title_list, link_list, more_news_url_list
    
    

def get_naver_news_info_from_selenium(keyword, save_path, target_date, ds_de, sort=0, remove_duplicate=False):
    crawl_date = f"{target_date[:4]}.{target_date[4:6]}.{target_date[6:]}"
    driver = wd.Chrome("../../ssuyan/PAST_NEWS/chromedriver_win32.zip/chromedriver") # chromedriver 파일 경로

    encoded_keyword = urllib.parse.quote(keyword)
    url = f"https://search.naver.com/search.naver?where=news&query={encoded_keyword}&sm=tab_opt&sort={sort}&photo=0&field=0&pd=3&ds={ds_de}&de={ds_de}&docid=&related=0&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so%3Ar%2Cp%3Afrom{target_date}to{target_date}&is_sug_officeid=0"
    
    more_news_base_url = "https://search.naver.com/search.naver"

    driver.get(url)
    
    press_list, title_list, link_list, date_list, more_news_url_list = [], [], [], [], []
    
    press_list, title_list, link_list, more_news_url_list = get_article_info(driver=driver, 
                                                                             crawl_date=crawl_date, 
                                                                             press_list=press_list, 
                                                                             title_list=title_list, 
                                                                             link_list=link_list,
                                                                             date_list=date_list,
                                                                             more_news_base_url=more_news_base_url,
                                                                             more_news=True)
    driver.close()
    
    if len(more_news_url_list) > 0:
        print(len(more_news_url_list))
        more_news_url_list = list(set(more_news_url_list))
        print(f"->{len(more_news_url_list)}")
        for more_news_url in more_news_url_list:
            driver = wd.Chrome("../../ssuyan/PAST_NEWS/chromedriver_win32.zip/chromedriver")
            driver.get(more_news_url)
            
            press_list, title_list, link_list, more_news_url_list = get_article_info(driver=driver, 
                                                                             crawl_date=crawl_date, 
                                                                             press_list=press_list, 
                                                                             title_list=title_list, 
                                                                             link_list=link_list,
                                                                             date_list=date_list)
            driver.close()
    article_df = pd.DataFrame({"날짜": date_list, "언론사": press_list, "제목": title_list, "링크": link_list})
    
    print(f"extract article num : {len(article_df)}")
    if remove_duplicate:
        article_df = article_df.drop_duplicates(['링크'], keep='first')
        print(f"after remove duplicate -> {len(article_df)}")
    
    article_df.to_excel(save_path, index=False)

In [12]:
from datetime import datetime
from tqdm import tqdm

def crawl_news_data(keyword, year, month, start_day, end_day, save_path):
    for day in tqdm(range(start_day, end_day+1)):
        date_time_obj = datetime(year=year, month=month, day=day)
        target_date = date_time_obj.strftime("%Y%m%d")
        ds_de = date_time_obj.strftime("%Y.%m.%d")

        get_naver_news_info_from_selenium(keyword=keyword, save_path=f"{save_path}/{keyword}/{target_date}_{keyword}_.xlsx", target_date=target_date, ds_de=ds_de, remove_duplicate=False)

In [13]:
keywords = ['방위']
save_path = "./naver_news_article_2022"

for keyword in keywords:
    os.makedirs(f"{save_path}/{keyword}")

---
##### 2022년 1월 1일 부터 13일까지 값을 크롤링
---

In [14]:
for keyword in keywords:
    print(f"start keyword - {keyword} crawling ...")
    crawl_news_data(keyword=keyword, year=2022, month=1, start_day=1, end_day=13, save_path=save_path)

start keyword - 방위 crawling ...


  driver = wd.Chrome("../../ssuyan/PAST_NEWS/chromedriver_win32.zip/chromedriver") # chromedriver 파일 경로


4
->4


  driver = wd.Chrome("../../ssuyan/PAST_NEWS/chromedriver_win32.zip/chromedriver")


extract article num : 100


  8%|▊         | 1/13 [01:07<13:27, 67.30s/it]

1
->1
extract article num : 56


 15%|█▌        | 2/13 [01:41<08:48, 48.01s/it]

63
->54


 15%|█▌        | 2/13 [06:13<34:12, 186.63s/it]


NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=114.0.5735.199)
Stacktrace:
Backtrace:
	GetHandleVerifier [0x00E7A813+48355]
	(No symbol) [0x00E0C4B1]
	(No symbol) [0x00D15358]
	(No symbol) [0x00CFD293]
	(No symbol) [0x00D5E37B]
	(No symbol) [0x00D6C473]
	(No symbol) [0x00D5A536]
	(No symbol) [0x00D382DC]
	(No symbol) [0x00D393DD]
	GetHandleVerifier [0x010DAABD+2539405]
	GetHandleVerifier [0x0111A78F+2800735]
	GetHandleVerifier [0x0111456C+2775612]
	GetHandleVerifier [0x00F051E0+616112]
	(No symbol) [0x00E15F8C]
	(No symbol) [0x00E12328]
	(No symbol) [0x00E1240B]
	(No symbol) [0x00E04FF7]
	BaseThreadInitThunk [0x76B27D59+25]
	RtlInitializeExceptionChain [0x77AFB74B+107]
	RtlClearBits [0x77AFB6CF+191]
