In [10]:
import requests
from bs4 import BeautifulSoup
import time
from datetime import datetime
import pandas as pd
import numpy as np

# ----------------------------------
# 날짜 입력받기
#   -> 입력하지 않으면 오늘날짜 반환
#   -> 잘못된 형식이면 None 반환
# ----------------------------------
def date_input():

    date_str = input("날짜:(형식:2020-01-05)")
    
    if date_str=="":
        date_str = datetime.now().strftime('%Y-%m-%d')
    
    try:
        datetime.strptime(date_str, '%Y-%m-%d')
    except:
        print("날짜 형색이 올바르지 않습니다.")
        date_str = None
    
    return date_str

date = date_input()



In [11]:
data = []
i = 1
while True:
    
    time.sleep(np.random.uniform(1.0, 3.0))
    

    # ----------------------------------
    # 웹페이지 요청하여 응답객체 반환
    # ----------------------------------
    url = f'https://finance.naver.com/news/mainnews.naver?date={date}&page={i}'
    response = requests.get(url)

    # ----------------------------------
    # 웹페이지 파싱하여 BeautifulSoup 객체 반환
    # ----------------------------------
    soup = BeautifulSoup(response.text, 'html.parser')

    # ----------------------------------
    # article 리스트 추출
    # ----------------------------------   
    articles = soup.select(".block1")

    # ----------------------------------
    # article 리스트에서 요소의 텍스트, 속성 추출
    # ----------------------------------   
    
    for article in articles:
        title = article.select_one(".articleSubject>a").text
        summary = article.select_one(".articleSummary").contents[0].text.strip()
        press = article.select_one(".press").text.strip()
        wdate = article.select_one(".wdate").text.strip()
        link = article.select_one(".articleSubject>a").attrs['href']
        article_id = link.split('=')[1].split('&')[0]
        office_id = link.split('office_id=')[1].split('&')[0]
        link = f'https://n.news.naver.com/mnews/article/{office_id}/{article_id}'
        data.append({"title":title, "summary":summary, "press":press, "wdate":wdate, "link":link})

    i+=1
    if(not soup.select_one('.pgRR')): break


In [12]:
df = pd.DataFrame(data)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    155 non-null    object
 1   summary  155 non-null    object
 2   press    155 non-null    object
 3   wdate    155 non-null    object
 4   link     155 non-null    object
dtypes: object(5)
memory usage: 6.2+ KB


In [13]:
suffix = f'{date}_{int(time.time())}'
file_name = f'data/네이버증권뉴스/주요뉴스_모든페이지_{suffix}'

df.to_csv(f'{file_name}.csv')
df.to_excel(f'{file_name}.xlsx')