In [1]:
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd

def make_pg_num(num):
    return 1 + 10 * (num - 1)

def make_url(search, start_pg, end_pg):
    urls = [f"https://search.naver.com/search.naver?where=news&sm=tab_pge&query={search}&start={make_pg_num(i)}" for i in range(start_pg, end_pg + 1)]
    return urls

def articles_crawler(url, headers):
    try:
        original_html = requests.get(url, headers=headers, timeout=5)
        html = BeautifulSoup(original_html.text, "html.parser")
        articles = html.select("div.group_news > ul.list_news > li div.news_area > a")
        return [(i.attrs['title'], i.attrs['href']) for i in articles]
    except Exception as e:
        print(f"Error occurred when fetching content from {url}: {str(e)}")
        return []

def news_contents_crawler(news_url, headers):
    try:
        news = requests.get(news_url, headers=headers, timeout=5)
        news_html = BeautifulSoup(news.text, "html.parser")
        content = news_html.find_all('p')
        return " ".join([c.get_text() for c in content])
    except Exception as e:
        print(f"Error occurred when fetching content: {str(e)}")
        return ""

# User input
search = input("검색할 키워드를 입력해주세요:")
start_page = int(input("\n크롤링할 시작 페이지를 입력해주세요. ex)1:"))
end_page = int(input("\n크롤링할 종료 페이지를 입력해주세요. ex)3:"))

# Headers for request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

# URL generation
urls = make_url(search, start_page, end_page)

news_titles = []
news_links = []
news_contents = []

# Crawling
for url in urls:
    for title, link in articles_crawler(url, headers):
        news_titles.append(title)
        news_links.append(link)
        news_contents.append(news_contents_crawler(link, headers))
        time.sleep(1)  # Delay between requests

# DataFrame generation
news_df = pd.DataFrame({'title': news_titles, 'link': news_links, 'content': news_contents})
news_df



Unnamed: 0,title,link,content


In [2]:
# Save the data
news_df.to_csv(f"{search}_news.csv", encoding='utf-8-sig', index=False)