# NAVER 뉴스 검색 자동화
Naver Open API를 활용하여 여러 검색어에 대해 뉴스를 수집하고, 결과를 CSV로 저장합니다.

In [1]:
import urllib.request
import urllib.parse
import json
import pandas as pd
import re
import time

# 변수 정의 예시
search_terms = ["진학사", "모의지원", "합격예측", "수시", "정시"]
client_id = "PKDCU1T1VNA6sqY1ylD3"
client_secret = "9Jn6qTVfnI"
display = 100
sort = "date"

all_news = []

for query in search_terms:
    encoded_query = urllib.parse.quote(query)
    start = 1
    end = 1000

    for start_index in range(start, end, display):
        url = f"https://openapi.naver.com/v1/search/news?query={encoded_query}&display={display}&start={start_index}&sort={sort}"
        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id", client_id)
        request.add_header("X-Naver-Client-Secret", client_secret)
        response = urllib.request.urlopen(request)
        rescode = response.getcode()
        if rescode == 200:
            response_body = response.read()
            response_dict = json.loads(response_body.decode('utf-8'))
            items = response_dict['items']
            for item in items:
                remove_tag = re.compile('<.*?>')
                title = re.sub(remove_tag, '', item['title'])
                original_link = item.get('originallink', '')
                link = item['link']
                description = re.sub(remove_tag, '', item['description'])
                pub_date = item['pubDate']
                all_news.append({
                    "Title": title,
                    "Original Link": original_link,
                    "Link": link,
                    "Description": description,
                    "Publication Date": pub_date
                })
        else:
            print("Error Code:", rescode)
        time.sleep(0.1)  # API 호출 제한 방지

all_news_df = pd.DataFrame(all_news)
all_news_df = all_news_df.drop_duplicates(subset=["Title", "Original Link"])

csv_file_name = "test_data.csv"
all_news_df.to_csv(csv_file_name, index=False, encoding='utf-8-sig')

print(f"CSV 파일 저장 완료: {csv_file_name}")

CSV 파일 저장 완료: test_data.csv


# 뉴스 본문 크롤링 자동화 (BeautifulSoup 활용)
이 코드는 test_data.csv에 저장된 뉴스 기사 링크를 바탕으로, 각 기사 페이지에서 본문 내용을 자동으로 크롤링하여 새로운 CSV로 저장합니다.

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# CSV 파일 읽기
file_path = 'test_data.csv'
news_df = pd.read_csv(file_path)

# 기사 본문을 저장할 리스트
article_contents = []

# 각 링크에 대한 웹 스크래핑
for link in news_df['Link']:
    try:
        # HTTP 요청
        response = requests.get(link)
        soup = BeautifulSoup(response.content, 'html.parser')

        # 기사 내용 추출 (HTML 구조에 따라 다를 수 있음)
        article_content = soup.find('div', class_='article-body')  # 예시 셀렉터
        if article_content:
            article_contents.append(article_content.get_text(strip=True))
        else:
            article_contents.append("내용 없음")
    except Exception as e:
        article_contents.append("오류: " + str(e))

# 기사 본문을 DataFrame에 추가
news_df['Article Content'] = article_contents

# 결과 저장
new_csv_file_path = 'test_data_with_content.csv'
news_df.to_csv(new_csv_file_path, index=False, encoding='utf-8-sig')

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHA

# 뉴스 크롤링 결과 자동 이메일 전송
Python을 활용해 뉴스 크롤링 결과를 매일 정해진 시간(11:00) 에 자동으로 이메일로 전송하는 자동화 시스템입니다.
첨부파일에는 test_data_with_content.csv가 포함되며, Gmail SMTP 서버를 이용합니다.

In [3]:
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email.mime.text import MIMEText
from email import encoders
import os
import urllib.parse

def send_email_with_attachment(subject, body, to_email, from_email, password, file_path):
    msg = MIMEMultipart()
    msg['From'] = from_email
    msg['To'] = to_email
    msg['Subject'] = subject

    # 본문에 utf-8 인코딩 명시
    msg.attach(MIMEText(body, 'plain', 'utf-8'))

    # 첨부파일 추가 (한글 파일명 대응)
    filename = os.path.basename(file_path)
    if not os.path.exists(file_path):
        print(f"[ERROR] File not found: {file_path}")
        return

    with open(file_path, "rb") as attachment:
        part = MIMEBase("application", "octet-stream")
        part.set_payload(attachment.read())
    encoders.encode_base64(part)
    part.add_header(
        "Content-Disposition",
        f"attachment; filename*=UTF-8''{urllib.parse.quote(filename)}",
    )
    msg.attach(part)

    server = smtplib.SMTP('smtp.gmail.com', 587)
    server.starttls()
    server.login(from_email, password)
    server.send_message(msg)
    server.quit()

In [4]:
import schedule
import time
import sys
import smtplib # Ensure smtplib is imported if job() calls send_email_with_attachment defined elsewhere
import datetime

from datetime import datetime

now = datetime.now()

# Assuming send_email_with_attachment is defined in a previous cell and available.
# If not, make sure it's imported or included in this cell.

def job():
    # Define the password here. Ensure this string literal contains only ASCII characters.
    password_value = "vglombbslblnperv" # Double-check this value

    # Add diagnostic prints
    print("-" * 30)
    print("Inside job() function before sending email:")
    print(f"Password variable name in job(): {password_value=}") # Use f-string for variable name and value
    print(f"Password type in job(): {type(password_value)}")
    # Attempt to encode to UTF-8 and print the bytes. This will show non-ASCII chars.
    try:
        password_bytes_utf8 = password_value.encode('utf-8')
        print(f"Password bytes (UTF-8): {password_bytes_utf8}")
        # If the character \uc571 appears here, the string literal itself contains it.
    except Exception as e:
        print(f"Error encoding password to UTF-8 for inspection: {e}")

    # Call the email sending function with the defined password_value
    try:
        send_email_with_attachment(
            subject = "news scrapping result",
            body = "check csv file.",
            to_email = "yewonjc@naver.com",
            from_email = "yewhwang@gmail.com",
            password = "vglombbslblnperv", # Use the variable here
            file_path = "test_data_with_content.csv"
        )
        print("Email sent successfully (or send_email_with_attachment call finished)")
    except Exception as e:
        print(f"An error occurred during send_email_with_attachment call: {e}")
        # Optionally, re-raise the exception if you want the script to stop
        # raise
    print("-" * 30)


# 매일 실행
# Check the time here, maybe adjust for testing
print(f"Scheduling job at {datetime.now().strftime('%H:%M')}")
schedule.every().day.at("13:40").do(job) # Keep your original schedule or adjust for testing

print("Scheduler started. Waiting for job...")

while True:
    schedule.run_pending()
    time.sleep(60)

Scheduling job at 13:30
Scheduler started. Waiting for job...
------------------------------
Inside job() function before sending email:
Password variable name in job(): password_value='vglombbslblnperv'
Password type in job(): <class 'str'>
Password bytes (UTF-8): b'vglombbslblnperv'
Email sent successfully (or send_email_with_attachment call finished)
------------------------------


KeyboardInterrupt: 