In [1]:
import requests
import sqlite3
import json
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [2]:
def absurl(host, path):
    return urljoin(host, path)

def beautiful_soup(response):
    return BeautifulSoup(response.text, 'html.parser')

def url_request(url, headers):
    response = requests.get(url, headers=headers, verify=False)
    return response

In [3]:
# 데이터 추출 함수

def find_cctvNews_data(soup, idxno):
    # cctvNews html 페이지에서 기사 링크, 제목, 내용 태그 찾아 저장
    link_tag = soup.find('link', rel='canonical') or soup.find('link', rel='amphtml')
    title_tag = soup.find('title')
    p_tags = soup.find_all('p')

    if link_tag and title_tag:
        link = link_tag.get('href')
        title = str(title_tag.text)
        content = ' '.join([tag.text for tag in p_tags])

        data = {
            "link": absurl("https://www.cctvnews.co.kr", link),
            "title": title,
            "content": content
        }
        return data
    else:
        # 링크나 제목 존재하지 않을 시 None 반환
        print(f"Invalid link or title for index {idxno}. Skipping...")
        return None

def find_dailySecu_data(soup, idxno):
    # dailySecu html 페이지에서 기사 링크, 제목, 내용 태그 찾아 저장
    link_tag = soup.find('link', rel='canonical') or soup.find('link', rel='amphtml')
    title_tag = soup.find('title')
    p_tags = soup.find_all('p')

    if link_tag and title_tag:
        link = link_tag.get('href')
        title = str(title_tag.text)
        content = ' '.join([tag.text for tag in p_tags])

        data = {
            "link": absurl("https://www.dailysecu.com", link),
            "title": title,
            "content": content
        }
        return data
    else:
        # 링크나 제목 존재하지 않을 시 None 반환
        print(f"Invalid link or title for index {idxno}. Skipping...")
        return None

def find_boanNews_data(soup, idx):
    # boanNews html 페이지에서 기사 링크, 제목, 내용 태그 찾아 저장
    link_tag = soup.find('link', rel='canonical') or soup.find('link', rel='amphtml')
    title_tag = soup.find('title')
    div_content = soup.find('div', id='news_content')

    # 한 글자 뒤 생기는 공백 처리
    if link_tag and title_tag and div_content:
        link = link_tag.get('href')
        title = str(title_tag.text)
        content = div_content.get_text(separator=' ', strip=True)
        content = ' '.join(content.split())

        data = {
            "link": absurl("https://www.boannews.com", link),
            "title": title,
            "content": content
        }
        return data
    else:
        # 링크나 제목 존재하지 않을 시 None 반환
        print(f"Invalid link, title, or content for index {idx}. Skipping...")
        return None

In [4]:
# 크롤링한 데이터를 데이터베이스에 저장
def insert_data(conn, cursor, table_name, data):
    if data:
        placeholders = ', '.join(['?' for _ in range(len(data))])
        columns = ', '.join(data.keys())
        values = tuple([str(val) for val in data.values()])
        sql = f"INSERT INTO {table_name} ({columns}) VALUES ({placeholders})"
        cursor.execute(sql, values)
        conn.commit()

In [5]:
# news 테이블의 모든 레코드 삭제(테이블 초기화)
def clear_table(conn, cursor, table_name):
    # 테이블 비우기
    sql = f"DELETE FROM {table_name}"
    cursor.execute(sql)
    conn.commit()

In [6]:
# 메인함수: 크롤링, 데이터베이스 저장
def main():
    # SQLite 데이터베이스 연결 생성
    conn = sqlite3.connect('news.db')
    cursor = conn.cursor()
    cursor.execute("CREATE TABLE IF NOT EXISTS news (link TEXT, title TEXT, content TEXT)")
    
    # 테이블 비우기
    clear_table(conn, cursor, 'news')
    
    # 크롤링할 URL 정의
    cctvNews_base_url = "https://www.cctvnews.co.kr/news/articleView.html?idxno="
    dailySecu_base_url = "https://www.dailysecu.com/news/articleView.html?idxno="
    boanNews_base_url = "https://www.boannews.com/media/view.asp?idx="

    # cctvNews 크롤링
    for idxno in range(236274, 236264, -1):
        cctvNews_url = f"{cctvNews_base_url}{idxno}"
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"}
        response = url_request(cctvNews_url, headers)
        soup = beautiful_soup(response)
        data = find_cctvNews_data(soup, idxno)
        insert_data(conn, cursor, 'news', data)

    # dailySecu 크롤링
    for idxno in range(151095, 151085, -1):
        dailySecu_url = f"{dailySecu_base_url}{idxno}"
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"}
        response = url_request(dailySecu_url, headers)
        soup = beautiful_soup(response)
        data = find_dailySecu_data(soup, idxno)
        insert_data(conn, cursor, 'news', data)

    # boanNews 크롤링
    for idx in range(123585, 123575, -1):
        boanNews_url = f"{boanNews_base_url}{idx}"
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"}
        response = url_request(boanNews_url, headers)
        soup = beautiful_soup(response)
        data = find_boanNews_data(soup, idx)
        insert_data(conn, cursor, 'news', data)

    conn.close()


In [7]:
if __name__ == "__main__":
    main()



Invalid link or title for index 236271. Skipping...




Invalid link or title for index 236269. Skipping...


