<a href="https://colab.research.google.com/github/ukkyukang/ML/blob/main/news_crawller.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# sqlite

In [10]:
import sqlite3


# 데이터베이스 파일 경로
db_path = 'urls.db'

try:
    # 데이터베이스 연결 (파일이 없으면 생성)
    connection = sqlite3.connect(db_path)
    cursor = connection.cursor()
    print("데이터베이스 연결 성공")

    # 테이블 생성 (이미 존재하지 않는 경우)
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS urls (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            url TEXT UNIQUE NOT NULL
        )
    ''')
    print("테이블 생성 성공")
except sqlite3.Error as e:
    print(f"데이터베이스 연결 또는 테이블 생성 실패: {e}")

try:
    # 샘플 URL 데이터
    sample_urls = [
        "https://www.example.com",
        "https://www.google.com",
        "https://www.example.com/page1",
        "https://www.example.com/page2",
        "https://www.google.com/search",
      "https://www.example.com"  # 중복 URL (삽입되지 않음),
         "https://www.breitbart.com/latin-america/2025/01/14/biden-removes-pro-hamas-hezbollah-tied-cuba-from-state-sponsor-of-terror-list/",
    "https://www.breitbart.com/national-security/2025/01/14/national-sheriffs-association-urges-senate-expeditiously-confirm-trump-nominee-tulsi-gabbard-dni/",

    ]

    # 샘플 URL 삽입
    for url in sample_urls:
        try:
            cursor.execute("INSERT INTO urls (url) VALUES (?)", (url,))
        except sqlite3.IntegrityError:
            print(f"중복 URL 발견: {url}. 삽입되지 않음.")
        except sqlite3.Error as e:
          print(f"데이터 삽입 실패: {e}")


    connection.commit() # 변경사항 저장
    print("샘플 URL 삽입 완료")
except sqlite3.Error as e:
    print(f"샘플 URL 삽입 실패: {e}")

try:
    # 모든 URL 조회
    cursor.execute("SELECT * FROM urls")
    results = cursor.fetchall()

    print("URL 데이터 조회 결과:")
    for row in results:
        print(f"ID: {row[0]}, URL: {row[1]}")
except sqlite3.Error as e:
    print(f"URL 데이터 조회 실패: {e}")




데이터베이스 연결 성공
테이블 생성 성공
샘플 URL 삽입 완료
URL 데이터 조회 결과:
ID: 1, URL: https://www.example.com
ID: 2, URL: https://www.google.com
ID: 3, URL: https://www.example.com/page1
ID: 4, URL: https://www.example.com/page2
ID: 5, URL: https://www.google.com/search
ID: 6, URL: https://www.example.comhttps://www.breitbart.com/latin-america/2025/01/14/biden-removes-pro-hamas-hezbollah-tied-cuba-from-state-sponsor-of-terror-list/
ID: 7, URL: https://www.breitbart.com/national-security/2025/01/14/national-sheriffs-association-urges-senate-expeditiously-confirm-trump-nominee-tulsi-gabbard-dni/


# Fox News Link Extraction

In [15]:
import requests
from bs4 import BeautifulSoup
import re

def print_sentences_with_period(text):
    # 정규 표현식을 사용하여 문장을 분할
    sentences = re.split(r'(?<!\bU\.S)(?<!\bMr)(?<!\bMrs)(?<!\bDr)\.(?!\w)', text)
    for sentence in sentences:
        if sentence.strip():  # 빈 문자열이 아닌 경우에만 출력
            print(sentence.strip() + '.')

def fox_news_link_extraction(category, url):

  print("Category: ", category)
  print("URL:", url)
    # 웹 페이지 요청
  response = requests.get(url, verify=True)

  # 페이지의 HTML 콘텐츠 파싱
  soup = BeautifulSoup(response.content, 'html.parser')

  links = []
  for a_tag in soup.find_all('a'):
    link = a_tag.get('href')
    if link:  # 링크가 있는 경우만 추가
      links.append(link)

  # print(links)


  # category 추출
  keyword = f"/{category}/"
  result = []

  for item in links:
      if keyword in item:
          result.append(item)

  # 중복제거
  unique_list = list(set(result))

  # for r in unique_list:
  #   print(f"{r}")

  # 실제 url만 추출
  print("*Real url extraction")
  new_urls = []
  for url in unique_list:
    if url.startswith(f"/{category}/"):
        new_urls.append("https://www.foxnews.com" + url)

  for r in new_urls:
    print(f"{r}")

#
# test main
#
# Fox News 웹사이트의 URL

urls = [

"https://www.foxnews.com/politics"


]

url_base = "https://www.foxnews.com/"
fox_news_category = ["us","politics","world"]

for category in fox_news_category:
    fox_news_link_extraction(category,url_base+category)

# for url in urls:
#   fox_news_link_extraction("politics",url)

Category:  us
URL: https://www.foxnews.com/us
*Real url extraction
https://www.foxnews.com/us/map-shows-extent-wildfire-devastation-homes-destroyed
https://www.foxnews.com/us/teen-dead-ski-resort-near-posh-mountain-town
https://www.foxnews.com/us/watch-city-bus-comes-within-inches-disaster-elevated-overpass-during-rush-hour
https://www.foxnews.com/us/spring-health-commits-500k-free-therapy-sessions-ceo-calls-mental-health-community-join
https://www.foxnews.com/us/frantic-911-call-captures-moments-after-toddlers-freak-sledding-accident
https://www.foxnews.com/us/los-angeles-county-da-says-nine-charged-looting-homes-wildfire-zones-one-arson
https://www.foxnews.com/us/wealthy-florida-real-estate-brothers-trophies-uncovered-judge-denies-bail
https://www.foxnews.com/us/florida-man-attacks-elderly-woman-robs-her-lottery-winnings-surveillance-footage
https://www.foxnews.com/us/man-black-jeep-wanted-california-after-stalking-girls-leaving-school-police-say
https://www.foxnews.com/us/second-arr

# FOXNEWS

In [7]:
import requests
from bs4 import BeautifulSoup
import re

def print_sentences_with_period(text):
    # 정규 표현식을 사용하여 문장을 분할
    sentences = re.split(r'(?<!\bU\.S)(?<!\bMr)(?<!\bMrs)(?<!\bDr)\.(?!\w)', text)
    for sentence in sentences:
        if sentence.strip():  # 빈 문자열이 아닌 경우에만 출력
            print(sentence.strip() + '.')

def extract_news(url):

  print(url)
    # 웹 페이지 요청
  response = requests.get(url, verify=True)

  # 페이지의 HTML 콘텐츠 파싱
  soup = BeautifulSoup(response.content, 'html.parser')

  # test
  head_tag = soup.find('head')
  if head_tag:
    title_tag = head_tag.find('title')
    if title_tag:
      print(title_tag.text)
    else:
      print("head 태그 안에서 title 태그를 찾을 수 없습니다.")
  else:
    print("head 태그를 찾을 수 없습니다.")

  # get time
  time = soup.find_all("time")
  print(f"new_time : {time[0].get_text()}")

  # Get title
  title = soup.find_all('title')
  # print(f"title: {title[0].get_text()}")
  print(f"{title[0].get_text()}")

  #Get sub_headline
  sub_headline = soup.find_all("h2", class_="sub-headline")
  # print(f"sub headline : {sub_headline[0].get_text()}")
  print(f"{sub_headline[0].get_text()}")


  # 예시: 헤드라인 뉴스 크롤링
  headlines = soup.find_all('div', class_='article-body')

  for headline in headlines:
      # print_sentences_with_period(headline.get_text())
      print (headline.get_text())

# Fox News 웹사이트의 URL

urls = [

"https://www.foxnews.com/politics/hirono-ripped-for-opening-confirmation-hearing-question-to-burgum-this-lady-has-issues"


]

for url in urls:
  extract_news(url)



https://www.foxnews.com/politics/hirono-ripped-for-opening-confirmation-hearing-question-to-burgum-this-lady-has-issues
Hirono ripped for 'deranged' opening confirmation hearing question to Burgum: 'This lady has issues' | Fox News
new_time :  January 16, 2025 3:12pm EST
Hirono ripped for 'deranged' opening confirmation hearing question to Burgum: 'This lady has issues' | Fox News
Hirono has asked the same question to other nominees in the past
 close      Video Hawaii Senator Mazie Hirono asks Governor Doug Burgum if he has made unwanted sexual advances on others Hawaii Senator Mazie Hirono asks Governor Doug Burgum if he has made unwanted sexual advances on others during his confirmation hearing. Democrat Sen. Mazie Hirono of Hawaii faced mockery on social media on Thursday over her opening questions to Interior Secretary nominee Doug Burgum during his confirmation hearing.     "As part of my responsibilities to ensure the fitness of nominees before any of the committees on which I s

# BREITBART

In [8]:
import requests
from bs4 import BeautifulSoup
import re

def extract_breitbart_news(url):

  print(url)

  # 웹 페이지 요청
  response = requests.get(url, verify=True)

  # 페이지의 HTML 콘텐츠 파싱
  soup = BeautifulSoup(response.content, 'html.parser')

  # Get title
  title = soup.find_all('title')
  # print(f"title: {title[0].get_text()}")
  print(f"{title[0].get_text()}")

   # get time
  time = soup.find_all("time")
  print(f"new_time : {time[0].get_text()}")

  #Get sub_headline
  # sub_headline = soup.find_all("p", class_="subheading")
  # print(f"sub headline : {sub_headline[0].get_text()}")
  # for text in sub_headline:
    # print(text.get_text())

  # print("\n")
  # print(f"{sub_headline[0].get_text()}")


  # 예시: 헤드라인 뉴스 크롤링
  headlines = soup.find_all('p')

  for headline in headlines:
      # print_sentences_with_period(headline.get_text())
      print (headline.get_text())

# start
urls = [
    "https://www.breitbart.com/latin-america/2025/01/14/biden-removes-pro-hamas-hezbollah-tied-cuba-from-state-sponsor-of-terror-list/",
    "https://www.breitbart.com/national-security/2025/01/14/national-sheriffs-association-urges-senate-expeditiously-confirm-trump-nominee-tulsi-gabbard-dni/",
    # "https://www.breitbart.com/politics/2025/01/14/exclusive-sen-jim-banks-pete-hegseth-excelled-hearing-democrats-repeated-same-bullsh-anonymous-accusations/",
    # "https://www.breitbart.com/politics/2025/01/14/sen-markwayne-mullin-slams-hypocritical-democrats-during-hegseth-hearing-how-many-senators-have-showed-up-drunk-to-vote-at-night/"

]

for url in urls:
  extract_breitbart_news(url)

https://www.breitbart.com/latin-america/2025/01/14/biden-removes-pro-hamas-hezbollah-tied-cuba-from-state-sponsor-of-terror-list/
Biden Removes Pro-Hamas, Hezbollah-Tied Cuba from State Sponsor of Terror List
new_time : 14 Jan 2025
Leftist President Joe Biden announced on Tuesday he would remove Cuba from the State Department’s list of state sponsors of terrorism, leaving only Iran, Syria, and North Korea on the list.
Biden claimed in his declaration that Cuba had not sponsored terrorism for six months, apparently enough to remove it from the list.
Cuba has been on the list for most of the years between 1982 and 2025, with the exception of six years between 2015 and 2021 after leftist President Barack Obama – whom Biden served as vice president – removed the communist regime from a list as part of a larger package of concessions to Havana. The Obama concession policy resulted in a dramatic increase in political and religious persecution on the island and served to embolden Cuba to forg