## 1-1. Daum 뉴스제목 가져오기
* user-agent 요청헤더를 반드시 설정해야 한다.

In [None]:
import requests
from bs4 import BeautifulSoup

# 다음 경제 뉴스 URL
url = 'https://news.daum.net/economy'

print(url)

# 요청 헤더
req_header = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}

res = requests.get(url, headers=req_header)
print(type(res))
print(res.status_code)

# 정상 응답 여부 확인
if res.ok:
    # 응답 데이터 인코딩
    res.encoding = 'utf-8' 
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')
    
    # 기사 제목과 링크 추출
    li_tag_list = soup.select('ul.list_newsheadline2 li')

    print(type(li_tag_list), len(li_tag_list))

    for li_tag in li_tag_list:  #li_tag => <li>     
        a_tag = li_tag.find('a') #a_tag <a href="">
        print(a_tag['href'])
        
        #strong_tag = li_tag.select('div.cont_thumb strong.tit_txt')[0] #[<strong>]
        strong_tag = li_tag.select_one('div.cont_thumb strong.tit_txt')
        title = strong_tag.text.strip()
        print(title)

else:
    print(f'에러코드 = {res.status_code}')



In [12]:
section_dict = {'기후/환경':'climate','사회':'society','경제':'economy','정치':'politics',\
                '국제':'world','문화':'culture','생활':'life','IT/과학':'tech','인물':'people'}
section_dict

{'기후/환경': 'climate',
 '사회': 'society',
 '경제': 'economy',
 '정치': 'politics',
 '국제': 'world',
 '문화': 'culture',
 '생활': 'life',
 'IT/과학': 'tech',
 '인물': 'people'}

## 1-2. Daum 뉴스제목 
* 함수로 선언하여 처리하기

In [13]:
import requests
from bs4 import BeautifulSoup

# '사회': 'society'
def print_news(section_name):    
    section = section_dict[section_name]
    # 요청 Parameter
    req_param = {
        'section': section
    }
    #url = 'https://news.daum.net/{section}'.format(**req_param)
    url = f'https://news.daum.net/{section}'
    
    print(f'======> {url} {section_name} 뉴스 <======')
    
    # 요청 헤더 설정 : 브라우저 정보
    req_header = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
    }

    res = requests.get(url, headers=req_header)   

    if res.ok:
        res.encoding = 'utf-8' 
        html = res.text
        soup = BeautifulSoup(html, 'html.parser')
        
        li_tag_list = list()
        # 기사 제목과 링크 추출
        li_tag_list = soup.select('ul.list_newsheadline2 li')

        #print(type(li_tag_list), len(li_tag_list))

        for li_tag in li_tag_list:        
            a_tag = li_tag.find('a')
            print(a_tag['href'])
            
            strong_tag = li_tag.select_one('div.cont_thumb strong.tit_txt')
            title = strong_tag.text.strip()
            print(title)

    else:
        print(f'에러코드 = {res.status_code}')


In [None]:
print_news('경제')

In [None]:
print_news('인물')

In [None]:
print_news('국제')

In [None]:
for section in section_dict.keys():
    print_news(section)

### 2-1. Nate 뉴스제목 출력하기

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from IPython.display import Image, display

url = 'https://news.nate.com/recent?mid=n0100'
print(url)

req_header = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}

res = requests.get(url, headers=req_header)
print(type(res))
print(res.status_code)

if res.ok:
    res.encoding = 'euc-kr'
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')
    
    tags = soup.select("div.postListType.noListTitle div.mlt01")
    print(len(tags))

    for idx,div_tag in enumerate(tags,1): 
        print(f'============>> {idx}')       
        a_tag = div_tag.find('a')
        a_join_url = urljoin(url,a_tag['href'])
        print(a_join_url)

        img_tag = div_tag.select_one('span.ib img')
        if img_tag:
            photo_url = urljoin(url,img_tag['src'])
            print(photo_url)
            display(Image(url=photo_url))

        h2_tag = div_tag.select_one('span.tb h2.tit')
        title = h2_tag.text
        print(title)

else:
    print(f'에러코드 = {res.status_code}')

### 2-2. Nate 뉴스제목과 Image 출력하기
* 함수로 선언하여 처리하기

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from IPython.display import Image, display

section_dict = {'최신뉴스':'n0100','정치':'n0200','경제':'n0300','사회':'n0400',\
                '세계':'n0500','IT/과학':'n0600'}
print(section_dict)

def print_news(section_name):    
    m_id = section_dict[section_name]

    url = f'https://news.nate.com/recent?mid={m_id}'
    print(url)

    req_header = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
    }

    res = requests.get(url, headers=req_header)
    print(type(res))
    print(res.status_code)

    if res.ok:
        res.encoding = 'euc-kr'
        html = res.text
        soup = BeautifulSoup(html, 'html.parser')

        print(f'======> {url} {section_name} 뉴스 <======')   

        tags = soup.select("div.postListType.noListTitle div.mlt01")
        print(len(tags))

        for idx,div_tag in enumerate(tags,1): 
            print(f'============>> {idx}')       
            a_tag = div_tag.find('a')
            a_join_url = urljoin(url,a_tag['href'])
            print(a_join_url)

            img_tag = div_tag.select_one('span.ib img')
            if img_tag:
                photo_url = urljoin(url,img_tag['src'])
                print(photo_url)
                display(Image(url=photo_url))

            h2_tag = div_tag.select_one('span.tb h2.tit')
            title = h2_tag.text
            print(title)

    else:
        print(f'에러코드 = {res.status_code}')

print_news('경제')