# 네이버 자주 본 뉴스 크롤링

1) 수집내용

    1) 많이본뉴스–섹션별(정치~IT/과학)Top5기사제목,신문사,뷰 
    2) 해당 기사별 기사 내용, 리액션 (좋아요 ~ 후속기사 원해요)
    
2) 수집방법(택1)

    1) [기본] Requests , BeautifulSoup, Selenium
    2) [심화] Requests, BeautifulSoup (+ 멀티프로세싱)
    
3) 수집범위 및 저장

    1) 2019년7월21일~2020년8월20일(동작가능,실제구동x)
    2) 하나의 파일로 저장 (방식 자유)
    3) Ex)총6섹션*Top5*365일=10950rows

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from IPython.display import Image
import os
from pathlib import Path
import glob

import datetime

from multiprocess import Pool
import concurrent

from tqdm import tqdm

## 날짜 String 생성

In [2]:
start_date = datetime.datetime(2019, 7, 21, 3, 22, 32)
start_date.strftime("%Y%m%d") #strftime 활용 예시

'20190721'

In [3]:
def get_date_string(start_date, period=365):
    # 20190721 형태의 Date String List 생성
    return [
        (start_date + datetime.timedelta(days=day)).strftime("%Y%m%d")
        for day in range(period)
    ]

In [4]:
date_string_list = get_date_string(start_date, 365)

In [5]:
date_string_list[0], date_string_list[-1] # 생성 완료

('20190721', '20200719')

## Part 1. Request를 이용한 크롤링

In [6]:
def get_top_news(date):
    
    # window에서 multiprocess 사용하기 위해 함수 안에서도 임포트
    import requests
    from bs4 import BeautifulSoup
    
    """
    해당 날짜의 자주 본 뉴스 25개에 대한 정보를 반환하는 함수입니다.
    """
    url = f"https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day&date={date}" # 이부분을 잘 채워넣어주세요 (네이버 뉴스 자주보는 뉴스 링크에서 date를 제외한 부분)
    res = requests.get(url)
    soup = BeautifulSoup(res.text, "html.parser")
    
    result = []
    
    for section in range(4, 10):
        for i in range(1, 6):
            # 특정 뉴스 박스 선택
            box = soup.select(f'#wrap > table > tr > td.content > div > div:nth-child({section}) > ol > li.num{i}')[0]
            # 그 중 제목과 링크 있는 부분 선택 후 추출
            tmp = box.select('dl > dt > a')[0]
            title = tmp.text
            link = tmp['href']
            # 신문사와 조회수도 추출
            company = box.select('span')[-2].text.strip()
            views = box.select('.count_view')[0].text
            # result에 추가
            result.append([title, link, company, views])
                
    return result

In [7]:
with Pool(14) as pool:
    news_data = list(tqdm(pool.imap(get_top_news, date_string_list), total=len(date_string_list)))

100%|████████████████████████████████████████████████████████████████████████████████| 365/365 [00:15<00:00, 24.30it/s]


In [8]:
def flatten(l):
    return [y for x in l for y in x]

news_data = flatten(news_data)

In [9]:
len(news_data)

10950

In [10]:
news_data[:5] # 결과 예시

[['[단독] 황교안 딸 운영 사이트, 대학 진학 후 왜 문 닫았나',
  '/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=032&aid=0002952683&date=20190721&type=1&rankingSectionId=100&rankingSeq=1',
  '경향신문',
  '201,039'],
 ['조국 폭풍페북, 日주장 정면반박…"친일파" 표현은 野 반발',
  '/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=008&aid=0004251344&date=20190721&type=1&rankingSectionId=100&rankingSeq=2',
  '머니투데이',
  '185,396'],
 ['조국, 연일 對日 \'항전\' 주문…"겁먹고 쫄지말자…싸워 이겨…',
  '/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=001&aid=0010969325&date=20190721&type=1&rankingSectionId=100&rankingSeq=3',
  '연합뉴스',
  '130,198'],
 ['[김순덕의 도발]복수를 하려면 아일랜드처럼!',
  '/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=020&aid=0003230442&date=20190721&type=1&rankingSectionId=100&rankingSeq=4',
  '동아일보',
  '120,897'],
 ['조국, 또 페북에 反日 선전전..."文정부, 서희·이순신 역할…',
  '/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=023&aid=0003462170&date=20190721&type=1&rankingSectio

In [11]:
# 결과물을 데이터 프레임으로 변환 및 Column Name 부여
df_top_news = pd.DataFrame(news_data, columns=["title", "url", "press", "views"])

In [12]:
# URL 앞에  "https://news.naver.com" 붙이기
df_top_news['url'] = df_top_news['url'].apply(lambda x: 'https://news.naver.com'+x)

In [13]:
df_top_news

Unnamed: 0,title,url,press,views
0,"[단독] 황교안 딸 운영 사이트, 대학 진학 후 왜 문 닫았나",https://news.naver.com/main/ranking/read.nhn?m...,경향신문,201039
1,"조국 폭풍페북, 日주장 정면반박…""친일파"" 표현은 野 반발",https://news.naver.com/main/ranking/read.nhn?m...,머니투데이,185396
2,"조국, 연일 對日 '항전' 주문…""겁먹고 쫄지말자…싸워 이겨…",https://news.naver.com/main/ranking/read.nhn?m...,연합뉴스,130198
3,[김순덕의 도발]복수를 하려면 아일랜드처럼!,https://news.naver.com/main/ranking/read.nhn?m...,동아일보,120897
4,"조국, 또 페북에 反日 선전전...""文정부, 서희·이순신 역할…",https://news.naver.com/main/ranking/read.nhn?m...,조선일보,119463
...,...,...,...,...
10945,"다시 출몰한 관악산 UFO 소동, 정체가 밝혀졌다?",https://news.naver.com/main/ranking/read.nhn?m...,경향신문,344464
10946,"""내가 알던 그 '한컴'이 맞나요?""… 4차산업혁명 유망주로 …",https://news.naver.com/main/ranking/read.nhn?m...,조선비즈,133197
10947,합격보너스 5000만원·기본 재택근무…코로나시국 귀한 직종…,https://news.naver.com/main/ranking/read.nhn?m...,머니투데이,108515
10948,[이기자의 유레카!] 中·日 덮친 폭우가 이것 때문?…코로나…,https://news.naver.com/main/ranking/read.nhn?m...,매일경제,90614


## Part 2. Selenium을 이용한 크롤링

In [14]:
df1 = df_top_news.copy()

In [15]:
df1.loc[:10, 'url']

0     https://news.naver.com/main/ranking/read.nhn?m...
1     https://news.naver.com/main/ranking/read.nhn?m...
2     https://news.naver.com/main/ranking/read.nhn?m...
3     https://news.naver.com/main/ranking/read.nhn?m...
4     https://news.naver.com/main/ranking/read.nhn?m...
5     https://news.naver.com/main/ranking/read.nhn?m...
6     https://news.naver.com/main/ranking/read.nhn?m...
7     https://news.naver.com/main/ranking/read.nhn?m...
8     https://news.naver.com/main/ranking/read.nhn?m...
9     https://news.naver.com/main/ranking/read.nhn?m...
10    https://news.naver.com/main/ranking/read.nhn?m...
Name: url, dtype: object

In [15]:
driver_path = 'chromedriver'
# headless
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome(driver_path, options=options)

여기서는 일반 뉴스 형식에 안맞는 페이지들은 except 처리해서 크롤링하지 않고 그냥 링크를 프린트하도록 하였다.

아래 bs를 활용하여 크롤링한 코드에서는 다양한 예외를 처리한다.

In [18]:
for idx, news_url in tqdm(list(enumerate(df_top_news['url'].values))):
    # 드라이버 내에서 해당 URL로 이동
    driver.get(news_url)    
    # 기사 내용
    try:
        df1.loc[idx, 'content'] = driver.find_element_by_css_selector('#articleBodyContents').text.strip()
    except:
        print('content', news_url)
    # 리액션
    for emotion in ['good', 'warm', 'sad', 'angry', 'want']:
        try:
            df1.loc[idx, emotion] = driver.find_element_by_css_selector(
                f'#spiLayer > div._reactionModule.u_likeit > ul > li.u_likeit_list.{emotion} > a > span.u_likeit_list_count._count'
            ).text.strip()
        except:
            print(emotion, news_url)

  3%|██▏                                                                         | 319/10950 [02:48<1:27:16,  2.03it/s]

content https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=001&aid=0010994889&date=20190731&type=1&rankingSectionId=103&rankingSeq=4
good https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=001&aid=0010994889&date=20190731&type=1&rankingSectionId=103&rankingSeq=4
warm https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=001&aid=0010994889&date=20190731&type=1&rankingSectionId=103&rankingSeq=4
sad https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=001&aid=0010994889&date=20190731&type=1&rankingSectionId=103&rankingSeq=4
angry https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=001&aid=0010994889&date=20190731&type=1&rankingSectionId=103&rankingSeq=4
want https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=001&aid=0010994889&date=20190731&type=1&rankingSectionId=

  3%|██▌                                                                         | 378/10950 [03:16<1:12:14,  2.44it/s]

content https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=025&aid=0002926865&date=20190802&type=1&rankingSectionId=103&rankingSeq=3
good https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=025&aid=0002926865&date=20190802&type=1&rankingSectionId=103&rankingSeq=3
warm https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=025&aid=0002926865&date=20190802&type=1&rankingSectionId=103&rankingSeq=3
sad https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=025&aid=0002926865&date=20190802&type=1&rankingSectionId=103&rankingSeq=3
angry https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=025&aid=0002926865&date=20190802&type=1&rankingSectionId=103&rankingSeq=3
want https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=025&aid=0002926865&date=20190802&type=1&rankingSectionId=

 12%|████████▉                                                                  | 1308/10950 [11:36<1:24:17,  1.91it/s]

content https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=032&aid=0002960805&date=20190902&type=1&rankingSectionId=103&rankingSeq=3
good https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=032&aid=0002960805&date=20190902&type=1&rankingSectionId=103&rankingSeq=3
warm https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=032&aid=0002960805&date=20190902&type=1&rankingSectionId=103&rankingSeq=3
sad https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=032&aid=0002960805&date=20190902&type=1&rankingSectionId=103&rankingSeq=3
angry https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=032&aid=0002960805&date=20190902&type=1&rankingSectionId=103&rankingSeq=3
want https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=032&aid=0002960805&date=20190902&type=1&rankingSectionId=

 12%|█████████▏                                                                 | 1338/10950 [11:52<1:31:55,  1.74it/s]

content https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=032&aid=0002960805&date=20190903&type=1&rankingSectionId=103&rankingSeq=3
good https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=032&aid=0002960805&date=20190903&type=1&rankingSectionId=103&rankingSeq=3
warm https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=032&aid=0002960805&date=20190903&type=1&rankingSectionId=103&rankingSeq=3
sad https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=032&aid=0002960805&date=20190903&type=1&rankingSectionId=103&rankingSeq=3
angry https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=032&aid=0002960805&date=20190903&type=1&rankingSectionId=103&rankingSeq=3
want https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=032&aid=0002960805&date=20190903&type=1&rankingSectionId=

 24%|█████████████████▊                                                         | 2606/10950 [23:24<1:02:47,  2.21it/s]

content https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=444&aid=0000001280&date=20191015&type=1&rankingSectionId=105&rankingSeq=1
good https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=444&aid=0000001280&date=20191015&type=1&rankingSectionId=105&rankingSeq=1
warm https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=444&aid=0000001280&date=20191015&type=1&rankingSectionId=105&rankingSeq=1
sad https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=444&aid=0000001280&date=20191015&type=1&rankingSectionId=105&rankingSeq=1
angry https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=444&aid=0000001280&date=20191015&type=1&rankingSectionId=105&rankingSeq=1
want https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=444&aid=0000001280&date=20191015&type=1&rankingSectionId=

 24%|██████████████████▌                                                          | 2636/10950 [23:40<59:38,  2.32it/s]

content https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=444&aid=0000001280&date=20191016&type=1&rankingSectionId=105&rankingSeq=1
good https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=444&aid=0000001280&date=20191016&type=1&rankingSectionId=105&rankingSeq=1
warm https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=444&aid=0000001280&date=20191016&type=1&rankingSectionId=105&rankingSeq=1
sad https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=444&aid=0000001280&date=20191016&type=1&rankingSectionId=105&rankingSeq=1
angry https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=444&aid=0000001280&date=20191016&type=1&rankingSectionId=105&rankingSeq=1
want https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=444&aid=0000001280&date=20191016&type=1&rankingSectionId=

 24%|██████████████████▏                                                        | 2657/10950 [23:51<1:02:20,  2.22it/s]

content https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=009&aid=0004445870&date=20191017&type=1&rankingSectionId=103&rankingSeq=2
good https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=009&aid=0004445870&date=20191017&type=1&rankingSectionId=103&rankingSeq=2
warm https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=009&aid=0004445870&date=20191017&type=1&rankingSectionId=103&rankingSeq=2
sad https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=009&aid=0004445870&date=20191017&type=1&rankingSectionId=103&rankingSeq=2
angry https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=009&aid=0004445870&date=20191017&type=1&rankingSectionId=103&rankingSeq=2
want https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=009&aid=0004445870&date=20191017&type=1&rankingSectionId=

 25%|██████████████████▌                                                        | 2712/10950 [24:24<1:14:33,  1.84it/s]

content https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=081&aid=0003037020&date=20191019&type=1&rankingSectionId=102&rankingSeq=2
good https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=081&aid=0003037020&date=20191019&type=1&rankingSectionId=102&rankingSeq=2
warm https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=081&aid=0003037020&date=20191019&type=1&rankingSectionId=102&rankingSeq=2
sad https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=081&aid=0003037020&date=20191019&type=1&rankingSectionId=102&rankingSeq=2
angry https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=081&aid=0003037020&date=20191019&type=1&rankingSectionId=102&rankingSeq=2
want https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=081&aid=0003037020&date=20191019&type=1&rankingSectionId=

 25%|██████████████████▊                                                        | 2745/10950 [24:41<1:07:13,  2.03it/s]

content https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001250536&date=20191020&type=1&rankingSectionId=102&rankingSeq=5
good https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001250536&date=20191020&type=1&rankingSectionId=102&rankingSeq=5
warm https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001250536&date=20191020&type=1&rankingSectionId=102&rankingSeq=5
sad https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001250536&date=20191020&type=1&rankingSectionId=102&rankingSeq=5
angry https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001250536&date=20191020&type=1&rankingSectionId=102&rankingSeq=5
want https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001250536&date=20191020&type=1&rankingSectionId=

 29%|█████████████████████▊                                                     | 3183/10950 [28:46<1:33:45,  1.38it/s]

content https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=001&aid=0011187676&date=20191104&type=1&rankingSectionId=100&rankingSeq=3
good https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=001&aid=0011187676&date=20191104&type=1&rankingSectionId=100&rankingSeq=3
warm https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=001&aid=0011187676&date=20191104&type=1&rankingSectionId=100&rankingSeq=3
sad https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=001&aid=0011187676&date=20191104&type=1&rankingSectionId=100&rankingSeq=3
angry https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=001&aid=0011187676&date=20191104&type=1&rankingSectionId=100&rankingSeq=3
want https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=001&aid=0011187676&date=20191104&type=1&rankingSectionId=

 30%|██████████████████████▌                                                    | 3288/10950 [29:45<1:09:27,  1.84it/s]

content https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=018&aid=0004511119&date=20191107&type=1&rankingSectionId=103&rankingSeq=3
good https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=018&aid=0004511119&date=20191107&type=1&rankingSectionId=103&rankingSeq=3
warm https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=018&aid=0004511119&date=20191107&type=1&rankingSectionId=103&rankingSeq=3
sad https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=018&aid=0004511119&date=20191107&type=1&rankingSectionId=103&rankingSeq=3
angry https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=018&aid=0004511119&date=20191107&type=1&rankingSectionId=103&rankingSeq=3
want https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=018&aid=0004511119&date=20191107&type=1&rankingSectionId=

 54%|█████████████████████████████████████████▉                                   | 5960/10950 [54:43<51:51,  1.60it/s]

content https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=081&aid=0003063149&date=20200204&type=1&rankingSectionId=103&rankingSeq=5
good https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=081&aid=0003063149&date=20200204&type=1&rankingSectionId=103&rankingSeq=5
warm https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=081&aid=0003063149&date=20200204&type=1&rankingSectionId=103&rankingSeq=5
sad https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=081&aid=0003063149&date=20200204&type=1&rankingSectionId=103&rankingSeq=5
angry https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=081&aid=0003063149&date=20200204&type=1&rankingSectionId=103&rankingSeq=5
want https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=081&aid=0003063149&date=20200204&type=1&rankingSectionId=

 67%|██████████████████████████████████████████████████                         | 7306/10950 [1:06:49<30:34,  1.99it/s]

content https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001301587&date=20200320&type=1&rankingSectionId=103&rankingSeq=1
good https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001301587&date=20200320&type=1&rankingSectionId=103&rankingSeq=1
warm https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001301587&date=20200320&type=1&rankingSectionId=103&rankingSeq=1
sad https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001301587&date=20200320&type=1&rankingSectionId=103&rankingSeq=1
angry https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001301587&date=20200320&type=1&rankingSectionId=103&rankingSeq=1
want https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001301587&date=20200320&type=1&rankingSectionId=

 82%|█████████████████████████████████████████████████████████████▊             | 9018/10950 [1:22:11<17:13,  1.87it/s]

content https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001321801&date=20200516&type=1&rankingSectionId=103&rankingSeq=3
good https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001321801&date=20200516&type=1&rankingSectionId=103&rankingSeq=3
warm https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001321801&date=20200516&type=1&rankingSectionId=103&rankingSeq=3
sad https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001321801&date=20200516&type=1&rankingSectionId=103&rankingSeq=3
angry https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001321801&date=20200516&type=1&rankingSectionId=103&rankingSeq=3
want https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001321801&date=20200516&type=1&rankingSectionId=

 85%|████████████████████████████████████████████████████████████████           | 9346/10950 [1:25:12<13:56,  1.92it/s]

content https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=023&aid=0003534279&date=20200527&type=1&rankingSectionId=103&rankingSeq=1
good https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=023&aid=0003534279&date=20200527&type=1&rankingSectionId=103&rankingSeq=1
warm https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=023&aid=0003534279&date=20200527&type=1&rankingSectionId=103&rankingSeq=1
sad https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=023&aid=0003534279&date=20200527&type=1&rankingSectionId=103&rankingSeq=1
angry https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=023&aid=0003534279&date=20200527&type=1&rankingSectionId=103&rankingSeq=1
want https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=023&aid=0003534279&date=20200527&type=1&rankingSectionId=

 90%|███████████████████████████████████████████████████████████████████▎       | 9826/10950 [1:29:41<09:02,  2.07it/s]

content https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001331111&date=20200612&type=1&rankingSectionId=103&rankingSeq=1
good https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001331111&date=20200612&type=1&rankingSectionId=103&rankingSeq=1
warm https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001331111&date=20200612&type=1&rankingSectionId=103&rankingSeq=1
sad https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001331111&date=20200612&type=1&rankingSectionId=103&rankingSeq=1
angry https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001331111&date=20200612&type=1&rankingSectionId=103&rankingSeq=1
want https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001331111&date=20200612&type=1&rankingSectionId=

 90%|███████████████████████████████████████████████████████████████████▌       | 9857/10950 [1:29:59<10:02,  1.81it/s]

content https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001331111&date=20200613&type=1&rankingSectionId=103&rankingSeq=2
good https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001331111&date=20200613&type=1&rankingSectionId=103&rankingSeq=2
warm https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001331111&date=20200613&type=1&rankingSectionId=103&rankingSeq=2
sad https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001331111&date=20200613&type=1&rankingSectionId=103&rankingSeq=2
angry https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001331111&date=20200613&type=1&rankingSectionId=103&rankingSeq=2
want https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001331111&date=20200613&type=1&rankingSectionId=

 92%|████████████████████████████████████████████████████████████████████      | 10066/10950 [1:31:49<06:57,  2.12it/s]

content https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001333664&date=20200620&type=1&rankingSectionId=103&rankingSeq=1
good https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001333664&date=20200620&type=1&rankingSectionId=103&rankingSeq=1
warm https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001333664&date=20200620&type=1&rankingSectionId=103&rankingSeq=1
sad https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001333664&date=20200620&type=1&rankingSectionId=103&rankingSeq=1
angry https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001333664&date=20200620&type=1&rankingSectionId=103&rankingSeq=1
want https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=005&aid=0001333664&date=20200620&type=1&rankingSectionId=

 94%|█████████████████████████████████████████████████████████████████████▍    | 10279/10950 [1:33:45<06:00,  1.86it/s]

content https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=023&aid=0003542126&date=20200627&type=1&rankingSectionId=103&rankingSeq=4
good https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=023&aid=0003542126&date=20200627&type=1&rankingSectionId=103&rankingSeq=4
warm https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=023&aid=0003542126&date=20200627&type=1&rankingSectionId=103&rankingSeq=4
sad https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=023&aid=0003542126&date=20200627&type=1&rankingSectionId=103&rankingSeq=4
angry https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=023&aid=0003542126&date=20200627&type=1&rankingSectionId=103&rankingSeq=4
want https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=023&aid=0003542126&date=20200627&type=1&rankingSectionId=

 95%|██████████████████████████████████████████████████████████████████████▍   | 10421/10950 [1:34:58<04:10,  2.11it/s]

content https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=025&aid=0003014141&date=20200702&type=1&rankingSectionId=102&rankingSeq=1
good https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=025&aid=0003014141&date=20200702&type=1&rankingSectionId=102&rankingSeq=1
warm https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=025&aid=0003014141&date=20200702&type=1&rankingSectionId=102&rankingSeq=1
sad https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=025&aid=0003014141&date=20200702&type=1&rankingSectionId=102&rankingSeq=1
angry https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=025&aid=0003014141&date=20200702&type=1&rankingSectionId=102&rankingSeq=1
want https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=025&aid=0003014141&date=20200702&type=1&rankingSectionId=

 96%|██████████████████████████████████████████████████████████████████████▊   | 10487/10950 [1:35:35<04:22,  1.76it/s]

content https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=025&aid=0003014675&date=20200704&type=1&rankingSectionId=103&rankingSeq=2
good https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=025&aid=0003014675&date=20200704&type=1&rankingSectionId=103&rankingSeq=2
warm https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=025&aid=0003014675&date=20200704&type=1&rankingSectionId=103&rankingSeq=2
sad https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=025&aid=0003014675&date=20200704&type=1&rankingSectionId=103&rankingSeq=2
angry https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=025&aid=0003014675&date=20200704&type=1&rankingSectionId=103&rankingSeq=2
want https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=025&aid=0003014675&date=20200704&type=1&rankingSectionId=

 96%|███████████████████████████████████████████████████████████████████████   | 10519/10950 [1:35:53<03:51,  1.87it/s]

content https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=023&aid=0003544102&date=20200705&type=1&rankingSectionId=103&rankingSeq=4
good https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=023&aid=0003544102&date=20200705&type=1&rankingSectionId=103&rankingSeq=4
warm https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=023&aid=0003544102&date=20200705&type=1&rankingSectionId=103&rankingSeq=4
sad https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=023&aid=0003544102&date=20200705&type=1&rankingSectionId=103&rankingSeq=4
angry https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=023&aid=0003544102&date=20200705&type=1&rankingSectionId=103&rankingSeq=4
want https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=023&aid=0003544102&date=20200705&type=1&rankingSectionId=

100%|██████████████████████████████████████████████████████████████████████████| 10950/10950 [1:39:45<00:00,  1.83it/s]


## Part3. bs

In [16]:
def get_content_reactions(url):
    
    # window에서 multiprocess 사용하기 위해 함수 안에서도 임포트
    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    
    # 일반 기사, 연예 기사, 스포츠 기사마다 조금씩 포맷이 다르다. 우선 골격 함수 만들자.
    def crawl(content_selector, q, q2):
        # content
        response = requests.request('GET', url)
        soup = BeautifulSoup(response.text, 'html.parser')
        content = soup.select(content_selector)[0].text.strip()
        # reactions api url 만들고, reaction별 개수 pd.Series로
        s = url.find('oid')
        oid = url[s+4:s+7]
        aid = url[s+12:s+22]
        url_reaction = f'https://news-like.naver.com/v1/search/contents?q={q}%5Bne_{oid}_{aid}%5D%7C{q2}%5B{oid}_{aid}'
        response = requests.request("GET", url_reaction).json()
        reactions = pd.Series(0, index=response['contents'][0]['reactionTextMap']['zh-hans'].keys())
        for reaction in response['contents'][0]['reactions']:
            reactions.loc[reaction['reactionType']] = reaction['count']  
        return content, reactions

    try:
        try:
            try:
                # 일반 기사
                content, reactions = crawl('#articleBodyContents', 'NEWS', 'NEWS_SUMMARY')

            except:
                # 연예 기사
                content, reactions = crawl('#articeBody', 'ENTERTAIN', 'ENTERTAIN_MAIN')
                print('ENTERTAIN')

        except:
            # 스포츠 기사
            content, reactions = crawl('#newsEndContents', 'SPORTS', 'SPORTS_MAIN')
            print('SPORTS')
    
    except:
        # 삭제된 기사
        content = None
        reactions = None
        print('DELETED')
    
    return content, reactions

In [17]:
with Pool(14) as pool:
    content_reactions = list(tqdm(pool.imap(get_content_reactions, df_top_news['url'].values), total=len(df_top_news)))

100%|████████████████████████████████████████████████████████████████████████████| 10950/10950 [02:45<00:00, 66.27it/s]


In [18]:
# 크롤링한 정보를 데이터프레임으로 저장
df2 = df_top_news.copy()
for i, (content, reactions) in tqdm(list(enumerate(content_reactions))):
    df2.loc[i, 'content'] = content
    if content is None:
        # 삭제된 기사의 경우
        continue
    for reaction in reactions.index:
        df2.loc[i, reaction] = reactions.loc[reaction]

100%|███████████████████████████████████████████████████████████████████████████| 10950/10950 [00:17<00:00, 628.16it/s]


In [19]:
df2.head()

Unnamed: 0,title,url,press,views,content,like,sad,angry,want,warm,cheer,congrats,expect,surprise,fan
0,"[단독] 황교안 딸 운영 사이트, 대학 진학 후 왜 문 닫았나",https://news.naver.com/main/ranking/read.nhn?m...,경향신문,201039,중 3 때 오빠와 장관상이어 고3 때도 ‘장함모’ 활동으로 자원봉사대회 금상“대체로...,131.0,17.0,2605.0,121.0,16.0,,,,,
1,"조국 폭풍페북, 日주장 정면반박…""친일파"" 표현은 野 반발",https://news.naver.com/main/ranking/read.nhn?m...,머니투데이,185396,"[머니투데이 김성휘 ,백지수 기자] [[the300]징용판결 해설 글 ""대통령 법...",1710.0,13.0,6058.0,48.0,32.0,,,,,
2,"조국, 연일 對日 '항전' 주문…""겁먹고 쫄지말자…싸워 이겨…",https://news.naver.com/main/ranking/read.nhn?m...,연합뉴스,130198,"""文정부, 서희와 이순신 역할 함께 수행…지레 겁먹지 말아야""""文정부 매도 정치인·...",1799.0,13.0,9120.0,30.0,19.0,,,,,
3,[김순덕의 도발]복수를 하려면 아일랜드처럼!,https://news.naver.com/main/ranking/read.nhn?m...,동아일보,120897,"친일잔재를 청산하고 한번도 경험하지 못한 나라로 가는 것이 목적이라면, 문재인 정부...",3017.0,16.0,623.0,37.0,19.0,,,,,
4,"조국, 또 페북에 反日 선전전...""文정부, 서희·이순신 역할…",https://news.naver.com/main/ranking/read.nhn?m...,조선일보,119463,"""문재인 정부, 국익 수호 위해 '서희' '이순신' 역할 함께 수행""""법적·외교적 ...",374.0,14.0,11468.0,38.0,13.0,,,,,


In [20]:
# 삭제된 기사 링크
df2[df2['content'].isnull()]['url'].values

array(['https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=444&aid=0000001280&date=20191015&type=1&rankingSectionId=105&rankingSeq=1',
       'https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=444&aid=0000001280&date=20191016&type=1&rankingSectionId=105&rankingSeq=1'],
      dtype=object)

In [21]:
# 연예 기사 링크
df2[~df2['cheer'].isnull()]['url'].values

array(['https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=001&aid=0010994889&date=20190731&type=1&rankingSectionId=103&rankingSeq=4',
       'https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=025&aid=0002926865&date=20190802&type=1&rankingSectionId=103&rankingSeq=3',
       'https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=032&aid=0002960805&date=20190902&type=1&rankingSectionId=103&rankingSeq=3',
       'https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=032&aid=0002960805&date=20190903&type=1&rankingSectionId=103&rankingSeq=3',
       'https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=009&aid=0004445870&date=20191017&type=1&rankingSectionId=103&rankingSeq=2',
       'https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=081&aid=0003037020&date=20191019

In [22]:
# 스포츠 기사
df2[~df2['fan'].isnull()]['url'].values

array(['https://news.naver.com/main/ranking/read.nhn?mid=etc&sid1=111&rankingType=popular_day&oid=001&aid=0011187676&date=20191104&type=1&rankingSectionId=100&rankingSeq=3'],
      dtype=object)

# 데이터 저장

> 파일 형태로 크롤링한 데이터를 저장

In [23]:
df2.to_csv('naver_top_news.csv')