# Scraping Korean News

In [2]:
import urllib.request, urllib.parse
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import csv
import random
from fake_useragent import UserAgent

## Scrape news links from the [Naver News portal](https://news.naver.com/)

In [3]:
article_title = []
article_outlet = []
article_date = []
article_contacttracing = []
article_lead = []
article_url = []

news = ['조선일보언론사 선정', '디지틀조선일보', '조선일보', '중앙일보언론사 선정', '중앙일보', '동아일보언론사 선정', '동아일보', 
 '한겨레언론사 선정', '한겨레','경향신문언론사 선정', '경향신문']

In [4]:
# These can change

start_date = 20200215
end_date = 20200217
num_articles_to_scrape = 500 # Any more than 2000 and liable to run into issues

In [5]:
for page in range(1,num_articles_to_scrape):
    sleep_time = random.random()*3
    time.sleep(sleep_time)
    
    params = urllib.parse.urlencode({
        'where': 'news',
        'query': '역학조사+역학조사+코로나', #검색어(기사 본문)
        'sm':'tab_pge',
        'sort':'1',
        'photo':'0',
        'field':'0',
        'reporter_article':'',
        'pd':'3',
        'ds':'2020.02.15', #검색 시작일
        'de':'2020.02.17', #검색 종료일
        'docid':'',
        'nso':f'so:dd,p:from{start_date}to{end_date},a:all', #검색 시작일, 종료일
        'mynews':'0',
        'cluster_rank': (page-1)*14,
        'start':(page-1)*10+1,
        'refresh_start':'0',
    })

     
    print("Page", page)
    request_url = 'https://search.naver.com/search.naver?&%s' % params
#     print("url", request_url)
    ua = UserAgent()
    fua = ua.random
    headers = {'User-Agent': fua}
    req = urllib.request.Request(request_url, headers=headers)
    
    html = urllib.request.urlopen(req).read()
    soup = BeautifulSoup(html, 'lxml')
    mylist = soup.find_all('li', attrs={'id':re.compile(r'sp_nws')})
    print(len(mylist))
    for li in mylist:
        title='NA'
        outlet = 'NA'
        date = 'NA'
        contacttracing = 'NA'
        lead = 'NA'
        url = 'NA'
        
        title = li.find('a', attrs={'class':re.compile(r'_sp_each_title')}).get('title').strip()
        outlet = li.find('span', attrs={'class':'_sp_each_source'}).text.strip()
        date = li.find('span', attrs={'class':'bar'}).next_sibling.strip()
        contacttracing = '코로나 역학조사'
        url = li.find('a')['href']
#         lead = li.find_all('dd')[1]

        if outlet in news:
            article_title.append(title)
            article_outlet.append(outlet)
            article_date.append(date)
            article_contacttracing.append(contacttracing)
    #         article_lead.append(lead)
            article_url.append(url)

Page 1


KeyboardInterrupt: 

In [6]:
# Convert into dataframe

df = pd.DataFrame.transpose(pd.DataFrame([article_outlet, article_date, article_contacttracing, article_title, article_url]))
dfs.columns = ['outlet', 'date','keywords','title','url']

df.head()

Unnamed: 0,outlet,date,keywords,title,url


In [265]:
df.reset_index(drop=True, inplace=True)
df.drop_duplicates(inplace=True)
df.to_csv(f"naver_news{start_date}-{end_date}.csv")

## Concat all the data

In [None]:
# Something like this for all the urls gathered

df1 = pd.read_csv("news_finished/naver_news_01_text.csv")
df2 = pd.read_csv("news_finished/naver_news_02_text.csv")
df3 = pd.read_csv("news_finished/naver_news_03_text.csv")
df4 = pd.read_csv("news_finished/naver_news_04_text.csv")
df5 = pd.read_csv("news_finished/naver_news_05_text.csv")
df6 = pd.read_csv("news_finished/naver_news_06_text.csv")

df = pd.concat([df1, df2, df3, df4, df5]).sort_values(by="date")
df.drop(columns="Unnamed: 0", inplace=True)
df.drop_duplicates(inplace=True)

df.reset_index(drop=True, inplace=True)
df.to_csv("naver_news_links.csv")

## Scrape Content of Individual Articles

In [45]:
df = pd.read_csv("naver_news_links.csv")
df.reset_index(drop=True, inplace=True)

In [178]:
def digital_chosun(url):
    req = urllib.request.Request(url)
    html = urllib.request.urlopen(req).read()
    soup = BeautifulSoup(html, 'lxml') 
    for script in soup(["script", "style"]):
        script.decompose()     a = soup.find('div', attrs={'class':'inputdate'}).text
    date = re.search(r'(\d\d\d\d.\d\d.\d\d)', a).group()
    text = soup.findAll('div', attrs={'class':'article'})[0].text.strip()
    return (date,text)


In [297]:
def donga(url):
    req = urllib.request.Request(url)
    html = urllib.request.urlopen(req).read()
    soup = BeautifulSoup(html, 'lxml') 
    for script in soup(["script", "style"]):
        script.decompose() 
    a = soup.find('span', attrs={'class':re.compile(r'date01')}).text
    date = re.search(r'(\d\d\d\d.\d\d.\d\d)', a).group()
    text = soup.findAll('div', attrs={'class':'article_txt'})[0].text.strip()
    return (date,text)


In [332]:
def khan(url):
    agent = {"User-Agent":"Mozilla/5.0"}
    req = urllib.request.Request(url, headers=agent)    
    html = urllib.request.urlopen(req).read()
    soup = BeautifulSoup(html, 'lxml')
    
    a = soup.find("div", attrs={"class": "byline"})
    date = re.search(r'(\d\d\d\d.\d\d.\d\d)', a.find("em").text).group()

    soup.find("h1").text
    text = ' '.join([a.text for a in soup.findAll("p", attrs={"class": "content_text"})])
    return (date,text)
    

In [127]:
def jungang_ilbo(url):
    req = urllib.request.Request(url)
    html = urllib.request.urlopen(req).read()
    soup = BeautifulSoup(html, 'lxml') 
    for script in soup(["script", "style"]):
        script.decompose()     a = soup.select(".byline > em:nth-of-type(2)")[0].text
    date = re.search(r'(\d\d\d\d.\d\d.\d\d)', a).group()
    text = soup.findAll('div', attrs={'class':'article_body'})[0].text.strip()
    return (date, text)

In [104]:
def hangurae(url):
    req = urllib.request.Request(url)
    html = urllib.request.urlopen(req).read()
    soup = BeautifulSoup(html, 'lxml') 
    for script in soup(["script", "style"]):
        script.decompose()     a = soup.find('p', attrs={'class':re.compile(r'date-time')})
    date = re.search(r'(\d\d\d\d.\d\d.\d\d)', a.span.text).group()
    text = soup.findAll('div', attrs={'class':'text'})[0].text.strip()
    return (date, text)

In [179]:
def chosun_ilbo(url):
    req = urllib.request.Request(url)
    html = urllib.request.urlopen(req).read()
    soup = BeautifulSoup(html, 'lxml') 
    for script in soup(["script", "style"]):
        script.decompose()     a = soup.find('div', attrs={'class':re.compile(r'news_date')}).string
    date = re.search(r'(\d\d\d\d.\d\d.\d\d)', a).group()
    text = soup.findAll('div', attrs={'class':'par'})[0].text.strip()
    return (date, text)

In [189]:
def get_text(s):
    print(s.date, s.url)
    try:
        if "seouland" in s.url:
            return "NA"
        if "디지틀조선일보" in s.outlet:
            return digital_chosun(s.url)
        if "조선일보" in s.outlet:
            return chosun_ilbo(s.url)
        if "한겨레" in s.outlet:
            return hangurae(s.url)
        if "중앙일보" in s.outlet:
            return jungang_ilbo(s.url)
        if "동아일보" in s.outlet:
            return donga(s.url)
        if "경향신문" in s.outlet:
            return khan(s.url)
        else: return "NA"
    except:
        e = sys.exc_info()[0]
        print("Exception", e)
        return "NA"

In [None]:
# Date and text saved as a tuple
df['date_text'] = df.apply(lambda s: get_text(s), axis=1)

In [None]:
# Separate the date, text tuple; convert the date to proper datetime format
for index, row in df.iterrows():
    if not (pd.isnull(row['date_text'])):
        a, b = df['date_text'][index]
        if a!="N": 
            df.loc[index, 'date'] = pd.to_datetime(a)
            print(pd.to_datetime(a))
        df.loc[index, 'text'] = b
df.drop(columns='date_text', inplace=True)



In [None]:
df.to_csv("naver_news.csv")