In [1]:
import pandas as pd
import requests
import time
import json

from bs4 import BeautifulSoup
from datetime import datetime
from dateutil.relativedelta import relativedelta

### 기본값 설정

In [2]:
url = 'http://tools.kinds.or.kr:8888/search/news'
query = '부동산 AND 아파트'
start_ymd = '2020-01-01'   #1일 단위로 설정
end_ymd = '2021-09-01'     #1일 단위로 설정
#return_from = 0
#return_size = 100000 # 1만개까지 
parsed_data = []

[get_ymd] 시작일부터 종료일까지의 날짜

In [3]:
def get_ymd(start_ymd, end_ymd) :
    res = []
    start = datetime.strptime(start_ymd, '%Y-%m-%d')
    end = datetime.strptime(end_ymd, '%Y-%m-%d')
    diff = (end.year - start.year) * 12 + (end.month  - start.month)
    for i in range(int(diff/2)) :              
        res.append(datetime.strftime(start + relativedelta(months = 2*i), '%Y-%m-%d'))
    return res

[get_data] json형식으로 data 가져오기 

In [4]:
def get_data(url) :
    params = {
        "access_key": "cef63bff-8972-4140-82b6-2be184378ccc",
        "argument": {
            "query": query,
            "published_at": {
                "from": start_ymd,
                "until": end_ymd
            },
            "sort": {"date": "desc"},
            "hilight": 200,
            "return_from": 0,
            "return_size": 10000,
            "fields": [
                "byline",
                "category",
                "category_incident",
                "provider_news_id",
                "content",  
                "hilight",  
            ]
        }
    }

    headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}
    
    r = requests.post(url, json=params, headers=headers)
    results = r.json()
    return results['return_object']['documents']

[parse_result] 데이터 파싱

In [5]:
def parse_result(result) :
    news_id = result.get('news_id')
    title = result.get('title')
    content = result.get('content')
    hilight = result.get('hilight')
    news_published_at = result.get('published_at')
    provider = result.get('provider')
    category = result.get('category')
    category_incident = result.get('category_incident')
    if category_incident == "[]" :
        category_incident = ""
    else :
        category_incident

    
    res = {'언론사' : provider , '뉴스식별번호' : news_id, '뉴스출고시간' : news_published_at, 
           '카테고리' : category, '카테고리체계' : category_incident,
           '뉴스제목' : title, '뉴스본문' : content, '하이라이트' : hilight}
    
    return res

[run] 실행함수

In [6]:
def run(url) :
    results = get_data(url)
    if results :
        parsed_data.extend([parse_result(x) for x in list(results)])
        return True
    else :
        return False

[save_data] 데이터 저장

In [7]:
def save_data(parsed_data) :
    df = pd.DataFrame(parsed_data)
    df.to_csv("data/bigkinds_news_202001_202109.csv" ,index=True, encoding='utf-8-sig')   # 날짜 수정
    return print(df.shape)

[main] 분석

In [8]:
%%time

if __name__ == '__main__' : # 메인실행을 위한 코드

    ymd_list = get_ymd(start_ymd, end_ymd) #날짜리스트

    for st in ymd_list :
        start_ymd = datetime.strptime(st, '%Y-%m-%d')
        end_ymd = datetime.strftime(start_ymd + relativedelta(months = 2), '%Y-%m-%d')
        start_ymd = str(start_ymd)[:10]
        print(start_ymd, end_ymd)       
        run(url)
        time.sleep(0.1)
       
    save_data(parsed_data) #데이터저장        
    #마지막저장기록 
    now = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M')

    with open('data/openapi_log'+ start_ymd + end_ymd +'.txt', 'a+') as f :
        f.write("크롤링날짜\t시작일\t종료일\n")
        f.write("{}\t{}\t{}\n".format(now, start_ymd, end_ymd))
        f.close()


2020-01-01 2020-03-01
2020-03-01 2020-05-01
2020-05-01 2020-07-01
2020-07-01 2020-09-01
2020-09-01 2020-11-01
2020-11-01 2021-01-01
2021-01-01 2021-03-01
2021-03-01 2021-05-01
2021-05-01 2021-07-01
2021-07-01 2021-09-01
(66736, 8)
Wall time: 1min 24s
