In [1]:
import requests
from bs4 import BeautifulSoup
import time

from datetime import datetime
from datetime import timedelta

import pandas as pd

import sys

In [2]:
def check_xml(mykey, endpoint, stday, rows, dirty_optimize=False):
    params = {'stDt':stday, 'edDt':stday, 'stTm':'00', 'edTm':'24', 'liIndDiv':None,
        'numOfRows':str(rows), 'pageNo':str(1), 'dataType':'json', 'serviceKey':mykey}
    
    try:
        r = requests.get(endpoint, params=params)
    except:
        print("K-Water API 연결 실패")
        sys.exit(1)

    if r.status_code != 200:
        if r.status_code == 401:
            r = requests.get(endpoint, params=params)
        elif r.status_code == 429:
            retry = int(r.find('Retry-After').get_text())
            time.sleep(retry/100)
            r = requests.get(endpoint, params=params)
        else:
            print('K-Water API 호출에러 발생!:',r.status_code)
    
    while r.text == '':
        oldrows = rows
        newrows = rows-15
        params['numOfRows'] = str(newrows)
        print('Too much Rows! set rows {} -> {}'.format(oldrows, newrows))
        r = requests.get(endpoint, params=params)

    if dirty_optimize:
        backr = r
        try:
            params['numOfRows'] = optimize_dirty(r)
            r = requests.get(endpoint, params=params)
            res = BeautifulSoup(r.text, 'lxml-xml')
            rcode = res.find('header').find('resultCode').get_text()
        except:
            print('dirty optimize failed retrying optimize...')
            r = backr
            params['numOfRows'] = optimize_row(r)
            r = requests.get(endpoint, params=params)
    else:
        params['numOfRows'] = optimize_row(r)

    r = requests.get(endpoint, params=params)
    res = BeautifulSoup(r.text, 'lxml-xml')
    rcode = res.find('header').find('resultCode').get_text()

    if rcode != '00':
        if rcode == '03':
            print('조회기간 동안 발생한 기록이 없습니다. (code:{})'.format(rcode))
        elif rcode == '04':
            try:
                time.sleep(1)
                r = requests.get(endpoint, params=params)
                res = BeautifulSoup(r.text, 'lxml-xml')
            except:
                print('Open API HTTP Error!')
                sys.exit(1)
        elif rcode == '10':
            print('파라미터를 잘못 입력했습니다. 다시 시도해주세요!')
            sys.exit(1)
        else:
            print('Open API Error! ({}: {})'.format(rcode, res.find('resultMsg')))
    
    rescount = int(res.find('body').find('totalCount').get_text())
    rows = int(res.find('body').find('numOfRows').get_text())
    rpage = (rescount//rows) + 1

    return res, rpage, rows


In [3]:
def get_xml(mykey, endpoint, stday, rows, npage):
    params = {'stDt':stday, 'edDt':stday, 'stTm':'00', 'edTm':'24', 'liIndDiv':None,
    'numOfRows':str(rows), 'pageNo':str(npage), 'dataType':'json', 'serviceKey':mykey}

    try:
        r = requests.get(endpoint, params=params)
    except:
        print("K-Water API 연결 실패")
        sys.exit(1)

    if r.status_code != 200:
        if r.status_code == 401:
            r = requests.get(endpoint, params=params)
        elif r.status_code == 429:
            retry = int(r.find('Retry-After').get_text())
            time.sleep(retry/100)
            r = requests.get(endpoint, params=params)
        else:
            print('K-Water API 호출에러 발생!:',r.status_code)
    
    res = BeautifulSoup(r.text, 'lxml-xml')
    rcode = res.find('header').find('resultCode').get_text()

    if rcode != '00':
        if rcode == '03':
            print('조회기간 동안 발생한 기록이 없습니다. (code:{})'.format(rcode))
        elif rcode == '04':
            try:
                time.sleep(1)
                r = requests.get(endpoint, params=params)
                res = BeautifulSoup(r.text, 'lxml-xml')
            except:
                print('Open API HTTP Error!')
                sys.exit(1)
        elif rcode == '10':
            print('파라미터를 잘못 입력했습니다. 다시 시도해주세요!')
            sys.exit(1)
        else:
            print('Open API Error! ({}: {})'.format(rcode, res.find('resultMsg')))
    return res

In [4]:
def get_waterdf (res):
    data_list = res.find_all('item')
    water_dict = {}
    water_list = []
    for data in data_list:
        water_dict = {}
        water_dict['NO'] = int(data.find('no').get_text())
        water_dict['Time'] = data.find('occrrncDt').get_text()
        water_dict['FcName'] = data.find('fcltyMngNm').get_text()
        water_dict['FcCode'] = data.find('fcltyMngNo').get_text()
        water_dict['FcLoc'] = data.find('fcltyAddr').get_text()
        water_dict['water_use'] = data.find('liIndDivName').get_text()
        water_dict['Cl_Val'] = data.find('clVal').get_text()
        water_dict['Cl_Unit'] = data.find('clUnit').get_text()
        water_dict['pH_Val'] = data.find('phVal').get_text()
        water_dict['pH_Unit'] = data.find('phUnit').get_text()
        water_dict['tb_Val'] = data.find('tbVal').get_text()
        water_dict['tb_Unit'] = data.find('tbUnit').get_text()
        water_list.append(water_dict)
    try:
        pdcols = water_list[0].keys()
        water_df = pd.DataFrame(columns=pdcols, data=water_list)
    except IndexError:
        water_df = None
    return water_df

In [5]:
def optimize_row(r):
    res = BeautifulSoup(r.text, 'lxml-xml')
    rows = int(res.find('body').find('numOfRows').get_text())
    rcount = int(res.find('body').find('totalCount').get_text())
    minp = (rcount * 300) // 64000 + 1
    optimize_rows = rcount//minp
    print('optimized rows {} -> {}'.format(rows, optimize_rows))
    return optimize_rows

In [6]:
def optimize_dirty(r):
    res = BeautifulSoup(r.text, 'lxml-xml')
    rows = int(res.find('body').find('numOfRows').get_text())
    rcount = int(res.find('body').find('totalCount').get_text())
    rlen = len(r.content)
    ravg = (rlen//rows)
    minp = (rcount * ravg) // 65535 + 1
    optimize_rows = rcount//minp
    print('trying dirty optimize rows... {} -> {}'.format(rows, optimize_rows))
    return optimize_rows


In [7]:
def main():
    mykey = 'hLgHjx4SRbAp3W6wWAgqvPqZB0dDxQlzAPhuLVz1nCYJ9WUW//16Hzz6vy/0n7rE4kB1kJ3/Ia7vxld3oid6IA=='
    endpoint = 'http://apis.data.go.kr/B500001/rwis/waterQuality/list'
    stday = str(datetime.today()-timedelta(days=1))[0:10]
    initrows = 190
    res, rpage, rows = check_xml(mykey, endpoint, stday, initrows, dirty_optimize=False)
    npage = 1
    print('indexing {}/{} pages...'.format(npage, rpage))

    if rpage != 1:
        water_df = pd.DataFrame()
        bind = get_waterdf(res)
        water_df = water_df.append(bind, ignore_index=True)
        while npage < rpage:
            time.sleep(1)
            npage = npage +1
            print('indexing {}/{} pages...'.format(npage,rpage))
            res = get_xml(mykey, endpoint, stday, rows, npage)
            bind = get_waterdf(res)
            water_df = water_df.append(bind, ignore_index=True)
    else:
        water_df = get_waterdf(res)
    water_df.to_csv("./csv/api3/{}.csv".format(stday), encoding='utf-8')
    print('csv file upload completed ({}.csv)'.format(stday))
    return water_df

In [9]:
if __name__ == '__main__':
    main()

Too much Rows! set rows 190 -> 175
trying dirty optimize rows... 175 -> 180
indexing 1/7 pages...
indexing 2/7 pages...
indexing 3/7 pages...
indexing 4/7 pages...
indexing 5/7 pages...
indexing 6/7 pages...
indexing 7/7 pages...
csv file upload completed (2022-05-03.csv)
