In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)          

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import requests, bs4
from lxml import html
from bs4 import BeautifulSoup as bs
from urllib.request import Request, urlopen, urlretrieve
from urllib.parse import urlencode, quote_plus, unquote, urlparse
import os

In [None]:
path = '/content/drive/MyDrive'
data_dir = path + '/DATA/weatherData'
url = "http://apis.data.go.kr/1360000/AsosDalyInfoService/getWthrDataList?"
key = unquote("fd7yiWzReD%2F9h0kE9KeIxpJRgIW7WoyTyr10Yov4chDZCooL53NoY%2BJQm1fXPMsD0BvNZkwOBbVdE%2FEbovhqww%3D%3D")

os.makedirs(data_dir, exist_ok=True)
df_stations = pd.read_csv(path + '/DATA/stations_info.csv', header=0, index_col=None)

In [None]:
def preprocess_file(stnId):
    df_data = pd.read_csv(path + '/DATA/{}_data.csv'.format(stnId), header=0, index_col=0)
    np_data = np.zeros((1,4))
    for i, rows in df_data.iterrows():  
        split_date = rows['date'].split('-')
        dt = datetime(int(split_date[0]), int(split_date[1]), int(split_date[2]))
        jd = '%03d' % (dt.timetuple().tm_yday)
        jd = int(jd)
        split_date.append(jd)
        np_split_date = np.array(split_date).reshape(1,-1)
        np_data = np.vstack((np_data, np_split_date))
    np_data = np.delete(np_data, 0, 0)

    df_data['year'] = np_data[:,0]
    df_data['month'] = np_data[:,1]
    df_data['day'] = np_data[:,2]
    df_data['jday'] = np_data[:,3]

    df_data = df_data.reindex(columns=['year', 'month', 'day', 'jday',  
                                                        'avgTa', 'minTa', 'maxTa', 
                                                        'sumRn', 'avgWs', 'minRhm', 
                                                        'avgRhm', 'sumGsr', 'sumSsHr', 
                                                        'avgTs', 'avgCm5Te', 'avgCm10Te', 
                                                        'avgCm20Te', 'avgCm30Te', 
                                                        'avgM05Te', 'avgM10Te', 'avgM15Te', 
                                                        'avgM30Te', 'avgM50Te'])
    return df_data

In [None]:
for stnId in df_stations['stnId']:
    if int(stnId) > 152:
        df_result = preprocess_file(stnId)
        df_result.to_csv(data_dir + '/{}_data.csv'.format(stnId))

In [None]:
def get_url(start_date, end_date, stnID, pageNo):
    queryParams = urlencode(
            {
                quote_plus('serviceKey'):key,
                quote_plus('numOfRows'):999,
                quote_plus('pageNo'):pageNo,
                quote_plus('dataType'):'XML',
                quote_plus('dataCd'):'ASOS',
                quote_plus('dateCd'):'DAY',
                quote_plus('startDt'):start_date,
                quote_plus('endDt'):end_date,
                quote_plus('stnIds'):stnID
            }, encoding='UTF-8', doseq=True
        )
    return url + queryParams


def read_page(start_date, end_date, stnID, pageNo):
    targetURL = get_url(start_date, end_date, stnID, pageNo)
    print('The query URL is:{}'.format(targetURL))
    while True:
        try:
            response = requests.get(targetURL).text.encode('utf-8')
        except requests.exceptions.ConnectionError as e:
            print('Connection Error : {}'.format(e))
            continue
        except requests.exceptions.ChunkedEncodingError as e:
            print('Chunked Encoding Error : {}'.format(e))
            continue
        break

    xmlobj = bs4.BeautifulSoup(response, 'lxml-xml')
    return xmlobj


def find_items(xmlobj, item):
    rows = xmlobj.findAll(item)
    temp = []
    for i in rows:
        if i.text == '':
            temp.append(np.nan)
        else:
            temp.append(float(i.text))
            continue
    np_temp = np.array(temp).reshape(-1,1)
    return np_temp


def get_data(start_year, stnID):
    start_date = str(start_year) + '0101'
    end_date = '20210701'
    xml_test = read_page(start_date, end_date, stnID, 1)
    total_rows = int(xml_test.find('totalCount').text)
    page_nums = total_rows // 999 + 1
    np_result = np.zeros((1,20))
    for i in range(page_nums):
        pageNo = i + 1
        xml_temp = read_page(start_date, end_date, str(stnID), pageNo)
        rows = xml_temp.findAll('tm')
        dates = []
        for j in rows:
            dates.append(j.text)
        np_dates = np.array(dates).reshape(-1, 1)
        avgTa = find_items(xml_temp, 'avgTa')
        minTa = find_items(xml_temp, 'minTa')
        maxTa = find_items(xml_temp, 'maxTa')
        sumRn = find_items(xml_temp, 'sumRn')
        avgWs = find_items(xml_temp, 'avgWs')
        minRhm = find_items(xml_temp, 'minRhm')
        avgRhm = find_items(xml_temp, 'avgRhm')
        sumGsr = find_items(xml_temp, 'sumGsr')
        sumSsHr = find_items(xml_temp, 'sumSsHr')
        avgTs = find_items(xml_temp, 'avgTs')
        avgCm5Te = find_items(xml_temp, 'avgCm5Te')
        avgCm10Te = find_items(xml_temp, 'avgCm10Te')
        avgCm20Te = find_items(xml_temp, 'avgCm20Te')
        avgCm30Te = find_items(xml_temp, 'avgCm30Te')
        avgM05Te = find_items(xml_temp, 'avgM05Te')
        avgM10Te = find_items(xml_temp, 'avgM10Te')
        avgM15Te = find_items(xml_temp, 'avgM15Te')
        avgM30Te = find_items(xml_temp, 'avgM30Te')
        avgM50Te = find_items(xml_temp, 'avgM50Te')
        np_temp = np.hstack((np_dates, avgTa, minTa, maxTa, sumRn, avgWs, minRhm, avgRhm, sumGsr,
                             sumSsHr, avgTs, avgCm5Te, avgCm10Te, avgCm20Te, avgCm30Te, avgM05Te,
                             avgM10Te, avgM15Te, avgM30Te, avgM50Te))
        np_result = np.vstack((np_result, np_temp))
        print('Intermediate Checkpoint :')
        print(np_result[-1,:])
    np_result = np.delete(np_result, 0, 0)
    return np_result

In [None]:
for stn in df_stations['stnId']:
    if stn >= 0:
        print('Station : {}'.format(stn))
        np_data = get_data(1920, int(stn))
        df_data = pd.DataFrame(np_data, index=None, columns=['date', 'avgTa', 'minTa', 'maxTa', 'sumRn', 'avgWs',
                                                             'minRhm', 'avgRhm', 'sumGsr', 'sumSsHr', 'avgTs',
                                                             'avgCm5Te', 'avgCm10Te', 'avgCm20Te', 'avgCm30Te',
                                                             'avgM05Te', 'avgM10Te', 'avgM15Te', 'avgM30Te',
                                                             'avgM50Te'])
        df_data.to_csv('/content/drive/MyDrive/{}_data.csv'.format(stn))