In [18]:
import requests
import pandas as pd
from datetime import datetime, timedelta
import time
from tqdm import tqdm
import os

class KMAHourlyDataCollector:
    """기상청 API를 통해 시간 단위 강수 데이터를 수집하는 클래스"""
    
    def __init__(self, auth_key, stn_id, start_date, end_date, output_dir='./data'):
        """
        auth_key: 기상청 API 인증키
        stn_id: 관측소 ID (예: 108=서울, 159=부산)
        start_date: 시작 날짜 (datetime 객체)
        end_date: 종료 날짜 (datetime 객체)
        output_dir: 데이터 저장 디렉토리
        """
        self.auth_key = auth_key
        self.stn_id = stn_id
        self.start_date = start_date
        self.end_date = end_date
        self.output_dir = output_dir
        self.base_url = "https://apihub.kma.go.kr/api/typ01/url/kma_sfctm2.php"
        
        # 출력 디렉토리 생성
        os.makedirs(output_dir, exist_ok=True)
        
    def parse_line(self, line):
        """한 줄의 데이터를 파싱"""
        if line.startswith('#') or len(line) < 100:
            return None
            
        try:
            # 시간 정보 (YYMMDDHHMI)
            time_str = line[0:12]
            dt = datetime.strptime(time_str, '%Y%m%d%H%M')
            
            # 관측소 ID
            stn = line[12:17].strip()
            
            # 강수량 (RN - 60분 강수량, 위치: 93-98)
            # RN은 여러 개가 있는데, 첫 번째 RN (60분 강수량)을 사용
            rn_str = line[93:99].strip()
            
            # -9.0은 결측값
            if rn_str == '-9.0' or rn_str == '-9':
                precip = None
            else:
                precip = float(rn_str)
                
            return {
                'datetime': dt,
                'stn_id': stn,
                'precip_mm': precip
            }
        except Exception as e:
            return None
    
    def fetch_data_for_datetime(self, dt):
        """특정 시각의 데이터를 API로부터 가져오기"""
        tm = dt.strftime('%Y%m%d%H%M')
        
        params = {
            'tm': tm,
            'stn': self.stn_id,
            'help': 0,
            'authKey': self.auth_key
        }
        
        try:
            response = requests.get(self.base_url, params=params, timeout=10)
            if response.status_code == 200:
                lines = response.text.splitlines()
                
                # 해당 관측소의 데이터만 필터링
                for line in lines:
                    parsed = self.parse_line(line)
                    if parsed and parsed['stn_id'] == str(self.stn_id):
                        return parsed
            return None
        except Exception as e:
            print(f"Error fetching {tm}: {e}")
            return None
    
    def collect_all_data(self, save_interval=24*30):  # 한 달마다 저장
        """전체 기간의 데이터를 수집"""
        print(f"데이터 수집 시작: {self.start_date} ~ {self.end_date}")
        print(f"관측소 ID: {self.stn_id}")
        
        # 시간 단위로 순회
        current_dt = self.start_date
        data_list = []
        count = 0
        
        # 총 시간 개수 계산
        total_hours = int((self.end_date - self.start_date).total_seconds() / 3600) + 1
        
        pbar = tqdm(total=total_hours, desc='Data Collection')
        
        while current_dt <= self.end_date:
            data = self.fetch_data_for_datetime(current_dt)
            
            if data:
                data_list.append(data)
            
            count += 1
            
            # 주기적으로 저장 (메모리 관리)
            if count % save_interval == 0:
                self._save_intermediate(data_list)
                data_list = []
            
            # API 과부하 방지 (초당 10회 정도로 제한)
            time.sleep(0.1)
            
            current_dt += timedelta(hours=1)
            pbar.update(1)
        
        pbar.close()
        
        # 남은 데이터 저장
        if data_list:
            self._save_intermediate(data_list)
        
        # 모든 중간 파일 병합
        self._merge_all_files()
        
        print(f"\n✓ 데이터 수집 완료!")
        
    def _save_intermediate(self, data_list):
        """중간 데이터 저장"""
        if not data_list:
            return
            
        df = pd.DataFrame(data_list)
        df = df.sort_values('datetime')
        
        filename = f'intermediate_{self.stn_id}_{df["datetime"].iloc[0].strftime("%Y%m%d")}.csv'
        filepath = os.path.join(self.output_dir, filename)
        
        df.to_csv(filepath, index=False)
        print(f"\nIntermediate save: {filename} ({len(df)} records)")
    
    def _merge_all_files(self):
        """모든 중간 파일을 하나로 병합"""
        print("\n중간 파일 병합 중...")
        
        # 중간 파일 찾기
        intermediate_files = [f for f in os.listdir(self.output_dir) 
                             if f.startswith(f'intermediate_{self.stn_id}_')]
        
        if not intermediate_files:
            print("병합할 중간 파일이 없습니다.")
            return
        
        # 모든 파일 읽어서 병합
        df_list = []
        for filename in sorted(intermediate_files):
            filepath = os.path.join(self.output_dir, filename)
            df_temp = pd.read_csv(filepath, parse_dates=['datetime'])
            df_list.append(df_temp)
        
        # 병합 및 정렬
        df_final = pd.concat(df_list, ignore_index=True)
        df_final = df_final.sort_values('datetime')
        df_final = df_final.drop_duplicates(subset=['datetime'])
        
        # 최종 파일 저장
        station_names = {
            108: 'seoul', 159: 'busan', 105: 'gangneung', 
            127: 'chungju', 136: 'andong', 156: 'gwangju',
            168: 'yeosu', 184: 'jeju'
        }
        station_name = station_names.get(int(self.stn_id), f'stn{self.stn_id}')
        
        final_filename = f'{station_name}_hourly.csv'
        final_filepath = os.path.join(self.output_dir, final_filename)
        
        df_final.to_csv(final_filepath, index=False)
        print(f"✓ 최종 파일 저장: {final_filename}")
        print(f"  - 총 레코드 수: {len(df_final)}")
        print(f"  - 기간: {df_final['datetime'].min()} ~ {df_final['datetime'].max()}")
        print(f"  - 결측값: {df_final['precip_mm'].isna().sum()} / {len(df_final)}")
        
        # 중간 파일 삭제
        for filename in intermediate_files:
            filepath = os.path.join(self.output_dir, filename)
            os.remove(filepath)
        print(f"✓ 중간 파일 {len(intermediate_files)}개 삭제 완료")


# ==============================================================================
# 실행 예시
# ==============================================================================
if __name__ == "__main__":
    
    # 설정
    AUTH_KEY = "fyJfARSHQiSiXwEUh9IkzQ"  # 실제 인증키로 교체
    
    # 관측소 코드
    STATIONS = {
        'seoul': 108,
    }
    
    # 수집 기간
    START_DATE = datetime(2001, 1, 1, 0, 0)
    END_DATE = datetime(2005, 1, 1, 23, 0)
    
    # 한 개 관측소만 테스트
    station_name = 'seoul'
    stn_id = STATIONS[station_name]
    

    
    # 데이터 수집
    collector = KMAHourlyDataCollector(
        auth_key=AUTH_KEY,
        stn_id=stn_id,
        start_date=START_DATE,
        end_date=END_DATE
    )
    
    collector.collect_all_data()
    


기상청 시간 단위 강수 데이터 수집
관측소: seoul (ID: 108)
기간: 2001-01-01 00:00:00 ~ 2005-01-01 23:00:00
예상 데이터 수: 35088 시간

데이터 수집 시작: 2001-01-01 00:00:00 ~ 2005-01-01 23:00:00
관측소 ID: 108




Error fetching 200101060600: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200101060700: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200101071000: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200101071100: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200101112300: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200101120000: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200101171400: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200101230000: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200101230200: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200101230800: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200101230900: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)





Intermediate save: intermediate_108_20010101.csv (709 records)




Error fetching 200102112300: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200102200100: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200102200200: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200102200300: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200102200400: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200102201100: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200102201400: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200102201500: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200102201600: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200102201700: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200102202100: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200102202200: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200102210000: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200102250900: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200102251300: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200102270100: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)





Intermediate save: intermediate_108_20010131.csv (704 records)




Error fetching 200103280400: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200103280500: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200103280600: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200103280700: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200103281100: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200103281200: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200103281600: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200103310800: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200103311000: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200103311100: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200103311200: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200103312100: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200103312300: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)

Intermediate save: intermediate_108_20010302.csv (707 records)




Error fetching 200104010000: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200104010100: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200104091700: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200104112000: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200104121400: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200104121600: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200104142000: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200104142200: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200104151000: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)





Intermediate save: intermediate_108_20010401.csv (711 records)




Error fetching 200105162100: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200105162200: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200105162300: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200105170000: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200105170200: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200105191700: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)





Intermediate save: intermediate_108_20010501.csv (714 records)




Error fetching 200106101000: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200106101100: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200106190200: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200106230600: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)





Intermediate save: intermediate_108_20010531.csv (716 records)




Error fetching 200106300000: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200106301700: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200106301800: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200106301900: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)





Intermediate save: intermediate_108_20010630.csv (716 records)




Error fetching 200107301600: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200108020700: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200108060600: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200108061300: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200108110900: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200108131700: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200108131800: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200108141200: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200108141600: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200108141700: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200108172300: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)





Intermediate save: intermediate_108_20010730.csv (709 records)





Intermediate save: intermediate_108_20010829.csv (720 records)




Error fetching 200110161500: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200110162000: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200110171800: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200110210100: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200110221600: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200110221700: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200110240400: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200110240700: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)





Intermediate save: intermediate_108_20010928.csv (712 records)




Error fetching 200110290400: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200110290600: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)





Intermediate save: intermediate_108_20011028.csv (718 records)




Error fetching 200112022200: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200112030100: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200112030200: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200112250700: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200112250800: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)





Intermediate save: intermediate_108_20011127.csv (715 records)




Error fetching 200201021900: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200201022100: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200201090900: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)




Error fetching 200201091200: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Read timed out. (read timeout=10)


Data Collection:   0%|          | 4/210384 [1:34:06<82500:56:40, 1411.75s/it]


KeyboardInterrupt: 