## 공공데이터포털 API 연동

In [2]:
import numpy as np
import pandas as pd
import requests
from datetime import datetime, timedelta
import os
import time
import calendar

In [None]:
# API 설정
SERVICE_KEY = "a46bbaf106e41963c3883db630366f91e4960f1a08c831475517864811d806f4"
BASE_URL = "https://apis.data.go.kr/B552115/PvAmountByLocHr/getPvAmountByLocHr"

# 날짜 범위 설정
year = 2024
month = 2
last_day = calendar.monthrange(year, month)[1]

start_date = datetime(year, month, 1)
end_date = datetime(year, month, last_day)
num_of_rows = 1000

# 모든 데이터 수집
all_items = []
current_date = start_date

while current_date <= end_date:
    trade_ymd = current_date.strftime("%Y%m%d")
    
    # 첫 요청으로 해당 날짜의 totalCount 확인
    params = {
        "serviceKey": SERVICE_KEY,
        "pageNo": 1,
        "numOfRows": num_of_rows,
        "dataType": "json",
        "tradeYmd": trade_ymd
    }
    
    response = requests.get(BASE_URL, params=params)
    
    # 에러 처리
    if response.status_code != 200:
        print(f"에러 발생: {trade_ymd}, 상태코드: {response.status_code}")
        print(f"응답: {response.text[:200]}")
        break
    
    try:
        data = response.json()
    except:
        print(f"JSON 파싱 에러: {trade_ymd}")
        print(f"응답: {response.text[:200]}")
        break
    
    total_count = int(data['response']['body']['totalCount'])
    total_pages = (total_count // num_of_rows) + (1 if total_count % num_of_rows > 0 else 0)
    
    # 해당 날짜의 모든 페이지 수집
    for page in range(1, total_pages + 1):
        params['pageNo'] = page
        response = requests.get(BASE_URL, params=params)
        data = response.json()
        items = data['response']['body']['items']['item']
        all_items.extend(items)
        time.sleep(0.1)  # 0.1초 대기
    
    print(f"{trade_ymd} 완료: {total_count}개 데이터")
    current_date += timedelta(days=1)
    time.sleep(0.2)  # 날짜마다 0.2초 대기

# DataFrame 생성
df = pd.DataFrame(all_items)
print(f"\n총 {len(df)}개 데이터 수집 완료")
df.head()

In [None]:
# rn 컬럼 제거 후 CSV 저장
df_save = df.drop(columns=['rn'])
SAVE_NAME = "data/2024/2024_12.csv"
df_save.to_csv(SAVE_NAME, index=False, encoding='utf-8-sig')
print(f"{SAVE_NAME} 저장 완료")

## 데이터 전처리

In [4]:
PATH = "data/2023/2023_01.csv"

df = pd.read_csv(PATH)
df.describe()

Unnamed: 0,tradeNo,tradeYmd,amgo
count,12648.0,12648.0,12648.0
mean,12.5,20230120.0,46.668986
std,6.92246,8.944626,121.384425
min,1.0,20230100.0,0.0
25%,6.75,20230110.0,0.00107
50%,12.5,20230120.0,2.092274
75%,18.25,20230120.0,27.768443
max,24.0,20230130.0,1490.953099
