In [None]:
import os
import requests
import pandas as pd
import time
from tqdm import tqdm
from datetime import datetime, timedelta
import ssl
import warnings
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from dotenv import load_dotenv
import json
import xml.etree.ElementTree as ET  # 추가: XML 파싱

load_dotenv()

In [None]:
import tensorflow as tf
print(tf.__version__)
tf.config.list_physical_devices('GPU')

In [None]:
# SSL 및 경고 설정
warnings.filterwarnings('ignore', category=InsecureRequestWarning)
ssl._create_default_https_context = ssl._create_unverified_context

In [None]:
# API 설정
API_KEY = os.getenv('DO_API_KEY')  # 환경 변수에서 키를 불러옵니다
BASE_URL = 'http://apis.data.go.kr/B552845/katSale/trades'

In [None]:
# 도매시장 코드 불러오기
df_market = pd.read_csv('도매시장_코드.csv', encoding='cp949')

In [None]:
# 품목 코드 설정
ITEM_CODES = {
    "배추": "1001",  # 용곤
}

# 날짜 입력 (YYYY-MM-DD)  # 테스트 후에 일자 조정 =>
start_date = '2018-01-03'
end_date = '2020-12-31'
start_dt = datetime.strptime(start_date, '%Y-%m-%d')
end_dt = datetime.strptime(end_date, '%Y-%m-%d')
total_days = (end_dt - start_dt).days + 1

# 기타 설정
max_retries = 3
FAIL_LOG = []

In [None]:
# 디렉토리 준비
os.makedirs("logs", exist_ok=True)
os.makedirs("data", exist_ok=True)

In [None]:
for item_name, code in tqdm(ITEM_CODES.items(), desc="전체 품목 진행"):
    LARGE = code[:2]
    MID = code[2:]
    data_list = []

    print(f"\n{item_name} 수집 시작: {start_date} ~ {end_date}")
    print("진행률: ", end='')

    current_dt = start_dt
    count = 0

    while current_dt <= end_dt:
        date_str = current_dt.strftime('%Y-%m-%d')
        for mcode, market_name in df_market.values:
            retry_count = 0
            market_success = False
            page_no = 1

            while retry_count < max_retries:
                try:
                    while True:
                        params = {
                            'serviceKey': API_KEY,
                            'pageNo': page_no,
                            'numOfRows': 100,
                            'cond[trd_clcln_ymd::EQ]': date_str,
                            'cond[whsl_mrkt_cd::EQ]': mcode,
                            'cond[gds_lclsf_cd::EQ]': LARGE,
                            'cond[gds_mclsf_cd::EQ]': MID
                        }

                        response = requests.get(BASE_URL, params=params, verify=False, timeout=10)
                        if response.status_code != 200:
                            fail_reason = f"HTTP {response.status_code}: {response.text[:100]}"
                            retry_count += 1
                            break

                        json_data = response.json()
                        header = json_data.get('response', {}).get('header', {})
                        body = json_data.get('response', {}).get('body', {})
                        items = body.get('items', {}).get('item', [])
                        total_count = int(body.get('totalCount', 0))

                        if isinstance(items, list) and items:
                            data_list.extend(items)
                            market_success = True
                        elif isinstance(items, dict):
                            data_list.append(items)
                            market_success = True
                        else:
                            break

                        if page_no * 100 >= total_count:
                            break
                        else:
                            page_no += 1
                            time.sleep(0.3)

                    break

                except Exception as e:
                    retry_count += 1
                    fail_reason = f"Exception: {str(e)}"
                    print(f"\n실패: {item_name} | {market_name} | {date_str} | 재시도 {retry_count}회 - {fail_reason}")

                    log_dir = "logs"
                    log_prefix = f"{log_dir}/failed_{item_name}_{mcode}_{date_str}_try{retry_count}"

                    if 'response' in locals():
                        content_type = response.headers.get("Content-Type", "")
                        if "text/html" in content_type or response.text.strip().startswith("<"):
                            ext = "html"
                        elif "application/json" in content_type:
                            ext = "json"
                        else:
                            ext = "txt"

                        with open(f"{log_prefix}.{ext}", "w", encoding="utf-8") as f:
                            f.write(response.text)

                        with open(f"{log_prefix}_info.txt", "w", encoding="utf-8") as f:
                            f.write(f"[에러 상태 코드] {response.status_code}\n")
                            f.write(f"[오류 메시지] {str(e)}\n")
                            f.write(f"[응답 Content-Type] {content_type}\n")

                    else:
                        with open(f"{log_prefix}_info.txt", "w", encoding="utf-8") as f:
                            f.write(f"[응답 없음]\n[오류 메시지] {str(e)}\n")

                    time.sleep(2 * retry_count)

            if not market_success:
                FAIL_LOG.append({
                    "item": item_name,
                    "market": market_name,
                    "mcode": mcode,
                    "date": date_str,
                    "reason": fail_reason if 'fail_reason' in locals() else 'Unknown'
                })

        count += 1
        if count % 10 == 0:
            percent = int((count / total_days) * 100)
            print(f"{percent}%", end='', flush=True)
        else:
            print('.', end='', flush=True)

        current_dt += timedelta(days=1)
        time.sleep(0.3)

    print(f"\n{item_name} 완료: {len(data_list):,}건")

    if data_list:
        df = pd.DataFrame(data_list)
        filename = f"data/유통공사_도매시장_{item_name}_{start_date.replace('-', '')}-{end_date.replace('-', '')}.csv"
        df.to_csv(filename, encoding='cp949', index=False)
        print(f"저장됨: {filename}")
    else:
        print(f"{item_name}: 수집된 데이터 없음")

if FAIL_LOG:
    df_fail = pd.DataFrame(FAIL_LOG)
    df_fail.to_csv('data/유통공사_fail_log.csv', index=False, encoding='cp949')
    print(f"\n실패 요청 {len(FAIL_LOG)}건 기록됨: data/유통공사_fail_log.csv")
else:
    print("\n모든 수집 성공, 실패 없음!")

In [None]:
import os
import pandas as pd
import requests
import time
from datetime import datetime
from tqdm import tqdm
import ssl
import warnings
import xml.etree.ElementTree as ET
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from dotenv import load_dotenv

load_dotenv()
warnings.filterwarnings('ignore', category=InsecureRequestWarning)
ssl._create_default_https_context = ssl._create_unverified_context

API_KEY = os.getenv("DO_API_KEY")
BASE_URL = 'http://apis.data.go.kr/B552845/katSale/trades'
ITEM_CODES = {"배추": "1001"}
max_retries = 2

os.makedirs("logs", exist_ok=True)
os.makedirs("data", exist_ok=True)
os.makedirs("success", exist_ok=True)

df_market = pd.read_csv("도매시장_코드.csv", encoding="cp949", header=None)
df_market[0] = df_market[0].astype(str)

fail_df = pd.read_csv("유통공사_fail_log.csv", encoding="cp949")
fail_pairs = fail_df[['mcode', 'date']].drop_duplicates()
fail_pairs['mcode'] = fail_pairs['mcode'].astype(str)

for item_name, code in ITEM_CODES.items():
    LARGE = code[:2]
    MID = code[2:]
    data_list = []

    print(f"\n실패 항목 재시도 시작: {item_name}")
    for _, row in tqdm(fail_pairs.iterrows(), total=len(fail_pairs), desc="재시도 진행"):
        mcode = str(row['mcode'])
        date_str = row['date']

        market_name_row = df_market[df_market[0] == mcode]
        if market_name_row.empty:
            print(f"시장 코드 {mcode} 누락 - 스킵")
            continue
        market_name = market_name_row.values[0][1]

        retry_count = 0
        market_success = False

        while retry_count < max_retries:
            page_no = 1
            try:
                while True:
                    print(f"요청 시도: {item_name} | 시장코드: {mcode} | 날짜: {date_str} | 페이지: {page_no} | 재시도: {retry_count + 1}")

                    params = {
                        'serviceKey': API_KEY,
                        'pageNo': page_no,
                        'numOfRows': 100,
                        'cond[trd_clcln_ymd::EQ]': date_str,
                        'cond[whsl_mrkt_cd::EQ]': mcode,
                        'cond[gds_lclsf_cd::EQ]': LARGE,
                        'cond[gds_mclsf_cd::EQ]': MID
                    }

                    response = requests.get(BASE_URL, params=params, verify=False, timeout=10)
                    content_type = response.headers.get("Content-Type", "")
                    print(f"응답 Content-Type: {content_type}, 응답 길이: {len(response.text)}")

                    time.sleep(2.0)

                    response_preview = response.text[:500].strip()
                    if "LIMITED_" in response_preview:
                        fail_reason = "API 호출 제한 (LIMITED_ 응답)"
                    elif "SERVICE ERROR" in response_preview:
                        fail_reason = "서비스 오류 (SERVICE ERROR 응답)"
                    elif "ERROR" in response_preview.upper():
                        fail_reason = "기타 오류 포함 (ERROR 키워드 포함)"
                    elif "TOO MANY REQUESTS" in response_preview.upper():
                        fail_reason = "요청 과다로 인한 제한 (Too Many Requests)"
                    else:
                        fail_reason = None

                    if fail_reason:
                        print(f"{fail_reason} - 재시도 대기 중 (2분)")

                        log_prefix = f"logs/retry_failed_{item_name}_{mcode}_{date_str}"
                        with open(f"{log_prefix}.html", "w", encoding="utf-8") as f:
                            f.write(response.text)
                        with open(f"{log_prefix}_info.txt", "w", encoding="utf-8") as f:
                            f.write(f"[오류] {fail_reason}\n{response_preview}")
                        retry_count += 1
                        if retry_count >= max_retries:
                            print(f"최대 재시도 {max_retries}회 초과 - 중단")
                            break
                        time.sleep(120)
                        continue

                    if "application/json" in content_type:
                        json_data = response.json()
                        body = json_data.get("response", {}).get("body", {})
                        items = body.get("items", {}).get("item", [])
                        total_count = int(body.get("totalCount", 0))

                    elif "application/xml" in content_type or response.text.strip().startswith("<"):
                        root = ET.fromstring(response.text)
                        total_count_el = root.find(".//totalCount")
                        total_count = int(total_count_el.text) if total_count_el is not None else 0
                        item_els = root.findall(".//item")
                        items = [{el.tag: el.text for el in item} for item in item_els]

                    else:
                        raise ValueError(f"알 수 없는 응답 형식: {content_type}")

                    if not items:
                        print("거래 데이터 없음 - 0으로 대체 후 성공 처리")
                        items = [0]
                        data_list.extend(items)
                        market_success = True

                        success_log_path = f"success/retry_success_{item_name}_{mcode}_{date_str}_empty.txt"
                        with open(success_log_path, "w", encoding="utf-8") as f:
                            f.write(f"{datetime.now()} - {item_name} {mcode} {date_str} 거래 없음 (빈 응답)\n")
                        break

                    data_list.extend(items)
                    market_success = True

                    success_log_path = f"success/retry_success_{item_name}_{mcode}_{date_str}.txt"
                    with open(success_log_path, "w", encoding="utf-8") as f:
                        f.write(f"{datetime.now()} - {item_name} {mcode} {date_str} 데이터 수집 성공\n")

                    if page_no * 100 >= total_count:
                        print(f"마지막 페이지 도달 (totalCount: {total_count})")
                        break

                    if page_no > 10:
                        print("페이지 10 초과 - 무한 루프 방지를 위해 중단")
                        break

                    page_no += 1
                    time.sleep(1.0)

                if market_success:
                    break
                else:
                    retry_count += 1
                    if retry_count >= max_retries:
                        print(f"최대 재시도 {max_retries}회 초과 - 중단")
                        break
                    time.sleep(2 * retry_count)

            except Exception as e:
                retry_count += 1
                print(f"예외 발생: {e} (재시도 {retry_count}/{max_retries})")
                fail_log_prefix = f"logs/retry_failed_{item_name}_{mcode}_{date_str}_try{retry_count}"
                if 'response' in locals():
                    with open(f"{fail_log_prefix}.txt", "w", encoding="utf-8") as f:
                        f.write(response.text)
                with open(f"{fail_log_prefix}_info.txt", "w", encoding="utf-8") as f:
                    f.write(f"[예외] {str(e)}\n")
                if retry_count >= max_retries:
                    print(f"최대 재시도 {max_retries}회 초과 - 중단")
                    break
                time.sleep(2 * retry_count)

        if not market_success:
            fail_log_path = f"logs/retry_failed_{item_name}_{mcode}_{date_str}.txt"
            with open(fail_log_path, "w", encoding="utf-8") as f:
                f.write(f"{datetime.now()} - {item_name} {mcode} {date_str} 데이터 수집 실패\n")

    if data_list:
        df = pd.DataFrame(data_list)
        filename = f"data/유통공사_retry_{item_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        df.to_csv(filename, encoding='cp949', index=False)
        print(f"저장 완료: {filename}")
    else:
        print(f"{item_name}: 재시도에서도 데이터 없음")

In [None]:
# 형식변환

import pandas as pd
import ast

# 파일 경로
retry_success_path = "유통공사_retry_success_20250708_084350.csv"
template_path = "유통공사_도매시장_배추_20180103-20241231.csv"
output_path = "유통공사_retry_success_형식변환_20250708.csv"

# CSV 파일 불러오기 (한글 포함이므로 cp949 인코딩)
df_retry_success = pd.read_csv(retry_success_path, encoding='cp949')
df_template = pd.read_csv(template_path, encoding='cp949')

# "0" 값만 있는 행 제거
df_filtered = df_retry_success[df_retry_success["0"] != "0"].copy()
df_filtered.columns = ["raw"]  # 열 이름 재지정

# 문자열을 파이썬 딕셔너리로 안전하게 변환
df_parsed = df_filtered["raw"].apply(ast.literal_eval)
df_data = pd.DataFrame(df_parsed.tolist())

# 열 순서 맞추기 (template 기준)
df_data = df_data.reindex(columns=df_template.columns)

# 저장
df_data.to_csv(output_path, index=False, encoding='cp949')