In [12]:
import yaml
import requests
import pandas as pd


with open('.env.yaml', 'r') as file:
    config = yaml.safe_load(file)

s3_bucket = config['s3']['buckets']['datalake']['bucket_name']
api_key = config['api']['seoul']['api_key']

In [None]:
data_format = 'json'
target_year = 2024
cgg_cd = 11560 # 11680: 강남, 11560: 영등포
cgg_name = '영등포구'

openapi_url = 'http://openapi.seoul.go.kr:8088'
data_name = 'tbLnOpendataRtmsV'

main_url = f'{openapi_url}/{api_key}/{data_format}/{data_name}'

In [None]:
target_years = [2020, 2021, 2022, 2023, 2024]
max_pages = 10000

rows = []
for target_year in target_years:
    print(f"[YEAR] {target_year}")
    for first_page, last_page in zip(range(1, max_pages + 1, 1000), range(1000, max_pages + 1, 1000)):
        print(f"[PAGE RANGE] {first_page} ~ {last_page}")

        sub_url = f'{first_page}/{last_page}/{target_year}/{cgg_cd}/{cgg_name}'
        response = requests.get(f'{main_url}/{sub_url}')
        api_output = response.json()

        try:
            # 성공 시 'tbLnOpendataRtmsV'가 header key로 같이 전달됨
            api_output[data_name]
        except:
            print(api_output['RESULT']['MESSAGE'])

        result_rows = api_output[data_name]['row']
        rows.extend(result_rows)

        if len(result_rows) != 1000: # 더 이상 값이 없는 것
            print(f"[END] Total row cnt: {first_page + len(result_rows)}")
            break

In [None]:
seoul_cgg_mapping = {
    "강남구": "gangnam",
    "강동구": "gangdong",
    "강북구": "gangbuk",
    "강서구": "gangseo",
    "관악구": "gwanak",
    "광진구": "gwangjin",
    "구로구": "guro",
    "금천구": "geumcheon",
    "노원구": "nowon",
    "도봉구": "dobong",
    "동대문구": "dongdaemun",
    "동작구": "dongjak",
    "마포구": "mapo",
    "서대문구": "seodaemun",
    "서초구": "seocho",
    "성동구": "seongdong",
    "성북구": "seongbuk",
    "송파구": "songpa",
    "양천구": "yangcheon",
    "영등포구": "yeongdeungpo",
    "용산구": "yongsan",
    "은평구": "eunpyeong",
    "종로구": "jongno",
    "중구": "jung",
    "중랑구": "jungnang"
}

In [None]:
df = pd.DataFrame(rows)

df.columns = df.columns.str.lower()

In [None]:
df['cgg'] = df['cgg_nm'].apply(lambda nm: seoul_cgg_mapping.get(nm))
df = df.rename(columns={'rcpt_yr': 'year'})

In [None]:
df.to_parquet(
    path=f's3://{s3_bucket}/real_estate/real_estate_sales_seoul',
    engine='pyarrow',
    compression='snappy',
    partition_cols=['year', 'cgg']
)

### 성공 시:
{'tbLnOpendataRtmsV': {'list_total_count': 772,
  'RESULT': {'CODE': 'INFO-000', 'MESSAGE': '정상 처리되었습니다'},
  'row': [{}]
}

### 실패 시:
{'RESULT': {'CODE': 'INFO-200', 'MESSAGE': '해당하는 데이터가 없습니다.'}}

### 제한:
1. 한 번 호출 시 1,000개까지만 호출 가능