###  OPEN DART API 활용

In [2]:
from pprint import pprint
import os
import zipfile
import io
import pandas as pd
import requests
import xml.etree.ElementTree as ET
import json
from lxml import etree

In [4]:
# API KEY 불러오기
with open('./api_keys.json', 'r') as file:
    data = json.load(file)
dart_key = data[0].get('opendart_api_key')

In [None]:
dart_key

In [6]:
# CORPCODE.XML 다운로드
PATH = f"https://opendart.fss.or.kr/api/corpCode.xml?crtfc_key={dart_key}"
response = requests.get(PATH)
if response.status_code == 200:
    zipfile.ZipFile(io.BytesIO(response.content)).extractall('./')


In [7]:
# CORPCODE.xml을 기반으로 한 pd.DataFrame 만들기
tree = ET.parse('./CORPCODE.xml')
root = tree.getroot()
corp_items = []
for corp_data in root.iter('list'):
    item = {
        "corp_code": (corp_data.findtext('corp_code') or '').strip(),
        "corp_name": (corp_data.findtext('corp_name') or '').strip(),
        "corp_eng_name": (corp_data.findtext('corp_eng_name') or '').strip(),
        "stock_code": (corp_data.findtext('stock_code') or '').strip(),
        "modify_date": (corp_data.findtext('modify_date') or '').strip(),
    }
    corp_items.append(item)

df = pd.DataFrame(corp_items)
df

Unnamed: 0,corp_code,corp_name,corp_eng_name,stock_code,modify_date
0,00434003,다코,Daco corporation,,20170630
1,00430964,굿앤엘에스,"Good & LS Co.,Ltd.",,20170630
2,00388953,크레디피아제이십오차유동화전문회사,Credipia 25th Asset Securitization Specialty L...,,20170630
3,00179984,연방건설산업,youn bao,,20170630
4,00420143,브룩스피알아이오토메이션잉크,"BROOKS-PRI Automation, Inc.",,20170630
...,...,...,...,...,...
113115,01913127,디지탈테크,"digitalltek co.,ltd.",,20250423
113116,01913491,피유란,"PYURAN Co., Ltd.",,20250423
113117,01947212,하나모터제일차,Hana Mortor 1st Corp.,,20250820
113118,00579342,영광,"YOUNGKWANG Co., Ltd",,20250820


### 활용할 API 내 보고서 양식 목록
1. 정기 보고서
2. 분기 보고서


In [8]:
BASE_URL = "https://opendart.fss.or.kr/api"

In [9]:
# 공시서류원본파일 확인용
url = BASE_URL+'/document.xml'
payload = {'crtfc_key': dart_key, 'rcept_no':20190401004781}
response = requests.get(url, params=payload)
response.status_code

200

In [10]:
zipfile.ZipFile(io.BytesIO(response.content)).extractall('./corp_reports/')

In [11]:
def fetch_documents(api_key, rcp_no):
    url = BASE_URL+'/document.xml'
    params = {'crtfc_key': api_key, 'rcept_no':rcp_no}
    response = requests.get(url, params=params)
    if response.status_code != 200:
        print('Bad Response')
        raise Exception(f'API 요청 실패: 상태코드 {response.status_code}')
    else:
        return response.content
    
def extract_report(zip_content, rcp_no):
    try:
        with zipfile.ZipFile(io.BytesIO(zip_content)) as zf:
            audit_fnames = [info.filename for info in zf.infolist() if f"{rcp_no}_00760.xml" in info.filename]
            if not audit_fnames:
                raise ValueError("감사보고서 파일을 찾을 수 없습니다.")
            xml_data = zf.read(audit_fnames[0])
            return xml_data
    except  zipfile.BadZipFile:
        raise ValueError("ZIP파일이 손상되었거나 유효하지 않습니다.")
    
def parse_html_from_xml(xml_data):
    try:
        xml_text = xml_data.decode('euc-kr')
    except UnicodeDecodeError:
        xml_text = xml_data.decode('utf-8')
    parser = etree.HTMLParser()
    root = etree.fromstring(xml_text.encode('utf-8'), parser)
    return root

api_key = dart_key
rcp_no = '20190401004781'

zip_content = fetch_documents(api_key, rcp_no)
xml_data = extract_report(zip_content, rcp_no)
root = parse_html_from_xml(xml_data)


In [None]:
root[0]

'body'

### 사업보고서 주요정보 추출

In [13]:
# 

In [28]:
import os, io, re, zipfile, datetime as dt
from typing import Optional, Tuple, List, Dict

import requests
import pandas as pd
from lxml import etree, html  # 선택: 대표 파일 텍스트까지 보고 싶을 때

BASE = "https://opendart.fss.or.kr/api"

# ---- 유틸 ---------------------------------------------------
def _decode_kr(data: bytes) -> str:
    for enc in ("cp949", "euc-kr", "utf-8"):
        try: return data.decode(enc)
        except UnicodeDecodeError: continue
    return data.decode("utf-8", errors="ignore")

def _is_business_report(report_nm: str) -> bool:
    name = str(report_nm)
    return ("사업보고서" in name) and ("정정" not in name)

def get_corp_code_from_df(corp_df: pd.DataFrame, corp_name: str) -> Optional[str]:
    hit = corp_df.loc[corp_df["corp_name"] == corp_name, "corp_code"]
    return None if hit.empty else str(hit.iloc[0])

# ---- 1) 최신 사업보고서 rcept_no 찾기 ------------------------
def find_latest_business_report_rcept_no(api_key: str, corp_code: str,
                                         days_back: int = 900) -> Tuple[str, Dict]:
    url = f"{BASE}/list.json"
    end_de = dt.date.today().strftime("%Y%m%d")
    bgn_de = (dt.date.today() - dt.timedelta(days=days_back)).strftime("%Y%m%d")
    page_no, page_count = 1, 100
    best = None

    while True:
        params = {
            "crtfc_key": api_key, "corp_code": corp_code,
            "bgn_de": bgn_de, "end_de": end_de,
            "page_no": page_no, "page_count": page_count
        }
        r = requests.get(url, params=params, timeout=60)
        r.raise_for_status()
        js = r.json()
        if js.get("status") != "000": break

        items = [it for it in js.get("list", []) if _is_business_report(it.get("report_nm",""))]
        for it in items:
            if (best is None) or (it["rcept_dt"] > best["rcept_dt"]):
                best = it

        if len(js.get("list", [])) < page_count: break
        page_no += 1

    if not best:
        raise ValueError("최근 기간 내 사업보고서가 없습니다.")
    return best["rcept_no"], best

# ---- 2) 원문 ZIP(document.xml) 받기 --------------------------
def download_document_zip(api_key: str, rcept_no: str) -> bytes:
    url = f"{BASE}/document.xml"
    params = {"crtfc_key": api_key, "rcept_no": rcept_no}
    r = requests.get(url, params=params, timeout=90)
    r.raise_for_status()
    return r.content

def list_zip_entries(zip_bytes: bytes) -> List[str]:
    with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
        return [i.filename for i in zf.infolist()]

def choose_main_doc_name(names: List[str], rcept_no: str) -> Optional[str]:
    """대표 본문 선택(경험칙): rcept_no로 시작 & (html|xml) → ‘사업보고서/본문’ 키워드 → 그 외 html/xml/pdf/hwp"""
    pri_ext = ["html", "htm", "xml", "pdf", "hwp"]
    def ext_rank(n: str) -> int:
        m = re.search(r"\.([a-z0-9]+)$", n.lower()); ext = (m.group(1) if m else "").lower()
        return pri_ext.index(ext) if ext in pri_ext else 999

    cands = [n for n in names if n.lower().startswith(rcept_no.lower()) and re.search(r"\.(html?|xml)$", n, re.I)]
    if not cands:
        cands = [n for n in names if re.search(r"(사업보고서|본문)", n) and re.search(r"\.(html?|xml)$", n, re.I)]
    if not cands:
        cands = [n for n in names if re.search(r"\.(html?|xml|pdf|hwp)$", n, re.I)]
    return None if not cands else sorted(cands, key=ext_rank)[0]

def extract_entry(zip_bytes: bytes, entry_name: str) -> bytes:
    with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
        return zf.read(entry_name)

def parse_xml_or_html(data: bytes):
    txt = _decode_kr(data)
    try:
        root = etree.fromstring(txt.encode("utf-8"), etree.XMLParser(recover=True))
        return "xml", root, txt
    except etree.XMLSyntaxError:
        root = html.fromstring(txt)
        return "html", root, txt

# ---- 3) 엔드투엔드 (corp_df 이용) ----------------------------
def fetch_latest_business_report_with_df(api_key: str, corp_df: pd.DataFrame, corp_name: str,
                                         save_zip_path: Optional[str] = None):
    corp_code = get_corp_code_from_df(corp_df, corp_name)
    if not corp_code:
        raise ValueError(f"corp_df에서 기업명을 찾을 수 없습니다: {corp_name}")

    rcept_no, meta = find_latest_business_report_rcept_no(api_key, corp_code)
    zip_bytes = download_document_zip(api_key, rcept_no)
    if save_zip_path:
        with open(save_zip_path, "wb") as f:
            f.write(zip_bytes)

    names = list_zip_entries(zip_bytes)
    main_name = choose_main_doc_name(names, rcept_no)
    main_kind = main_text_sample = None
    if main_name:
        data = extract_entry(zip_bytes, main_name)
        main_kind, _, main_text = parse_xml_or_html(data)
        main_text_sample = main_text[:1000]

    return {
        "corp_name": corp_name,
        "corp_code": corp_code,
        "rcept_no": rcept_no,
        "report_nm": meta.get("report_nm"),
        "rcept_dt": meta.get("rcept_dt"),
        "zip_entries": names,
        "main_entry": main_name,          # 대표 본문 추정 파일명
        "main_kind": main_kind,           # 'xml' or 'html' (없을 수도 있음)
        "main_text_sample": main_text_sample,
    }

# ---- 사용 예 -------------------------------------------------
# corp_df = ...  # 이미 보유한 CORPCODE.xml 파싱 DF (컬럼: corp_code, corp_name, ...)
# DART_API_KEY = os.environ["DART_API_KEY"]
# info = fetch_latest_business_report_with_df(DART_API_KEY, corp_df, "삼성전자",
#                                             save_zip_path="삼성전자_최신_사업보고서.zip")
# print(info)


In [29]:
corp_df = df
DART_API_KEY = dart_key
info = fetch_latest_business_report_with_df(DART_API_KEY, corp_df, "삼성전자",
                                            save_zip_path="삼성전자_최신_사업보고서.zip")
print(info)

{'corp_name': '삼성전자', 'corp_code': '00126380', 'rcept_no': '20250311001085', 'report_nm': '사업보고서 (2024.12)', 'rcept_dt': '20250311', 'zip_entries': ['20250311001085_00761.xml', '20250311001085_00760.xml', '20250311001085.xml'], 'main_entry': '20250311001085_00761.xml', 'main_kind': 'xml', 'main_text_sample': '<?xml version="1.0" encoding="utf-8"?>\r\n\r\n\r\n<DOCUMENT xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="dart4.xsd">\n\n<DOCUMENT-NAME ACODE="00761">연결감사보고서</DOCUMENT-NAME>\n<FORMULA-VERSION ADATE="20230116">6.0</FORMULA-VERSION>\n<COMPANY-NAME AREGCIK="00126380" AACCOUNTTYPE="A">삼성전자(주)</COMPANY-NAME>\n\n<SUMMARY>\n<EXTRACTION ACODE="AUDIT_CIK" AFEATURE="BOTH">00260116</EXTRACTION>\n<EXTRACTION ACODE="SUPV_OPIN" AFEATURE="BOTH">100000000000</EXTRACTION>\n<EXTRACTION ACODE="TOT_ASSETS" AFEATURE="BOTH">514531948</EXTRACTION>\n<EXTRACTION ACODE="TOT_DEBTS" AFEATURE="BOTH">112339878</EXTRACTION>\n<EXTRACTION ACODE="TOT_SALES" AFEATURE="BOTH">30