사업보고서 모든 내용을 Markdown 파일로 만드는 스크립트

In [1]:
import os
import re
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import markdownify
import OpenDartReader
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
api_key = os.environ["OPENDART_API_KEY"]
dart = OpenDartReader(api_key=api_key)

In [3]:
def get_webpage_content(url):
    response = requests.get(url)
    html_content = response.text
    soup = BeautifulSoup(html_content, "html.parser")
    return soup


def modify_html_tags(soup, selector, tag_name):
    tags = soup.select(selector)
    for tag in tags:
        tag.name = tag_name
        tag.string = tag.get_text()

    return soup


def save_webpage_as_markdown(url):
    soup = get_webpage_content(url)

    # CSS 셀렉터를 사용하여 태그 이름 변경
    soup = modify_html_tags(soup, "p.cover-title", "h1")
    soup = modify_html_tags(soup, "p.section-1", "h1")
    soup = modify_html_tags(soup, "p.section-2", "h2")

    markdown_content = markdownify.markdownify(str(soup.body), heading_style="ATX")
    return markdown_content


def make_markdown_report(report_row):
    rcept_no = report_row["rcept_no"]
    file_name = report_row["file_name"]
    file_name = f".files/__사업보고서/{file_name}"

    df_pages = dart.sub_docs(rcept_no)
    df_pages = df_pages[~df_pages["title"].str.match(r"^\d")]
    df_pages = df_pages[~df_pages["title"].str.startswith("【")]

    urls = df_pages["url"].tolist()
    markdown_content = ""
    for url in urls:
        time.sleep(0.3)
        markdown_content += save_webpage_as_markdown(url)

    with open(file_name, "w") as f:
        f.write(markdown_content)


def create_folder(path):
    if not os.path.exists(path):
        os.makedirs(path)

In [4]:
ticker = "217190"
df_reports = dart.list(ticker, kind="A")
df_reports = df_reports[df_reports["report_nm"].str.contains("사업보고서")]

df_reports["year"] = df_reports["report_nm"].str.extract(r"\((\d{4})\.\d{2}\)")
df_reports = df_reports[["year", "corp_name", "stock_code", "rcept_no"]]
df_reports["file_name"] = (
    df_reports["stock_code"]
    + "_"
    + df_reports["corp_name"]
    + "_사업보고서_"
    + df_reports["year"]
    + ".md"
)

create_folder(".files/__사업보고서")

for index, row in df_reports.iterrows():
    make_markdown_report(row)
    print(f"{row['file_name']} saved")

217190_제너셈_사업보고서_2023.md saved
217190_제너셈_사업보고서_2022.md saved
217190_제너셈_사업보고서_2021.md saved
217190_제너셈_사업보고서_2020.md saved
217190_제너셈_사업보고서_2019.md saved
217190_제너셈_사업보고서_2018.md saved
217190_제너셈_사업보고서_2017.md saved
217190_제너셈_사업보고서_2016.md saved
217190_제너셈_사업보고서_2015.md saved
