In [None]:
# ! pip install python-dotenv
# ! pip install opendartreader

### 전체 값 설정

In [24]:
REPORT_URL = 'https://opendart.fss.or.kr/api/document.xml'

REPORT_PATH = '../../data/origin/dart/report/'
REPORT_TEXT_PATH = '../../data/preprocessed/report_text/'

ERROR_FILEPATH = '../../data/error/{function_name}_report_error.csv'

ORIGIN_ENCODING = 'euc-kr'
MY_ENCODING = 'cp949'

ORIGIN_REPORT_FILENAME = '{report_code}.{extension}'
MY_REPORT_FILENAME = '{name}_{report_code}_report.{extension}'
REPORT_FILENAME_PATTERN = r'(?P<name>[^_]+)_(?P<report_code>\d+)_report\..*'

In [9]:
import pandas as pd
import os
from typing import *

In [10]:
def get_name_and_report_code(filename: str) -> Tuple[str, str]:
    name, report_code = re.search(REPORT_FILENAME_PATTERN, filename).groups()
    return name, report_code

In [28]:
def save_error(error: List, function_name: str):
    error_df = corp_df.loc[corp_df['회사명'].isin([name for name, _ in error]),:]\
        .reset_index(drop=True)
    error_df = pd.concat(
        [error_df, pd.Series([e for _, e in error])],
        ignore_index=True, axis=1
    )
    
    error_df.to_csv(ERROR_FILEPATH.format(function_name=function_name),
                    encoding=MY_ENCODING)

### 사업보고서 다운로드

In [3]:
from dotenv import load_dotenv
import os
import OpenDartReader 

load_dotenv()
API_KEY = os.environ.get('DART_API_KEY')
dart = OpenDartReader(API_KEY)

In [22]:
import pandas as pd
corp_df = pd.read_csv('../../data/preprocessed/mani_corp.csv',
                      encoding=MY_ENCODING,
                      dtype=object
                      )
corp_df

Unnamed: 0,회사명,종목코드,주요제품,상장일,결산월,대표자명,홈페이지,지역,업종대분류,업종중분류,업종소분류
0,탈로스,434190,군용 리튬이온 이차전지,2022-10-24,12월,채재호,http://www.talos.or.kr,경기도,제조업,전기장비 제조업,일차전지 및 축전지 제조업
1,플라즈맵,405000,플라즈마 멸균기 및 표면처리기기,2022-10-21,12월,임유봉,http://plasmapp.com/,대전광역시,제조업,"의료, 정밀, 광학 기기 및 시계 제조업",의료용 기기 제조업
2,샤페론,378800,합성신약 및 항체치료제신약 기술제품 및 기술이전,2022-10-19,12월,"성승용, 이명세",http://shaperon.com,서울특별시,제조업,의료용 물질 및 의약품 제조업,기초 의약 물질 및 생물학적 제제 제조업
3,비스토스,419540,환자 및 태아감시장치,2022-10-18,12월,이후정,http://www.bistos.co.kr/korean,경기도,제조업,"의료, 정밀, 광학 기기 및 시계 제조업",의료용 기기 제조업
4,탑머티리얼,360070,"이차전지 시스템 엔지니어링, 전극소재, 양극소재",2022-10-18,12월,노환진,http://www.topmaterial.co.kr,경기도,제조업,전기장비 제조업,일차전지 및 축전지 제조업
...,...,...,...,...,...,...,...,...,...,...,...
1535,엔피씨,004250,"산업용기프라스틱제품(플라스틱 파렛트,시트 파렛트) 제조,판매",1969-09-08,12월,최병민,http://www.npc.co.kr,경기도,제조업,고무 및 플라스틱제품 제조업,플라스틱 제품 제조업
1536,대한전선,001440,"전력선,통신케이블,적산계기,스텐레스압연제품,광케이블,초고압선,알루미늄 제조,도매/전기공사",1968-12-27,12월,나형균,http://www.taihan.com,경기도,제조업,전기장비 제조업,절연선 및 케이블 제조업
1537,대한제당,001790,"제당,가축용 배합사료,설탕,기능성감미료(자일로올리고당),외식산 제조,도소매/부동산 임대",1968-12-27,12월,강승우,http://www.ts.co.kr,인천광역시,제조업,식료품 제조업,"곡물 가공품, 전분 및 전분제품 제조업"
1538,전방,000950,"면사,면혼방사,마혼방사,화섬사,염색사,자수사,소모사,면직물,면혼방직물,화섬직물,특수...",1968-10-21,12월,조덕현,http://www.chonbang.co.kr,서울특별시,제조업,섬유제품 제조업; 의복 제외,방적 및 가공사 제조업


In [43]:
from typing import *
from requests import Session
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

def document(code: str) -> bytes:
    session = Session()
    retry = Retry(connect=5, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    
    params = {
            'crtfc_key': API_KEY, 
            'rcept_no': code
    }
    res = session.get(REPORT_URL, params=params)
    return res.content

def download_report(name: str, code: str, error: List[str]) -> bool:
    try:
        report_list = dart.list(code, start='20220101')
        
        mask = report_list['report_nm'].str.contains('사업보고서')
        report_info = report_list[mask].head(1)
        report_code = report_info['rcept_no'].values[0]
        
        report = document(report_code)

        filepath = os.path.join(
            REPORT_PATH,
            MY_REPORT_FILENAME.format(
                name=name, report_code=report_code, extension='zip'
            )
        )
        
        with open(
            filepath,
            'wb') as f:
            f.write(report)
        
        return True
    
    except Exception as e:
        error.append((name, e))
        return False

In [11]:
error = []
for i, corp_s in corp_df.iterrows():
    name = corp_s['회사명']
    code = corp_s['종목코드']
    
    download_report(name, code, error)
    
save_error(error, download_report)

## 보고서 추출 및 이름 변경

In [36]:
import os
import re
import zipfile
from typing import *


    
def extract_report(name: str, report_code: str)-> Tuple[str, str]:
    report_name = MY_REPORT_FILENAME.format(name=name, report_code=report_code, extension='zip')
    
    zip_filename = os.path.join(REPORT_PATH, report_name)
    xml_filename = ORIGIN_REPORT_FILENAME.format(report_code=report_code, extension='xml')
    
    zipfile.ZipFile(zip_filename)\
        .extract(xml_filename, REPORT_PATH)
    
    return name, report_code

def rename_report(name: str, report_code: str):
    origin_report_name = ORIGIN_REPORT_FILENAME.format(report_code=report_code, extension='xml')
    my_report_name = MY_REPORT_FILENAME.format(name=name, report_code=report_code, extension='xml')
    in_path = os.path.join(REPORT_PATH, origin_report_name)
    out_path = os.path.join(REPORT_PATH, my_report_name)
    
    os.rename(in_path,out_path)

In [45]:
from zipfile import BadZipFile

filenames = [filename for filename in os.listdir(REPORT_PATH) if filename[-4:] == '.zip']


error2 = []
for filename in filenames:
    try:
        name, report_code = get_name_and_report_code(filename)
        extract_report(name, report_code)
        rename_report(name, report_code)
    except BadZipFile as e:
        error2.append((name, e))
    except KeyError as e:
        error2.append((name, e))

save_error(error2, 'extract_and_rename')


### 사업보고서 내 내용 추출

In [33]:
import os
import re

from bs4 import BeautifulSoup


def load_report(name: str, report_code: str) -> str:
    try:
        filepath = os.path.join(
            REPORT_PATH,
            MY_REPORT_FILENAME.format(
                name=name, report_code=report_code,
                extension='xml'
            )
        )
        
        with open(filepath, 'r', encoding=ORIGIN_ENCODING) as f:
            report = f.read()
    except UnicodeDecodeError as e:
        with open(filepath, 'r', encoding='utf8') as f:
            report = f.read()
    
    return report
        

def get_report_text(report: str) -> str:
    
    soup = BeautifulSoup(report, 'lxml')

    doc_text = soup.find('title', text=re.compile('II. 사업의 내용')).parent#.find('section-2')
    doc_text = doc_text.findAll(text=True)
    doc_text = ' '.join(doc_text)
    doc_text = re.sub('\s+', ' ', doc_text)
    return doc_text

def save_report_text(name: str, report_code: str, report_text: str):
    filepath = os.path.join(
        REPORT_TEXT_PATH,
        MY_REPORT_FILENAME.format(
            name=name, 
            report_code=report_code,
            extension='txt'
        )
    )
    
    with open(filepath, 'w', encoding=MY_ENCODING) as f:
        f.write(report_text)


In [35]:
filenames = [filename for filename in os.listdir(REPORT_PATH) if filename[-4:] == '.xml']

report_texts = {}

error3 = []

for filename in filenames:
    try:
        name, report_code = get_name_and_report_code(filename)
        report = load_report(name, report_code)
        report_text = get_report_text(report)
        report_texts[name] = report_text
        save_report_text(name, report_code, report_text)
    except Exception as e:
        error3.append((name, e))

save_error(error3, 'extract_contents')

