In [4]:
import pandas as pd

report_list = pd.read_csv('../data/preprocessed/20150101-20221111_d002_list.csv',
                          encoding='cp949',
                          dtype=object
                          )
report_list.shape

(53597, 6)

In [5]:
report_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53597 entries, 0 to 53596
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   고유번호    53597 non-null  object
 1   회사명     53597 non-null  object
 2   종목코드    53597 non-null  object
 3   공시명     53597 non-null  object
 4   공시번호    53597 non-null  object
 5   공시일     53597 non-null  object
dtypes: object(6)
memory usage: 2.5+ MB


In [6]:
report_list.head()

Unnamed: 0,고유번호,회사명,종목코드,공시명,공시번호,공시일
0,365387,AJ네트웍스,95570,임원ㆍ주요주주특정증권등소유상황보고서,20220404002789,20220404
1,365387,AJ네트웍스,95570,임원ㆍ주요주주특정증권등소유상황보고서,20220404002755,20220404
2,365387,AJ네트웍스,95570,임원ㆍ주요주주특정증권등소유상황보고서,20200717000248,20200717
3,365387,AJ네트웍스,95570,임원ㆍ주요주주특정증권등소유상황보고서,20200717000224,20200717
4,365387,AJ네트웍스,95570,임원ㆍ주요주주특정증권등소유상황보고서,20200717000193,20200717


### Download data

In [7]:
import os
from dotenv import load_dotenv

load_dotenv()

API_KEY = os.environ.get('DART_API_KEY')

In [8]:
from requests import Session
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

session = Session()
retry = Retry(connect=5, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('https://', adapter)
session.mount('http://', adapter)

URL = 'https://opendart.fss.or.kr/api/document.xml'

FILEPATH = '../data/origin/dart/reports/d002/kospi'

RAW_FILENAME = '{rcept_no}.{extension}'
MY_FILENAME = '{corp_name}_d002_{rcept_no}.{extension}'

In [11]:
def download_report(corp_name:str, rcept_no: str) -> bool:
    # try:
        params = {
            'crtfc_key': API_KEY, 
            'rcept_no': rcept_no
            }
        
        res = session.get(URL, params=params)
        
        filepath = os.path.join(
            FILEPATH,
            MY_FILENAME.format(corp_name=corp_name, rcept_no=rcept_no, extension='zip')
            )
        with open(filepath, 'wb') as f:
            f.write(res.content)
        
        return True
    # except:
    #     return False

In [13]:

for i, report in report_list[999:15000].iterrows():
    try:
        corp_name = report['회사명']
        rcept_no = report['공시번호']
        download_report(corp_name, rcept_no)
    except Exception as e:
        print(999+i, e)
        break

999 HTTPSConnectionPool(host='opendart.fss.or.kr', port=443): Max retries exceeded with url: /api/document.xml?crtfc_key=b1bc86f02d80d0e8a66c3cfeb5927c35ef25d22f&rcept_no=20210407002345 (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


## Extract data

In [57]:
import os
import re
import zipfile
from typing import *

MY_PATTERN = r'([^_]+)_d002_(\d+).zip'

def get_name_and_report_code(filename: str) -> Tuple[str, str]:
    name, report_code = re.search(MY_PATTERN, filename).groups()
    return name, report_code
    
def extract_report(name: str, report_code: str)-> Tuple[str, str]:
    zipfile.ZipFile(FILEPATH + MY_FILENAME.format(corp_name=name, rcept_no=report_code, extension='zip'))\
        .extractall(FILEPATH)
    
    return name, report_code

def rename_report(name: str, report_code: str):
    os.rename(FILEPATH + RAW_FILENAME.format(rcept_no=report_code, extension='xml'),
              FILEPATH + MY_FILENAME.format(corp_name=name, rcept_no=report_code, extension='xml'))

In [58]:
filenames = [filename for filename in os.listdir(FILEPATH) if filename[-4:] == '.zip']
error = []

for filename in filenames:
    try:
        name, report_code = get_name_and_report_code(filename)
        extract_report(name, report_code)
        rename_report(name, report_code)
    except:
        error.append((name, report_code))
    # break

NameError: name 'bs' is not defined