## 고유번호 목록 생성

In [1]:
import pandas as pd

corp_df = pd.read_csv('../data/origin/krx/data_3825_20221112.csv',
                 encoding='cp949')
corp_df

Unnamed: 0,종목코드,종목명,종가,대비,등락률,시가,고가,저가,거래량,거래대금,시가총액,상장주식수
0,095570,AJ네트웍스,6260,70,1.13,6220,6400,6220,62587,393191300,293107566700,46822295
1,006840,AK홀딩스,14700,700,5.00,14250,14800,14100,29049,419615200,194739146700,13247561
2,027410,BGF,3515,70,2.03,3510,3590,3475,137111,481378470,336444520365,95716791
3,282330,BGF리테일,177500,-500,-0.28,180000,180000,176500,50762,9029550000,3067893315000,17283906
4,138930,BNK금융지주,6690,70,1.06,6750,6760,6650,1155689,7735615500,2180506795740,325935246
...,...,...,...,...,...,...,...,...,...,...,...,...
936,005010,휴스틸,6120,450,7.94,5840,6600,5680,7787722,48641893930,239831019000,39188075
937,000540,흥국화재,2840,5,0.18,2895,2895,2830,62758,178524150,182449111800,64242645
938,000547,흥국화재2우B,20000,400,2.04,19800,20150,19750,249,4934600,3072000000,153600
939,000545,흥국화재우,6150,40,0.65,6230,6230,6150,3249,19647990,4723200000,768000


In [3]:
code_df = pd.read_xml('../data/preprocessed/CORPCODE.xml',
                      dtype=object)
code_df

Unnamed: 0,corp_code,corp_name,stock_code,modify_date
0,00434003,다코,,20170630
1,00434456,일산약품,,20170630
2,00430964,굿앤엘에스,,20170630
3,00432403,한라판지,,20170630
4,00388953,크레디피아제이십오차유동화전문회사,,20170630
...,...,...,...,...
96369,01695133,리뉴메디칼,,20221024
96370,00263238,케이피텍,,20221025
96371,00154152,태준제약,,20221025
96372,01305434,포커스미디어코리아,,20221025


In [20]:
df = pd.merge(corp_df, code_df,
         how='left',
         left_on='종목코드', right_on='stock_code'
)

df = df[df['corp_code'].notna()]

df = df.rename(columns={'corp_code':'고유번호', '상장주식수':'마지막 상장주식수'})

df = df[['종목명', '종목코드', '고유번호', '마지막 상장주식수']]

df

Unnamed: 0,종목명,종목코드,고유번호,마지막 상장주식수
0,AJ네트웍스,095570,00365387,46822295
1,AK홀딩스,006840,00125080,13247561
2,BGF,027410,00219097,95716791
3,BGF리테일,282330,01263022,17283906
4,BNK금융지주,138930,00858364,325935246
...,...,...,...,...
934,휴니드,005870,00111421,14116015
935,휴비스,079980,00362238,34500000
936,휴스틸,005010,00156488,39188075
937,흥국화재,000540,00103176,64242645


In [21]:
df.to_csv('../data/preprocessed/20221111_corp_list.csv',
          encoding='cp949',
          index=False)

## 20150101~20221111 공시 데이터 수집

In [22]:
import os
from dotenv import load_dotenv

load_dotenv()

API_KEY = os.environ.get('DART_API_KEY')

In [23]:
from typing import *
from requests import Session
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

session = Session()
retry = Retry(connect=5, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

In [24]:
URL = 'https://opendart.fss.or.kr/api/list.json'

PRAMS = {
    'crtfc_key': API_KEY,
    'bgn_de' : '20150101',
    'end_de' : '20221130',
    # 'pblntf_ty' : 'D',
    'pblntf_detail_ty': 'D002'
}

In [41]:
def get_report_list_total_page(corp_code:str) -> int:
    params = dict(PRAMS, **{
        'corp_code': corp_code,
        'page_no': 1,
        'page_count': 1
        }
    )

    res = session.get(URL, params=params)
    json = res.json()

    page_count = 100
    try:
        total_count = json['total_count'] 
        total_page = total_count // page_count + (total_count % page_count > 0)
    
        return total_page
    except:
        return 0

In [42]:
from typing import *

def get_report_list(corp_code:str, total_page:int) -> List:
    report_list = []

    for page in range(1, total_page+1):
        params = dict(PRAMS, **{
            'corp_code': corp_code,
            'page_no': page,
            'page_count': 100,
            }
        )

        res = session.get(URL, params=params)
        json = res.json()
        
        report_list += json['list']
    
    return report_list

In [43]:
report_list = []
for corp_code in df['고유번호']:
    total_page = get_report_list_total_page(corp_code)
    report_list += get_report_list(corp_code, total_page)

len(report_list)

53597

In [47]:
report_list_df = pd.DataFrame(report_list)
report_list_df = report_list_df.rename(columns={'corp_code': '고유번호', 'corp_name': '회사명', 'stock_code': '종목코드', 'report_nm': '공시명', 'rcept_no':'공시번호', 'rcept_dt':'공시일'})
report_list_df = report_list_df[['고유번호', '회사명', '종목코드', '공시명', '공시번호', '공시일']]
report_list_df

Unnamed: 0,고유번호,회사명,종목코드,공시명,공시번호,공시일
0,00365387,AJ네트웍스,095570,임원ㆍ주요주주특정증권등소유상황보고서,20220404002789,20220404
1,00365387,AJ네트웍스,095570,임원ㆍ주요주주특정증권등소유상황보고서,20220404002755,20220404
2,00365387,AJ네트웍스,095570,임원ㆍ주요주주특정증권등소유상황보고서,20200717000248,20200717
3,00365387,AJ네트웍스,095570,임원ㆍ주요주주특정증권등소유상황보고서,20200717000224,20200717
4,00365387,AJ네트웍스,095570,임원ㆍ주요주주특정증권등소유상황보고서,20200717000193,20200717
...,...,...,...,...,...,...
53592,00167208,흥아해운,003280,임원ㆍ주요주주특정증권등소유상황보고서,20160504001190,20160504
53593,00167208,흥아해운,003280,임원ㆍ주요주주특정증권등소유상황보고서,20160504001175,20160504
53594,00167208,흥아해운,003280,임원ㆍ주요주주특정증권등소유상황보고서,20160504001165,20160504
53595,00167208,흥아해운,003280,임원ㆍ주요주주특정증권등소유상황보고서,20160504001144,20160504


In [48]:
len(report_list_df['회사명'].unique())

803

In [49]:
report_list_df.to_csv('../data/preprocessed/20150101-20221111_d002_list.csv',
                 index=False,
                 encoding='cp949'
)