## Import

In [1]:
from tqdm import tqdm

import streamlit as st
import pandas as pd
import numpy as np
import OpenDartReader
import warnings
import dart_fss
import time
import re, os

warnings.filterwarnings('ignore')

In [20]:
f = open("requirements.txt", 'w')
f.write('streamlit==1.20.0\n')
f.write("streamlit_option_menu==0.3.2\n")
f.write("pandas==1.3.4\n")
f.write("numpy==1.21.6\n")
f.write("OpenDartReader==0.2.1\n")
f.write("dart_fss==0.4.2\n")
f.close()

In [6]:
data_path = os.getcwd() + "\\datasets\\"

if not os.path.isdir(data_path):
    os.mkdir(data_path)
    
# load data
def read_xlsx(name):
    import xlwings as xw
    instance = xw.App(visible=False)
    xlsx_data = xw.Book(name).sheets[0]
    df = xlsx_data.range('A1').options(pd.DataFrame, index = False, expand = 'table').value
    instance.quit()
    instance.kill()
    return df

from glob import glob

for file in glob(data_path + "*.xlsx"):
    temp_df = read_xlsx(file)
    temp_df.to_csv(data_path + file.split("\\")[-1].replace(".xlsx", "") + "-사업보고서.csv", index = False, encoding = "CP949")

* functions

In [2]:
def get_data(dart, code, year = 2022, quarter = "사업보고서"):
    select_cols = ['corp_cls', 'corp_code', 'corp_name', 'inv_prm', 'frst_acqs_de', 'invstmnt_purps', 'frst_acqs_amount', 'trmend_blce_qy', 'trmend_blce_qota_rt', 'trmend_blce_acntbk_amount']
    change_cols = ['법인구분', '고유번호', '회사명', '법인명', '최초취득일자', '출자목적', '최초취득금액', '기말잔액수량', '기말잔액지분율', '기말잔액장부가액']
    change_cls = {"Y":"유가", "K":"코스닥", "N":"코넥스", "E":"기타"}
    
    change_dict = {"1분기보고서": 11013, "반기보고서": 11012, "3분기보고서": 11014, "사업보고서":11011}
    r_code = change_dict[quarter]
    
    invst_df = dart.report(code, '타법인출자', year, r_code)
    
    if invst_df.shape[0] == 0:
        return invst_df
    else:
        invst_df = invst_df.loc[:, select_cols]
        invst_df.corp_cls = invst_df.corp_cls.map(change_cls)
        invst_df.columns = change_cols
    
        return invst_df

* API Setting

In [3]:
api_key = '1b39652cef07f626c9d37375edf582ee51b1407f'
dart = OpenDartReader(api_key)
dart_fss.set_api_key(api_key=api_key)

'1b39652cef07f626c9d37375edf582ee51b1407f'

* get total corp list

In [4]:
corp_dict = dart_fss.api.filings.get_corp_code()
corp_df = pd.DataFrame(corp_dict)
corp_df = corp_df.loc[corp_df.stock_code.notnull()]
corp_df.index = [x for x in range(corp_df.shape[0])]

Output()

In [5]:
corp_df

Unnamed: 0,corp_code,corp_name,stock_code,modify_date
0,00260985,한빛네트,036720,20170630
1,00264529,엔플렉스,040130,20170630
2,00358545,동서정보기술,055000,20170630
3,00231567,애드모바일,032600,20170630
4,00247939,씨모스,037600,20170630
...,...,...,...,...
3563,00413417,우리손에프앤지,073560,20230403
3564,00440712,어반리튬,073570,20230403
3565,00483735,해성옵틱스,076610,20230403
3566,00516246,알에프세미,096610,20230403


In [8]:
year_list = [x for x in range(2015, 2023)]; year_list
#year_list = [x for x in range(2018, 2023)]; year_list

[2018, 2019, 2020, 2021, 2022]

In [9]:
change_dict = {"1분기보고서": 11013, "반기보고서": 11012, "3분기보고서": 11014, "사업보고서": 11011}

* 전체 연도

In [None]:
r_code = '사업보고서'

In [None]:
cnt = 0
t_cnt = 0
pass_list = []

for year in year_list:
    start_time = time.time()

    for code in corp_df.corp_code.unique():
        try:
            temp_df = get_data(dart, code, year, r_code)

            if temp_df.shape[0] == 0:
                pass_list.append(code)
                time.sleep(0.6)
                t_cnt += 1
                continue

            elif (temp_df.shape[0] != 0) & (cnt == 0):
                output_df = get_data(dart, code, year, r_code)
            else:
                output_df = output_df.append(get_data(dart, code, year, r_code))

            cnt += 1
            t_cnt += 1
            time.sleep(0.6)

        except:
            time.sleep(20)
            temp_df = get_data(dart, code, year, r_code)

            if temp_df.shape[0] == 0:
                pass_list.append(code)
                time.sleep(0.6)
                t_cnt += 1
                continue

            elif (temp_df.shape[0] != 0) & (cnt == 0):
                output_df = get_data(dart, code, year, r_code)
            else:
                output_df = output_df.append(get_data(dart, code, year, r_code))

            cnt += 1
            t_cnt += 1
            time.sleep(0.6)
            
    save_df = output_df.loc[(output_df['출자목적'] == '단순투자') & (output_df['법인명'].isin(list(corp_df.corp_name.unique())))]
    save_df.to_excel("ECM_타법인출자-단순투자-{}.xlsx".format(year), index = False, encoding = 'CP949')

In [None]:
output_df.drop_duplicates().loc[(output_df['출자목적'] == '단순투자') & (output_df['법인명'].isin(list(corp_df.corp_name.unique())))].to_excel("ECM_타법인출자-단순투자-2017.xlsx", index = False, encoding = 'CP949')

In [None]:
end_time = time.time()
print(end_time - start_time)

In [None]:
save_df = output_df.loc[(output_df['출자목적'] == '단순투자') & (output_df['법인명'].isin(list(corp_df.corp_name.unique())))]
save_df.to_excel("ECM_타법인출자-단순투자-total.xlsx", index = False, encoding = 'CP949')

* 특정 연도

In [67]:
year = 2015
r_code = '1분기보고서'

In [None]:
start_time = time.time()
cnt = 0
t_cnt = 0
pass_list = []

for code in corp_df.corp_code.unique():
    try:
        temp_df = get_data(dart, code, year, r_code)

        if temp_df.shape[0] == 0:
            pass_list.append(code)
            time.sleep(0.6)
            t_cnt += 1
            continue

        elif (temp_df.shape[0] != 0) & (cnt == 0):
            output_df = get_data(dart, code, year, r_code)
        else:
            output_df = output_df.append(get_data(dart, code, year, r_code))

        cnt += 1
        t_cnt += 1
        time.sleep(0.6)
        
    except:
        time.sleep(10)
        temp_df = get_data(dart, code, year, r_code)

        if temp_df.shape[0] == 0:
            pass_list.append(code)
            time.sleep(0.6)
            t_cnt += 1
            continue

        elif (temp_df.shape[0] != 0) & (cnt == 0):
            output_df = get_data(dart, code, year, r_code)
        else:
            output_df = output_df.append(get_data(dart, code, year, r_code))

        cnt += 1
        t_cnt += 1
        time.sleep(0.6)

In [None]:
end_time = time.time()
print(end_time - start_time)

In [65]:
output_df.loc[(output_df['출자목적'] == '단순투자') & (output_df['법인명'].isin(list(corp_df.corp_name.unique())))].to_excel("ECM_타법인출자-단순투자-2021.xlsx", index = False, encoding = 'CP949')

In [32]:
# 소요 시간
corp_df.corp_code.unique().shape[0] * 1.25 /3600

1.2368055555555555

In [71]:
df = pd.read_excel("ECM_타법인출자-단순투자-2021.xlsx")

In [75]:
corp_df.shape[0] - 3500

62

In [None]:
a = 0
for code in corp_df.corp_code.unique()[3500:]:
    a += 1
    print(a)
    

In [72]:
df

Unnamed: 0,법인구분,고유번호,회사명,법인명,최초취득일자,출자목적,최초취득금액,기말잔액수량,기말잔액지분율,기말잔액장부가액
0,코스닥,132992,성우하이텍,현대제철,-,단순투자,34380000000,433808,0.33,17786000000
1,코스닥,132992,성우하이텍,KNN,-,단순투자,8662000000,7355570,5.55,10188000000
2,유가,1234297,미원에스씨,KPX케미칼,2020.10.07,단순투자,28000000,32777,0.68,1757000000
3,유가,1234297,미원에스씨,롯데정밀화학,2020.11.03,단순투자,24000000,0,0.00,0
4,유가,1234297,미원에스씨,대덕,2020.11.16,단순투자,58000000,323454,0.95,2607000000
...,...,...,...,...,...,...,...,...,...,...
395,코스닥,925587,위드텍,에이치엠씨제5호스팩,2021.05.12,단순투자,235856000,86000,1.68,178450000
396,코스닥,925587,위드텍,DB금융스팩9호,2021.05.12,단순투자,213721000,80000,1.65,167600000
397,코스닥,440712,어반리튬,LG전자,2021.02.24,단순투자,171000000,1000,0.0006,138000000
398,코스닥,440712,어반리튬,바이오니아,2021.09.27,단순투자,158000000,2000,0.0077,97000000
