In [1]:
from bs4 import BeautifulSoup
import lxml

In [44]:
import os
import pandas as pd
df = pd.DataFrame()

PATH = '../data/origin/dart/reports/d002/'

filenames = [filename for filename in os.listdir(PATH) if filename[-4:] == '.xml']
filenames

['한화투자증권_d002_20210104000399.xml',
 '한화손해보험_d002_20210104000358.xml',
 '유나이티드_d002_20210104000482.xml',
 '루트락_d002_20210104000497.xml',
 '한화투자증권_d002_20210104000371.xml',
 '세방_d002_20210104000372.xml',
 '한화손해보험_d002_20210104000367.xml',
 '에프앤리퍼블릭_d002_20210104000395.xml',
 '새론오토모티브_d002_20210104000423.xml',
 '디앤씨미디어_d002_20210104000441.xml',
 '서울반도체_d002_20210104000048.xml',
 '유바이오로직스_d002_20210104000377.xml',
 '한화투자증권_d002_20210104000416.xml',
 '케이티_d002_20210104000221.xml',
 '헥토파이낸셜_d002_20210104000446.xml',
 '이엔코퍼레이션_d002_20210104000387.xml',
 'SK텔레콤_d002_20210104000151.xml',
 '유나이티드_d002_20210104000485.xml',
 '유바이오로직스_d002_20210104000366.xml',
 '한화투자증권_d002_20210104000411.xml',
 '루트락_d002_20210104000487.xml',
 '이엔코퍼레이션_d002_20210104000391.xml',
 '유안타제7호스팩_d002_20210104000468.xml',
 '루트락_d002_20210104000492.xml',
 '에프앤리퍼블릭_d002_20210104000386.xml',
 '한화투자증권_d002_20210104000404.xml',
 'KG스틸_d002_20210104000189.xml',
 '삼성증권_d002_20210104000388.xml',
 '코웨이_d002_20210104000271.xml',
 '마

In [209]:
from typing import *

def load_report(filename:str) -> BeautifulSoup:
    with open(PATH + filename, 'rb') as f:
        a = f.read()
        bs = BeautifulSoup(a, 'html.parser')
    
    return bs

def get_company_info(bs: BeautifulSoup) -> List :
    tr = bs.find('td', string='회 사 명').parent.parent
    
    # 회사명
    name = tr.find('td', string='회 사 명').next_sibling.next_sibling.text

    # 법인구분
    market = tr.find('td', string='법인구분').next_sibling.next_sibling.text
    market = market.replace('상장법인','')

    # 종목코드
    stock_code = tr.find('td', string='회사코드 ').next_sibling.next_sibling.text

    # 발생주식 총수
    total_stock = tr.find('td', string='발행주식 총수').next_sibling.next_sibling.text
    total_stock = int(total_stock.replace(r',', '').strip())

    return [name, market, stock_code, total_stock]

def get_reporter_info(bs: BeautifulSoup)->List:
    tr = bs.find('td', string='보고구분').parent

    # 보고구분
    report_type = tr.find('td', string='보고구분').next_sibling.next_sibling.text
    
    # 보고자 구분
    reporter_type = tr.find('td', string='보고자 구분').next_sibling.next_sibling.text
    
    tr = bs.find('td', string='성명(명칭)').parent
    
    # 보고자 명칭
    reporter_name = tr.find('td', string='한     글').next_sibling.next_sibling.text
    
    tr = bs.find('td', string='발행회사와의 관계').parent
    
    # 임원
    executive = tr.find('td', string='임원(등기여부)').next_sibling.next_sibling.text
    
    # 직위명
    position = tr.find('td', string='직위명').next_sibling.next_sibling.text
    
    tr = tr.next_sibling.next_sibling.next_sibling.next_sibling
    
    # 주요주주
    # try:
    stock_holder_type = tr.find('td', string='주요주주').next_sibling.next_sibling.text
    # except:
        # stock_holder_type = '-'
    
    return [report_type, reporter_type, reporter_name, executive, position, stock_holder_type]

def get_report_contents(bs: BeautifulSoup) -> List:
    body = bs.find('th', string='보고사유').parent.parent.next_sibling.next_sibling
    
    trs = body.find_all('tr')
    contents = []
    for tr in trs[:-1]:
        tus = tr.find_all('tu')
        
        # 변동사유
        reason = tus[0].text
        
        # 변동일
        change_date = tus[1].text
        
        # 특정증권등의 종류
        stock_type = tus[2].text
        
        tes = body.find_all('te')
        
        # 변동전
        try:
            prev = tes[0].text
            prev = int(prev.replace(',', '').strip())
        except:
            prev = 0
        
        # 증감
        change = tes[1].text
        change = int(change.replace(',', '').strip())
        
        # 변동 후
        try:
            after = tes[2].text
            after = int(after.replace(',', '').strip())
        except:
            after = 0
        
        contents.append([reason, change_date, stock_type, prev, change, after])
        
    return contents
        
    

In [210]:
import re

data = []
for filename in filenames:
    name, date = re.search(r'([^_]+)_d002_(\d{8})', filename).groups()
    bs = load_report(filename)
    company_info = get_company_info(bs)
    reporter_info = get_reporter_info(bs)
    contents = get_report_contents(bs)
    
    for content in contents:
        data.append(company_info + reporter_info + content + [name, date])
        
    # break

df = pd.DataFrame(data, columns=['회사명1', '법인구분', '종목코드', '발행주식 총수', '보고구분', '보고자 구분', '보고자 명칭', '임원', '직위명', '주요주주', '변동사유', '변동일', '특정증권등의 종류', '변동전', '증감', '변동후', '회사명2', '보고일자'])
df.shape



(183, 18)

In [211]:
df.to_csv('../data/preprocessed/20210104_D002.csv', encoding='cp949', index=False)

In [170]:
with open('../data/origin/dart/reports/d002/고려시멘트_d002_20210104000267.xml', 'rb') as f:
        a = f.read()
        bs = BeautifulSoup(a, 'html.parser')



#### 회사 개요

In [171]:
tr = bs.find('td', string='회 사 명').parent.parent

In [172]:
# 회사명
tr.find('td', string='회 사 명').next_sibling.next_sibling.text

'주식회사 고려시멘트'

In [173]:
# 법인구분
market = tr.find('td', string='법인구분').next_sibling.next_sibling.text
market.replace('상장법인','')

'코스닥'

In [174]:
# 종목코드
tr.find('td', string='회사코드 ').next_sibling.next_sibling.text

'198440'

In [175]:
# 발생주식 총수
total = tr.find('td', string='발행주식 총수').next_sibling.next_sibling.text
int(total.replace(r',', '').strip())

31979960

#### 보고자 개요

In [189]:
tr = bs.find('td', string='보고구분').parent

In [190]:
# 보고구분
tr.find('td', string='보고구분').next_sibling.next_sibling.text

'신규'

In [191]:
# 보고자 구분
tr.find('td', string='보고자 구분').next_sibling.next_sibling.text

'국내법인'

In [192]:
tr = bs.find('td', string='성명(명칭)').parent

In [193]:
# 보고자 명칭
tr.find('td', string='한     글').next_sibling.next_sibling.text

'주식회사 미래'

In [199]:
tr = bs.find('td', string='발행회사와의 관계').parent

In [200]:
# 임원여부
tr.find('td', string='임원(등기여부)').next_sibling.next_sibling.text

'-'

In [201]:
# 직위명
tr.find('td', string='직위명').next_sibling.next_sibling.text

'-'

In [202]:
# tr = tr.next_sibling
# 주요주주
tr.find('td', string='주요주주')#.next_sibling.next_sibling.text

In [206]:
tr.next_sibling.next_sibling.next_sibling.next_sibling

<tr acopy="Y" adelete="Y">
<td align="CENTER" aupdatecont="N" height="30" valign="MIDDLE" width="115">주요주주</td>
<tu align="CENTER" aunit="MAIN_SH" aunitvalue="1" colspan="3" height="30" valign="MIDDLE" width="352">10%이상주주</tu>
</tr>

#### 보고 항목

In [113]:
body = bs.find('th', string='보고사유').parent.parent.next_sibling.next_sibling

In [122]:
trs = body.find_all('tr')
tus = trs[0].find_all('tu')

In [129]:
# 변동사유
tus[0].text

'합병(+)'

In [130]:
# 변동일
tus[1].text

'2020.12.31'

In [131]:
# 특정증권등의 종류
tus[2].text

'보통주'

In [132]:
tes = body.find_all('te')

In [133]:
# 변동전
tes[0].text

'- '

In [134]:
# 증감
tes[1].text

'8,471,733 '

In [135]:
# 변동 후
tes[2].text

'8,471,733 '

In [136]:
# 취득/처분 단가
tes[3].text

'- '

In [137]:
# 비고
tes[4].text

'-'