In [1]:
from bs4 import BeautifulSoup
import lxml

In [2]:
import os
import pandas as pd
df = pd.DataFrame()

PATH = '../data/origin/dart/reports/d002/'

filenames = [filename for filename in os.listdir(PATH) if filename[-4:] == '.xml' and filename[:4] == '삼성전자']
filenames

['삼성전자_d002_20200204000152.xml',
 '삼성전자_d002_20180529000327.xml',
 '삼성전자_d002_20200330000200.xml',
 '삼성전자_d002_20220316000044.xml',
 '삼성전자_d002_20220107000644.xml',
 '삼성전자_d002_20201005000249.xml',
 '삼성전자_d002_20200402001611.xml',
 '삼성전자_d002_20171228000737.xml',
 '삼성전자_d002_20210208000100.xml',
 '삼성전자_d002_20210427000593.xml',
 '삼성전자_d002_20180509003300.xml',
 '삼성전자_d002_20181212000050.xml',
 '삼성전자_d002_20201223000225.xml',
 '삼성전자_d002_20200511000505.xml',
 '삼성전자_d002_20180608000547.xml',
 '삼성전자_d002_20180611000424.xml',
 '삼성전자_d002_20221018000007.xml',
 '삼성전자_d002_20180611000425.xml',
 '삼성전자_d002_20190211000454.xml',
 '삼성전자_d002_20220929000523.xml',
 '삼성전자_d002_20201124000227.xml',
 '삼성전자_d002_20160826000142.xml',
 '삼성전자_d002_20220302000086.xml',
 '삼성전자_d002_20220511000241.xml',
 '삼성전자_d002_20211216000388.xml',
 '삼성전자_d002_20170308000067.xml',
 '삼성전자_d002_20180611000035.xml',
 '삼성전자_d002_20210203000461.xml',
 '삼성전자_d002_20210820000057.xml',
 '삼성전자_d002_20180516000208.xml',
 '삼성전자_d00

In [5]:
from typing import *

def load_report(filename:str) -> BeautifulSoup:
    with open(PATH + filename, 'rb') as f:
        a = f.read()
        bs = BeautifulSoup(a, 'html.parser')
    
    return bs

def get_company_info(bs: BeautifulSoup) -> List :
    tr = bs.find('td', string='회 사 명').parent.parent
    
    # 회사명
    name = tr.find('td', string='회 사 명').next_sibling.next_sibling.text

    # 법인구분
    market = tr.find('td', string='법인구분').next_sibling.next_sibling.text
    market = market.replace('상장법인','')

    # 종목코드
    stock_code = tr.find('td', string='회사코드 ').next_sibling.next_sibling.text

    # 발생주식 총수
    total_stock = tr.find('td', string='발행주식 총수').next_sibling.next_sibling.text
    total_stock = int(total_stock.replace(r',', '').strip())

    return [name, market, stock_code, total_stock]

def get_reporter_info(bs: BeautifulSoup)->List:
    tr = bs.find('td', string='보고구분').parent

    # 보고구분
    report_type = tr.find('td', string='보고구분').next_sibling.next_sibling.text
    
    # 보고자 구분
    reporter_type = tr.find('td', string='보고자 구분').next_sibling.next_sibling.text
    
    tr = bs.find('td', string='성명(명칭)').parent
    
    # 보고자 명칭
    reporter_name = tr.find('td', string='한     글').next_sibling.next_sibling.text
    
    tr = bs.find('td', string='발행회사와의 관계').parent
    
    # 임원
    executive = tr.find('td', string='임원(등기여부)').next_sibling.next_sibling.text
    
    # 직위명
    position = tr.find('td', string='직위명').next_sibling.next_sibling.text
    
    tr = tr.next_sibling.next_sibling.next_sibling.next_sibling
    
    # 주요주주
    # try:
    stock_holder_type = tr.find('td', string='주요주주').next_sibling.next_sibling.text
    # except:
        # stock_holder_type = '-'
    
    return [report_type, reporter_type, reporter_name, executive, position, stock_holder_type]

def get_report_contents(bs: BeautifulSoup) -> List:
    body = bs.find('th', string='보고사유').parent.parent.next_sibling.next_sibling
    
    trs = body.find_all('tr')
    contents = []
    for tr in trs[:-1]:
        tus = tr.find_all('tu')
        
        # 변동사유
        reason = tus[0].text
        
        # 변동일
        change_date = tus[1].text
        
        # 특정증권등의 종류
        stock_type = tus[2].text
        
        tes = tr.find_all('te')
        
        # 변동전
        try:
            prev = tes[0].text
            prev = int(prev.replace(',', '').strip())
        except:
            prev = 0
        
        # 증감
        try:
            change = tes[1].text
            change = int(change.replace(',', '').strip())
        except:
            change = 0
                
        # 변동 후
        try:
            after = tes[2].text
            after = int(after.replace(',', '').strip())
        except:
            after = 0
        
        contents.append([reason, change_date, stock_type, prev, change, after])
        
    return contents
        
    

In [7]:
import re

data = []
for filename in filenames:
    try:
        name, date = re.search(r'([^_]+)_d002_(\d{8})', filename).groups()
        bs = load_report(filename)
        company_info = get_company_info(bs)
        reporter_info = get_reporter_info(bs)
        contents = get_report_contents(bs)
        
        for content in contents:
            data.append(company_info + reporter_info + content + [name, date])
            
        # break
    except Exception as e:
        print(filename, e)

df = pd.DataFrame(data, columns=['회사명1', '법인구분', '종목코드', '발행주식 총수', '보고구분', '보고자 구분', '보고자 명칭', '임원', '직위명', '주요주주', '변동사유', '변동일', '특정증권등의 종류', '변동전', '증감', '변동후', '회사명2', '보고일자'])
df.shape



삼성전자_d002_20220802000246.xml 'NoneType' object has no attribute 'parent'


(2156, 18)

In [8]:
df.sort_values
df.to_csv('../data/preprocessed/20150101-20221024_삼성전자_d002.csv', encoding='cp949', index=False)

## Sample

In [3]:
with open('../data/origin/dart/reports/d002/네오리진_d002_20210104000195.xml', 'rb') as f:
        a = f.read()
        bs = BeautifulSoup(a, 'html.parser')

#### 회사 개요

In [4]:
tr = bs.find('td', string='회 사 명').parent.parent

In [5]:
# 회사명
tr.find('td', string='회 사 명').next_sibling.next_sibling.text

'주식회사 코닉글로리'

In [6]:
# 법인구분
market = tr.find('td', string='법인구분').next_sibling.next_sibling.text
market.replace('상장법인','')

'코스닥'

In [7]:
# 종목코드
tr.find('td', string='회사코드 ').next_sibling.next_sibling.text

'094860'

In [8]:
# 발생주식 총수
total = tr.find('td', string='발행주식 총수').next_sibling.next_sibling.text
int(total.replace(r',', '').strip())

50642595

#### 보고자 개요

In [9]:
tr = bs.find('td', string='보고구분').parent

In [10]:
# 보고구분
tr.find('td', string='보고구분').next_sibling.next_sibling.text

'변동'

In [11]:
# 보고자 구분
tr.find('td', string='보고자 구분').next_sibling.next_sibling.text

'외국법인'

In [12]:
tr = bs.find('td', string='성명(명칭)').parent

In [13]:
# 보고자 명칭
tr.find('td', string='한     글').next_sibling.next_sibling.text

'조이프렌즈 피티이 엘티디'

In [14]:
tr = bs.find('td', string='발행회사와의 관계').parent

In [15]:
# 임원여부
tr.find('td', string='임원(등기여부)').next_sibling.next_sibling.text

'-'

In [16]:
# 직위명
tr.find('td', string='직위명').next_sibling.next_sibling.text

'-'

In [17]:
# tr = tr.next_sibling
# 주요주주
tr.find('td', string='주요주주')#.next_sibling.next_sibling.text

In [18]:
tr.next_sibling.next_sibling.next_sibling.next_sibling

<tr acopy="Y" adelete="Y">
<td align="CENTER" aupdatecont="N" height="30" valign="MIDDLE" width="115">주요주주</td>
<tu align="CENTER" aunit="MAIN_SH" aunitvalue="2" colspan="3" height="30" valign="MIDDLE" width="352">사실상지배주주</tu>
</tr>

#### 보고 항목

In [19]:
body = bs.find('th', string='보고사유').parent.parent.next_sibling.next_sibling

In [122]:
trs = body.find_all('tr')
tus = trs[0].find_all('tu')

In [129]:
# 변동사유
tus[0].text

'합병(+)'

In [130]:
# 변동일
tus[1].text

'2020.12.31'

In [131]:
# 특정증권등의 종류
tus[2].text

'보통주'

In [132]:
tes = body.find_all('te')

In [133]:
# 변동전
tes[0].text

'- '

In [134]:
# 증감
tes[1].text

'8,471,733 '

In [135]:
# 변동 후
tes[2].text

'8,471,733 '

In [136]:
# 취득/처분 단가
tes[3].text

'- '

In [137]:
# 비고
tes[4].text

'-'

#### 보고 항목

In [21]:
body = bs.find('th', string='보고사유').parent.parent.next_sibling.next_sibling

In [22]:
trs = body.find_all('tr')
tr = trs[1]
tus = tr.find_all('tu')

In [24]:
# 변동사유
tus[0].text

'장내매수(+)'

In [25]:
# 변동일
tus[1].text

'2020년 12월 28일'

In [26]:
# 특정증권등의 종류
tus[2].text

'보통주'

In [28]:
tes = tr.find_all('te')

In [29]:
# 변동전
tes[0].text

'2,538,740'

In [30]:
# 증감
tes[1].text

'130,256'

In [31]:
# 변동 후
tes[2].text

'2,668,996'

In [32]:
# 취득/처분 단가
tes[3].text

'2,685'

In [33]:
# 비고
tes[4].text

'-'

In [None]:
df['']