### 0. Modules

In [2]:
# import modules
import os
import pandas as pd
import sys
sys.path.append('../')
from util.get_coord_vw import GeoCoderVworld
import constant as c

### 1. Corporation Info

In [2]:
# Corporation code procurement - location, industry code
df_corp_code = pd.read_csv('../asset/preprocess/etcs/corp_code_valid.csv', header= 0, index_col=0, dtype= {'corp_code': str, 'corp_name': str, 'stock_code': str, 'valid': bool})
df_corp_code = df_corp_code[df_corp_code['valid'] == True]

# Corporation info(crawled from API) data concat
corp_info_crawled_path = '../asset/preprocess/industry/'
corp_files = os.listdir(corp_info_crawled_path)
dfs = []
dtype_dict = {
    'cc': str,
    'est_dt': str,
    'induty_code': str,
    'address': str,
    'index': int
}
for filename in corp_files:
    df = pd.read_csv(os.path.join(corp_info_crawled_path, filename), header=0, dtype=dtype_dict)
    dfs.append(df)
df_corp_info = pd.concat(dfs, axis=0, ignore_index=True) 
df_corp_info = df_corp_info.drop('Unnamed: 0', axis=1)
df_corp_info = df_corp_info.sort_values(by='index', ascending= True)

# concat two dfs
df_corp = pd.merge(df_corp_info, df_corp_code, how='outer', left_on= 'cc', right_on= 'corp_code')
to_erase_cols = ['valid', 'index', 'modify_date']
df_corp = df_corp.drop(columns=to_erase_cols)

### 2. FSS Financial Data

In [4]:
# https://opendart.fss.or.kr/disclosureinfo/fnltt/dwld/main.do
# 2021 사업보고서(202203 발표) -> 2022에 적용 
# 2020 사업보고서(202103 발표) -> 2021에 적용 
# 2019 사업보고서(202003 발표) -> 2020에 적용 

df1path = '/Volumes/T7/asset/corpis/2021_4Q_PL_20230509040149/2021_KOSDAQ_consolidated.txt'
df2path = '/Volumes/T7/asset/corpis/2021_4Q_PL_20230509040149/2021_KOSPI_consolidated.txt'
df1 = pd.read_csv(df1path, encoding='cp949', sep = '\t')
df1 = df1.filter(regex='^(?!Unnamed)')
df1 = df1[df1['항목코드'] == 'dart_OperatingIncomeLoss']
rename_cols = {
    '당기': '2022',
    '전기': '2021',
    '전전기': '2020'
}
df1 = df1.rename(columns=rename_cols)
interested_cols = ['종목코드', '회사명', '시장구분', '업종', '업종명', '항목명', '2022', '2021', '2020']
df1 = df1[interested_cols]

df2 = pd.read_csv(df2path, encoding='cp949', sep = '\t')
df2 = df2.filter(regex='^(?!Unnamed)')
df2 = df2[df2['항목코드'] == 'dart_OperatingIncomeLoss']
df2 = df2.rename(columns=rename_cols)
interested_cols = ['종목코드', '회사명', '시장구분', '업종', '업종명', '항목명', '2022', '2021', '2020']
df2 = df2[interested_cols]

df_is = pd.concat([df1, df2], axis = 0)
df_is['종목코드'] = df_is['종목코드'].str.replace(r'\[|\]', '')

# concat df_corp and df_is
df_corpis = pd.merge(df_is, df_corp, how = 'inner', left_on = '종목코드', right_on ='stock_code')

# Getting gu data in interest
df_corpis[['rlgnm', 'gu', 'other']] = df_corpis['address'].str.split(' ', n=2, expand=True)
df_corpis = df_corpis.drop(columns='other')
int_gu_nm = c.CBD_NMS # ['강남구', '서초구', '중구', '종로구', '영등포구']
cond = df_corpis['gu'].isin(int_gu_nm)
df_corpis = df_corpis[cond]

# Class declaration
gcw = GeoCoderVworld()
xs = []
ys = []
for i in range(len(df_corpis['address'])):
# for i in range(10):
    addr = df_corpis.iloc[i].address
    x, y = gcw.AddrToCoord(addr)
    xs.append(x)
    ys.append(y)

df_corpis['x'] = xs 
df_corpis['y'] = ys
df_corpis.to_csv('../asset/preprocess/features/df_corpis.csv')

  df_is['종목코드'] = df_is['종목코드'].str.replace(r'\[|\]', '')


In [3]:
# Read df_corpis
df_corpis = pd.read_csv('../asset/preprocess/features/df_corpis.csv', index_col = 0)