# Financial named entities

## Company names

In [1]:
import requests
import tarfile
import os
import math

import pandas as pd
import dart_fss as dart
from tqdm import tqdm
import matplotlib.pyplot as plt
from IPython.display import display
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, use_memory_fs=True)

import sys
sys.path.append('../../src/')
from utils import load_korean_companies, download, load_mecab_ko_dic_vocabulary

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### Loading a list of korean companies

In [2]:
with open('../../src/opendart_api_key.txt', 'r') as f: dart_api_key = f.read()
korean_companies = load_korean_companies(dart_api_key)

Output()

Output()

In [3]:
korean_companies

Unnamed: 0,corp_code,corp_name,stock_code,modify_date
0,00434003,다코,,20170630
1,00434456,일산약품,,20170630
2,00430964,굿앤엘에스,,20170630
3,00432403,한라판지,,20170630
4,00388953,크레디피아제이십오차유동화전문회사,,20170630
...,...,...,...,...
97041,00151571,청림실업,,20221114
97042,01143889,에이치엠지하우징,,20221114
97043,01359578,성남대장피에프브이,,20221114
97044,01002944,스마트에프앤디,,20221114


In [4]:
# korean_companies.to_csv('korean_companies.csv', index=False)

### Initializing a user dictionary of the korean company names for mecab-ko

- Downloading and extracting a default vocabulary of the mecab-ko

In [5]:
mecab_ko_dic_url = 'https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.1.1-20180720.tar.gz'
mecab_ko_dic_path = './mecab-ko-dic-2.1.1-20180720.tar.gz'

download(
    url=mecab_ko_dic_url, 
    filepath=mecab_ko_dic_path
)

tarfile.open(mecab_ko_dic_path).extractall()

- Loading a default vocabulary of the mecab-ko

In [6]:
default_dictionary = load_mecab_ko_dic_vocabulary('./mecab-ko-dic-2.1.1-20180720/')

In [7]:
default_dictionary

Unnamed: 0,표층형,left-ID,right-ID,비용,품사,의미 부류,종성 유무,읽기,타입,첫번째 품사,마지막 품사,표현,소스
0,가가호호,735,2649,3337,MAG,성분부사|양태부사,F,가가호호,*,*,*,*,MAG.csv
1,가각히,726,2633,4017,MAG,*,F,가각히,*,*,*,*,MAG.csv
2,가강히,726,2633,4017,MAG,*,F,가강히,*,*,*,*,MAG.csv
3,가공스레,726,2633,4017,MAG,*,F,가공스레,*,*,*,*,MAG.csv
4,가관스레,726,2633,4017,MAG,*,F,가관스레,*,*,*,*,MAG.csv
...,...,...,...,...,...,...,...,...,...,...,...,...,...
779516,쿠노,2,3,2124,EC,*,F,쿠노,*,*,*,*,EC.csv
779517,티,2,3,3500,EC,*,F,티,*,*,*,*,EC.csv
779518,하고,2,3,3327,EC,*,F,하고,*,*,*,*,EC.csv
779519,히,2,3,5168,EC,*,F,히,*,*,*,*,EC.csv


- Initializing a DataFrame representing the user dictionary

In [8]:
def create_user_dictionary(new_vocabulary, current_dictionary):
    # Creates a user dictionary which includes new vocabulary ($new_vocabulary) based on currently used dictionary ($current_dictionary)
    columns = ['표층형', 'left-ID', 'right-ID', '비용', '품사', '의미 부류', '종성 유무', '읽기', '타입', '첫번째 품사', '마지막 품사', '표현']
    
    def init_default_costs(new_vocabulary, current_dictionary):
        result = dict()
        current_unique_vocabulary = current_dictionary.sort_values(['비용']).drop_duplicates(subset=['표층형'], keep='first').copy()
        unique_vocab_lengths = current_unique_vocabulary['표층형'].str.len()
        for length in new_vocabulary.str.len().unique(): 
            result[length] = None
            same_length_indices = (unique_vocab_lengths == length)
            if same_length_indices.any() == True: result[length] = current_unique_vocabulary[same_length_indices]['비용'].mean(skipna=True)
            else: result[length] = current_dictionary['비용'].mean(skipna=True)
            result[length] = int(result[length])
        return result
                
    def has_coda(char):
        # Check if the letter has jong-seong (종성)
        is_hangul = True if 44032 <= ord(char) <= 55203 else False
        if is_hangul is False: return 'F'
        return 'F' if ((ord(char) - 44032) % 28 == 0) else 'T'
    
    def initialize_row(x):
        y = None
        homonym_indices = (current_dictionary['표층형'] == x)
        
        if homonym_indices.any() == True:
            # homonyms = current_dictionary[homonym_indices].copy()
            # y = homonyms.iloc[homonyms['비용'].argmin()].copy()
            y = current_dictionary[homonym_indices].iloc[current_dictionary[homonym_indices]['비용'].argmin()].copy()
            y['비용'] = y['비용'] - 50
            y['품사'] = 'NNP'
            y['의미 부류'] = '회사'
            y['타입'] = '*'
            y['첫번째 품사'] = '*'
            y['마지막 품사'] = '*'
            y['표현'] = '*'
        else:
            y = pd.Series(
                [
                    x, 
                    '',
                    '',
                    default_costs[len(x)],
                    'NNP',
                    '회사',
                    has_coda(x[-1]),
                    x,
                    '*',
                    '*',
                    '*',
                    '*',
                ], 
                index=columns
            )
            
        return y

    default_costs = init_default_costs(new_vocabulary, default_dictionary)
    
    return new_vocabulary.parallel_apply(initialize_row)    

In [9]:
user_dictionary = create_user_dictionary(
    new_vocabulary=korean_companies['corp_name'], 
    current_dictionary=default_dictionary
)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3033), Label(value='0 / 3033'))), …

In [10]:
user_dictionary

Unnamed: 0,표층형,left-ID,right-ID,비용,품사,의미 부류,종성 유무,읽기,타입,첫번째 품사,마지막 품사,표현,소스
0,다코,1788,3549,5425,NNP,회사,F,다코,*,*,*,*,Person.csv
1,일산약품,,,3209,NNP,회사,T,일산약품,*,*,*,*,
2,굿앤엘에스,,,3579,NNP,회사,F,굿앤엘에스,*,*,*,*,
3,한라판지,,,3209,NNP,회사,F,한라판지,*,*,*,*,
4,크레디피아제이십오차유동화전문회사,,,3615,NNP,회사,F,크레디피아제이십오차유동화전문회사,*,*,*,*,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
97041,청림실업,,,3209,NNP,회사,T,청림실업,*,*,*,*,
97042,에이치엠지하우징,,,4025,NNP,회사,T,에이치엠지하우징,*,*,*,*,
97043,성남대장피에프브이,,,4089,NNP,회사,F,성남대장피에프브이,*,*,*,*,
97044,스마트에프앤디,,,3988,NNP,회사,F,스마트에프앤디,*,*,*,*,
