In [2]:
'''
dictionary sample
꼭지	NNP:2	NNG:12
'''
import re

f1_s = 'dic.word'
f2_s = 'sejong.dic'

# 아래의 tag 들만 저장 : 일부 komoran 에 없는 tag 세종말뭉치에 있음
tags = ['MAJ', 'ETN', 'JKC', 'EP', 'JKQ', 'EC', 
        'SO', 'SE', 'JKO', 'JX', 'NNB', 'JKG', 'SS',
        'IC', 'SF', 'NP', 'VA', 'EF', 'VCN', 'ETM', 
        'XSA', 'JKS', 'JC', 'JKV', 'NNG', 'XPN', 'XSN', 
        'VCP', 'VX', 'VV', 'NR', 'XR', 'MM', 'XSV', 
        'NNP', 'MAG', 'SW', 'NA', 'SP', 'JKB']

def get_word_dic(f1_s, dic, ix = 0):
    with open(f1_s) as f1:
        for line in f1:
            elms = line.split()
            for tag in elms[1:]: # some words have multiple morpheme tag : refer sample in head
                morph_tag, count = tag.split(':')
                if morph_tag not in tags:
                    continue
                count = int(count)
                name = elms[0] + '\t' + morph_tag
                dic.setdefault(name, 0)
                dic[name] += count # some keys are duplicate
                ix += 1
    return ix

def get_duplicate_words(dic1, dic2):
    # two dictionary have different counts
    # get intersection and sum counts of duplicate items
    dic_union_key = dic1.keys() | dic2.keys()
    dic_intersection_key = dic1.keys() & dic2.keys()
    dic_union, dic_intersection = {}, {}

    for key1 in dic1.keys(): 
        dic_union.setdefault(key1, 0)
        dic_union[key1] += dic1[key1]
    for key2 in dic2.keys(): 
        dic_union.setdefault(key2, 0)
        dic_union[key2] += dic2[key2]
        
    return dic_union, dic_intersection

In [9]:
dic1, dic2, dic3 = {}, {}, {}
ix1 = get_word_dic(f1_s, dic1, 0)
ix2 = get_word_dic(f2_s, dic2, 0)
print(ix1, ix2)

68150 173429


In [10]:
tags = []
for key1 in dic1.keys():
    word, tag = key1.split('\t')
    tags.append(tag)
tags = list(set(tags))
print(tags, len(tags))

['SO', 'JC', 'MM', 'NNG', 'JKQ', 'ETM', 'VA', 'NNB', 'NNP', 'XPN', 'JKC', 'NP', 'XSV', 'VV', 'MAG', 'EC', 'VX', 'SW', 'ETN', 'XR', 'NR', 'SS', 'SF', 'XSN', 'NA', 'VCN', 'SP', 'EF', 'JX', 'SE', 'JKV', 'VCP', 'XSA', 'JKB', 'IC', 'EP', 'JKG', 'JKO', 'JKS', 'MAJ'] 40


In [11]:
def remove_non_korean(dic1):
    '''
    한글만을 추출할 경우 적용
    처음 세종 코퍼스 전처리 단어 추출에서 형태소 tag 을 제외함으로써 한글이 아닌 글자들 대부분 제외되어 있음
    '''
    dic2 = {}
    for key1, val1 in dic1.items():
        noun, tag = key1.split('\t')
        if re.search("[^가-힣]", noun) is None:
            dic2[key1] = val1
    return dic2

dic2 = remove_non_korean(dic2)

In [12]:
'''
dic_union, dic_intersection = get_duplicate_words(dic1, dic2)
ratio = evaluate_ratio(dic_intersection)
ratio = 1
dic_union_adjust = adjust_count(dic_union, dic2, ratio)
print(ratio)
keys = dic_union_adjust.keys()
'''
dic_union_adjust, dic_intersection = get_duplicate_words(dic1, dic2)
keys = dic_union_adjust.keys()
for key1 in list(keys)[:10]:
    print(key1, dic_union_adjust[key1])

꼬지	NNG 13
탄로	NNG 30
미심쩍	VA 19
편마비	NNG 2
쿠폰	NNG 16
대중음악고	NNG 2
경창헌	NNP 1
구롓길	NNG 2
국제선	NNG 11
고전학	NNG 3


In [13]:
def save_dic(dic_adjust):
    f1_s = 'word.pk'
    f2_s = 'dic2.word'
    f3_s = 'user_raw.dic'
    save_dic2pk(f1_s, dic_adjust)
    save_dic2txt(f2_s, dic_adjust)
    save_dic2user(f3_s, dic_adjust)
    
def save_dic2pk(f1_s, dic_adjust):
    import pickle as pk
    with open(f1_s, 'wb') as f1:
        pk.dump(dic_adjust, f1)
        
# find words have multiple morpheme tags
def save_dic2txt(f1_s, dic_adjust):
    temp_dic = {}
    for key1 in dic_adjust.keys():
        #print(key1)
        name, tag = key1.split('\t') # key1 ~ 'hangul\ttag', val ~ 'count'
        temp_dic.setdefault(name, [])
        temp_dic[name].append(tag)

    # separate temp_dic with multiple tag and single tag
    multi_dic0, single_dic0 = {}, []
    for key1, val1 in temp_dic.items():
        if len(val1) > 1:
            multi_dic0[key1] = val1
        else:
            single_dic0.append(key1 + '\t' + val1[0]) # key1\tval1 <-- original key
    
    with open(f1_s, 'w') as f1:
        for key1, vals1 in multi_dic0.items():
            line = key1 + '\t'
            for i, tag in enumerate(vals1):
                count_s = str( dic_adjust[key1 + '\t' + tag] )
                if i < len(vals1) - 1:
                    line += tag + ':' + count_s + '\t'
                else:
                    line += tag + ':' + count_s + '\n'
            f1.write(line)

        for key1 in single_dic0:
            line = key1 + ':' + str( dic_adjust[key1] ) + '\n'
            f1.write(line)
            
# find words have multiple morpheme tags
def save_dic2user(f1_s, dic_adjust):
    '''
    In case multiple tags, write the most probable tag only
    '''
    dic_adjust2 = {} # {noun: tag}
    for key1 in dic_adjust.keys():
        #print(key1)
        name, tag = key1.split('\t') # {noun\ttag: count}
        if name in dic_adjust2.keys(): # if the key is already in, compare values
            pre_key = name + '\t' + dic_adjust2[name]
            pre_val = dic_adjust[pre_key]
            val = dic_adjust[key1]
            if val > pre_val:
                dic_adjust2[name] = tag
        else: # In first appereance, add key
            dic_adjust2[name] = tag 
            
    with open(f1_s, 'w') as f1:
        for key1, vals1 in dic_adjust2.items():
            line = key1 + '\t' + vals1 + '\n'
            f1.write(line)

In [14]:
save_dic(dic_union_adjust)

In [15]:
len(dic2)

171547