In [2]:
import pandas as pd
import re
import numpy as np


# 讀取 dict 成 pandas 的 function
re_userdict = re.compile('^(.+?)( [0-9]+)?( [a-z]+)?$', re.U)
def get_dict(file_path):
    '''讀取 jieba dict 成 pandas 型態的方法，使用正規表達式
    可以正常將 dict 切成 word, freq, tag 三欄。    
    Parameter:
        - file_path: string. The path of jieba dictionary file.
    return: pandas data frame
    '''
    f = open(file_path, 'rb')
    dict_list = []
    for lineno, ln in enumerate(f, 1):
        line = ln.strip()
        if not isinstance(line, str):
            line = line.decode('utf-8').lstrip('\ufeff')
        if not line:
            continue
        # match won't be None because there's at least one character
        word, freq, tag = re_userdict.match(line).groups()
        if freq is not None:
            freq = freq.strip()
        if tag is not None:
            tag = tag.strip()
        dict_list.append([word, freq, tag])
    return pd.DataFrame(dict_list)
# https://segmentfault.com/q/1010000016011808

In [3]:
# 讀取 jieba 主詞典成 pandas 檔
dict_path = "anaconda/lib/python3.5/site-packages/jieba/dict_20181221.txt"
dict_pd = get_dict(dict_path)
dict_pd.columns = ["word", "freq", "tag"]

In [10]:
# 讀取新詞檔案，由於每次新詞格式不一，因此需要自行整理成 one column 的 dataframe
# 如下：
pd.DataFrame(["發熱衣","三花","..."])

Unnamed: 0,0
0,發熱衣
1,三花
2,...


In [19]:
# 整理新詞檔案
append_dict_path = "Downloads/append_dict_0521.csv"
append_dict = pd.read_csv(append_dict_path, header=None)

In [22]:
# 整理完的行詞檔案可繼續接下面 script 工作
# 包含刪除重複值、轉小寫、新增詞頻、給予 3 個 columns colnames 
append_dict.drop_duplicates(keep="last", inplace=True)
append_dict[0] = append_dict[0].str.lower()
append_dict['freq'] = append_dict[0].map(lambda row: len(row))
append_dict['tag'] = ''
append_dict.columns = ["word", "freq", "tag"]

In [23]:
# 以新詞 append 主詞典（重要）
dict_ = append_dict.append(dict_pd)
# 刪除重複值，只保留後面出現的（主詞典出現的）
dict_ = dict_.drop_duplicates(subset="word", keep="last")
# 將 None 的欄位改為空值，轉乘 numpy 時才不會出現 'None' 字串 
dict_ = dict_.fillna("")

In [24]:
# 轉乘 numpy 存成 txt 檔，
# 不使用 pd.to_csv 的原因是使用空白分隔時，當 word 有空格時，會被「"」包覆，
# 如："LOUIS VUITTON" 19 。
output_path = "anaconda/lib/python3.5/site-packages/jieba/dict_20190521.txt"
np.savetxt(output_path, dict_.values, delimiter=" ", fmt="%s")

In [17]:
# output_path = "Downloads/nd.csv"
# np.savetxt(output_path, append_dict.iloc[:,0].values, delimiter=" ", fmt="%s")