In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("./sample_data/searchkeyword.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188359 entries, 0 to 188358
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   searchkeyword  188359 non-null  object
 1   count          188359 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.9+ MB


In [3]:
import re


def clean_text(text:str) -> str:
    # 去除 HTML 標籤
    text = re.sub(r'<.*?>', ',', text)
    # 去除非字母字符以及英文
    text = re.sub(r'[^\u4e00-\u9fa5]', ',', text)
    text = re.sub(r",+", ",", text)

    return text

In [4]:
df["searchkeyword"] = df["searchkeyword"].apply(clean_text)

In [5]:
df.iloc[:20]

Unnamed: 0,searchkeyword,count
0,",白泥碳酸潔面泡",1
1,",潔廁劑原味",1
2,",劇",1
3,",喇叭牌,正露丸,粒",1
4,",歐萊雅",1
5,",蘭州歸脾丸",1
6,",香港嶺南萬應筋健貼,科研榮譽出品,片包",1
7,",菲滋寶咀嚼片",1
8,",阿葵亞瞬效水光髪膜",1
9,",益生菌",4


In [8]:
import hanlp
hanlp.pretrained.mtl.ALL
# tok = hanlp.load(hanlp.pretrained.tok.MSR_TOK_ELECTRA_BASE_CRF)

{'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small_20201223_035557.zip',
 'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base_20201223_201906.zip',
 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20210111_124159.zip',
 'CLOSE_TOK_POS_NER_SRL_UDEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20220626_175100.zip',
 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip',
 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip',
 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_S

In [9]:
HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)

Downloading https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip to C:\Users\victo\AppData\Roaming\hanlp\mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip
Decompressing C:\Users\victo\AppData\Roaming\hanlp\mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip to C:\Users\victo\AppData\Roaming\hanlp\mtl
Downloading https://file.hankcs.com/corpus/char_table.json.zip to C:\Users\victo\AppData\Roaming\hanlp\thirdparty\file.hankcs.com\corpus/char_table.json.zip
Decompressing C:\Users\victo\AppData\Roaming\hanlp\thirdparty\file.hankcs.com\corpus/char_table.json.zip to C:\Users\victo\AppData\Roaming\hanlp\thirdparty\file.hankcs.com\corpus
                                   

In [10]:
HanLP(',香港嶺南萬應筋健貼,科研榮譽出品,片包', tasks='tok').pretty_print()

In [12]:
df["tokenized"] = df["searchkeyword"].apply(lambda x: HanLP(x, tasks='tok'))

In [13]:
df

Unnamed: 0,searchkeyword,count,tokenized
0,",白泥碳酸潔面泡",1,"{'tok/fine': [',', '白', '泥', '碳酸', '潔面', '泡']}"
1,",潔廁劑原味",1,"{'tok/fine': [',', '潔廁劑', '原味']}"
2,",劇",1,"{'tok/fine': [',', '劇']}"
3,",喇叭牌,正露丸,粒",1,"{'tok/fine': [',', '喇叭牌', ',', '正露丸', ',', '粒']}"
4,",歐萊雅",1,"{'tok/fine': [',', '歐萊雅']}"
...,...,...,...
188354,",芝孢子",8,"{'tok/fine': [',', '芝', '孢子']}"
188355,",芝抱子",4,"{'tok/fine': [',', '芝', '抱', '子']}"
188356,",芝苞子",3,"{'tok/fine': [',', '芝苞', '子']}"
188357,",燒鵝",1,"{'tok/fine': [',', '燒', '鵝']}"


In [14]:
df.to_csv("./n_gram_processed/tokenized_by_hanlp.csv")