In [19]:
import pandas as pd

In [20]:
df = pd.read_csv("./sample_data/searchkeyword.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188359 entries, 0 to 188358
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   searchkeyword  188359 non-null  object
 1   count          188359 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.9+ MB


In [21]:
df.iloc[:20]

Unnamed: 0,searchkeyword,count
0,\t GATSBY 白泥碳酸潔面泡,1
1,\t clorox 潔廁劑原味,1
2,\t 劇,1
3,\t 喇叭牌 正露丸 100粒,1
4,\t 歐萊雅,1
5,\t 蘭州歸脾丸,1
6,\t 香港嶺南萬應筋健貼【科研榮譽出品】3片包,1
7,\t菲滋寶咀嚼片,1
8,\t阿葵亞瞬效水光髪膜,1
9,\n益生菌,4


In [22]:
import re


def clean_text(text:str) -> str:
    # 去除 HTML 標籤
    text = re.sub(r'<.*?>', ',', text)
    # 去除非字母字符以及英文
    text = re.sub(r'[^\u4e00-\u9fa5]', ',', text)
    text = re.sub(r",+", ",", text)

    return text

In [23]:
df["searchkeyword"] = df["searchkeyword"].apply(clean_text)

In [24]:
df.iloc[:20]

Unnamed: 0,searchkeyword,count
0,",白泥碳酸潔面泡",1
1,",潔廁劑原味",1
2,",劇",1
3,",喇叭牌,正露丸,粒",1
4,",歐萊雅",1
5,",蘭州歸脾丸",1
6,",香港嶺南萬應筋健貼,科研榮譽出品,片包",1
7,",菲滋寶咀嚼片",1
8,",阿葵亞瞬效水光髪膜",1
9,",益生菌",4


In [25]:
def do_n_gram(doc: str, n: int=2) -> list[str]:
    return [doc[i: i+n] for i in range(len(doc) - (n-1)) if not ("," in doc[i: i+n] or "的" in doc[i: i+n])]

In [26]:
for i in range(1, 6):
    df[f"{i}_gram"] = df["searchkeyword"].apply(lambda x: do_n_gram(x, i))

In [27]:
df.iloc[:20]

Unnamed: 0,searchkeyword,count,1_gram,2_gram,3_gram,4_gram,5_gram
0,",白泥碳酸潔面泡",1,"[白, 泥, 碳, 酸, 潔, 面, 泡]","[白泥, 泥碳, 碳酸, 酸潔, 潔面, 面泡]","[白泥碳, 泥碳酸, 碳酸潔, 酸潔面, 潔面泡]","[白泥碳酸, 泥碳酸潔, 碳酸潔面, 酸潔面泡]","[白泥碳酸潔, 泥碳酸潔面, 碳酸潔面泡]"
1,",潔廁劑原味",1,"[潔, 廁, 劑, 原, 味]","[潔廁, 廁劑, 劑原, 原味]","[潔廁劑, 廁劑原, 劑原味]","[潔廁劑原, 廁劑原味]",[潔廁劑原味]
2,",劇",1,[劇],[],[],[],[]
3,",喇叭牌,正露丸,粒",1,"[喇, 叭, 牌, 正, 露, 丸, 粒]","[喇叭, 叭牌, 正露, 露丸]","[喇叭牌, 正露丸]",[],[]
4,",歐萊雅",1,"[歐, 萊, 雅]","[歐萊, 萊雅]",[歐萊雅],[],[]
5,",蘭州歸脾丸",1,"[蘭, 州, 歸, 脾, 丸]","[蘭州, 州歸, 歸脾, 脾丸]","[蘭州歸, 州歸脾, 歸脾丸]","[蘭州歸脾, 州歸脾丸]",[蘭州歸脾丸]
6,",香港嶺南萬應筋健貼,科研榮譽出品,片包",1,"[香, 港, 嶺, 南, 萬, 應, 筋, 健, 貼, 科, 研, 榮, 譽, 出, 品, ...","[香港, 港嶺, 嶺南, 南萬, 萬應, 應筋, 筋健, 健貼, 科研, 研榮, 榮譽, 譽...","[香港嶺, 港嶺南, 嶺南萬, 南萬應, 萬應筋, 應筋健, 筋健貼, 科研榮, 研榮譽, ...","[香港嶺南, 港嶺南萬, 嶺南萬應, 南萬應筋, 萬應筋健, 應筋健貼, 科研榮譽, 研榮譽...","[香港嶺南萬, 港嶺南萬應, 嶺南萬應筋, 南萬應筋健, 萬應筋健貼, 科研榮譽出, 研榮譽出品]"
7,",菲滋寶咀嚼片",1,"[菲, 滋, 寶, 咀, 嚼, 片]","[菲滋, 滋寶, 寶咀, 咀嚼, 嚼片]","[菲滋寶, 滋寶咀, 寶咀嚼, 咀嚼片]","[菲滋寶咀, 滋寶咀嚼, 寶咀嚼片]","[菲滋寶咀嚼, 滋寶咀嚼片]"
8,",阿葵亞瞬效水光髪膜",1,"[阿, 葵, 亞, 瞬, 效, 水, 光, 髪, 膜]","[阿葵, 葵亞, 亞瞬, 瞬效, 效水, 水光, 光髪, 髪膜]","[阿葵亞, 葵亞瞬, 亞瞬效, 瞬效水, 效水光, 水光髪, 光髪膜]","[阿葵亞瞬, 葵亞瞬效, 亞瞬效水, 瞬效水光, 效水光髪, 水光髪膜]","[阿葵亞瞬效, 葵亞瞬效水, 亞瞬效水光, 瞬效水光髪, 效水光髪膜]"
9,",益生菌",4,"[益, 生, 菌]","[益生, 生菌]",[益生菌],[],[]


In [28]:
unigram_df = df[["1_gram", "count"]].explode("1_gram").groupby("1_gram", as_index=False).agg({"count": "sum"}).sort_values("count", ascending=False).reset_index(drop=True)
unigram_df.to_csv("./n_gram_processed/1_gram.csv", index=False)
unigram_df

Unnamed: 0,1_gram,count
0,水,101199
1,牙,100323
2,巾,88083
3,生,85063
4,膏,82376
...,...,...
5721,敪,1
5722,齙,1
5723,齚,1
5724,齜,1


In [29]:
bigram_df = df[["2_gram", "count"]].explode("2_gram").groupby("2_gram", as_index=False).agg({"count": "sum"}).sort_values("count", ascending=False).reset_index(drop=True)
bigram_df.to_csv("./n_gram_processed/2_gram.csv", index=False)
bigram_df

Unnamed: 0,2_gram,count
0,口罩,52030
1,紙巾,49981
2,牙膏,40901
3,濕紙,33358
4,防曬,30397
...,...,...
95190,一智,1
95191,一宿,1
95192,一小,1
95193,奶酸,1


In [30]:
trigram_df = df[["3_gram", "count"]].explode("3_gram").groupby("3_gram", as_index=False).agg({"count": "sum"}).sort_values("count", ascending=False).reset_index(drop=True)
trigram_df.to_csv("./n_gram_processed/3_gram.csv", index=False)
trigram_df

Unnamed: 0,3_gram,count
0,濕紙巾,32201
1,益生菌,25382
2,維他命,22148
3,衛生巾,20403
4,洗頭水,17253
...,...,...
143709,龟鹿二,1
143710,砂布紗,1
143711,砂平胃,1
143712,砂正胃,1


In [31]:
fourgram_df = df[["4_gram", "count"]].explode("4_gram").groupby("4_gram", as_index=False).agg({"count": "sum"}).sort_values("count", ascending=False).reset_index(drop=True)
fourgram_df.to_csv("./n_gram_processed/4_gram.csv", index=False)
fourgram_df

Unnamed: 0,4_gram,count
0,曼秀雷敦,8187
1,美素佳兒,6999
2,個人護理,6753
3,電動牙刷,5637
4,幸福傷風,5406
...,...,...
130216,龍角散龍,1
130217,龍角護髪,1
130218,龍造型芝,1
130219,龍隆巴斯,1


In [32]:
fivegram_df = df[["5_gram", "count"]].explode("5_gram").groupby("5_gram", as_index=False).agg({"count": "sum"}).sort_values("count", ascending=False).reset_index(drop=True)
fivegram_df.to_csv("./n_gram_processed/5_gram.csv", index=False)
fivegram_df

Unnamed: 0,5_gram,count
0,舒特膚透亮,4396
1,京都念慈菴,3669
2,幸福傷風素,2916
3,家品及飲食,2672
4,高露潔牙膏,2508
...,...,...
100450,瑰緻柔沐浴,1
100451,瑰胎盤素膠,1
100452,瑰胶原小熊,1
100453,瑰胺基酸潔,1


In [67]:
def find_matches(term, ngram, n):
    def matches_position(term, text):
        pattern = re.compile(term)

        position = pattern.search(text)
        if position:
            return position.span()[0]
        return None
    temp_df = ngram[ngram.iloc[:, 1] > 70].copy()
    temp_df["matches"] = temp_df.iloc[:, 0].map(lambda x: matches_position(term, x))  # 100 → 12.8s
    return {row.iloc[0]: [row.iloc[1], row.iloc[2]] for _, row in temp_df[~(temp_df["matches"].isna())].iterrows()}  #  if row.iloc[2] == n-1

In [68]:
compare_df = unigram_df.copy()

In [69]:
compare_df["compare_to_2gram"] = compare_df["1_gram"].apply(lambda x: find_matches(x, bigram_df, 2))
compare_df["compare_to_3gram"] = compare_df["1_gram"].apply(lambda x: find_matches(x, trigram_df, 3))
compare_df["compare_to_4gram"] = compare_df["1_gram"].apply(lambda x: find_matches(x, fourgram_df, 4))
compare_df["compare_to_5gram"] = compare_df["1_gram"].apply(lambda x: find_matches(x, fivegram_df, 5))

In [70]:
compare_df

Unnamed: 0,1_gram,count,compare_to_2gram,compare_to_3gram,compare_to_4gram,compare_to_5gram
0,水,101199,"{'頭水': [17353, 1.0], '口水': [10471, 1.0], '藥水':...","{'洗頭水': [17253, 2.0], '漱口水': [8673, 2.0], '眼藥水...","{'生理鹽水': [1735, 3.0], '濾水器水': [1212, 1.0], '水器...","{'濾水器水機': [1210, 1.0], '形眼鏡藥水': [1008, 4.0], '..."
1,牙,100323,"{'牙膏': [40901, 0.0], '牙刷': [22549, 0.0], '牙線':...","{'動牙刷': [5912, 1.0], '電動牙': [5859, 2.0], '牙線棒'...","{'電動牙刷': [5637, 2.0], '高露潔牙': [3417, 3.0], '露潔...","{'高露潔牙膏': [2508, 3.0], '舒適達牙膏': [1476, 3.0], '..."
2,巾,88083,"{'紙巾': [49981, 1.0], '生巾': [23547, 1.0], '濕巾':...","{'濕紙巾': [32201, 2.0], '衛生巾': [20403, 2.0], '衞生...","{'兒濕紙巾': [2563, 3.0], '體衛生巾': [2461, 3.0], '兒柔...","{'嬰兒濕紙巾': [2503, 4.0], '液體衛生巾': [2460, 4.0], '..."
3,生,85063,"{'益生': [27901, 1.0], '衛生': [25857, 1.0], '生菌':...","{'益生菌': [25382, 1.0], '衛生巾': [20403, 1.0], '余仁...","{'液體衛生': [2683, 3.0], '體衛生巾': [2461, 2.0], '生理...","{'液體衛生巾': [2460, 3.0], '念慈菴養生': [1414, 4.0], '..."
4,膏,82376,"{'牙膏': [40901, 1.0], '唇膏': [11582, 1.0], '藥膏':...","{'潤唇膏': [6897, 2.0], '潔牙膏': [2772, 2.0], '枇杷膏'...","{'露潔牙膏': [2508, 3.0], '黑人牙膏': [1477, 3.0], '適達...","{'高露潔牙膏': [2508, 4.0], '舒適達牙膏': [1476, 4.0], '..."
...,...,...,...,...,...,...
5721,敪,1,{},{},{},{}
5722,齙,1,{},{},{},{}
5723,齚,1,{},{},{},{}
5724,齜,1,{},{},{},{}


In [71]:
unigram_df[unigram_df["1_gram"].str.contains("秀")]

Unnamed: 0,1_gram,count
127,秀,12757


In [72]:
bigram_df[bigram_df["2_gram"].str.contains("秀敦")]

Unnamed: 0,2_gram,count
31214,秀敦,6
