In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("./sample_data/searchkeyword.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188359 entries, 0 to 188358
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   searchkeyword  188359 non-null  object
 1   count          188359 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.9+ MB


In [5]:
df.iloc[:20]

Unnamed: 0,searchkeyword,count
0,\t GATSBY 白泥碳酸潔面泡,1
1,\t clorox 潔廁劑原味,1
2,\t 劇,1
3,\t 喇叭牌 正露丸 100粒,1
4,\t 歐萊雅,1
5,\t 蘭州歸脾丸,1
6,\t 香港嶺南萬應筋健貼【科研榮譽出品】3片包,1
7,\t菲滋寶咀嚼片,1
8,\t阿葵亞瞬效水光髪膜,1
9,\n益生菌,4


In [6]:
import re


def clean_text(text:str) -> str:
    # 去除 HTML 標籤
    text = re.sub(r'<.*?>', ',', text)
    # 去除非字母字符以及英文
    text = re.sub(r'[^\u4e00-\u9fa5]', ',', text)
    text = re.sub(r",+", ",", text)

    return text

In [7]:
df["searchkeyword"] = df["searchkeyword"].apply(clean_text)

In [8]:
df.iloc[:20]

Unnamed: 0,searchkeyword,count
0,",白泥碳酸潔面泡",1
1,",潔廁劑原味",1
2,",劇",1
3,",喇叭牌,正露丸,粒",1
4,",歐萊雅",1
5,",蘭州歸脾丸",1
6,",香港嶺南萬應筋健貼,科研榮譽出品,片包",1
7,",菲滋寶咀嚼片",1
8,",阿葵亞瞬效水光髪膜",1
9,",益生菌",4


In [9]:
def do_n_gram(doc: str, n: int=2) -> list[str]:
    return [doc[i: i+n] for i in range(len(doc) - (n-1)) if not ("," in doc[i: i+n] or "的" in doc[i: i+n])]

In [10]:
for i in range(1, 6):
    df[f"{i}_gram"] = df["searchkeyword"].apply(lambda x: do_n_gram(x, i))

In [11]:
df.iloc[:20]

Unnamed: 0,searchkeyword,count,1_gram,2_gram,3_gram,4_gram,5_gram
0,",白泥碳酸潔面泡",1,"[白, 泥, 碳, 酸, 潔, 面, 泡]","[白泥, 泥碳, 碳酸, 酸潔, 潔面, 面泡]","[白泥碳, 泥碳酸, 碳酸潔, 酸潔面, 潔面泡]","[白泥碳酸, 泥碳酸潔, 碳酸潔面, 酸潔面泡]","[白泥碳酸潔, 泥碳酸潔面, 碳酸潔面泡]"
1,",潔廁劑原味",1,"[潔, 廁, 劑, 原, 味]","[潔廁, 廁劑, 劑原, 原味]","[潔廁劑, 廁劑原, 劑原味]","[潔廁劑原, 廁劑原味]",[潔廁劑原味]
2,",劇",1,[劇],[],[],[],[]
3,",喇叭牌,正露丸,粒",1,"[喇, 叭, 牌, 正, 露, 丸, 粒]","[喇叭, 叭牌, 正露, 露丸]","[喇叭牌, 正露丸]",[],[]
4,",歐萊雅",1,"[歐, 萊, 雅]","[歐萊, 萊雅]",[歐萊雅],[],[]
5,",蘭州歸脾丸",1,"[蘭, 州, 歸, 脾, 丸]","[蘭州, 州歸, 歸脾, 脾丸]","[蘭州歸, 州歸脾, 歸脾丸]","[蘭州歸脾, 州歸脾丸]",[蘭州歸脾丸]
6,",香港嶺南萬應筋健貼,科研榮譽出品,片包",1,"[香, 港, 嶺, 南, 萬, 應, 筋, 健, 貼, 科, 研, 榮, 譽, 出, 品, ...","[香港, 港嶺, 嶺南, 南萬, 萬應, 應筋, 筋健, 健貼, 科研, 研榮, 榮譽, 譽...","[香港嶺, 港嶺南, 嶺南萬, 南萬應, 萬應筋, 應筋健, 筋健貼, 科研榮, 研榮譽, ...","[香港嶺南, 港嶺南萬, 嶺南萬應, 南萬應筋, 萬應筋健, 應筋健貼, 科研榮譽, 研榮譽...","[香港嶺南萬, 港嶺南萬應, 嶺南萬應筋, 南萬應筋健, 萬應筋健貼, 科研榮譽出, 研榮譽出品]"
7,",菲滋寶咀嚼片",1,"[菲, 滋, 寶, 咀, 嚼, 片]","[菲滋, 滋寶, 寶咀, 咀嚼, 嚼片]","[菲滋寶, 滋寶咀, 寶咀嚼, 咀嚼片]","[菲滋寶咀, 滋寶咀嚼, 寶咀嚼片]","[菲滋寶咀嚼, 滋寶咀嚼片]"
8,",阿葵亞瞬效水光髪膜",1,"[阿, 葵, 亞, 瞬, 效, 水, 光, 髪, 膜]","[阿葵, 葵亞, 亞瞬, 瞬效, 效水, 水光, 光髪, 髪膜]","[阿葵亞, 葵亞瞬, 亞瞬效, 瞬效水, 效水光, 水光髪, 光髪膜]","[阿葵亞瞬, 葵亞瞬效, 亞瞬效水, 瞬效水光, 效水光髪, 水光髪膜]","[阿葵亞瞬效, 葵亞瞬效水, 亞瞬效水光, 瞬效水光髪, 效水光髪膜]"
9,",益生菌",4,"[益, 生, 菌]","[益生, 生菌]",[益生菌],[],[]


In [12]:
unigram_df = df[["1_gram", "count"]].explode("1_gram").groupby("1_gram", as_index=False).agg({"count": "sum"}).sort_values("count", ascending=False).reset_index(drop=True)
unigram_df.to_csv("./n_gram_processed/1_gram.csv", index=False)
unigram_df

Unnamed: 0,1_gram,count
0,水,101199
1,牙,100323
2,巾,88083
3,生,85063
4,膏,82376
...,...,...
5721,敪,1
5722,齙,1
5723,齚,1
5724,齜,1


In [13]:
bigram_df = df[["2_gram", "count"]].explode("2_gram").groupby("2_gram", as_index=False).agg({"count": "sum"}).sort_values("count", ascending=False).reset_index(drop=True)
bigram_df.to_csv("./n_gram_processed/2_gram.csv", index=False)
bigram_df

Unnamed: 0,2_gram,count
0,口罩,52030
1,紙巾,49981
2,牙膏,40901
3,濕紙,33358
4,防曬,30397
...,...,...
95190,一智,1
95191,一宿,1
95192,一小,1
95193,奶酸,1


In [14]:
trigram_df = df[["3_gram", "count"]].explode("3_gram").groupby("3_gram", as_index=False).agg({"count": "sum"}).sort_values("count", ascending=False).reset_index(drop=True)
trigram_df.to_csv("./n_gram_processed/3_gram.csv", index=False)
trigram_df

Unnamed: 0,3_gram,count
0,濕紙巾,32201
1,益生菌,25382
2,維他命,22148
3,衛生巾,20403
4,洗頭水,17253
...,...,...
143709,龟鹿二,1
143710,砂布紗,1
143711,砂平胃,1
143712,砂正胃,1


In [15]:
fourgram_df = df[["4_gram", "count"]].explode("4_gram").groupby("4_gram", as_index=False).agg({"count": "sum"}).sort_values("count", ascending=False).reset_index(drop=True)
fourgram_df.to_csv("./n_gram_processed/4_gram.csv", index=False)
fourgram_df

Unnamed: 0,4_gram,count
0,曼秀雷敦,8187
1,美素佳兒,6999
2,個人護理,6753
3,電動牙刷,5637
4,幸福傷風,5406
...,...,...
130216,龍角散龍,1
130217,龍角護髪,1
130218,龍造型芝,1
130219,龍隆巴斯,1


In [16]:
fivegram_df = df[["5_gram", "count"]].explode("5_gram").groupby("5_gram", as_index=False).agg({"count": "sum"}).sort_values("count", ascending=False).reset_index(drop=True)
fivegram_df.to_csv("./n_gram_processed/5_gram.csv", index=False)
fivegram_df

Unnamed: 0,5_gram,count
0,舒特膚透亮,4396
1,京都念慈菴,3669
2,幸福傷風素,2916
3,家品及飲食,2672
4,高露潔牙膏,2508
...,...,...
100450,瑰緻柔沐浴,1
100451,瑰胎盤素膠,1
100452,瑰胶原小熊,1
100453,瑰胺基酸潔,1


In [57]:
from difflib import SequenceMatcher


def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()


def correct_word(word, dictionary):
    similarities = [(similar(word, dict_word), dict_word) for dict_word in dictionary]
    best_match = max(similarities, key=lambda x: x[0])
    return best_match  # [1] if best_match[0] > 0.5 else word

In [58]:
product_list = pd.read_csv("./sample_data/summary_sorted.csv")
product_list = product_list[product_list["retain"] == True]
product_list

Unnamed: 0,n_gram,retain
0,肌膚,True
1,配方,True
2,保濕,True
3,使用,True
4,精華,True
...,...,...
570,均勻膚色,True
571,初生嬰兒,True
572,方便攜帶,True
573,有效去除,True


In [59]:
find_similar_df = trigram_df[trigram_df["count"] > 300].copy()

find_similar_df["similar"] = find_similar_df["3_gram"].apply(lambda x: [correct_word(x, product_list["n_gram"])])

In [61]:
find_similar_df.iloc[60:90]

Unnamed: 0,3_gram,count,similar
60,洗衣液,3388,"[(0.8, 洗衣)]"
61,學習褲,3369,"[(0.8, 學習)]"
62,黄道益,3316,"[(0.4, 益生)]"
63,原蛋白,3289,"[(0.8571428571428571, 膠原蛋白)]"
64,李施德,3141,"[(0.0, 肌膚)]"
65,傷風素,3089,"[(0.4, 色素)]"
66,施德林,3087,"[(0.0, 肌膚)]"
67,葡萄糖,3033,"[(1.0, 葡萄糖)]"
68,蛋白粉,3029,"[(0.8, 蛋白)]"
69,化痰素,3026,"[(0.4, 色素)]"


In [None]:
text = "洗衣液"

"洗衣"  "衣液"  "xx洗衣液"  "洗衣液xx"

### 計算句子出現的機率 (Bigram Model)

In [83]:
def find_matches(term, ngram):
    def matches_position(term, text):
        pattern = re.compile(term)

        position = pattern.search(text)
        if position:
            return position.span()[0]
        return None
    temp_df = ngram[ngram.iloc[:, 1] > 70].copy()
    temp_df["matches"] = temp_df.iloc[:, 0].map(lambda x: matches_position(term, x))  # 100 → 12.8s
    return {row.iloc[0]: [row.iloc[1], row.iloc[2]] for _, row in temp_df[~(temp_df["matches"].isna())].iterrows()}

In [85]:
input_text = "有基嬰兒奶粉"  # 曼秀雷吞

input_unigram = do_n_gram(input_text, n=1)
input_bigram = do_n_gram(input_text, n=2)
input_bigram.append(None)

autocorrect_dict = {
    "unigram": input_unigram, 
    "bigram": input_bigram
}

autocorrect_df = pd.DataFrame(autocorrect_dict)
autocorrect_df

Unnamed: 0,unigram,bigram
0,有,有基
1,基,基嬰
2,嬰,嬰兒
3,兒,兒奶
4,奶,奶粉
5,粉,


In [87]:
autocorrect_df["unigram_counts"] = autocorrect_df["unigram"].apply(lambda x: unigram_df["count"][unigram_df["1_gram"] == x].values)
autocorrect_df["unigram_counts"] = autocorrect_df["unigram_counts"].map(lambda x: x[0] if len(x) > 0 else 0)
autocorrect_df["bigram_counts"] = autocorrect_df["bigram"].apply(lambda x: bigram_df["count"][bigram_df["2_gram"] == x].values)
autocorrect_df["bigram_counts"] = autocorrect_df["bigram_counts"].map(lambda x: x[0] if len(x) > 0 else 0)
autocorrect_df["prob"] = autocorrect_df["bigram_counts"] / autocorrect_df["unigram_counts"]
autocorrect_df

Unnamed: 0,unigram,bigram,unigram_counts,bigram_counts,prob
0,有,有基,2785,0,0.0
1,基,基嬰,1390,0,0.0
2,嬰,嬰兒,17947,13967,0.778236
3,兒,兒奶,39593,1436,0.036269
4,奶,奶粉,33415,20652,0.618046
5,粉,,39982,0,0.0


In [88]:
threshold = 0.00017

autocorrect_df["compare"] = autocorrect_df["unigram"][(autocorrect_df["prob"] < threshold) & ~(autocorrect_df["bigram"].isna())].apply(lambda x: find_matches(x, bigram_df))
autocorrect_df

Unnamed: 0,unigram,bigram,unigram_counts,bigram_counts,prob,compare
0,有,有基,2785,0,0.0,"{'有機': [1883, 0.0], '有色': [278, 0.0], '有机': [1..."
1,基,基嬰,1390,0,0.0,"{'氨基': [807, 1.0], '基酸': [767, 0.0], '胺基': [21..."
2,嬰,嬰兒,17947,13967,0.778236,
3,兒,兒奶,39593,1436,0.036269,
4,奶,奶粉,33415,20652,0.618046,
5,粉,,39982,0,0.0,


In [89]:
autocorrect_df["compare"] = autocorrect_df["compare"].map(lambda x: x.items() if isinstance(x, dict) else x)

expand_df = autocorrect_df[["unigram", "compare"]].explode("compare")
expand_df["related_bigram"] = expand_df["compare"].map(lambda x: x[0] if isinstance(x, tuple) else x)
expand_df["related_bigram_count"] = expand_df["compare"].map(lambda x: x[1][0] if isinstance(x, tuple) else x)
expand_df

Unnamed: 0,unigram,compare,related_bigram,related_bigram_count
0,有,"(有機, [1883, 0.0])",有機,1883.0
0,有,"(有色, [278, 0.0])",有色,278.0
0,有,"(有机, [164, 0.0])",有机,164.0
0,有,"(濃有, [142, 1.0])",濃有,142.0
0,有,"(兒有, [96, 1.0])",兒有,96.0
0,有,"(擁有, [94, 1.0])",擁有,94.0
0,有,"(家有, [71, 1.0])",家有,71.0
1,基,"(氨基, [807, 1.0])",氨基,807.0
1,基,"(基酸, [767, 0.0])",基酸,767.0
1,基,"(胺基, [210, 1.0])",胺基,210.0


In [90]:
import pypinyin

In [91]:
bigram_need_correct = autocorrect_df["bigram"][~(autocorrect_df["compare"].isna())].values
text_need_correct = "".join(pypinyin.lazy_pinyin(bigram_need_correct))
print(text_need_correct)

youjijiying


In [92]:
expand_df["pinyin"] = expand_df["related_bigram"].apply(lambda x: "".join(pypinyin.lazy_pinyin(x)) if isinstance(x, str) else x)
expand_df

Unnamed: 0,unigram,compare,related_bigram,related_bigram_count,pinyin
0,有,"(有機, [1883, 0.0])",有機,1883.0,youji
0,有,"(有色, [278, 0.0])",有色,278.0,youse
0,有,"(有机, [164, 0.0])",有机,164.0,youji
0,有,"(濃有, [142, 1.0])",濃有,142.0,nongyou
0,有,"(兒有, [96, 1.0])",兒有,96.0,eryou
0,有,"(擁有, [94, 1.0])",擁有,94.0,yongyou
0,有,"(家有, [71, 1.0])",家有,71.0,jiayou
1,基,"(氨基, [807, 1.0])",氨基,807.0,anji
1,基,"(基酸, [767, 0.0])",基酸,767.0,jisuan
1,基,"(胺基, [210, 1.0])",胺基,210.0,anji


In [93]:
import difflib

close_matches = difflib.get_close_matches(text_need_correct, expand_df["pinyin"][~(expand_df["pinyin"].isna())])

expand_df["in_close_matches"] = expand_df["pinyin"].map(lambda x: True if isinstance(x, str) and x in close_matches else False)
expand_df[expand_df["in_close_matches"] == True]

Unnamed: 0,unigram,compare,related_bigram,related_bigram_count,pinyin,in_close_matches
0,有,"(有機, [1883, 0.0])",有機,1883.0,youji,True
0,有,"(有机, [164, 0.0])",有机,164.0,youji,True
1,基,"(基因, [207, 0.0])",基因,207.0,jiyin,True
