In [21]:
import pandas as pd

In [22]:
df = pd.read_csv("./sample_data/searchkeyword.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188359 entries, 0 to 188358
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   searchkeyword  188359 non-null  object
 1   count          188359 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.9+ MB


In [23]:
# df["searchkeyword"] = df["searchkeyword"].str.replace("\t", "").str.replace("\n", "").str.replace(" ", "").str.replace("-", "").str.replace("‘", "").str.strip()
# df

In [24]:
import re


def clean_text(text:str) -> str:
    return re.sub(r'[^\u4e00-\u9fa5]', '', text)

In [25]:
df["searchkeyword"] = df["searchkeyword"].apply(clean_text)
df

Unnamed: 0,searchkeyword,count
0,白泥碳酸潔面泡,1
1,潔廁劑原味,1
2,劇,1
3,喇叭牌正露丸粒,1
4,歐萊雅,1
...,...,...
188354,芝孢子,8
188355,芝抱子,4
188356,芝苞子,3
188357,燒鵝,1


In [26]:
filtered_df = df[(df["searchkeyword"].str.len() > 2) & (df["searchkeyword"].str.len() < 6)]
filtered_df

Unnamed: 0,searchkeyword,count
1,潔廁劑原味,1
4,歐萊雅,1
5,蘭州歸脾丸,1
9,益生菌,4
14,日本命力,1
...,...,...
188343,生骨骼,1
188349,適達牙膏,1
188354,芝孢子,8
188355,芝抱子,4


In [27]:
filtered_df = filtered_df.groupby("searchkeyword", as_index=False).agg({"count": "sum"}).sort_values(by="count", ascending=False).reset_index(drop=True)

In [28]:
all_keywords = filtered_df["searchkeyword"].tolist()
all_count = filtered_df["count"].tolist()

In [29]:
# filtered_df['compare'] = filtered_df['searchkeyword'].agg(lambda x: all_keywords)
# filtered_df

In [30]:
def generate_regex_patterns(keyword):
    patterns = []
    for i in range(len(keyword)):
        # 在每個位置替換一個字符為'.'
        pattern = keyword[:i] + '.' + keyword[i+1:]
        patterns.append(pattern)
    return patterns

In [31]:
filtered_df["regex_pattern"] = filtered_df["searchkeyword"].apply(generate_regex_patterns)
filtered_df

Unnamed: 0,searchkeyword,count,regex_pattern
0,濕紙巾,22859,"[.紙巾, 濕.巾, 濕紙.]"
1,益生菌,18134,"[.生菌, 益.菌, 益生.]"
2,衛生巾,14198,"[.生巾, 衛.巾, 衛生.]"
3,維他命,14080,"[.他命, 維.命, 維他.]"
4,洗頭水,11657,"[.頭水, 洗.水, 洗頭.]"
...,...,...,...
68660,一口唇,1,"[.口唇, 一.唇, 一口.]"
68661,一口清熱,1,"[.口清熱, 一.清熱, 一口.熱, 一口清.]"
68662,一口降堂,1,"[.口降堂, 一.降堂, 一口.堂, 一口降.]"
68663,一土大尸,1,"[.土大尸, 一.大尸, 一土.尸, 一土大.]"


In [32]:
# def need_correct(searchkeyword, keyword_list):
#     def comparision(searchkeyword, keyword):
#         if searchkeyword == keyword:
#             return keyword
#     need_correct_word = keyword_list.apply(lambda x: comparision(searchkeyword, x))
#     return [row for row in need_correct_word[~(need_correct_word.isna())].T.items() if row]

In [33]:
def search_with_regex_patterns(keyword, lst):
        patterns = generate_regex_patterns(keyword)
        results = []
        for pattern in patterns:
            regex = re.compile(pattern)
            # 搜尋list中匹配的項目
            for word in lst:
                if len(word) == len(keyword) and word != keyword:
                    if regex.search(word):
                        results.append(word)
        if len(results) > 0:
            return results
        return None

In [36]:
filtered_df["need_correct"] = filtered_df.apply(lambda x: search_with_regex_patterns(x.searchkeyword, all_keywords), axis=1)
filtered_df.to_csv("./n_gram_processed/need_correct.csv", index=False)

In [37]:
filtered_df.iloc[:20]

Unnamed: 0,searchkeyword,count,regex_pattern,need_correct
0,濕紙巾,22859,"[.紙巾, 濕.巾, 濕紙.]","[竹紙巾, 乾紙巾, 面紙巾, 手紙巾, 涼紙巾, 潔紙巾, 座紙巾, 抹紙巾, 鼻紙巾, ..."
1,益生菌,18134,"[.生菌, 益.菌, 益生.]","[億生菌, 溢生菌, 醫生菌, 乳生菌, 維生菌, 盒生菌, 盆生菌, 鱼生菌, 易生菌, ..."
2,衛生巾,14198,"[.生巾, 衛.巾, 衛生.]","[衞生巾, 卫生巾, 維生巾, 護生巾, 強生巾, 街生巾, 偉生巾, 徫生巾, 衙生巾, ..."
3,維他命,14080,"[.他命, 維.命, 維他.]","[维他命, 錐他命, 為他命, 源他命, 誰他命, 唯他命, 鐘他命, 維它命, 維化命, ..."
4,洗頭水,11657,"[.頭水, 洗.水, 洗頭.]","[冼頭水, 清頭水, 黑頭水, 淘頭水, 港頭水, 新頭水, 诜頭水, 添頭水, 洗髮水, ..."
5,高露潔,10165,"[.露潔, 高.潔, 高露.]","[牙露潔, 滴露潔, 膏露潔, 追露潔, 商露潔, 咁露潔, 高路潔, 高絲潔, 高樂潔, ..."
6,必理痛,9569,"[.理痛, 必.痛, 必理.]","[比理痛, 心理痛, 勿理痛, 散理痛, 速理痛, 不理痛, 病理痛, 生理痛, 理理痛, ..."
7,防脫髮,9020,"[.脫髮, 防.髮, 防脫.]","[呂脫髮, 放脫髮, 防掉髮, 防斷髮, 防脱髮, 防白髮, 防甩髮, 防脫髪, 防脫色, ..."
8,沐浴露,8100,"[.浴露, 沐.露, 沐浴.]","[淋浴露, 沫浴露, 洗浴露, 沭浴露, 浴浴露, 木浴露, 滌浴露, 泍浴露, 沐髮露, ..."
9,樂而雅,7996,"[.而雅, 樂.雅, 樂而.]","[乐而雅, 槳而雅, 楽而雅, 優而雅, 幾而雅, 潔而雅, 安而雅, 嫻而雅, 好而雅, ..."


In [47]:
expand_df = filtered_df[["searchkeyword", "count", "need_correct"]][~(filtered_df["need_correct"].isna())].explode("need_correct").reset_index(drop=True)
expand_df

Unnamed: 0,searchkeyword,count,need_correct
0,濕紙巾,22859,竹紙巾
1,濕紙巾,22859,乾紙巾
2,濕紙巾,22859,面紙巾
3,濕紙巾,22859,手紙巾
4,濕紙巾,22859,涼紙巾
...,...,...,...
273501,一分钟焗油,1,一分鈡焗油
273502,一口唇,1,一口清
273503,一口唇,1,一口棧
273504,一口唇,1,一口廿


In [48]:
filtered_count_df = filtered_df[["searchkeyword", "count"]]
filtered_count_df.columns = ["need_correct", "count"]

In [50]:
expand_df = expand_df.merge(filtered_count_df, how="inner", on="need_correct")
expand_df

Unnamed: 0,searchkeyword,count_x,need_correct,count_y
0,濕紙巾,22859,竹紙巾,149
1,濕紙巾,22859,乾紙巾,17
2,濕紙巾,22859,面紙巾,16
3,濕紙巾,22859,手紙巾,11
4,濕紙巾,22859,涼紙巾,9
...,...,...,...,...
273501,一分钟焗油,1,一分鈡焗油,1
273502,一口唇,1,一口清,6
273503,一口唇,1,一口棧,3
273504,一口唇,1,一口廿,2


In [55]:
expand_df[expand_df["searchkeyword"].str.match("濕紙巾")]

Unnamed: 0,searchkeyword,count_x,need_correct,count_y
0,濕紙巾,22859,竹紙巾,149
1,濕紙巾,22859,乾紙巾,17
2,濕紙巾,22859,面紙巾,16
3,濕紙巾,22859,手紙巾,11
4,濕紙巾,22859,涼紙巾,9
...,...,...,...,...
182700,濕紙巾包,1,濕紙巾臉,1
182701,濕紙巾包,1,濕紙巾水,1
182702,濕紙巾包,1,濕紙巾猴,1
182703,濕紙巾包,1,濕紙巾口,1
