In [1]:
import pandas as pd

In [2]:
sample_df = pd.read_csv("./sample_data/product_name_sample_data.csv")
sample_df.head()

Unnamed: 0,product_name,description
0,草本花香洗髮露 600毫升,揉合100%日本培植的有機草本植物，令髮絲彷如重生，令變得柔滑清爽，氛芳花香，令你彷如置身大...
1,Voost 運動水樽,VOOST MUG
2,完美遮瑕筆306 (1.5ml),質地柔亮潤澤，遮瑕的同時去除暗沉，作為 highlight 使用能提亮妝容<BR><BR>獨...
3,高效防脫增生洗髮液 150毫升,ANTI HAIR LOSS SHAMP
4,ISOTONIC 運動水溶片青檸檬味十片裝,幫助人體代謝碳水化合物、脂肪和蛋白質; 快速補充水份與電解質，促進神經肌肉傳導; 有助維持肌...


In [3]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32372 entries, 0 to 32371
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_name  32361 non-null  object
 1   description   31501 non-null  object
dtypes: object(2)
memory usage: 505.9+ KB


In [4]:
sample_df = sample_df.fillna("*")

In [5]:
product_name_df = sample_df[["product_name"]]
description_df = sample_df[["description"]]

product_name_df.columns = ["doc"]
description_df.columns = ["doc"]

long_df = pd.concat([product_name_df, description_df])
long_df.head()

Unnamed: 0,doc
0,草本花香洗髮露 600毫升
1,Voost 運動水樽
2,完美遮瑕筆306 (1.5ml)
3,高效防脫增生洗髮液 150毫升
4,ISOTONIC 運動水溶片青檸檬味十片裝


In [6]:
long_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 64744 entries, 0 to 32371
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   doc     64744 non-null  object
dtypes: object(1)
memory usage: 1011.6+ KB


In [7]:
import re


def clean_text(text:str) -> str:
    # 去除 HTML 標籤
    text = re.sub(r'<.*?>', ',', text)
    # 去除非字母字符以及英文
    text = re.sub(r'[^\u4e00-\u9fa5]', ',', text)
    text = re.sub(r",+", ",", text)

    return text

In [8]:
long_df["doc"] = long_df["doc"].apply(clean_text)
long_df.iloc[:20]

Unnamed: 0,doc
0,"草本花香洗髮露,毫升"
1,",運動水樽"
2,"完美遮瑕筆,"
3,"高效防脫增生洗髮液,毫升"
4,",運動水溶片青檸檬味十片裝"
5,強生便利貼公主膠布
6,"牛油果滋養修護晚霜,毫升"
7,",麥蘆卡蜂蜜,克,"
8,"光采豐蜜唇釉,"
9,",金盞花面霜"


In [9]:
# def do_n_gram(doc: str, n: int=2) -> list[str]:
#     pattern = re.compile(",")
#     return [doc[i: i+n] for i in range(len(doc) - (n-1)) if not (bool(pattern.search(doc[i: i+n])))]


def do_n_gram(doc: str, n: int=2) -> list[str]:
    return [doc[i: i+n] for i in range(len(doc) - (n-1)) if not ("," in doc[i: i+n] or "的" in doc[i: i+n])]

In [10]:
long_df["count"] = 1

In [11]:
for i in range(2, 6):
    long_df[f"{i}_gram"] = long_df["doc"].apply(lambda x: do_n_gram(x, i))

In [12]:
long_df.iloc[:20]

Unnamed: 0,doc,count,2_gram,3_gram,4_gram,5_gram
0,"草本花香洗髮露,毫升",1,"[草本, 本花, 花香, 香洗, 洗髮, 髮露, 毫升]","[草本花, 本花香, 花香洗, 香洗髮, 洗髮露]","[草本花香, 本花香洗, 花香洗髮, 香洗髮露]","[草本花香洗, 本花香洗髮, 花香洗髮露]"
1,",運動水樽",1,"[運動, 動水, 水樽]","[運動水, 動水樽]",[運動水樽],[]
2,"完美遮瑕筆,",1,"[完美, 美遮, 遮瑕, 瑕筆]","[完美遮, 美遮瑕, 遮瑕筆]","[完美遮瑕, 美遮瑕筆]",[完美遮瑕筆]
3,"高效防脫增生洗髮液,毫升",1,"[高效, 效防, 防脫, 脫增, 增生, 生洗, 洗髮, 髮液, 毫升]","[高效防, 效防脫, 防脫增, 脫增生, 增生洗, 生洗髮, 洗髮液]","[高效防脫, 效防脫增, 防脫增生, 脫增生洗, 增生洗髮, 生洗髮液]","[高效防脫增, 效防脫增生, 防脫增生洗, 脫增生洗髮, 增生洗髮液]"
4,",運動水溶片青檸檬味十片裝",1,"[運動, 動水, 水溶, 溶片, 片青, 青檸, 檸檬, 檬味, 味十, 十片, 片裝]","[運動水, 動水溶, 水溶片, 溶片青, 片青檸, 青檸檬, 檸檬味, 檬味十, 味十片, ...","[運動水溶, 動水溶片, 水溶片青, 溶片青檸, 片青檸檬, 青檸檬味, 檸檬味十, 檬味十...","[運動水溶片, 動水溶片青, 水溶片青檸, 溶片青檸檬, 片青檸檬味, 青檸檬味十, 檸檬味..."
5,強生便利貼公主膠布,1,"[強生, 生便, 便利, 利貼, 貼公, 公主, 主膠, 膠布]","[強生便, 生便利, 便利貼, 利貼公, 貼公主, 公主膠, 主膠布]","[強生便利, 生便利貼, 便利貼公, 利貼公主, 貼公主膠, 公主膠布]","[強生便利貼, 生便利貼公, 便利貼公主, 利貼公主膠, 貼公主膠布]"
6,"牛油果滋養修護晚霜,毫升",1,"[牛油, 油果, 果滋, 滋養, 養修, 修護, 護晚, 晚霜, 毫升]","[牛油果, 油果滋, 果滋養, 滋養修, 養修護, 修護晚, 護晚霜]","[牛油果滋, 油果滋養, 果滋養修, 滋養修護, 養修護晚, 修護晚霜]","[牛油果滋養, 油果滋養修, 果滋養修護, 滋養修護晚, 養修護晚霜]"
7,",麥蘆卡蜂蜜,克,",1,"[麥蘆, 蘆卡, 卡蜂, 蜂蜜]","[麥蘆卡, 蘆卡蜂, 卡蜂蜜]","[麥蘆卡蜂, 蘆卡蜂蜜]",[麥蘆卡蜂蜜]
8,"光采豐蜜唇釉,",1,"[光采, 采豐, 豐蜜, 蜜唇, 唇釉]","[光采豐, 采豐蜜, 豐蜜唇, 蜜唇釉]","[光采豐蜜, 采豐蜜唇, 豐蜜唇釉]","[光采豐蜜唇, 采豐蜜唇釉]"
9,",金盞花面霜",1,"[金盞, 盞花, 花面, 面霜]","[金盞花, 盞花面, 花面霜]","[金盞花面, 盞花面霜]",[金盞花面霜]


In [13]:
bigram_df = long_df[["2_gram", "count"]].explode("2_gram").groupby("2_gram", as_index=False).agg({"count": "sum"}).sort_values("count", ascending=False).reset_index(drop=True)
# bigram_df = bigram_df[~bigram_df["2_gram"].str.contains("#")].reset_index(drop=True)
bigram_df.to_csv("./n_gram_processed/2_gram.csv", index=False)
bigram_df

Unnamed: 0,2_gram,count
0,肌膚,15119
1,配方,6583
2,保濕,5935
3,使用,5884
4,精華,5195
...,...,...
146902,疊機,1
146903,疊熨,1
146904,疊牆,1
146905,售靈,1


In [14]:
trigram_df = long_df[["3_gram", "count"]].explode("3_gram").groupby("3_gram", as_index=False).agg({"count": "sum"}).sort_values("count", ascending=False).reset_index(drop=True)
# trigram_df = trigram_df[~trigram_df["3_gram"].str.contains("#")].reset_index(drop=True)
trigram_df.to_csv("./n_gram_processed/3_gram.csv", index=False)
trigram_df

Unnamed: 0,3_gram,count
0,維他命,2812
1,屈臣氏,1828
2,防腐劑,1193
3,型美甲,1179
4,美甲片,1124
...,...,...
322141,煲湯火,1
322142,煲湯煮,1
322143,煲湯調,1
322144,煲身和,1


In [15]:
fourgram_df = long_df[["4_gram", "count"]].explode("4_gram").groupby("4_gram", as_index=False).agg({"count": "sum"}).sort_values("count", ascending=False).reset_index(drop=True)
# fourgram_df = fourgram_df[~fourgram_df["4_gram"].str.contains("#")].reset_index(drop=True)
fourgram_df.to_csv("./n_gram_processed/4_gram.csv", index=False)
fourgram_df

Unnamed: 0,4_gram,count
0,透明質酸,1111
1,型美甲片,1040
2,膠原蛋白,824
3,指甲品牌,747
4,全新造型,746
...,...,...
381678,易患異位,1
381679,易感受到,1
381680,易懷孕即,1
381681,易懷孕測,1


In [16]:
fivegram_df = long_df[["5_gram", "count"]].explode("5_gram").groupby("5_gram", as_index=False).agg({"count": "sum"}).sort_values("count", ascending=False).reset_index(drop=True)
# fivegram_df = fivegram_df[~fivegram_df["5_gram"].str.contains("#")].reset_index(drop=True)
fivegram_df.to_csv("./n_gram_processed/5_gram.csv", index=False)
fivegram_df

Unnamed: 0,5_gram,count
0,甲品牌全新,744
1,指甲品牌全,744
2,全新造型美,744
3,新造型美甲,744
4,牌全新造型,744
...,...,...
369641,方是以薑為,1
369642,方是低劑量,1
369643,方是專為吞,1
369644,方是專為男,1


In [17]:
# def find_matches(ngram1, ngram2):
#     return ngram2[(ngram2.iloc[:, 1] > 1) & (ngram2.iloc[:, 0].str.contains(ngram1))].iloc[:, 0].to_list()

In [18]:
def find_matches(term, ngram):
    def matches_position(term, text):
        pattern = re.compile(term)

        position = pattern.search(text)
        if position:
            return position.span()[0]
        return None
    temp_df = ngram[ngram.iloc[:, 1] > 70].copy()
    temp_df["matches"] = temp_df.iloc[:, 0].map(lambda x: matches_position(term, x))  # 100 → 12.8s
    return {row.iloc[0]: [row.iloc[1], row.iloc[2]] for _, row in temp_df[~(temp_df["matches"].isna())].iterrows()}

In [19]:
import numpy as np
def find_matches_np(term, ngram):  # 100 -> 12.7s
    array = ngram.iloc[:, 0][ngram.iloc[:, 1] > 1].to_list()

    matches = np.char.find(array, term) != -1

    return matches.sum()

In [20]:
count_threshold_2_gram = 300

bi_tri_df = bigram_df[bigram_df["count"] > count_threshold_2_gram].copy()
bi_tri_df

Unnamed: 0,2_gram,count
0,肌膚,15119
1,配方,6583
2,保濕,5935
3,使用,5884
4,精華,5195
...,...,...
670,控制,303
671,絲絨,303
672,品質,302
673,黃金,301


In [21]:
bi_tri_df["compare"] = bi_tri_df["2_gram"].apply(lambda x: find_matches(x, trigram_df))

In [22]:
bi_tri_df

Unnamed: 0,2_gram,count,compare
0,肌膚,15119,"{'令肌膚': [916, 1.0], '為肌膚': [767, 1.0], '讓肌膚': ..."
1,配方,6583,"{'配方奶': [323, 0.0], '水配方': [165, 1.0], '配方含': ..."
2,保濕,5935,"{'效保濕': [560, 1.0], '保濕成': [441, 0.0], '潤保濕': ..."
3,使用,5884,"{'續使用': [243, 1.0], '使用方': [227, 0.0], '使用後': ..."
4,精華,5195,"{'精華液': [395, 0.0], '取精華': [290, 1.0], '濕精華': ..."
...,...,...,...
670,控制,303,{}
671,絲絨,303,"{'絲絨霧': [79, 0.0]}"
672,品質,302,"{'高品質': [104, 1.0]}"
673,黃金,301,{}


In [23]:
bi_tri_df["group_count"] = bi_tri_df["compare"].apply(lambda x: len(x.items()) if isinstance(x, dict) else x)
bi_tri_df["total_related_3_gram"] = bi_tri_df["compare"].apply(lambda x: np.array(list(x.values()))[:, 0].sum() if isinstance(x, dict) and len(x.values()) > 0 else x)
bi_tri_df["compare"] = bi_tri_df["compare"].map(lambda x: x.items() if isinstance(x, dict) else x)
# bi_tri_df[~(bi_tri_df["compare"].isna())]
bi_tri_df_expand = bi_tri_df.explode("compare")  # .groupby("2_gram", as_index=False)  # .agg({"count": "sum"}).sort_values("count", ascending=False).reset_index(drop=True)
bi_tri_df_expand["3_gram"] = bi_tri_df_expand["compare"].map(lambda x: x[0] if isinstance(x, tuple) else x)
bi_tri_df_expand["3_gram_count"] = bi_tri_df_expand["compare"].map(lambda x: x[1][0] if isinstance(x, tuple) else x)
bi_tri_df_expand["3_gram_position"] = bi_tri_df_expand["compare"].map(lambda x: x[1][1] if isinstance(x, tuple) else x)
bi_tri_df_expand.reset_index(drop=True, inplace=True)

In [24]:
bi_tri_df_expand["prob_of_related_3_gram"] = bi_tri_df_expand["3_gram_count"] / bi_tri_df_expand["total_related_3_gram"]
bi_tri_df_expand

Unnamed: 0,2_gram,count,compare,group_count,total_related_3_gram,3_gram,3_gram_count,3_gram_position,prob_of_related_3_gram
0,肌膚,15119,"(令肌膚, [916, 1.0])",90,16791.0,令肌膚,916.0,1.0,0.054553
1,肌膚,15119,"(為肌膚, [767, 1.0])",90,16791.0,為肌膚,767.0,1.0,0.045679
2,肌膚,15119,"(讓肌膚, [580, 1.0])",90,16791.0,讓肌膚,580.0,1.0,0.034542
3,肌膚,15119,"(護肌膚, [562, 1.0])",90,16791.0,護肌膚,562.0,1.0,0.03347
4,肌膚,15119,"(持肌膚, [493, 1.0])",90,16791.0,持肌膚,493.0,1.0,0.029361
...,...,...,...,...,...,...,...,...,...
1644,絲絨,303,"(絲絨霧, [79, 0.0])",1,79.0,絲絨霧,79.0,0.0,1.0
1645,品質,302,"(高品質, [104, 1.0])",1,104.0,高品質,104.0,1.0,1.0
1646,黃金,301,,0,{},,,,
1647,淨肌,301,"(淨肌膚, [165, 0.0])",2,311.0,淨肌膚,165.0,0.0,0.530547


In [25]:
bi_tri_df_expand["keep_2_gram"] = bi_tri_df_expand["3_gram_count"] / bi_tri_df_expand["count"] <= 0.7
bi_tri_df_expand["keep_3_gram"] = (bi_tri_df_expand["prob_of_related_3_gram"] > 0.11) & ((bi_tri_df_expand["3_gram_count"] > 200) | (bi_tri_df_expand["group_count"] > 140))

In [26]:
bi_tri_df_expand

Unnamed: 0,2_gram,count,compare,group_count,total_related_3_gram,3_gram,3_gram_count,3_gram_position,prob_of_related_3_gram,keep_2_gram,keep_3_gram
0,肌膚,15119,"(令肌膚, [916, 1.0])",90,16791.0,令肌膚,916.0,1.0,0.054553,True,False
1,肌膚,15119,"(為肌膚, [767, 1.0])",90,16791.0,為肌膚,767.0,1.0,0.045679,True,False
2,肌膚,15119,"(讓肌膚, [580, 1.0])",90,16791.0,讓肌膚,580.0,1.0,0.034542,True,False
3,肌膚,15119,"(護肌膚, [562, 1.0])",90,16791.0,護肌膚,562.0,1.0,0.03347,True,False
4,肌膚,15119,"(持肌膚, [493, 1.0])",90,16791.0,持肌膚,493.0,1.0,0.029361,True,False
...,...,...,...,...,...,...,...,...,...,...,...
1644,絲絨,303,"(絲絨霧, [79, 0.0])",1,79.0,絲絨霧,79.0,0.0,1.0,True,False
1645,品質,302,"(高品質, [104, 1.0])",1,104.0,高品質,104.0,1.0,1.0,True,False
1646,黃金,301,,0,{},,,,,False,False
1647,淨肌,301,"(淨肌膚, [165, 0.0])",2,311.0,淨肌膚,165.0,0.0,0.530547,True,False


In [27]:
bi_tri_df_expand["3_gram_dig_down"] = bi_tri_df_expand["3_gram"][bi_tri_df_expand["keep_3_gram"] == True].apply(lambda x: find_matches(x, fourgram_df))

In [28]:
bi_tri_df_expand[bi_tri_df_expand["keep_3_gram"] == True]

Unnamed: 0,2_gram,count,compare,group_count,total_related_3_gram,3_gram,3_gram_count,3_gram_position,prob_of_related_3_gram,keep_2_gram,keep_3_gram,3_gram_dig_down
115,保濕,5935,"(效保濕, [560, 1.0])",30,4830.0,效保濕,560.0,1.0,0.115942,True,True,"{'長效保濕': [221, 1.0], '高效保濕': [148, 1.0], '強效保濕..."
164,精華,5195,"(精華液, [395, 0.0])",19,2810.0,精華液,395.0,0.0,0.140569,True,True,{}
183,天然,4916,"(天然成, [356, 0.0])",21,2933.0,天然成,356.0,0.0,0.121377,True,True,"{'天然成分': [202, 0.0], '天然成份': [154, 0.0]}"
184,天然,4916,"(含天然, [326, 1.0])",21,2933.0,含天然,326.0,1.0,0.111149,True,True,"{'蘊含天然': [113, 1.0]}"
204,有效,4755,"(能有效, [578, 1.0])",28,3726.0,能有效,578.0,1.0,0.155126,True,True,{}
...,...,...,...,...,...,...,...,...,...,...,...,...
1617,高露,313,"(高露潔, [309, 0.0])",1,309.0,高露潔,309.0,0.0,1.0,False,True,{}
1621,性成,312,"(活性成, [257, 1.0])",3,569.0,活性成,257.0,1.0,0.45167,False,True,"{'活性成分': [183, 0.0], '活性成份': [74, 0.0]}"
1622,性成,312,"(性成分, [228, 0.0])",3,569.0,性成分,228.0,0.0,0.400703,False,True,"{'活性成分': [183, 1.0]}"
1627,機認,311,"(有機認, [311, 1.0])",2,603.0,有機認,311.0,1.0,0.515755,False,True,"{'有機認證': [292, 0.0]}"


In [29]:
test_df = bi_tri_df_expand[bi_tri_df_expand["keep_3_gram"] == True].copy()

In [30]:
test_df["4_gram_max_count"] = test_df["3_gram_dig_down"].apply(lambda x: list(x.values())[0][0] if len(x.values()) > 0 else 0)
test_df

Unnamed: 0,2_gram,count,compare,group_count,total_related_3_gram,3_gram,3_gram_count,3_gram_position,prob_of_related_3_gram,keep_2_gram,keep_3_gram,3_gram_dig_down,4_gram_max_count
115,保濕,5935,"(效保濕, [560, 1.0])",30,4830.0,效保濕,560.0,1.0,0.115942,True,True,"{'長效保濕': [221, 1.0], '高效保濕': [148, 1.0], '強效保濕...",221
164,精華,5195,"(精華液, [395, 0.0])",19,2810.0,精華液,395.0,0.0,0.140569,True,True,{},0
183,天然,4916,"(天然成, [356, 0.0])",21,2933.0,天然成,356.0,0.0,0.121377,True,True,"{'天然成分': [202, 0.0], '天然成份': [154, 0.0]}",202
184,天然,4916,"(含天然, [326, 1.0])",21,2933.0,含天然,326.0,1.0,0.111149,True,True,"{'蘊含天然': [113, 1.0]}",113
204,有效,4755,"(能有效, [578, 1.0])",28,3726.0,能有效,578.0,1.0,0.155126,True,True,{},0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1617,高露,313,"(高露潔, [309, 0.0])",1,309.0,高露潔,309.0,0.0,1.0,False,True,{},0
1621,性成,312,"(活性成, [257, 1.0])",3,569.0,活性成,257.0,1.0,0.45167,False,True,"{'活性成分': [183, 0.0], '活性成份': [74, 0.0]}",183
1622,性成,312,"(性成分, [228, 0.0])",3,569.0,性成分,228.0,0.0,0.400703,False,True,"{'活性成分': [183, 1.0]}",183
1627,機認,311,"(有機認, [311, 1.0])",2,603.0,有機認,311.0,1.0,0.515755,False,True,"{'有機認證': [292, 0.0]}",292


In [31]:
test_df["keep_3_gram_after_dig_down"] = test_df["4_gram_max_count"] / test_df["3_gram_count"] <= 0.7
test_df[["2_gram", "3_gram", "keep_3_gram", "keep_3_gram_after_dig_down"]]

Unnamed: 0,2_gram,3_gram,keep_3_gram,keep_3_gram_after_dig_down
115,保濕,效保濕,True,True
164,精華,精華液,True,True
183,天然,天然成,True,True
184,天然,含天然,True,True
204,有效,能有效,True,True
...,...,...,...,...
1617,高露,高露潔,True,True
1621,性成,活性成,True,False
1622,性成,性成分,True,False
1627,機認,有機認,True,False


In [32]:
bi_tri_df_expand.loc[test_df.index, "keep_3_gram"] = test_df["keep_3_gram_after_dig_down"]

In [33]:
bi_tri_df_expand[bi_tri_df_expand["keep_3_gram"] == True]

Unnamed: 0,2_gram,count,compare,group_count,total_related_3_gram,3_gram,3_gram_count,3_gram_position,prob_of_related_3_gram,keep_2_gram,keep_3_gram,3_gram_dig_down
115,保濕,5935,"(效保濕, [560, 1.0])",30,4830.0,效保濕,560.0,1.0,0.115942,True,True,"{'長效保濕': [221, 1.0], '高效保濕': [148, 1.0], '強效保濕..."
164,精華,5195,"(精華液, [395, 0.0])",19,2810.0,精華液,395.0,0.0,0.140569,True,True,{}
183,天然,4916,"(天然成, [356, 0.0])",21,2933.0,天然成,356.0,0.0,0.121377,True,True,"{'天然成分': [202, 0.0], '天然成份': [154, 0.0]}"
184,天然,4916,"(含天然, [326, 1.0])",21,2933.0,含天然,326.0,1.0,0.111149,True,True,"{'蘊含天然': [113, 1.0]}"
204,有效,4755,"(能有效, [578, 1.0])",28,3726.0,能有效,578.0,1.0,0.155126,True,True,{}
...,...,...,...,...,...,...,...,...,...,...,...,...
1573,潤保,327,"(潤保濕, [291, 0.0])",2,472.0,潤保濕,291.0,0.0,0.616525,False,True,"{'滋潤保濕': [146, 1.0]}"
1588,妝水,322,"(化妝水, [224, 1.0])",2,314.0,化妝水,224.0,1.0,0.713376,True,True,{}
1597,面乳,319,"(潔面乳, [273, 1.0])",1,273.0,潔面乳,273.0,1.0,1.0,False,True,{}
1608,露潔,315,"(高露潔, [309, 1.0])",1,309.0,高露潔,309.0,1.0,1.0,False,True,{}


In [34]:
retain_2_gram = bi_tri_df_expand[["2_gram", "keep_2_gram"]]
retain_3_gram = bi_tri_df_expand[["3_gram", "keep_3_gram"]]

retain_2_gram.columns = ["n_gram", "retain"]
retain_3_gram.columns = ["n_gram", "retain"]

result_2vs3_df = pd.concat([retain_2_gram, retain_3_gram])

In [35]:
result_2vs3_df.drop_duplicates(inplace=True, subset=["n_gram"])
result_2vs3_df.reset_index(drop=True, inplace=True)
result_2vs3_df.to_csv("./n_gram_processed/result_2vs3_df.csv", index=False)
result_2vs3_df

Unnamed: 0,n_gram,retain
0,肌膚,True
1,配方,True
2,保濕,True
3,使用,True
4,精華,True
...,...,...
1957,盒裝面,False
1958,膚免受,False
1959,花籽油,False
1960,絲絨霧,False


In [36]:
count_threshold_3_gram = 150

tri_four_df = trigram_df[trigram_df["count"] > count_threshold_3_gram].copy()
tri_four_df

Unnamed: 0,3_gram,count
0,維他命,2812
1,屈臣氏,1828
2,防腐劑,1193
3,型美甲,1179
4,美甲片,1124
...,...,...
384,有效去,152
385,膽固醇,151
386,用後肌,151
387,膚帶來,151


In [37]:
tri_four_df["compare"] = tri_four_df["3_gram"].apply(lambda x: find_matches(x, fourgram_df))

In [38]:
tri_four_df

Unnamed: 0,3_gram,count,compare
0,維他命,2812,"{'及維他命': [293, 1.0], '含維他命': [210, 1.0], '種維他命..."
1,屈臣氏,1828,"{'屈臣氏骨': [253, 0.0]}"
2,防腐劑,1193,"{'類防腐劑': [179, 1.0], '含防腐劑': [176, 1.0], '加防腐劑..."
3,型美甲,1179,"{'型美甲片': [1040, 0.0], '造型美甲': [744, 1.0], '薄型美..."
4,美甲片,1124,"{'型美甲片': [1040, 1.0]}"
...,...,...,...
384,有效去,152,"{'有效去除': [134, 0.0]}"
385,膽固醇,151,{}
386,用後肌,151,"{'用後肌膚': [146, 0.0]}"
387,膚帶來,151,"{'肌膚帶來': [143, 1.0]}"


In [39]:
tri_four_df["group_count"] = tri_four_df["compare"].apply(lambda x: len(x.items()) if isinstance(x, dict) else x)
tri_four_df["total_related_4_gram"] = tri_four_df["compare"].apply(lambda x: np.array(list(x.values()))[:, 0].sum() if isinstance(x, dict) and len(x.values()) > 0 else 0)
tri_four_df["compare"] = tri_four_df["compare"].map(lambda x: x.items() if isinstance(x, dict) else x)
# tri_four_df[~(tri_four_df["compare"].isna())]
tri_four_df_expand = tri_four_df.explode("compare")  # .groupby("2_gram", as_index=False)  # .agg({"count": "sum"}).sort_values("count", ascending=False).reset_index(drop=True)
tri_four_df_expand["4_gram"] = tri_four_df_expand["compare"].map(lambda x: x[0] if isinstance(x, tuple) else x)
tri_four_df_expand["4_gram_count"] = tri_four_df_expand["compare"].map(lambda x: x[1][0] if isinstance(x, tuple) else x)
tri_four_df_expand["4_gram_position"] = tri_four_df_expand["compare"].map(lambda x: x[1][1] if isinstance(x, tuple) else x)
tri_four_df_expand.reset_index(drop=True, inplace=True)

In [40]:
tri_four_df_expand["prob_of_related_4_gram"] = tri_four_df_expand["4_gram_count"] / tri_four_df_expand["total_related_4_gram"]
tri_four_df_expand

Unnamed: 0,3_gram,count,compare,group_count,total_related_4_gram,4_gram,4_gram_count,4_gram_position,prob_of_related_4_gram
0,維他命,2812,"(及維他命, [293, 1.0])",8,1191.0,及維他命,293.0,1.0,0.246012
1,維他命,2812,"(含維他命, [210, 1.0])",8,1191.0,含維他命,210.0,1.0,0.176322
2,維他命,2812,"(種維他命, [168, 1.0])",8,1191.0,種維他命,168.0,1.0,0.141058
3,維他命,2812,"(維他命及, [137, 0.0])",8,1191.0,維他命及,137.0,0.0,0.115029
4,維他命,2812,"(富維他命, [115, 1.0])",8,1191.0,富維他命,115.0,1.0,0.096558
...,...,...,...,...,...,...,...,...,...
503,有效去,152,"(有效去除, [134, 0.0])",1,134.0,有效去除,134.0,0.0,1.000000
504,膽固醇,151,,0,0.0,,,,
505,用後肌,151,"(用後肌膚, [146, 0.0])",1,146.0,用後肌膚,146.0,0.0,1.000000
506,膚帶來,151,"(肌膚帶來, [143, 1.0])",1,143.0,肌膚帶來,143.0,1.0,1.000000


In [41]:
tri_four_df_expand["keep_3_gram"] = (tri_four_df_expand["4_gram_count"] / tri_four_df_expand["count"] <= 0.85) & (tri_four_df_expand["group_count"] > 20)
tri_four_df_expand["keep_4_gram"] = (tri_four_df_expand["prob_of_related_4_gram"] > 0.15) & ((tri_four_df_expand["4_gram_count"] > 110) | (tri_four_df_expand["group_count"] > 50))

In [42]:
tri_four_df_expand

Unnamed: 0,3_gram,count,compare,group_count,total_related_4_gram,4_gram,4_gram_count,4_gram_position,prob_of_related_4_gram,keep_3_gram,keep_4_gram
0,維他命,2812,"(及維他命, [293, 1.0])",8,1191.0,及維他命,293.0,1.0,0.246012,False,True
1,維他命,2812,"(含維他命, [210, 1.0])",8,1191.0,含維他命,210.0,1.0,0.176322,False,True
2,維他命,2812,"(種維他命, [168, 1.0])",8,1191.0,種維他命,168.0,1.0,0.141058,False,False
3,維他命,2812,"(維他命及, [137, 0.0])",8,1191.0,維他命及,137.0,0.0,0.115029,False,False
4,維他命,2812,"(富維他命, [115, 1.0])",8,1191.0,富維他命,115.0,1.0,0.096558,False,False
...,...,...,...,...,...,...,...,...,...,...,...
503,有效去,152,"(有效去除, [134, 0.0])",1,134.0,有效去除,134.0,0.0,1.000000,False,True
504,膽固醇,151,,0,0.0,,,,,False,False
505,用後肌,151,"(用後肌膚, [146, 0.0])",1,146.0,用後肌膚,146.0,0.0,1.000000,False,True
506,膚帶來,151,"(肌膚帶來, [143, 1.0])",1,143.0,肌膚帶來,143.0,1.0,1.000000,False,True


In [43]:
tri_four_df_expand.dropna(inplace=True)

In [44]:
tri_four_df_expand["4_gram_dig_down"] = tri_four_df_expand["4_gram"][tri_four_df_expand["keep_4_gram"] == True].apply(lambda x: find_matches(x, fivegram_df))

In [45]:
tri_four_df_expand[tri_four_df_expand["keep_4_gram"] == True]

Unnamed: 0,3_gram,count,compare,group_count,total_related_4_gram,4_gram,4_gram_count,4_gram_position,prob_of_related_4_gram,keep_3_gram,keep_4_gram,4_gram_dig_down
0,維他命,2812,"(及維他命, [293, 1.0])",8,1191.0,及維他命,293.0,1.0,0.246012,False,True,{}
1,維他命,2812,"(含維他命, [210, 1.0])",8,1191.0,含維他命,210.0,1.0,0.176322,False,True,"{'蘊含維他命': [80, 1.0]}"
8,屈臣氏,1828,"(屈臣氏骨, [253, 0.0])",1,253.0,屈臣氏骨,253.0,0.0,1.000000,False,True,"{'屈臣氏骨膠': [252, 0.0]}"
9,防腐劑,1193,"(類防腐劑, [179, 1.0])",5,681.0,類防腐劑,179.0,1.0,0.262849,False,True,{}
10,防腐劑,1193,"(含防腐劑, [176, 1.0])",5,681.0,含防腐劑,176.0,1.0,0.258443,False,True,"{'不含防腐劑': [176, 1.0]}"
...,...,...,...,...,...,...,...,...,...,...,...,...
499,別適合,152,"(特別適合, [152, 1.0])",1,152.0,特別適合,152.0,1.0,1.000000,False,True,{}
503,有效去,152,"(有效去除, [134, 0.0])",1,134.0,有效去除,134.0,0.0,1.000000,False,True,{}
505,用後肌,151,"(用後肌膚, [146, 0.0])",1,146.0,用後肌膚,146.0,0.0,1.000000,False,True,{}
506,膚帶來,151,"(肌膚帶來, [143, 1.0])",1,143.0,肌膚帶來,143.0,1.0,1.000000,False,True,"{'為肌膚帶來': [118, 1.0]}"


In [46]:
test_df = tri_four_df_expand[tri_four_df_expand["keep_4_gram"] == True].copy()

In [47]:
test_df["5_gram_max_count"] = test_df["4_gram_dig_down"].apply(lambda x: list(x.values())[0][0] if len(x.values()) > 0 else 0)
test_df

Unnamed: 0,3_gram,count,compare,group_count,total_related_4_gram,4_gram,4_gram_count,4_gram_position,prob_of_related_4_gram,keep_3_gram,keep_4_gram,4_gram_dig_down,5_gram_max_count
0,維他命,2812,"(及維他命, [293, 1.0])",8,1191.0,及維他命,293.0,1.0,0.246012,False,True,{},0
1,維他命,2812,"(含維他命, [210, 1.0])",8,1191.0,含維他命,210.0,1.0,0.176322,False,True,"{'蘊含維他命': [80, 1.0]}",80
8,屈臣氏,1828,"(屈臣氏骨, [253, 0.0])",1,253.0,屈臣氏骨,253.0,0.0,1.000000,False,True,"{'屈臣氏骨膠': [252, 0.0]}",252
9,防腐劑,1193,"(類防腐劑, [179, 1.0])",5,681.0,類防腐劑,179.0,1.0,0.262849,False,True,{},0
10,防腐劑,1193,"(含防腐劑, [176, 1.0])",5,681.0,含防腐劑,176.0,1.0,0.258443,False,True,"{'不含防腐劑': [176, 1.0]}",176
...,...,...,...,...,...,...,...,...,...,...,...,...,...
499,別適合,152,"(特別適合, [152, 1.0])",1,152.0,特別適合,152.0,1.0,1.000000,False,True,{},0
503,有效去,152,"(有效去除, [134, 0.0])",1,134.0,有效去除,134.0,0.0,1.000000,False,True,{},0
505,用後肌,151,"(用後肌膚, [146, 0.0])",1,146.0,用後肌膚,146.0,0.0,1.000000,False,True,{},0
506,膚帶來,151,"(肌膚帶來, [143, 1.0])",1,143.0,肌膚帶來,143.0,1.0,1.000000,False,True,"{'為肌膚帶來': [118, 1.0]}",118


In [48]:
test_df["keep_4_gram_after_dig_down"] = test_df["5_gram_max_count"] / test_df["4_gram_count"] <= 0.5
test_df[["3_gram", "4_gram", "keep_4_gram", "keep_4_gram_after_dig_down"]]

Unnamed: 0,3_gram,4_gram,keep_4_gram,keep_4_gram_after_dig_down
0,維他命,及維他命,True,True
1,維他命,含維他命,True,True
8,屈臣氏,屈臣氏骨,True,False
9,防腐劑,類防腐劑,True,True
10,防腐劑,含防腐劑,True,False
...,...,...,...,...
499,別適合,特別適合,True,True
503,有效去,有效去除,True,True
505,用後肌,用後肌膚,True,True
506,膚帶來,肌膚帶來,True,False


In [49]:
tri_four_df_expand.loc[test_df.index, "keep_4_gram"] = test_df["keep_4_gram_after_dig_down"]

In [50]:
tri_four_df_expand[tri_four_df_expand["keep_4_gram"] == True]

Unnamed: 0,3_gram,count,compare,group_count,total_related_4_gram,4_gram,4_gram_count,4_gram_position,prob_of_related_4_gram,keep_3_gram,keep_4_gram,4_gram_dig_down
0,維他命,2812,"(及維他命, [293, 1.0])",8,1191.0,及維他命,293.0,1.0,0.246012,False,True,{}
1,維他命,2812,"(含維他命, [210, 1.0])",8,1191.0,含維他命,210.0,1.0,0.176322,False,True,"{'蘊含維他命': [80, 1.0]}"
9,防腐劑,1193,"(類防腐劑, [179, 1.0])",5,681.0,類防腐劑,179.0,1.0,0.262849,False,True,{}
19,透明質,1116,"(透明質酸, [1111, 0.0])",3,1288.0,透明質酸,1111.0,0.0,0.862578,False,True,"{'透明質酸及': [126, 0.0], '含透明質酸': [98, 1.0], '及透明..."
22,明質酸,1113,"(透明質酸, [1111, 1.0])",2,1237.0,透明質酸,1111.0,1.0,0.898141,False,True,"{'透明質酸及': [126, 0.0], '含透明質酸': [98, 1.0], '及透明..."
...,...,...,...,...,...,...,...,...,...,...,...,...
498,高效保,153,"(高效保濕, [148, 0.0])",1,148.0,高效保濕,148.0,0.0,1.000000,False,True,{}
499,別適合,152,"(特別適合, [152, 1.0])",1,152.0,特別適合,152.0,1.0,1.000000,False,True,{}
503,有效去,152,"(有效去除, [134, 0.0])",1,134.0,有效去除,134.0,0.0,1.000000,False,True,{}
505,用後肌,151,"(用後肌膚, [146, 0.0])",1,146.0,用後肌膚,146.0,0.0,1.000000,False,True,{}


In [51]:
retain_3_gram = tri_four_df_expand[["3_gram", "keep_3_gram"]]
retain_4_gram = tri_four_df_expand[["4_gram", "keep_4_gram"]]

retain_3_gram.columns = ["n_gram", "retain"]
retain_4_gram.columns = ["n_gram", "retain"]

result_3vs4_df = pd.concat([retain_3_gram, retain_4_gram])

In [52]:
result_3vs4_df.drop_duplicates(inplace=True, subset=["n_gram"])
result_3vs4_df.reset_index(drop=True, inplace=True)
result_3vs4_df.to_csv("./n_gram_processed/result_2vs3_df.csv", index=False)
result_3vs4_df

Unnamed: 0,n_gram,retain
0,維他命,False
1,屈臣氏,False
2,防腐劑,False
3,型美甲,False
4,美甲片,False
...,...,...
553,持久不脫,False
554,呵護寶寶,False
555,有效去除,True
556,肌膚帶來,False


In [53]:
summary_df = pd.concat([result_2vs3_df, result_3vs4_df])
summary_df.drop_duplicates(inplace=True, subset=["n_gram"])
summary_df.to_csv("./n_gram_processed/summary.csv", index=False)

In [54]:
summary_df

Unnamed: 0,n_gram,retain
0,肌膚,True
1,配方,True
2,保濕,True
3,使用,True
4,精華,True
...,...,...
553,持久不脫,False
554,呵護寶寶,False
555,有效去除,True
556,肌膚帶來,False
