In [1]:
import pandas as pd

In [2]:
sample_df = pd.read_csv("./sample_data/product_name_sample_data.csv")
sample_df.head()

Unnamed: 0,product_name,description
0,草本花香洗髮露 600毫升,揉合100%日本培植的有機草本植物，令髮絲彷如重生，令變得柔滑清爽，氛芳花香，令你彷如置身大...
1,Voost 運動水樽,VOOST MUG
2,完美遮瑕筆306 (1.5ml),質地柔亮潤澤，遮瑕的同時去除暗沉，作為 highlight 使用能提亮妝容<BR><BR>獨...
3,高效防脫增生洗髮液 150毫升,ANTI HAIR LOSS SHAMP
4,ISOTONIC 運動水溶片青檸檬味十片裝,幫助人體代謝碳水化合物、脂肪和蛋白質; 快速補充水份與電解質，促進神經肌肉傳導; 有助維持肌...


In [3]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32372 entries, 0 to 32371
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_name  32361 non-null  object
 1   description   31501 non-null  object
dtypes: object(2)
memory usage: 505.9+ KB


In [4]:
sample_df = sample_df.fillna("*")

In [5]:
product_name_df = sample_df[["product_name"]]
description_df = sample_df[["description"]]

product_name_df.columns = ["doc"]
description_df.columns = ["doc"]

long_df = pd.concat([product_name_df, description_df])
long_df.head()

Unnamed: 0,doc
0,草本花香洗髮露 600毫升
1,Voost 運動水樽
2,完美遮瑕筆306 (1.5ml)
3,高效防脫增生洗髮液 150毫升
4,ISOTONIC 運動水溶片青檸檬味十片裝


In [6]:
long_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 64744 entries, 0 to 32371
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   doc     64744 non-null  object
dtypes: object(1)
memory usage: 1011.6+ KB


In [7]:
import re


def clean_text(text:str) -> str:
    # 去除 HTML 標籤
    text = re.sub(r'<.*?>', ',', text)
    # 去除非字母字符以及英文
    text = re.sub(r'[^\u4e00-\u9fa5]', ',', text)
    text = re.sub(r",+", ",", text)

    return text

In [8]:
long_df["doc"] = long_df["doc"].apply(clean_text)
long_df.iloc[:20]

Unnamed: 0,doc
0,"草本花香洗髮露,毫升"
1,",運動水樽"
2,"完美遮瑕筆,"
3,"高效防脫增生洗髮液,毫升"
4,",運動水溶片青檸檬味十片裝"
5,強生便利貼公主膠布
6,"牛油果滋養修護晚霜,毫升"
7,",麥蘆卡蜂蜜,克,"
8,"光采豐蜜唇釉,"
9,",金盞花面霜"


In [9]:
# def do_n_gram(doc: str, n: int=2) -> list[str]:
#     pattern = re.compile(",")
#     return [doc[i: i+n] for i in range(len(doc) - (n-1)) if not (bool(pattern.search(doc[i: i+n])))]


def do_n_gram(doc: str, n: int=2) -> list[str]:
    return [doc[i: i+n] for i in range(len(doc) - (n-1)) if not ("," in doc[i: i+n])]

In [10]:
long_df["count"] = 1

In [11]:
# long_df.drop_duplicates(subset=['doc'], inplace=True)  # 64744 → 42071

In [12]:
for i in range(2, 6):
    long_df[f"{i}_gram"] = long_df["doc"].apply(lambda x: do_n_gram(x, i))

In [13]:
long_df.iloc[:20]

Unnamed: 0,doc,count,2_gram,3_gram,4_gram,5_gram
0,"草本花香洗髮露,毫升",1,"[草本, 本花, 花香, 香洗, 洗髮, 髮露, 毫升]","[草本花, 本花香, 花香洗, 香洗髮, 洗髮露]","[草本花香, 本花香洗, 花香洗髮, 香洗髮露]","[草本花香洗, 本花香洗髮, 花香洗髮露]"
1,",運動水樽",1,"[運動, 動水, 水樽]","[運動水, 動水樽]",[運動水樽],[]
2,"完美遮瑕筆,",1,"[完美, 美遮, 遮瑕, 瑕筆]","[完美遮, 美遮瑕, 遮瑕筆]","[完美遮瑕, 美遮瑕筆]",[完美遮瑕筆]
3,"高效防脫增生洗髮液,毫升",1,"[高效, 效防, 防脫, 脫增, 增生, 生洗, 洗髮, 髮液, 毫升]","[高效防, 效防脫, 防脫增, 脫增生, 增生洗, 生洗髮, 洗髮液]","[高效防脫, 效防脫增, 防脫增生, 脫增生洗, 增生洗髮, 生洗髮液]","[高效防脫增, 效防脫增生, 防脫增生洗, 脫增生洗髮, 增生洗髮液]"
4,",運動水溶片青檸檬味十片裝",1,"[運動, 動水, 水溶, 溶片, 片青, 青檸, 檸檬, 檬味, 味十, 十片, 片裝]","[運動水, 動水溶, 水溶片, 溶片青, 片青檸, 青檸檬, 檸檬味, 檬味十, 味十片, ...","[運動水溶, 動水溶片, 水溶片青, 溶片青檸, 片青檸檬, 青檸檬味, 檸檬味十, 檬味十...","[運動水溶片, 動水溶片青, 水溶片青檸, 溶片青檸檬, 片青檸檬味, 青檸檬味十, 檸檬味..."
5,強生便利貼公主膠布,1,"[強生, 生便, 便利, 利貼, 貼公, 公主, 主膠, 膠布]","[強生便, 生便利, 便利貼, 利貼公, 貼公主, 公主膠, 主膠布]","[強生便利, 生便利貼, 便利貼公, 利貼公主, 貼公主膠, 公主膠布]","[強生便利貼, 生便利貼公, 便利貼公主, 利貼公主膠, 貼公主膠布]"
6,"牛油果滋養修護晚霜,毫升",1,"[牛油, 油果, 果滋, 滋養, 養修, 修護, 護晚, 晚霜, 毫升]","[牛油果, 油果滋, 果滋養, 滋養修, 養修護, 修護晚, 護晚霜]","[牛油果滋, 油果滋養, 果滋養修, 滋養修護, 養修護晚, 修護晚霜]","[牛油果滋養, 油果滋養修, 果滋養修護, 滋養修護晚, 養修護晚霜]"
7,",麥蘆卡蜂蜜,克,",1,"[麥蘆, 蘆卡, 卡蜂, 蜂蜜]","[麥蘆卡, 蘆卡蜂, 卡蜂蜜]","[麥蘆卡蜂, 蘆卡蜂蜜]",[麥蘆卡蜂蜜]
8,"光采豐蜜唇釉,",1,"[光采, 采豐, 豐蜜, 蜜唇, 唇釉]","[光采豐, 采豐蜜, 豐蜜唇, 蜜唇釉]","[光采豐蜜, 采豐蜜唇, 豐蜜唇釉]","[光采豐蜜唇, 采豐蜜唇釉]"
9,",金盞花面霜",1,"[金盞, 盞花, 花面, 面霜]","[金盞花, 盞花面, 花面霜]","[金盞花面, 盞花面霜]",[金盞花面霜]


In [14]:
bigram_df = long_df[["2_gram", "count"]].explode("2_gram").groupby("2_gram", as_index=False).agg({"count": "sum"}).sort_values("count", ascending=False).reset_index(drop=True)
# bigram_df = bigram_df[~bigram_df["2_gram"].str.contains("#")].reset_index(drop=True)
bigram_df

Unnamed: 0,2_gram,count
0,肌膚,15119
1,配方,6583
2,保濕,5935
3,使用,5884
4,精華,5195
...,...,...
149680,种天,1
149681,种抗,1
149682,种有,1
149683,种水,1


In [15]:
trigram_df = long_df[["3_gram", "count"]].explode("3_gram").groupby("3_gram", as_index=False).agg({"count": "sum"}).sort_values("count", ascending=False).reset_index(drop=True)
# trigram_df = trigram_df[~trigram_df["3_gram"].str.contains("#")].reset_index(drop=True)
trigram_df

Unnamed: 0,3_gram,count
0,維他命,2812
1,屈臣氏,1828
2,防腐劑,1193
3,型美甲,1179
4,美甲片,1124
...,...,...
343566,杯杯麵,1
343567,杯柔滑,1
343568,杯榨果,1
343569,杯水的,1


In [16]:
fourgram_df = long_df[["4_gram", "count"]].explode("4_gram").groupby("4_gram", as_index=False).agg({"count": "sum"}).sort_values("count", ascending=False).reset_index(drop=True)
# fourgram_df = fourgram_df[~fourgram_df["4_gram"].str.contains("#")].reset_index(drop=True)
fourgram_df

Unnamed: 0,4_gram,count
0,透明質酸,1111
1,型美甲片,1040
2,膠原蛋白,824
3,指甲品牌,747
4,全新造型,746
...,...,...
427003,士賦顏黃,1
427004,獨家特級,1
427005,士豆澱粉,1
427006,士護膚產,1


In [17]:
fivegram_df = long_df[["5_gram", "count"]].explode("5_gram").groupby("5_gram", as_index=False).agg({"count": "sum"}).sort_values("count", ascending=False).reset_index(drop=True)
# fivegram_df = fivegram_df[~fivegram_df["5_gram"].str.contains("#")].reset_index(drop=True)
fivegram_df

Unnamed: 0,5_gram,count
0,甲品牌全新,744
1,全新造型美,744
2,指甲品牌全,744
3,牌全新造型,744
4,品牌全新造,744
...,...,...
427764,旅途中快速,1
427765,旅途中使用,1
427766,旅行電水壺,1
427767,旅行野餐籃,1


In [18]:
# def find_matches(ngram1, ngram2):
#     return ngram2[(ngram2.iloc[:, 1] > 1) & (ngram2.iloc[:, 0].str.contains(ngram1))].iloc[:, 0].to_list()

In [19]:
def find_matches(term, ngram):
    pattern = re.compile(term)
    filtered_df = ngram[ngram.iloc[:, 1] > 1]
    matches = filtered_df[filtered_df.iloc[:, 0].apply(lambda x: bool(pattern.search(x)))]  # 100 -> 7.7s
    # matches = filtered_df[filtered_df.iloc[:, 0].str.contains(term)]  # 100 -> 8.9s
    return {row.iloc[0]: row.iloc[1] for _, row in matches.iterrows()}

In [20]:
import numpy as np
def find_matches_np(term, ngram):  # 100 -> 12.7s
    array = ngram.iloc[:, 0][ngram.iloc[:, 1] > 1].to_list()

    matches = np.char.find(array, term) != -1

    return matches.sum()

In [21]:
count_threshold_2_gram = 300
bi_tri_df = bigram_df[bigram_df["count"] > count_threshold_2_gram].copy()
print(bi_tri_df.shape)

bi_tri_df["compare"] = bi_tri_df["2_gram"].apply(lambda x: find_matches(x, trigram_df))


(698, 2)


In [22]:
bi_tri_df["group_count"] = bi_tri_df["compare"].apply(lambda x: len(x.items()) if isinstance(x, dict) else 0)

bi_tri_df["3_gram_counts"] = bi_tri_df["compare"].apply(lambda x: sum(x.values()))

bi_tri_df["2_gram / 3_gram"] = bi_tri_df["count"] / (bi_tri_df["3_gram_counts"] + 1)

# bi_tri_df.to_csv("./n_gram_processed/bi_tri_df.csv")
bi_tri_df.iloc[:20]

Unnamed: 0,2_gram,count,compare,group_count,3_gram_counts,2_gram / 3_gram
0,肌膚,15119,"{'令肌膚': 916, '為肌膚': 767, '肌膚的': 656, '的肌膚': 62...",649,25336,0.596716
1,配方,6583,"{'配方奶': 323, '水配方': 165, '配方含': 164, '的配方': 15...",485,7984,0.824421
2,保濕,5935,"{'效保濕': 560, '保濕成': 441, '潤保濕': 291, '保濕精': 27...",404,9342,0.635235
3,使用,5884,"{'續使用': 243, '使用方': 227, '使用後': 227, '使用時': 17...",487,7408,0.794169
4,精華,5195,"{'精華液': 395, '取精華': 290, '濕精華': 272, '精華及': 25...",442,7198,0.721628
5,天然,4916,"{'的天然': 482, '天然成': 356, '含天然': 326, '天然植': 24...",437,7910,0.621413
6,有效,4755,"{'能有效': 578, '有效舒': 180, '有效減': 180, '有效清': 17...",295,6812,0.69793
7,成分,4227,"{'濕成分': 329, '性成分': 228, '然成分': 205, '養成分': 19...",340,5320,0.7944
8,皮膚,4149,"{'皮膚科': 599, '過皮膚': 488, '的皮膚': 288, '皮膚測': 22...",377,7161,0.579307
9,產品,4016,"{'本產品': 423, '的產品': 271, '產品包': 266, '此產品': 20...",342,5690,0.705676


In [23]:
bi_tri_df.describe()

Unnamed: 0,count,group_count,3_gram_counts,2_gram / 3_gram
count,698.0,698.0,698.0,698.0
mean,808.419771,92.901146,1165.67192,1.217893
std,911.543207,72.496853,1393.748235,11.966802
min,301.0,0.0,0.0,0.499197
25%,387.75,48.0,564.25,0.610962
50%,530.0,79.0,766.0,0.688646
75%,873.5,116.75,1309.75,0.779948
max,15119.0,649.0,25336.0,316.0


In [24]:
bi_tri_df.sort_values("2_gram / 3_gram", ascending=False)

Unnamed: 0,2_gram,count,compare,group_count,3_gram_counts,2_gram / 3_gram
662,度數,316,{},0,0,316.000000
10,毫升,3612,"{'毫升沸': 38, '毫升孖': 17, '毫升補': 15, '毫升水': 12, '...",31,189,19.010526
125,片裝,1099,"{'單片裝': 16, '十片裝': 14, '大片裝': 8, '五片裝': 6, '片裝...",15,72,15.054795
607,支裝,340,"{'兩支裝': 19, '三支裝': 10, '支裝加': 7, '支裝送': 7, '立支...",11,64,5.230769
286,毫克,611,"{'毫克膽': 18, '毫克的': 18, '零毫克': 18, '毫克亞': 13, '...",29,160,3.795031
...,...,...,...,...,...,...
226,銷指,741,"{'銷指甲': 741, '熱銷指': 741}",2,1482,0.499663
364,家直,513,"{'商家直': 513, '家直送': 513}",2,1026,0.499513
382,濕成,498,"{'保濕成': 441, '濕成分': 329, '濕成份': 169, '補濕成': 57}",4,996,0.499498
383,內送,497,"{'天內送': 497, '內送到': 490, '內送貨': 7}",3,994,0.499497


In [25]:
bi_tri_df["keep_2_gram"] = (bi_tri_df["group_count"] >= 30) & (bi_tri_df["2_gram / 3_gram"] >= 0.55)
bi_tri_df[(bi_tri_df["keep_2_gram"] == True)]

Unnamed: 0,2_gram,count,compare,group_count,3_gram_counts,2_gram / 3_gram,keep_2_gram
0,肌膚,15119,"{'令肌膚': 916, '為肌膚': 767, '肌膚的': 656, '的肌膚': 62...",649,25336,0.596716,True
1,配方,6583,"{'配方奶': 323, '水配方': 165, '配方含': 164, '的配方': 15...",485,7984,0.824421,True
2,保濕,5935,"{'效保濕': 560, '保濕成': 441, '潤保濕': 291, '保濕精': 27...",404,9342,0.635235,True
3,使用,5884,"{'續使用': 243, '使用方': 227, '使用後': 227, '使用時': 17...",487,7408,0.794169,True
4,精華,5195,"{'精華液': 395, '取精華': 290, '濕精華': 272, '精華及': 25...",442,7198,0.721628,True
...,...,...,...,...,...,...,...
692,控制,303,"{'控制自': 31, '助控制': 30, '控制異': 23, '易控制': 19, '...",79,452,0.668874,True
693,絲絨,303,"{'絲絨霧': 79, '絲絨唇': 53, '絲絨質': 41, '體絲絨': 31, '...",37,504,0.600000,True
694,品質,302,"{'高品質': 104, '品質的': 31, '品質保': 29, '產品質': 28, ...",61,477,0.631799,True
695,質的,302,"{'優質的': 102, '白質的': 54, '物質的': 47, '品質的': 31, ...",67,540,0.558226,True


In [26]:
count_threshold_3_gram = 150
tri_four_df = trigram_df[trigram_df["count"] > count_threshold_3_gram].copy()
print(tri_four_df.shape)

tri_four_df["compare"] = tri_four_df["3_gram"].apply(lambda x: find_matches(x, fourgram_df))

(415, 2)


In [27]:
tri_four_df["group_count"] = tri_four_df["compare"].apply(lambda x: len(x.items()) if isinstance(x, dict) else 0)

tri_four_df["4_gram_counts"] = tri_four_df["compare"].apply(lambda x: sum(x.values()))

tri_four_df["3_gram / 4_gram"] = tri_four_df["count"] / (tri_four_df["4_gram_counts"] + 1)

# tri_four_df.to_csv("./n_gram_processed/tri_four_df.csv")
tri_four_df.iloc[:20]

Unnamed: 0,3_gram,count,compare,group_count,4_gram_counts,3_gram / 4_gram
0,維他命,2812,"{'及維他命': 293, '含維他命': 210, '種維他命': 168, '維他命及'...",141,2310,1.216789
1,屈臣氏,1828,"{'屈臣氏骨': 253, '屈臣氏蒸': 58, '屈臣氏燕': 57, '送屈臣氏': ...",186,1773,1.03044
2,防腐劑,1193,"{'類防腐劑': 179, '含防腐劑': 176, '加防腐劑': 127, '及防腐劑'...",28,964,1.236269
3,型美甲,1179,"{'型美甲片': 1040, '造型美甲': 744, '薄型美甲': 435, '型美甲貼...",4,2358,0.499788
4,美甲片,1124,"{'型美甲片': 1040, '繪美甲片': 47, '療美甲片': 36, '美甲片閃':...",53,1279,0.878125
5,透明質,1116,"{'透明質酸': 1111, '含透明質': 98, '及透明質': 79, '子透明質':...",61,1856,0.600969
6,明質酸,1113,"{'透明質酸': 1111, '明質酸及': 126, '明質酸鈉': 48, '明質酸保'...",54,1677,0.66329
7,無添加,1098,"{'無添加糖': 92, '無添加螢': 88, '無添加防': 83, '大無添加': 7...",72,1017,1.078585
8,抗氧化,1000,"{'抗氧化劑': 139, '效抗氧化': 96, '的抗氧化': 94, '抗氧化功': ...",107,1478,0.676133
9,令肌膚,916,"{'令肌膚更': 58, '令肌膚回': 57, '令肌膚保': 53, '令肌膚重': 4...",90,993,0.921529


In [28]:
tri_four_df.describe()

Unnamed: 0,count,group_count,4_gram_counts,3_gram / 4_gram
count,415.0,415.0,415.0,415.0
mean,299.337349,28.653012,425.072289,1.172326
std,237.810457,26.078478,330.671597,8.461265
min,151.0,0.0,0.0,0.498584
25%,176.0,11.0,240.0,0.575742
50%,209.0,22.0,317.0,0.712963
75%,315.5,38.5,465.0,0.869126
max,2812.0,186.0,2358.0,173.0


In [29]:
tri_four_df.sort_values("3_gram / 4_gram", ascending=False)

Unnamed: 0,3_gram,count,compare,group_count,4_gram_counts,3_gram / 4_gram
318,原產地,173,{},0,0,173.000000
153,無香料,252,"{'無香料嬰': 10, '無香料無': 9, '氏無香料': 8, '無香料配': 7, ...",8,45,5.478261
135,現金券,268,"{'子現金券': 30, '現金券或': 14, '現金券及': 14, '現金券於': 1...",16,105,2.528302
171,甲效果,237,{'美甲效果': 126},1,126,1.866142
76,礦物油,403,"{'礦物油及': 62, '無礦物油': 53, '含礦物油': 34, '及礦物油': 3...",15,230,1.744589
...,...,...,...,...,...,...
126,療薄型,276,"{'療薄型美': 276, '光療薄型': 276}",2,552,0.499096
154,氏骨膠,252,"{'臣氏骨膠': 252, '氏骨膠原': 252}",2,504,0.499010
164,膠指甲,243,"{'凝膠指甲': 243, '膠指甲油': 155, '膠指甲般': 74, '膠指甲顏':...",4,486,0.498973
307,洛哥堅,177,"{'洛哥堅果': 177, '摩洛哥堅': 168, '犘洛哥堅': 9}",3,354,0.498592


In [30]:
tri_four_df["keep_3_gram"] = (tri_four_df["group_count"] >= 25) & (tri_four_df["3_gram / 4_gram"] >= 0.55)
tri_four_df[(tri_four_df["keep_3_gram"] == True)]

Unnamed: 0,3_gram,count,compare,group_count,4_gram_counts,3_gram / 4_gram,keep_3_gram
0,維他命,2812,"{'及維他命': 293, '含維他命': 210, '種維他命': 168, '維他命及'...",141,2310,1.216789,True
1,屈臣氏,1828,"{'屈臣氏骨': 253, '屈臣氏蒸': 58, '屈臣氏燕': 57, '送屈臣氏': ...",186,1773,1.030440,True
2,防腐劑,1193,"{'類防腐劑': 179, '含防腐劑': 176, '加防腐劑': 127, '及防腐劑'...",28,964,1.236269,True
4,美甲片,1124,"{'型美甲片': 1040, '繪美甲片': 47, '療美甲片': 36, '美甲片閃':...",53,1279,0.878125,True
5,透明質,1116,"{'透明質酸': 1111, '含透明質': 98, '及透明質': 79, '子透明質':...",61,1856,0.600969,True
...,...,...,...,...,...,...,...
401,玫瑰花,154,"{'玫瑰花香': 54, '玫瑰花瓣': 16, '的玫瑰花': 15, '淡玫瑰花': 1...",27,215,0.712963,True
402,造成的,153,"{'造成的傷': 42, '膚造成的': 22, '所造成的': 20, '而造成的': 1...",45,258,0.590734,True
404,衛生巾,153,"{'體衛生巾': 62, '黏衛生巾': 9, '薄衛生巾': 9, '衛生巾乾': 8, ...",33,191,0.796875,True
408,潤手霜,152,"{'護潤手霜': 22, '果潤手霜': 13, '油潤手霜': 9, '瑰潤手霜': 8,...",30,130,1.160305,True


In [31]:
count_threshold_4_gram = 75
four_five_df = fourgram_df[fourgram_df["count"] > count_threshold_4_gram].copy()
print(four_five_df.shape)

four_five_df["compare"] = four_five_df["4_gram"].apply(lambda x: find_matches(x, fivegram_df))

(480, 2)


In [32]:
four_five_df["group_count"] = four_five_df["compare"].apply(lambda x: len(x.items()) if isinstance(x, dict) else 0)

four_five_df["5_gram_counts"] = four_five_df["compare"].apply(lambda x: sum(x.values()))

four_five_df["4_gram / 5_gram"] = four_five_df["count"] / (four_five_df["5_gram_counts"] + 1)

# four_five_df.to_csv("./n_gram_processed/four_five_df.csv")
four_five_df.iloc[:20]

Unnamed: 0,4_gram,count,compare,group_count,5_gram_counts,4_gram / 5_gram
0,透明質酸,1111,"{'透明質酸及': 126, '含透明質酸': 98, '及透明質酸': 79, '子透明質...",110,1303,0.851994
1,型美甲片,1040,"{'造型美甲片': 605, '薄型美甲片': 435, '型美甲片閃': 7, '型美甲片...",36,1139,0.912281
2,膠原蛋白,824,"{'進膠原蛋白': 68, '解膠原蛋白': 61, '的膠原蛋白': 59, '膠原蛋白及...",109,1020,0.807052
3,指甲品牌,747,"{'指甲品牌全': 744, '銷指甲品牌': 741, '销指甲品牌': 6}",3,1491,0.50067
4,全新造型,746,"{'全新造型美': 744, '牌全新造型': 744, '拓全新造型': 2, '全新造型...",4,1492,0.499665
5,品牌全新,744,"{'甲品牌全新': 744, '品牌全新造': 744}",2,1488,0.499664
6,新造型美,744,"{'全新造型美': 744, '新造型美甲': 744}",2,1488,0.499664
7,造型美甲,744,"{'新造型美甲': 744, '造型美甲片': 605, '造型美甲貼': 139}",3,1488,0.499664
8,甲品牌全,744,"{'甲品牌全新': 744, '指甲品牌全': 744}",2,1488,0.499664
9,牌全新造,744,"{'牌全新造型': 744, '品牌全新造': 744}",2,1488,0.499664


In [33]:
four_five_df.describe()

Unnamed: 0,count,group_count,5_gram_counts,4_gram / 5_gram
count,480.0,480.0,480.0,480.0
mean,149.754167,14.3375,195.610417,2.95505
std,135.822884,12.500466,240.369115,25.245536
min,76.0,0.0,0.0,0.496732
25%,86.0,5.0,95.0,0.636003
50%,106.0,12.0,133.5,0.845757
75%,147.0,20.0,192.25,1.009334
max,1111.0,110.0,1492.0,513.0


In [34]:
four_five_df.sort_values("4_gram / 5_gram", ascending=False)

Unnamed: 0,4_gram,count,compare,group_count,5_gram_counts,4_gram / 5_gram
15,商家直送,513,{},0,0,513.000000
165,聯絡電話,127,{},0,0,127.000000
178,或者點擊,124,{},0,0,124.000000
247,每件數量,105,{},0,0,105.000000
80,類防腐劑,179,{'酯類防腐劑': 3},1,3,44.750000
...,...,...,...,...,...,...
377,科專家測,84,"{'科專家測試': 84, '膚科專家測': 81, '眼科專家測': 3}",3,168,0.497041
445,露防菌保,79,"{'露防菌保護': 79, '滴露防菌保': 79}",2,158,0.496855
437,滴露防菌,79,"{'滴露防菌保': 79, '的滴露防菌': 69, '是滴露防菌': 10}",3,158,0.496855
436,經動物測,79,"{'經動物測試': 79, '不經動物測': 72, '有經動物測': 5, '未經動物測'...",4,158,0.496855


In [35]:
four_five_df["keep_4_gram"] = (four_five_df["group_count"] >= 6) & (four_five_df["4_gram / 5_gram"] >= 0.55)
four_five_df[~(four_five_df["keep_4_gram"] == True)]

Unnamed: 0,4_gram,count,compare,group_count,5_gram_counts,4_gram / 5_gram,keep_4_gram
3,指甲品牌,747,"{'指甲品牌全': 744, '銷指甲品牌': 741, '销指甲品牌': 6}",3,1491,0.500670,False
4,全新造型,746,"{'全新造型美': 744, '牌全新造型': 744, '拓全新造型': 2, '全新造型...",4,1492,0.499665,False
5,品牌全新,744,"{'甲品牌全新': 744, '品牌全新造': 744}",2,1488,0.499664,False
6,新造型美,744,"{'全新造型美': 744, '新造型美甲': 744}",2,1488,0.499664,False
7,造型美甲,744,"{'新造型美甲': 744, '造型美甲片': 605, '造型美甲貼': 139}",3,1488,0.499664,False
...,...,...,...,...,...,...,...
461,兒濕紙巾,77,"{'嬰兒濕紙巾': 75, '兒濕紙巾使': 4, '幼兒濕紙巾': 2}",3,81,0.939024,False
466,系列產品,77,"{'系列產品比': 31, '有系列產品': 27, '根系列產品': 9, '密系列產品'...",21,140,0.546099,False
467,管理委員,77,"{'林管理委員': 76, '管理委員會': 50, '管理委員認': 27}",3,153,0.500000,False
473,森林管理,76,"{'森林管理委': 76, '經森林管理': 39}",2,115,0.655172,False


In [36]:
general_col = ["n-gram", "keep"]

bigram_result = bi_tri_df[["2_gram", "keep_2_gram"]].copy()
bigram_result.columns = general_col

trigram_result = tri_four_df[["3_gram", "keep_3_gram"]].copy()
trigram_result.columns = general_col

fourgram_result = four_five_df[["4_gram", "keep_4_gram"]].copy()
fourgram_result.columns = general_col

final_df = pd.concat([bigram_result, trigram_result, fourgram_result])
final_df.to_csv("./n_gram_processed/final_df.csv")
final_df.iloc[:20]

Unnamed: 0,n-gram,keep
0,肌膚,True
1,配方,True
2,保濕,True
3,使用,True
4,精華,True
5,天然,True
6,有效,True
7,成分,True
8,皮膚,True
9,產品,True
