In [1]:
import pandas as pd

In [2]:
sample_df = pd.read_csv("./sample_data/product_name_sample_data.csv")
sample_df.head()

Unnamed: 0,product_name,description
0,草本花香洗髮露 600毫升,揉合100%日本培植的有機草本植物，令髮絲彷如重生，令變得柔滑清爽，氛芳花香，令你彷如置身大...
1,Voost 運動水樽,VOOST MUG
2,完美遮瑕筆306 (1.5ml),質地柔亮潤澤，遮瑕的同時去除暗沉，作為 highlight 使用能提亮妝容<BR><BR>獨...
3,高效防脫增生洗髮液 150毫升,ANTI HAIR LOSS SHAMP
4,ISOTONIC 運動水溶片青檸檬味十片裝,幫助人體代謝碳水化合物、脂肪和蛋白質; 快速補充水份與電解質，促進神經肌肉傳導; 有助維持肌...


In [3]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32372 entries, 0 to 32371
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_name  32361 non-null  object
 1   description   31501 non-null  object
dtypes: object(2)
memory usage: 505.9+ KB


In [4]:
sample_df = sample_df.fillna("*")

In [5]:
product_name_df = sample_df[["product_name"]]
description_df = sample_df[["description"]]

product_name_df.columns = ["doc"]
description_df.columns = ["doc"]

long_df = pd.concat([product_name_df, description_df])
long_df.head()

Unnamed: 0,doc
0,草本花香洗髮露 600毫升
1,Voost 運動水樽
2,完美遮瑕筆306 (1.5ml)
3,高效防脫增生洗髮液 150毫升
4,ISOTONIC 運動水溶片青檸檬味十片裝


In [6]:
long_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 64744 entries, 0 to 32371
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   doc     64744 non-null  object
dtypes: object(1)
memory usage: 1011.6+ KB


In [7]:
import re


def clean_text(text:str) -> str:
    # 去除 HTML 標籤
    text = re.sub(r'<.*?>', ',', text)
    # 去除非字母字符以及英文
    text = re.sub(r'[^\u4e00-\u9fa5]', ',', text)
    text = re.sub(r",+", ",", text)

    return text

In [8]:
long_df["doc"] = long_df["doc"].apply(clean_text)
long_df.iloc[:20]

Unnamed: 0,doc
0,"草本花香洗髮露,毫升"
1,",運動水樽"
2,"完美遮瑕筆,"
3,"高效防脫增生洗髮液,毫升"
4,",運動水溶片青檸檬味十片裝"
5,強生便利貼公主膠布
6,"牛油果滋養修護晚霜,毫升"
7,",麥蘆卡蜂蜜,克,"
8,"光采豐蜜唇釉,"
9,",金盞花面霜"


In [9]:
# def do_n_gram(doc: str, n: int=2) -> list[str]:
#     pattern = re.compile(",")
#     return [doc[i: i+n] for i in range(len(doc) - (n-1)) if not (bool(pattern.search(doc[i: i+n])))]


def do_n_gram(doc: str, n: int=2) -> list[str]:
    return [doc[i: i+n] for i in range(len(doc) - (n-1)) if not ("," in doc[i: i+n])]

In [10]:
colunm_name = ["bigram"]
long_df["count"] = 1

In [11]:
long_df.shape

(64744, 2)

In [12]:
long_df.drop_duplicates(subset=['doc'], inplace=True)  # 64744 → 42071

In [13]:
long_df.shape

(42071, 2)

In [14]:
for i in range(2, 6):
    long_df[f"{i}_gram"] = long_df["doc"].apply(lambda x: do_n_gram(x, i))

In [15]:
long_df.iloc[:20]

Unnamed: 0,doc,count,2_gram,3_gram,4_gram,5_gram
0,"草本花香洗髮露,毫升",1,"[草本, 本花, 花香, 香洗, 洗髮, 髮露, 毫升]","[草本花, 本花香, 花香洗, 香洗髮, 洗髮露]","[草本花香, 本花香洗, 花香洗髮, 香洗髮露]","[草本花香洗, 本花香洗髮, 花香洗髮露]"
1,",運動水樽",1,"[運動, 動水, 水樽]","[運動水, 動水樽]",[運動水樽],[]
2,"完美遮瑕筆,",1,"[完美, 美遮, 遮瑕, 瑕筆]","[完美遮, 美遮瑕, 遮瑕筆]","[完美遮瑕, 美遮瑕筆]",[完美遮瑕筆]
3,"高效防脫增生洗髮液,毫升",1,"[高效, 效防, 防脫, 脫增, 增生, 生洗, 洗髮, 髮液, 毫升]","[高效防, 效防脫, 防脫增, 脫增生, 增生洗, 生洗髮, 洗髮液]","[高效防脫, 效防脫增, 防脫增生, 脫增生洗, 增生洗髮, 生洗髮液]","[高效防脫增, 效防脫增生, 防脫增生洗, 脫增生洗髮, 增生洗髮液]"
4,",運動水溶片青檸檬味十片裝",1,"[運動, 動水, 水溶, 溶片, 片青, 青檸, 檸檬, 檬味, 味十, 十片, 片裝]","[運動水, 動水溶, 水溶片, 溶片青, 片青檸, 青檸檬, 檸檬味, 檬味十, 味十片, ...","[運動水溶, 動水溶片, 水溶片青, 溶片青檸, 片青檸檬, 青檸檬味, 檸檬味十, 檬味十...","[運動水溶片, 動水溶片青, 水溶片青檸, 溶片青檸檬, 片青檸檬味, 青檸檬味十, 檸檬味..."
5,強生便利貼公主膠布,1,"[強生, 生便, 便利, 利貼, 貼公, 公主, 主膠, 膠布]","[強生便, 生便利, 便利貼, 利貼公, 貼公主, 公主膠, 主膠布]","[強生便利, 生便利貼, 便利貼公, 利貼公主, 貼公主膠, 公主膠布]","[強生便利貼, 生便利貼公, 便利貼公主, 利貼公主膠, 貼公主膠布]"
6,"牛油果滋養修護晚霜,毫升",1,"[牛油, 油果, 果滋, 滋養, 養修, 修護, 護晚, 晚霜, 毫升]","[牛油果, 油果滋, 果滋養, 滋養修, 養修護, 修護晚, 護晚霜]","[牛油果滋, 油果滋養, 果滋養修, 滋養修護, 養修護晚, 修護晚霜]","[牛油果滋養, 油果滋養修, 果滋養修護, 滋養修護晚, 養修護晚霜]"
7,",麥蘆卡蜂蜜,克,",1,"[麥蘆, 蘆卡, 卡蜂, 蜂蜜]","[麥蘆卡, 蘆卡蜂, 卡蜂蜜]","[麥蘆卡蜂, 蘆卡蜂蜜]",[麥蘆卡蜂蜜]
8,"光采豐蜜唇釉,",1,"[光采, 采豐, 豐蜜, 蜜唇, 唇釉]","[光采豐, 采豐蜜, 豐蜜唇, 蜜唇釉]","[光采豐蜜, 采豐蜜唇, 豐蜜唇釉]","[光采豐蜜唇, 采豐蜜唇釉]"
9,",金盞花面霜",1,"[金盞, 盞花, 花面, 面霜]","[金盞花, 盞花面, 花面霜]","[金盞花面, 盞花面霜]",[金盞花面霜]


In [16]:
# def compute_document_frequency(target, doc):
#     return doc.iloc[:, 0].str.contains(target).value_counts()[1]

In [17]:
# import math
# def compute_idf(df_counts, total_docs):
#     return math.log(total_docs / (1 + df_counts))

In [18]:
# total_doc = len(long_df)

In [19]:
# bigram_df["df_count"] = bigram_df["2_gram"].apply(lambda x: compute_document_frequency(x, long_df))
# bigram_df["idf_score"] = bigram_df["df_count"].apply(lambda x: compute_idf(x, total_doc))
# bigram_df["tf-idf"] = bigram_df["df_count"] * bigram_df["idf_score"]
# bigram_df.to_csv("./n_gram_processed/bigram_df_tfidf.csv")

In [20]:
bigram_df = long_df[["2_gram", "count"]].explode("2_gram").groupby("2_gram", as_index=False).agg({"count": "sum"}).sort_values("count", ascending=False).reset_index(drop=True)
# bigram_df = bigram_df[~bigram_df["2_gram"].str.contains("#")].reset_index(drop=True)
bigram_df

Unnamed: 0,2_gram,count
0,肌膚,12065
1,保濕,4962
2,配方,4809
3,使用,4367
4,精華,4299
...,...,...
149680,洗修,1
149681,洗便,1
149682,洗你,1
149683,洗低,1


In [21]:
trigram_df = long_df[["3_gram", "count"]].explode("3_gram").groupby("3_gram", as_index=False).agg({"count": "sum"}).sort_values("count", ascending=False).reset_index(drop=True)
# trigram_df = trigram_df[~trigram_df["3_gram"].str.contains("#")].reset_index(drop=True)
trigram_df

Unnamed: 0,3_gram,count
0,維他命,2362
1,屈臣氏,1457
2,透明質,924
3,明質酸,922
4,防腐劑,914
...,...,...
343566,明鏡盒,1
343567,明長效,1
343568,明雙梳,1
343569,明雙眼,1


In [22]:
fourgram_df = long_df[["4_gram", "count"]].explode("4_gram").groupby("4_gram", as_index=False).agg({"count": "sum"}).sort_values("count", ascending=False).reset_index(drop=True)
# fourgram_df = fourgram_df[~fourgram_df["4_gram"].str.contains("#")].reset_index(drop=True)
fourgram_df

Unnamed: 0,4_gram,count
0,透明質酸,920
1,膠原蛋白,715
2,個工作天,470
3,商家直送,462
4,送到府上,459
...,...,...
427003,效殺死引,1
427004,效殺死及,1
427005,效歷久如,1
427006,效止腹瀉,1


In [23]:
fivegram_df = long_df[["5_gram", "count"]].explode("5_gram").groupby("5_gram", as_index=False).agg({"count": "sum"}).sort_values("count", ascending=False).reset_index(drop=True)
# fivegram_df = fivegram_df[~fivegram_df["5_gram"].str.contains("#")].reset_index(drop=True)
fivegram_df

Unnamed: 0,5_gram,count
0,個工作天內,448
1,工作天內送,447
2,天內送到府,442
3,內送到府上,442
4,作天內送到,442
...,...,...
427764,提高身體素,1
427765,提高身體的,1
427766,提高身體應,1
427767,提高身體元,1


In [24]:
# def find_matches(ngram1, ngram2):
#     return ngram2[(ngram2.iloc[:, 1] > 1) & (ngram2.iloc[:, 0].str.contains(ngram1))].iloc[:, 0].to_list()

In [25]:
def find_matches(term, ngram):
    pattern = re.compile(term)
    filtered_df = ngram[ngram.iloc[:, 1] > 1]
    matches = filtered_df[filtered_df.iloc[:, 0].apply(lambda x: bool(pattern.search(x)))]  # 100 -> 5.9s
    # matches = filtered_df[filtered_df.iloc[:, 0].str.contains(ngram1)]  # 100 -> 6.7s
    return len(matches)  # {row.iloc[0]: row.iloc[1] for _, row in matches.iterrows()}

In [26]:
import numpy as np
def find_matches_np(term, ngram):  # 100 -> 12.7s
    array = ngram.iloc[:, 0][ngram.iloc[:, 1] > 1].to_list()

    matches = np.char.find(array, term) != -1

    return matches.sum()

In [27]:
bi_tri_df = bigram_df.copy()

bi_tri_df["compare"] = bi_tri_df["2_gram"][bi_tri_df.iloc[:, 1] > 1].apply(lambda x: find_matches(x, trigram_df))

bi_tri_df.to_csv("./n_gram_processed/bi_tri_df.csv")
bi_tri_df

Unnamed: 0,2_gram,count,compare
0,肌膚,12065,600.0
1,保濕,4962,386.0
2,配方,4809,443.0
3,使用,4367,437.0
4,精華,4299,421.0
...,...,...,...
149680,洗修,1,
149681,洗便,1,
149682,洗你,1,
149683,洗低,1,


In [28]:
bi_tri_df["group_count"] = bi_tri_df["compare"].apply(lambda x: len(x.items()) if isinstance(x, dict) else 0)
# bi_tri_df["keep_2_gram"] = bi_tri_df["group_count"].apply(lambda x: True if x > 100 else False)
bi_tri_df

Unnamed: 0,2_gram,count,compare,group_count
0,肌膚,12065,600.0,0
1,保濕,4962,386.0,0
2,配方,4809,443.0,0
3,使用,4367,437.0,0
4,精華,4299,421.0,0
...,...,...,...,...
149680,洗修,1,,0
149681,洗便,1,,0
149682,洗你,1,,0
149683,洗低,1,,0


In [29]:
tri_four_df = trigram_df.copy()

tri_four_df["compare"] = tri_four_df["3_gram"][tri_four_df.iloc[:, 1] > 1].apply(lambda x: find_matches(x, fourgram_df))

bi_tri_df.to_csv("./n_gram_processed/tri_four_df.csv")

tri_four_df

Unnamed: 0,3_gram,count,compare
0,維他命,2362,128.0
1,屈臣氏,1457,179.0
2,透明質,924,54.0
3,明質酸,922,51.0
4,防腐劑,914,25.0
...,...,...,...
343566,明鏡盒,1,
343567,明長效,1,
343568,明雙梳,1,
343569,明雙眼,1,


In [30]:
tri_four_df["group_count"] = tri_four_df["compare"].apply(lambda x: len(x) if isinstance(x, list) else 0)
tri_four_df["keep_3_gram"] = tri_four_df["group_count"].apply(lambda x: True if x>60 else False)

tri_four_df.head(20)

Unnamed: 0,3_gram,count,compare,group_count,keep_3_gram
0,維他命,2362,128.0,0,False
1,屈臣氏,1457,179.0,0,False
2,透明質,924,54.0,0,False
3,明質酸,922,51.0,0,False
4,防腐劑,914,25.0,0,False
5,抗氧化,874,100.0,0,False
6,無添加,765,64.0,0,False
7,原蛋白,717,51.0,0,False
8,膠原蛋,715,54.0,0,False
9,令肌膚,703,82.0,0,False


In [31]:
four_five_df = fourgram_df.copy()

four_five_df["compare"] = four_five_df["4_gram"][four_five_df.iloc[:, 1] > 1].apply(lambda x: find_matches(x, fivegram_df))

four_five_df.to_csv("./n_gram_processed/four_five_df.csv")

four_five_df

Unnamed: 0,4_gram,count,compare
0,透明質酸,920,101.0
1,膠原蛋白,715,101.0
2,個工作天,470,5.0
3,商家直送,462,0.0
4,送到府上,459,2.0
...,...,...,...
427003,效殺死引,1,
427004,效殺死及,1,
427005,效歷久如,1,
427006,效止腹瀉,1,
