In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("./sample_data/product_name_sample_data.csv")
df.head()

Unnamed: 0,product_name,description
0,草本花香洗髮露 600毫升,揉合100%日本培植的有機草本植物，令髮絲彷如重生，令變得柔滑清爽，氛芳花香，令你彷如置身大...
1,Voost 運動水樽,VOOST MUG
2,完美遮瑕筆306 (1.5ml),質地柔亮潤澤，遮瑕的同時去除暗沉，作為 highlight 使用能提亮妝容<BR><BR>獨...
3,高效防脫增生洗髮液 150毫升,ANTI HAIR LOSS SHAMP
4,ISOTONIC 運動水溶片青檸檬味十片裝,幫助人體代謝碳水化合物、脂肪和蛋白質; 快速補充水份與電解質，促進神經肌肉傳導; 有助維持肌...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32372 entries, 0 to 32371
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_name  32361 non-null  object
 1   description   31501 non-null  object
dtypes: object(2)
memory usage: 505.9+ KB


In [6]:
df = df.fillna("*")

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32372 entries, 0 to 32371
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_name  32372 non-null  object
 1   description   32372 non-null  object
dtypes: object(2)
memory usage: 505.9+ KB


In [8]:
df["combine"] = df["product_name"] + df["description"]
df.head()

Unnamed: 0,product_name,description,combine
0,草本花香洗髮露 600毫升,揉合100%日本培植的有機草本植物，令髮絲彷如重生，令變得柔滑清爽，氛芳花香，令你彷如置身大...,草本花香洗髮露 600毫升揉合100%日本培植的有機草本植物，令髮絲彷如重生，令變得柔滑清爽...
1,Voost 運動水樽,VOOST MUG,Voost 運動水樽VOOST MUG
2,完美遮瑕筆306 (1.5ml),質地柔亮潤澤，遮瑕的同時去除暗沉，作為 highlight 使用能提亮妝容<BR><BR>獨...,完美遮瑕筆306 (1.5ml)質地柔亮潤澤，遮瑕的同時去除暗沉，作為 highlight ...
3,高效防脫增生洗髮液 150毫升,ANTI HAIR LOSS SHAMP,高效防脫增生洗髮液 150毫升ANTI HAIR LOSS SHAMP
4,ISOTONIC 運動水溶片青檸檬味十片裝,幫助人體代謝碳水化合物、脂肪和蛋白質; 快速補充水份與電解質，促進神經肌肉傳導; 有助維持肌...,ISOTONIC 運動水溶片青檸檬味十片裝幫助人體代謝碳水化合物、脂肪和蛋白質; 快速補充水...


In [9]:
import re


def clean_text(text:str) -> str:
    # 去除 HTML 標籤
    text = re.sub(r'<.*?>', '', text)
    # 去除非字母字符
    text = re.sub(r'[^a-zA-Z\u4e00-\u9fa5]', ' ', text)
    # 轉換為小寫（僅針對英文）
    text = text.lower()

    return text

In [10]:
df["clean_text"] = df["combine"].apply(clean_text)
df["clean_text"].head()

0    草本花香洗髮露    毫升揉合    日本培植的有機草本植物 令髮絲彷如重生 令變得柔滑清爽...
1                                  voost 運動水樽voost mug
2    完美遮瑕筆        ml 質地柔亮潤澤 遮瑕的同時去除暗沉 作為 highlight ...
3                  高效防脫增生洗髮液    毫升anti hair loss shamp
4    isotonic 運動水溶片青檸檬味十片裝幫助人體代謝碳水化合物 脂肪和蛋白質  快速補充水...
Name: clean_text, dtype: object

In [11]:
def split_chinese_english(text):
    chinese_part = re.findall(r'[\u4e00-\u9fa5]+', text)
    english_part = re.findall(r'[a-zA-Z]+', text)
    return ' '.join(chinese_part), ' '.join(english_part)

In [12]:
df['chinese_part'], df['english_part'] = zip(*df['clean_text'].apply(split_chinese_english))
df["chinese_part"] = df["chinese_part"].apply(lambda x: re.sub(" ", "", x))
df[["chinese_part", "english_part"]].head()

Unnamed: 0,chinese_part,english_part
0,草本花香洗髮露毫升揉合日本培植的有機草本植物令髮絲彷如重生令變得柔滑清爽氛芳花香令你彷如置身...,
1,運動水樽,voost voost mug
2,完美遮瑕筆質地柔亮潤澤遮瑕的同時去除暗沉作為使用能提亮妝容獨有的斜面刷頭設計取代手指貼合面部...,ml highlight
3,高效防脫增生洗髮液毫升,anti hair loss shamp
4,運動水溶片青檸檬味十片裝幫助人體代謝碳水化合物脂肪和蛋白質快速補充水份與電解質促進神經肌肉傳...,isotonic stevia


In [13]:
def do_n_gram_chinese(doc: str, n: int=2) -> list[tuple[str, int]]:
    text = doc

    if doc:
        freq = {}
        for i in range(len(text) - (n-1)):
            n_gram = "".join(text[i:i+n]).lower()
            freq[n_gram] = freq.get(n_gram, 0) + 1

        freq = sorted(freq.items(), key=lambda word_count: word_count[1], reverse=True)

        return freq

    return [("", 0)]

In [14]:
def do_n_gram_english(doc: str, n: int=2) -> list[tuple[str, int]]:
    text = doc.split()

    if doc:
        freq = {}
        for i in range(len(text) - (n-1)):
            n_gram = " ".join(text[i:i+n]).lower()
            freq[n_gram] = freq.get(n_gram, 0) + 1

        freq = sorted(freq.items(), key=lambda word_count: word_count[1], reverse=True)

        return freq

    return [("", 0)]

In [15]:
all_chinese_text = df["chinese_part"].sum()

In [16]:
metrics = []

for i in range(2, 5):
    print(f"Now doing {i}_gram")

    chinese_n_gram = do_n_gram_chinese(all_chinese_text, n=i)

    result = pd.DataFrame(chinese_n_gram, columns=[f"{i}_gram", "freq"])
    result.to_csv(f"./n_gram_processed/{i}_gram.csv", encoding="utf-8-sig", index=False)

    uninqe_count = result.shape[0]
    total_count = result["freq"].sum()
    rare_ngrams = result['freq'][result['freq'] == 1].count()

    coverage = uninqe_count / total_count
    sparsity = rare_ngrams / total_count
    print(f"total uniqe {i}_gram: {uninqe_count}")
    print(f"coverage of {i}_gram: {coverage:.5f}")
    print(f"sparsity of {i}_gram: {sparsity:.5f}")

    metrics.append((i, uninqe_count, total_count, coverage, sparsity))

    print(result.head())
    print("\n")

Now doing 2_gram
total uniqe 2_gram: 215133
coverage of 2_gram: 0.10063
sparsity of 2_gram: 0.04011
  2_gram   freq
0     肌膚  15144
1     配方   6587
2     保濕   5941
3     使用   5885
4     精華   5198


Now doing 3_gram
total uniqe 3_gram: 569396
coverage of 3_gram: 0.26633
sparsity of 3_gram: 0.14011
  3_gram  freq
0    維他命  2816
1    屈臣氏  1830
2    防腐劑  1194
3    型美甲  1179
4    美甲片  1124


Now doing 4_gram
total uniqe 4_gram: 856792
coverage of 4_gram: 0.40076
sparsity of 4_gram: 0.24592
  4_gram  freq
0   透明質酸  1112
1   型美甲片  1040
2   膠原蛋白   824
3   指甲品牌   747
4   全新造型   746




In [17]:
metrics_df = pd.DataFrame(metrics, columns=["n", "uninqe_count", "total_count", "coverage", "sparsity"])
metrics_df

Unnamed: 0,n,uninqe_count,total_count,coverage,sparsity
0,2,215133,2137942,0.100626,0.040107
1,3,569396,2137941,0.266329,0.140113
2,4,856792,2137940,0.400756,0.245924


In [18]:
bigram_df = pd.read_csv("./n_gram_processed/2_gram.csv", encoding="utf-8-sig")
bigram_df.rename(columns={"2_gram": "ngram"}, inplace=True)
bigram_df.head()

Unnamed: 0,ngram,freq
0,肌膚,15144
1,配方,6587
2,保濕,5941
3,使用,5885
4,精華,5198


In [19]:
trigram_df = pd.read_csv("./n_gram_processed/3_gram.csv", encoding="utf-8-sig")
trigram_df.rename(columns={"3_gram": "ngram"}, inplace=True)
trigram_df.head()

Unnamed: 0,ngram,freq
0,維他命,2816
1,屈臣氏,1830
2,防腐劑,1194
3,型美甲,1179
4,美甲片,1124


In [20]:
four_gram_df = pd.read_csv("./n_gram_processed/4_gram.csv", encoding="utf-8-sig")
four_gram_df.rename(columns={"4_gram": "ngram"}, inplace=True)
four_gram_df.head()

Unnamed: 0,ngram,freq
0,透明質酸,1112
1,型美甲片,1040
2,膠原蛋白,824
3,指甲品牌,747
4,全新造型,746


In [21]:
# 添加 n-gram 類型
bigram_df["type"] = "bi"
trigram_df["type"] = "tri"
four_gram_df["type"] = "four"

# 合併所有 n-gram DataFrame
n_grams_df = pd.concat([bigram_df, trigram_df, four_gram_df])

# 計算總詞數和每個 n-gram 的出現概率
total_ngrams = n_grams_df['freq'].sum()  # 6413823
n_grams_df['prob'] = n_grams_df['freq'] / total_ngrams

In [22]:
n_grams_df.head()

Unnamed: 0,ngram,freq,type,prob
0,肌膚,15144,bi,0.002361
1,配方,6587,bi,0.001027
2,保濕,5941,bi,0.000926
3,使用,5885,bi,0.000918
4,精華,5198,bi,0.00081


In [23]:
from collections import Counter

# 計算 n-gram 中詞的詞頻
word_counter = Counter()
for ngram in n_grams_df['ngram']:
    for word in ngram:
        # words = ngram.split()
        word_counter.update(word)

# 計算總詞數
total_words = sum(word_counter.values())  # 1641321
print(total_words)

# 計算每個詞的概率
word_probs = {word: count / total_words for word, count in word_counter.items()}

5565622


In [24]:
import math


# 計算 n-gram 之間的互信息值
def calculate_mi_vectorized(ngram_series1, ngram_series2, prob_series1, prob_series2):
    def compute_common_word_prob(ngram1, ngram2):
        common_words = set(list(ngram1)).intersection(list(ngram2))
        if not common_words:
            return 0
        return math.prod([word_probs[word] for word in common_words])
    
    common_word_probs = ngram_series1.apply(lambda x: ngram_series2.apply(lambda y: compute_common_word_prob(x, y)))
    mi_matrix = prob_series1.values[:, None] * prob_series2.values / (common_word_probs + 1e-10)
    mi_matrix = mi_matrix.map(lambda x: math.log(x, 2) if x > 0 else 0)
    
    return mi_matrix

In [25]:
n_grams_df.head()

Unnamed: 0,ngram,freq,type,prob
0,肌膚,15144,bi,0.002361
1,配方,6587,bi,0.001027
2,保濕,5941,bi,0.000926
3,使用,5885,bi,0.000918
4,精華,5198,bi,0.00081


In [62]:
# 計算 bi-gram 和 tri-gram 之間的互信息
mi_bi_tri = calculate_mi_vectorized(n_grams_df.query("type == 'bi' and freq > 1")["ngram"][:100], 
                                    n_grams_df.query("type == 'tri' and freq > 1")["ngram"][:100], 
                                    n_grams_df.query("type == 'bi' and freq > 1")["prob"][:100], 
                                    n_grams_df.query("type == 'tri' and freq > 1")["prob"][:100])

mi_bi_tri.to_csv("./n_gram_processed/mi_bi_tri.csv", encoding="utf-8-sig", index=False)

mi_bi_tri.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,13.339665,12.717861,12.101821,12.083581,12.01466,12.004355,12.001767,11.980896,11.897042,-6.729361,...,10.40771,10.395933,10.391986,-8.081264,10.376088,10.360014,10.360014,10.355967,10.347839,-8.117586
1,12.138612,11.516808,10.900768,10.882529,10.813607,10.803302,10.800714,10.779843,10.695989,10.53093,...,9.206658,9.19488,9.190933,9.179026,9.175036,9.158961,9.158961,9.154914,9.146786,9.142705
2,11.989696,11.367893,10.751852,10.733613,10.664691,10.654386,10.651798,10.630927,10.547073,10.382014,...,9.057742,9.045965,9.042017,9.030111,9.02612,9.010045,9.010045,9.005998,8.99787,8.993789
3,11.976033,11.354229,10.738189,10.719949,10.651028,10.640723,10.638135,10.617264,10.53341,10.36835,...,9.044078,9.032301,9.028354,9.016447,9.012456,8.996382,8.996382,8.992335,8.984207,8.980126
4,11.796947,11.175144,10.559103,10.540864,10.471942,10.461637,10.459049,10.438178,10.354324,10.189265,...,8.864993,8.853215,8.849268,8.837362,8.833371,8.817296,8.817296,8.813249,8.805121,8.80104


In [63]:
# 將矩陣轉換為長格式
mi_bi_tri_long = mi_bi_tri.stack().reset_index()
mi_bi_tri_long.columns = ['short_index', 'long_index', 'mi_value']

mi_bi_tri_long.head()

Unnamed: 0,short_index,long_index,mi_value
0,0,0,13.339665
1,0,1,12.717861
2,0,2,12.101821
3,0,3,12.083581
4,0,4,12.01466


In [64]:
# 根據索引對應到原始的 bi-gram 和 tri-gram
mi_bi_tri_long['short_index'] = mi_bi_tri_long['short_index'].map(bigram_df['ngram'])
mi_bi_tri_long['long_index'] = mi_bi_tri_long['long_index'].map(trigram_df['ngram'])

In [65]:
mi_bi_tri_long.head(25)

Unnamed: 0,short_index,long_index,mi_value
0,肌膚,維他命,13.339665
1,肌膚,屈臣氏,12.717861
2,肌膚,防腐劑,12.101821
3,肌膚,型美甲,12.083581
4,肌膚,美甲片,12.01466
5,肌膚,透明質,12.004355
6,肌膚,明質酸,12.001767
7,肌膚,無添加,11.980896
8,肌膚,抗氧化,11.897042
9,肌膚,令肌膚,-6.729361


In [76]:
# 計算 tri-gram 和 four-gram 之間的互信息
mi_tri_four = calculate_mi_vectorized(n_grams_df.query("type == 'tri' and freq > 1")["ngram"][:100], 
                                      n_grams_df.query("type == 'four' and freq > 1")["ngram"][:100], 
                                      n_grams_df.query("type == 'tri' and freq > 1")["prob"][:100], 
                                      n_grams_df.query("type == 'four' and freq > 1")["prob"][:100])

mi_tri_four.to_csv("./n_gram_processed/mi_tri_four.csv", encoding="utf-8-sig", index=False)

mi_tri_four.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,9.572147,9.475574,9.139707,8.998171,8.996238,8.994303,8.992365,8.992365,8.992365,8.992365,...,6.992365,6.984588,6.976768,6.968906,6.968906,6.961001,6.961001,6.953052,6.945059,6.945059
1,8.950344,8.85377,8.517903,8.376367,8.374434,8.372499,8.370561,8.370561,8.370561,8.370561,...,6.370561,6.362784,6.354965,6.347103,6.347103,6.339197,6.339197,6.331249,6.323256,6.323256
2,8.334303,8.23773,7.901862,7.760326,7.758394,7.756458,7.754521,7.754521,7.754521,7.754521,...,5.754521,5.746743,5.738924,5.731062,5.731062,5.723156,5.723156,5.715208,5.707215,5.707215
3,8.316064,1.836389,7.883623,-15.994451,-16.246409,7.738219,-16.000257,7.736282,-8.112198,1.35318,...,5.736282,5.728504,5.720685,5.712823,5.712823,5.704917,5.704917,5.696969,5.688976,5.688976
4,8.247142,0.874431,7.814702,-16.063373,7.671233,7.669298,-16.069179,7.66736,-17.413813,-7.931099,...,5.66736,5.659583,5.651763,5.643901,5.643901,5.635996,-19.251655,5.628047,5.620054,5.620054


In [77]:
# 將矩陣轉換為長格式
mi_tri_four_long = mi_tri_four.stack().reset_index()
mi_tri_four_long.columns = ['short_index', 'long_index', 'mi_value']

mi_tri_four_long.head()

Unnamed: 0,short_index,long_index,mi_value
0,0,0,9.572147
1,0,1,9.475574
2,0,2,9.139707
3,0,3,8.998171
4,0,4,8.996238


In [78]:
# 根據索引對應到原始的 bi-gram 和 tri-gram
mi_tri_four_long['short_index'] = mi_tri_four_long['short_index'].map(trigram_df['ngram'])
mi_tri_four_long['long_index'] = mi_tri_four_long['long_index'].map(four_gram_df['ngram'])

In [79]:
mi_tri_four_long.head(25)

Unnamed: 0,short_index,long_index,mi_value
0,維他命,透明質酸,9.572147
1,維他命,型美甲片,9.475574
2,維他命,膠原蛋白,9.139707
3,維他命,指甲品牌,8.998171
4,維他命,全新造型,8.996238
5,維他命,品牌全新,8.994303
6,維他命,甲品牌全,8.992365
7,維他命,牌全新造,8.992365
8,維他命,新造型美,8.992365
9,維他命,造型美甲,8.992365


In [84]:
final_df = pd.concat([mi_bi_tri_long, mi_tri_four_long])
final_df.shape

(20000, 3)

In [85]:
# 設置互信息值的閾值
min_mi = 5

final_df["keep"] = final_df["mi_value"].map(lambda x: x > min_mi)
final_df[3556:3586]

Unnamed: 0,short_index,long_index,mi_value,keep
3556,柔軟,的天然,7.942339,True
3557,柔軟,內送到,7.942339,True
3558,柔軟,過皮膚,7.939391,True
3559,柔軟,粉底液,7.927542,True
3560,柔軟,氨基酸,7.882218,True
3561,柔軟,提升肌,7.85745,True
3562,柔軟,肌膚保,7.85745,True
3563,柔軟,升肌膚,7.844905,True
3564,柔軟,通過皮,7.835424,True
3565,柔軟,使肌膚,7.813056,True


In [86]:
final_df.to_csv("./n_gram_processed/final.csv", encoding="utf-8-sig", index=False)

In [None]:
# n_grams_df['ngram'][(n_grams_df["type"] == "bi") & (n_grams_df["freq"] > 1)]