In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# # download packages

# !pip3 install bertopic
# !pip3 install hdbscan
# !pip3 install pandas

In [2]:
import logging

# 設定 logging
logging.basicConfig(
    filename='bert.log',
    filemode='a',
    format='%(asctime)s %(levelname)s: %(message)s',
    level=logging.INFO
)

## BERTpopic

- 留言：需要先過濾表情符號、網頁原始碼、ckip斷詞
- 逐字稿：清理時間代碼、ckip斷詞

In [3]:
import pandas as pd
import ast
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from hdbscan import HDBSCAN
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# 載入停用詞
with open('stopwords_zh-tw.txt', 'r', encoding='utf-8') as f:
# with open('/content/drive/MyDrive/comments/stopwords_zh-tw.txt', 'r', encoding='utf-8') as f:
    stopwords = f.read().splitlines()
    stopwords = [word.strip() for word in stopwords]
    print(stopwords)

['\ufeff,', '?', '、', '。', '“', '”', '《', '》', '！', '，', '：', '；', '？', '人民', '末##末', '啊', '阿', '哎', '哎呀', '哎喲', '唉', '我', '我們', '按', '按照', '依照', '吧', '吧噠', '把', '罷了', '被', '本', '本著', '比', '比方', '比如', '鄙人', '彼', '彼此', '邊', '別', '別的', '別說', '並', '並且', '不比', '不成', '不單', '不但', '不獨', '不管', '不光', '不過', '不僅', '不拘', '不論', '不怕', '不然', '不如', '不特', '不惟', '不問', '不只', '朝', '朝著', '趁', '趁著', '乘', '沖', '除', '除此之外', '除非', '除了', '此', '此間', '此外', '從', '從而', '打', '待', '但', '但是', '當', '當著', '到', '得', '的', '的話', '等', '等等', '地', '第', '叮咚', '對', '對於', '多', '多少', '而', '而況', '而且', '而是', '而外', '而言', '而已', '爾後', '反過來', '反過來說', '反之', '非但', '非徒', '否則', '嘎', '嘎登', '該', '趕', '個', '各', '各個', '各位', '各種', '各自', '給', '根據', '跟', '故', '故此', '固然', '關於', '管', '歸', '果然', '果真', '過', '哈', '哈哈', '呵', '和', '何', '何處', '何況', '何時', '嘿', '哼', '哼唷', '呼哧', '乎', '嘩', '還是', '還有', '換句話說', '換言之', '或', '或是', '或者', '極了', '及', '及其', '及至', '即', '即便', '即或', '即令', '即若', '即使', '幾', '幾時', '己', '既', '既然', '既是', '繼而', '加之', '假如', '假若', '假使', '鑒於', 

In [5]:
# 清理停用詞
def clean_tokens(tokens):
    return [t for t in tokens if t.strip() not in stopwords]

In [6]:
### i = 0 ~ 35

# num_list = [0, 1, 7, 14, 21, 22, 25, 31, 33, 34, 35]

i = 7

comment_df = pd.read_csv(f"hello_comments/spam_result/v{i}_self_training_results.csv", encoding='utf-8')
print(">>> 總共有", len(comment_df), "筆留言")
# comment_df = pd.read_csv(f"/content/drive/MyDrive/Hello_comment/for_bert/video_{i}_ckip.csv", encoding='utf-8')
# print(f"==> 正在處理 video_{i}, 總共有{len(comment_df)} 筆留言")
# logging.info(f"==> 正在處理 video_{i}, 總共有{len(comment_df)} 筆留言")

try:
    data = comment_df[['video_title', 'cleaned_text', 'ws', 'published_at', 'author_name', 'like_count', 'comment_type', 'predicted_label']]
    data = data.dropna().reset_index(drop=True)

    # 2. 清理斷詞欄位
    data['ws'] = data['ws'].apply(lambda x: ast.literal_eval(x))
    data['ws_cleaned'] = data['ws'].apply(clean_tokens)
    data['ws_cleaned'] = data['ws_cleaned'].apply(lambda x: ' '.join(x))

    # 3. 過濾空白
    data = data[data['ws_cleaned'].str.strip() != ""].reset_index(drop=True)
    print(">>> 清理後的留言數量:", len(data))
    # print(f"[INFO]video_{i} 清理後的留言數量: {len(data)}")
    # logging.info(f"[INFO]video_{i} 清理後的留言數量: {len(data)}")

    # 檢查是否有空的資料
    if data.empty:
        print(">>> 資料為空，跳過")
        # print(f"[Warnning]video_{i} 全部為空，跳過")
        # logging.info(f"[Warnning]video_{i} 全部為空，跳過")

except Exception as e:
    print(">>> 發生錯誤:", e)
    # print(f"[Error]video_{i} 發生錯誤: {e}")
    # logging.info(f"[Error]video_{i} 發生錯誤: {e}")
    raise e

>>> 總共有 2135 筆留言
>>> 清理後的留言數量: 2127


# BERTopic Set Up

In [7]:
# bertopic setup
vectorizer_model = CountVectorizer(tokenizer=lambda x: x.split(" "))
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
representation_model = KeyBERTInspired()
hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean',
                        cluster_selection_method='eom', prediction_data=True, min_samples=10)


In [8]:

# result_dir = Path("/content/drive/MyDrive/Hello_comment/bert_result_0515")
result_dir = Path("hello_comments/bert_result_0606")
result_dir.mkdir(parents=True, exist_ok=True)

try:
    # 4. 執行 BERTopic
    # print(f"==> video_{i} 開始執行 BERTopic")
    # logging.info(f"==> video_{i} 開始執行 BERTopic")

    docs = data['ws_cleaned'].tolist()
    title = data['video_title'].iloc[0]
    video_prefix = f"video_{i}"
    # video_prefix = f"new_video_0_test"
    assignment_file = result_dir / f"{video_prefix}_topic_assignments.csv"
    keyword_file = result_dir / f"{video_prefix}_topic_keywords.csv"

    if assignment_file.exists() and keyword_file.exists():
        print(f"[Warnning]{video_prefix} ({title})的留言已經處理過，跳過")

    topic_model = BERTopic(
        language="chinese (traditional)",
        embedding_model="all-MiniLM-L6-v2",
        vectorizer_model=vectorizer_model,
        calculate_probabilities=True,
        verbose=True
    )
    topics, probs = topic_model.fit_transform(docs)

    doc_info = topic_model.get_document_info(docs)
    doc_info['original_text'] = data['cleaned_text'].values
    doc_info['published_at'] = data['published_at'].values
    doc_info['author_name'] = data['author_name'].values
    doc_info['like_count'] = data['like_count'].values
    doc_info['comment_type'] = data['comment_type'].values
    doc_info['video_title'] = title
    doc_info['spam_tag'] = data['predicted_label'].values
    doc_info.to_csv(assignment_file, index=False, encoding="utf-8")

    topic_info = topic_model.get_topic_info()
    topic_info.to_csv(keyword_file, index=False, encoding="utf-8")

    print(f"==> BERTopic 結果存到：{video_prefix}")
except Exception as e:
    print(">>> 處理 video 發生錯誤:", e)
    # print(f"[Error]處理 video_{i}發生錯誤: {e}")

2025-06-06 10:51:58,617 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 67/67 [00:08<00:00,  7.63it/s]
2025-06-06 10:52:14,402 - BERTopic - Embedding - Completed ✓
2025-06-06 10:52:14,403 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-06 10:52:25,477 - BERTopic - Dimensionality - Completed ✓
2025-06-06 10:52:25,478 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-06 10:52:25,901 - BERTopic - Cluster - Completed ✓
2025-06-06 10:52:25,908 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-06 10:52:25,980 - BERTopic - Representation - Completed ✓


==> BERTopic 結果存到：video_7
