In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# download packages

!pip3 install bertopic
!pip3 install hdbscan
!pip3 install pandas

[1;31merror[0m: [1mexternally-managed-environment[0m

[31m×[0m This environment is externally managed
[31m╰─>[0m To install Python packages system-wide, try brew install
[31m   [0m xyz, where xyz is the package you are trying to
[31m   [0m install.
[31m   [0m 
[31m   [0m If you wish to install a Python library that isn't in Homebrew,
[31m   [0m use a virtual environment:
[31m   [0m 
[31m   [0m python3 -m venv path/to/venv
[31m   [0m source path/to/venv/bin/activate
[31m   [0m python3 -m pip install xyz
[31m   [0m 
[31m   [0m If you wish to install a Python application that isn't in Homebrew,
[31m   [0m it may be easiest to use 'pipx install xyz', which will manage a
[31m   [0m virtual environment for you. You can install pipx with
[31m   [0m 
[31m   [0m brew install pipx
[31m   [0m 
[31m   [0m You may restore the old behavior of pip by passing
[31m   [0m the '--break-system-packages' flag to pip, or by adding
[31m   [0m 'break-system-packag

In [None]:
import logging

# 設定 logging
logging.basicConfig(
    filename='bert.log',
    filemode='a',
    format='%(asctime)s %(levelname)s: %(message)s',
    level=logging.INFO
)

## BERTpopic

- 留言：需要先過濾表情符號、網頁原始碼、ckip斷詞
- 逐字稿：清理時間代碼、ckip斷詞

In [None]:
import pandas as pd
import ast
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from hdbscan import HDBSCAN
from pathlib import Path

In [None]:
# 載入停用詞
# with open('stopwords_zh-tw.txt', 'r', encoding='utf-8') as f:
with open('/content/drive/MyDrive/comments/stopwords_zh-tw.txt', 'r', encoding='utf-8') as f:
    stopwords = f.read().splitlines()
    stopwords = [word.strip() for word in stopwords]
    print(stopwords)

In [None]:
# 清理停用詞
def clean_tokens(tokens):
    return [t for t in tokens if t.strip() not in stopwords]

In [None]:
### i = 0 ~ 35

i = 0
comment_df = pd.read_csv(f"/content/drive/MyDrive/Hello_comment/for_bert/video_{i}_ckip.csv", encoding='utf-8')
print(f"==> 正在處理 video_{i}, 總共有{len(comment_df)} 筆留言")
logging.info(f"==> 正在處理 video_{i}, 總共有{len(comment_df)} 筆留言")

try:
    data = comment_df[['video_title', 'cleaned_text', 'ws', 'published_at', 'author_name', 'like_count', 'comment_type']]
    data = data.dropna().reset_index(drop=True)

    # 2. 清理斷詞欄位
    data['ws'] = data['ws'].apply(lambda x: ast.literal_eval(x))
    data['ws_cleaned'] = data['ws'].apply(clean_tokens)
    data['ws_cleaned'] = data['ws_cleaned'].apply(lambda x: ' '.join(x))

    # 3. 過濾空白
    data = data[data['ws_cleaned'].str.strip() != ""].reset_index(drop=True)
    print(f"[INFO]video_{i} 清理後的留言數量: {len(data)}")
    logging.info(f"[INFO]video_{i} 清理後的留言數量: {len(data)}")

    # 檢查是否有空的資料
    if data.empty:
        print(f"[Warnning]video_{i} 全部為空，跳過")
        logging.info(f"[Warnning]video_{i} 全部為空，跳過")

except Exception as e:
    print(f"[Error]video_{i} 發生錯誤: {e}")
    logging.info(f"[Error]video_{i} 發生錯誤: {e}")
    raise e

In [None]:
# bertopic setup
vectorizer_model = CountVectorizer(tokenizer=lambda x: x.split(" "))
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
representation_model = KeyBERTInspired()
hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean',
                        cluster_selection_method='eom', prediction_data=True, min_samples=10)


In [None]:

result_dir = Path("/content/drive/MyDrive/Hello_comment/bert_result_0515")
result_dir.mkdir(parents=True, exist_ok=True)

try:
    # 4. 執行 BERTopic
    print(f"==> video_{i} 開始執行 BERTopic")
    logging.info(f"==> video_{i} 開始執行 BERTopic")

    docs = data['ws_cleaned'].tolist()
    title = data['video_title'].iloc[0]
    video_prefix = f"video_{i}"
    assignment_file = result_dir / f"{video_prefix}_topic_assignments.csv"
    keyword_file = result_dir / f"{video_prefix}_topic_keywords.csv"

    if assignment_file.exists() and keyword_file.exists():
        print(f"[Warnning]{video_prefix} ({title})的留言已經處理過，跳過")

    topic_model = BERTopic(
        language="chinese (traditional)",
        embedding_model="distiluse-base-multilingual-cased-v1",
        vectorizer_model=vectorizer_model,
        calculate_probabilities=True,
        verbose=True
    )
    topics, probs = topic_model.fit_transform(docs)

    doc_info = topic_model.get_document_info(docs)
    doc_info['original_text'] = data['cleaned_text'].values
    doc_info['published_at'] = data['published_at'].values
    doc_info['author_name'] = data['author_name'].values
    doc_info['like_count'] = data['like_count'].values
    doc_info['comment_type'] = data['comment_type'].values
    doc_info['video_title'] = title
    doc_info.to_csv(assignment_file, index=False, encoding="utf-8")

    topic_info = topic_model.get_topic_info()
    topic_info.to_csv(keyword_file, index=False, encoding="utf-8")

    print(f"==> BERTopic 結果存到：{video_prefix}")
except Exception as e:
    print(f"[Error]處理 video_{i}發生錯誤: {e}")