In [1]:
# 步驟 1: 安裝所有必要的函式庫
# 在 Colab 環境中，這會解決 hdbscan 的編譯問題

print("--- 正在安裝必要的函式庫 (hdbscan, BERTopic, sBERT) ---")
!pip install pandas sentence-transformers hdbscan umap-learn bertopic
# 確保安裝 SpaCy 英文模型
!python -m spacy download en_core_web_sm

print("\n安裝完成。請確保您已將 'processed_SM_data_lemmatized.csv' 檔案上傳到 Colab 檔案面板中。")

--- 正在安裝必要的函式庫 (hdbscan, BERTopic, sBERT) ---
Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Downloading bertopic-0.17.3-py3-none-any.whl (153 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.0/153.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bertopic
Successfully installed bertopic-0.17.3
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m105.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by select

In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import os

# --- 檔案路徑與欄位設定 ---
# ⚠️ 注意：在 Colab 中，檔案路徑通常只需要檔名
file_path = "processed_SM_data_lemmatized.csv"
english_col = 'english_lemmas'
korean_col = 'korean_tokens'

# 輸出檔案名稱
output_df_path = "processed_SM_data_topics_multilingual.csv"

# --- 步驟 1：載入並合併資料 ---
try:
    # 載入 CSV 檔案
    df = pd.read_csv(file_path)

    # 檢查目標欄位是否存在
    if english_col not in df.columns or korean_col not in df.columns:
        missing_cols = [col for col in [english_col, korean_col] if col not in df.columns]
        print(f"錯誤：CSV 檔案中缺少必要的欄位: {', '.join(missing_cols)}。")
        print(f"現有欄位: {df.columns.tolist()}")
        exit()

    print("正在合併英文和韓文歌詞到 'full_lyrics' 欄位...")
    # 將兩欄文字合併，中間用空格分隔。fillna("") 處理潛在的空值。
    df['full_lyrics'] = df[english_col].fillna("").astype(str) + " " + \
                        df[korean_col].fillna("").astype(str)

    # 提取要分析的歌詞列表，並去除空字串
    docs = df['full_lyrics'].tolist()
    docs = [doc for doc in docs if isinstance(doc, str) and doc.strip()]

except FileNotFoundError:
    print(f"致命錯誤：找不到檔案 '{file_path}'。請確保檔案已上傳到 Colab 環境。")
    exit()
except Exception as e:
    print(f"發生錯誤: {e}")
    exit()

if len(docs) < 10:
    print("錯誤: 有效歌詞數量不足，請檢查您的數據。")
    exit()

print(f"成功處理 {len(docs)} 條歌詞數據。")

  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.


正在合併英文和韓文歌詞到 'full_lyrics' 欄位...
成功處理 3115 條歌詞數據。


In [3]:
# --- 步驟 2：選擇多語言模型與向量化 ---

# 使用最優異的多語言 Sentence Transformer 模型
model_name = "paraphrase-multilingual-mpnet-base-v2"
print(f"正在載入多語言模型: {model_name}...")

# 載入模型 (Colab 會自動下載)
try:
    embedding_model = SentenceTransformer(model_name)

    # 生成跨語言歌詞向量 (Embedding)
    print("正在生成跨語言歌詞向量...這可能需要一些時間。")
    embeddings = embedding_model.encode(docs, show_progress_bar=True)

except Exception as e:
    print(f"載入或生成向量時發生錯誤: {e}")
    exit()

# --- 步驟 3：BERTopic 主題模型與聚類 ---

print("\n正在進行 BERTopic 主題分群 (聚類)...")

# 初始化 BERTopic 模型
topic_model = BERTopic(
    embedding_model=embedding_model,
    language="multilingual",          # 告知 BERTopic 處理多語言文本
    top_n_words=10,
    nr_topics="auto"                  # HDBSCAN 自動尋找最佳主題數量
)

# 訓練模型並獲取主題分佈
topics, probs = topic_model.fit_transform(docs, embeddings=embeddings)

# --- 步驟 4：分析與結果輸出 ---

# 獲取主題信息，包含每首歌的主題 ID
topic_info_df = topic_model.get_document_info(docs)

# 將主題結果合併回原始 DataFrame (處理索引對齊)
original_indices = [df.index[i] for i, doc in enumerate(df['full_lyrics'].tolist()) if isinstance(doc, str) and doc.strip()]
topic_mapping = dict(zip(original_indices, topic_info_df['Topic']))
df['topic_id'] = df.index.map(topic_mapping).fillna(-2).astype(int) # -2 代表未參與分析

# 輸出主題總覽
print("\n--- 主題總覽 ---")
print(topic_model.get_topic_info())

# 顯示每個主題的關鍵詞 (包含英/韓文)
print("\n--- 各主題關鍵詞 (英/韓文) ---")
for topic_num in topic_model.get_topics():
    if topic_num != -1: # -1 是噪點/離群值，通常不列入主題
        count = topic_model.get_topic_info().loc[topic_model.get_topic_info()['Topic'] == topic_num, 'Count'].iloc[0]
        keywords = topic_model.get_topic(topic_num)
        print(f"主題 {topic_num} ({count} 首歌):")
        print(keywords)


# 儲存帶有主題 ID 的 CSV
df.to_csv(output_df_path, index=False, encoding='utf-8')
print(f"\n跨語言主題分析完成。結果已儲存至 '{output_df_path}' (請在 Colab 檔案面板下載)")

# 儲存 BERTopic 模型 (可選)
topic_model.save("multilingual_lyrics_topic_model")
print(f"BERTopic 模型已儲存。")

正在載入多語言模型: paraphrase-multilingual-mpnet-base-v2...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

正在生成跨語言歌詞向量...這可能需要一些時間。


Batches:   0%|          | 0/98 [00:00<?, ?it/s]


正在進行 BERTopic 主題分群 (聚類)...

--- 主題總覽 ---
    Topic  Count                             Name  \
0      -1    755                   -1_보다_사랑_않다_같다   
1       0   2047                 0_love_보다_get_사랑   
2       1     70                1_nae_nal_su_neol   
3       2     35                 2_nae_eun_eul_eo   
4       3     34                 3_ng_que_de_amor   
5       4     28         4_christmas_snow_ta_time   
6       5     25              5_uh_걸리버_super_안녕하다   
7       6     20             6_forever_사랑_love_그대   
8       7     20  7_dream_odyssey_dystopia_mirage   
9       8     20              8_mind_세상_great_만들다   
10      9     17              9_talk_오빠_lingo_해보다   
11     10     17              10_run_runnin_째깍_go   
12     11     15                  11_ddu_du_사랑_ru   
13     12     12                12_neun_ji_ee_geu   

                                       Representation  \
0        [보다, 사랑, 않다, 같다, love, 나르다, 버리다, 내다, 순간, 시간]   
1   [love, 보다, get, 사랑, 않다, want, come, 같다, 그대, 




跨語言主題分析完成。結果已儲存至 'processed_SM_data_topics_multilingual.csv' (請在 Colab 檔案面板下載)
BERTopic 模型已儲存。
