In [10]:
import sys
import os

# 把项目根目录加入 Python path
sys.path.append(os.path.abspath(".."))

from src.data_loader import load_comments
from src.chunking import chunk_comments
from src.llm_extraction_v2 import extract_insights_from_chunk
from src.aggregation import aggregate_insights_with_clustering

comments = load_comments("../data/bts_comments.csv")
chunks = chunk_comments(comments, chunk_size=500)

print("Chunking is done")

#（并发）
from src.parsing import robust_parse
import concurrent.futures

group_results = []
failed = []

def process_chunk(i_chunk):
    i, chunk = i_chunk
    raw_output = extract_insights_from_chunk(chunk)
    parsed = robust_parse(raw_output, raise_on_fail=False)

    if parsed is None:
        with open(f"debug_raw_chunk_{i}.txt", "w", encoding="utf-8") as f:
            f.write(str(raw_output))
        return i, {
            "audience_interest_themes": [],
            "positive_content_drivers": [],
            "recurring_pain_points": []
        }, True
    else:
        return i, parsed, False


# 选择chunk范围
selected_chunks = list(enumerate(chunks[:6]))

with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
    futures = [executor.submit(process_chunk, item) for item in selected_chunks]

    for future in concurrent.futures.as_completed(futures):
        i, result, is_failed = future.result()
        group_results.append(result)
        if is_failed:
            failed.append(i)

print(f"LLM summaries for all chunks are done. total={len(group_results)} failed={len(failed)}. failed indices={failed}")

final = aggregate_insights_with_clustering(group_results, eps=0.35)
print(final['top_content_elements'])
print(final['top_audience_insights'])
print(final['top_engagement_drivers'])
print(final['top_audience_pain_points'])


Chunking is done
LLM summaries for all chunks are done. total=6 failed=0. failed indices=[]
[("Jimin's funny expressions", 6), ("Taehyung's eyes", 4), ('BTS', 2), ("Namjoon's leadership", 2), ('Suga', 2)]
[('Bilingual fans (comments in multiple languages)', 4), ('Audience members expressing emotional connections to BTS', 4), ('International ARMY fans', 3)]
[("Emotional connection to BTS's music and performances", 6), ('Humor and comedic moments from the members', 6)]
[('Desire for more content and longer versions of videos', 4), ("Frustration with the lack of recognition for BTS's contributions", 2)]


In [11]:
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_distances
import numpy as np

# 1️⃣ 收集所有 phrase（比如 content_elements）
items = []
for res in group_results:
    for phrase in res.get("content_elements", []):
        if phrase and phrase.strip():
            items.append(phrase.strip())

texts = items

# 2️⃣ 得到 embeddings
from src.aggregation import get_embeddings
embs = np.array(get_embeddings(texts))

# 3️⃣ 计算距离矩阵
dists = cosine_distances(embs)

# 4️⃣ 重新跑 DBSCAN
from sklearn.cluster import DBSCAN
clustering = DBSCAN(eps=0.35, min_samples=1, metric="precomputed")
labels = clustering.fit_predict(dists)

# 5️⃣ 去掉 noise
mask = labels != -1

filtered_labels = labels[mask]
filtered_dists = dists[mask][:, mask]

# 6️⃣ 检查是否至少2个cluster
if len(set(filtered_labels)) > 1:
    score = silhouette_score(filtered_dists, filtered_labels, metric="precomputed")
    print("Silhouette Score:", score)
else:
    print("Cannot compute silhouette score (only one cluster)")


Silhouette Score: 0.31901039720844243


In [None]:
#不固定随机种子和并发顺序的情况下，跑了5次测得silhouette scores
import pandas as pd

silhouette_score = pd.DataFrame({"result":[0.25, 0.17, 0.17, 0.09, 0.32]})
silhouette_score_mean = silhouette_score.mean()
silhouette_score_std = silhouette_score.std()

display(f"silhouette_score_mean: {silhouette_score_mean['result'].round(2)}", f"silhouette_score_std: {silhouette_score_std['result'].round(2)}")

'silhouette_score_mean: 0.2'

'silhouette_score_std: 0.09'

In [29]:
comments = load_comments("../data/bts_comments.csv")
chunks = chunk_comments(comments, chunk_size=500)

print("Chunking is done")

group_results = []
failed = []

def process_chunk(i_chunk):
    i, chunk = i_chunk
    raw_output = extract_insights_from_chunk(chunk)
    parsed = robust_parse(raw_output, raise_on_fail=False)

    if parsed is None:
        with open(f"debug_raw_chunk_{i}.txt", "w", encoding="utf-8") as f:
            f.write(str(raw_output))
        return i, {
            "audience_interest_themes": [],
            "positive_content_drivers": [],
            "recurring_pain_points": []
        }, True
    else:
        return i, parsed, False


# 选择chunk范围
selected_chunks = list(enumerate(chunks[:6]))

with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
    futures = [executor.submit(process_chunk, item) for item in selected_chunks]

    for future in concurrent.futures.as_completed(futures):
        i, result, is_failed = future.result()
        group_results.append(result)
        if is_failed:
            failed.append(i)

print(group_results)
eval_dbscan = pd.DataFrame({"group_results": group_results})
eval_dbscan.to_csv("../data/eval_dbscan.csv")

Chunking is done
[{'content_elements': ["Jungkook's cute moments", "BTS's brotherhood", "Jin's humor", "Taehyung's unique personality", "Jimin's dance skills"], 'audience_identity_signals': ['New ARMY members', 'Fans expressing cultural connections', 'International audience from various countries'], 'engagement_drivers': ["Relatability to BTS's struggles and growth", 'Humor in interactions and comments', "Emotional connection to members' personalities"], 'audience_painpoints': ['Desire for more BTS content', 'Frustration with negative comments about BTS', "Concerns about members' well-being during military service"]}, {'content_elements': ["Jin's humor", "Jungkook's cuteness", "BTS's camaraderie", "Hobi's dancing skills", "Suga's savage moments"], 'audience_identity_signals': ['Fans expressing deep emotional connections to BTS', 'International audience with diverse languages and cultures', 'Younger audience members, including teens and children'], 'engagement_drivers': ['Relatability t

In [32]:
# 提取 content_elements
texts = []

for res in group_results:
    elems = res.get("content_elements", [])
    if isinstance(elems, list):
        for t in elems:
            if t and str(t).strip():
                texts.append(str(t).strip())

print("Total content_elements:", len(texts))
print(texts[:10])


Total content_elements: 30
["Jungkook's cute moments", "BTS's brotherhood", "Jin's humor", "Taehyung's unique personality", "Jimin's dance skills", "Jin's humor", "Jungkook's cuteness", "BTS's camaraderie", "Hobi's dancing skills", "Suga's savage moments"]


In [33]:
import numpy as np
from src.aggregation import get_embeddings

embs = np.array(get_embeddings(texts))
print("Embedding shape:", embs.shape)

Embedding shape: (30, 1536)


In [36]:
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics import silhouette_score

scores = []

for i in range(5):
    dists = cosine_distances(embs)

    clustering = DBSCAN(
        eps=0.35,
        min_samples=1,
        metric="precomputed"
    )

    labels = clustering.fit_predict(dists)

    mask = labels != -1

    if len(set(labels[mask])) > 1:
        score = silhouette_score(
            dists[mask][:, mask],
            labels[mask],
            metric="precomputed"
        )
    else:
        score = None

    scores.append(score)
    print(f"Run {i+1}: {score}")

print("silhouette_score_mean:", np.mean(scores).round(2))
print("silhouette_score_std:", np.std(scores).round(2))

Run 1: 0.21873193522521206
Run 2: 0.21873193522521206
Run 3: 0.21873193522521206
Run 4: 0.21873193522521206
Run 5: 0.21873193522521206
silhouette_score_mean: 0.22
silhouette_score_std: 0.0


### Variance decomposition is done
#### Downstream clustering is stable. Variance mainly comes from upstream LLM summarization.
- Pipeline-level silhouette: 0.20 ± 0.09 (n=5, full runs), which was high variance observed
- Downstream test (fixed extracted phrases & embeddings): silhouette = 0.2187 (std = 0, n=5), which was downstream stable

In [37]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

# 重新计算 labels（确保和刚才一致）
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_distances

dists = cosine_distances(embs)

clustering = DBSCAN(
    eps=0.35,
    min_samples=1,
    metric="precomputed"
)

labels = clustering.fit_predict(dists)

# -------- 计算每个 cluster 的内部平均相似度 --------

cluster_dict = defaultdict(list)

for idx, lab in enumerate(labels):
    if lab != -1:  # 排除 noise
        cluster_dict[lab].append(idx)

print("Total clusters:", len(cluster_dict))
print()

for lab, indices in cluster_dict.items():
    
    if len(indices) < 2:
        print(f"Cluster {lab}: only 1 item (skip)")
        continue
    
    cluster_embs = embs[indices]
    
    sim_matrix = cosine_similarity(cluster_embs)
    
    # 只取上三角（避免重复和对角线1）
    n = len(indices)
    upper_triangle = sim_matrix[np.triu_indices(n, k=1)]
    
    avg_sim = np.mean(upper_triangle)
    
    print(f"Cluster {lab}:")
    print(f"  size = {n}")
    print(f"  avg cosine similarity = {avg_sim:.4f}")
    
    # 可选：打印该cluster的文本
    print("  sample texts:")
    for i in indices[:3]:
        print("   -", texts[i])
    
    print()


Total clusters: 15

Cluster 0:
  size = 4
  avg cosine similarity = 0.7829
  sample texts:
   - Jungkook's cute moments
   - Jungkook's cuteness
   - Jungkook

Cluster 1:
  size = 4
  avg cosine similarity = 0.7833
  sample texts:
   - BTS's brotherhood
   - BTS's camaraderie
   - BTS members' interactions

Cluster 2:
  size = 5
  avg cosine similarity = 0.7012
  sample texts:
   - Jin's humor
   - Jin's humor
   - Jimin's humor

Cluster 3:
  size = 2
  avg cosine similarity = 0.6574
  sample texts:
   - Taehyung's unique personality
   - Taehyung's charm and looks

Cluster 4:
  size = 2
  avg cosine similarity = 0.6850
  sample texts:
   - Jimin's dance skills
   - Hobi's dancing skills

Cluster 5:
  size = 3
  avg cosine similarity = 0.6558
  sample texts:
   - Suga's savage moments
   - Suga
   - Suga's rap skills

Cluster 6: only 1 item (skip)
Cluster 7: only 1 item (skip)
Cluster 8: only 1 item (skip)
Cluster 9: only 1 item (skip)
Cluster 10: only 1 item (skip)
Cluster 11: only 1 

### Sensitivity check

In [40]:
# sensitivity_check.ipynb cell
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
from sklearn.metrics import silhouette_score
from collections import defaultdict

# 假设你已有：
# embs: numpy array shape (n, d)
# texts: list of strings aligned with embs

EPS_LIST = [0.25, 0.30, 0.35, 0.40, 0.45]
MIN_SAMPLES_LIST = [1, 2, 3]

def eval_for_params(embs, eps, min_samples):
    dists = cosine_distances(embs)
    clustering = DBSCAN(eps=eps, min_samples=min_samples, metric="precomputed")
    labels = clustering.fit_predict(dists)
    mask = labels != -1
    # n_clusters excluding noise
    n_clusters = len(set(labels[mask])) if mask.sum() > 0 else 0
    noise_ratio = (labels == -1).sum() / len(labels)
    sil = None
    if n_clusters > 1:
        sil = silhouette_score(dists[mask][:, mask], labels[mask], metric="precomputed")
    # cluster cohesion: avg of avg intra-cluster cosine similarity
    cluster_indices = defaultdict(list)
    for i, lab in enumerate(labels):
        if lab != -1:
            cluster_indices[lab].append(i)
    cohesion_scores = []
    for lab, idxs in cluster_indices.items():
        if len(idxs) < 2:
            continue
        sims = cosine_similarity(embs[idxs])
        upper = sims[np.triu_indices(len(idxs), k=1)]
        cohesion_scores.append(np.mean(upper))
    mean_cohesion = np.mean(cohesion_scores) if cohesion_scores else None
    return dict(silhouette=sil, n_clusters=n_clusters, noise_ratio=noise_ratio, mean_cohesion=mean_cohesion, labels=labels)

rows = []
for eps in EPS_LIST:
    for ms in MIN_SAMPLES_LIST:
        r = eval_for_params(embs, eps=eps, min_samples=ms)
        rows.append({
            "eps": eps,
            "min_samples": ms,
            "silhouette": r["silhouette"],
            "n_clusters": r["n_clusters"],
            "noise_ratio": r["noise_ratio"],
            "mean_cohesion": r["mean_cohesion"]
        })
df = pd.DataFrame(rows)
# 方便查看：按 silhouette 排序
print(df.sort_values(["silhouette"], ascending=False))


     eps  min_samples  silhouette  n_clusters  noise_ratio  mean_cohesion
1   0.25            2    0.791385           3     0.666667       0.891090
2   0.25            3    0.791385           3     0.666667       0.891090
5   0.30            3    0.685606           3     0.600000       0.812199
4   0.30            2    0.668814           4     0.533333       0.786581
8   0.35            3    0.564029           4     0.466667       0.730782
7   0.35            2    0.459048           7     0.266667       0.710749
10  0.40            2    0.391564           5     0.066667       0.632583
11  0.40            3    0.391564           5     0.066667       0.632583
9   0.40            1    0.316881           7     0.000000       0.632583
0   0.25            1    0.220895          23     0.000000       0.891090
6   0.35            1    0.218732          15     0.000000       0.710749
3   0.30            1    0.210039          20     0.000000       0.786581
13  0.45            2    0.202170     

In [42]:
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_distances
from collections import defaultdict

def show_clusters(embs, texts, eps, min_samples=1):
    print(f"\n============================")
    print(f"DBSCAN Results (eps={eps})")
    print(f"============================\n")
    
    dists = cosine_distances(embs)
    clustering = DBSCAN(eps=eps, min_samples=min_samples, metric="precomputed")
    labels = clustering.fit_predict(dists)
    
    cluster_dict = defaultdict(list)
    for i, lab in enumerate(labels):
        if lab != -1:
            cluster_dict[lab].append(texts[i])
    
    # 按 cluster size 排序
    sorted_clusters = sorted(cluster_dict.items(), key=lambda x: len(x[1]), reverse=True)
    
    for lab, items in sorted_clusters:
        if len(items) < 2:
            continue  # 跳过单个
        
        print(f"Cluster {lab} (size={len(items)}):")
        for t in items:
            print("  -", t)
        print()

# 运行 0.25 和 0.40
show_clusters(embs, texts, eps=0.25)
show_clusters(embs, texts, eps=0.40)



DBSCAN Results (eps=0.25)

Cluster 1 (size=4):
  - BTS's brotherhood
  - BTS's camaraderie
  - BTS members' interactions
  - BTS's camaraderie

Cluster 0 (size=3):
  - Jungkook's cute moments
  - Jungkook's cuteness
  - Jungkook's cute moments

Cluster 2 (size=3):
  - Jin's humor
  - Jin's humor
  - Jin's humor


DBSCAN Results (eps=0.4)

Cluster 2 (size=9):
  - Jin's humor
  - Jimin's dance skills
  - Jin's humor
  - Hobi's dancing skills
  - Jin's dance
  - Jimin's humor
  - Jimin
  - Jimin's character development
  - Jin's humor

Cluster 0 (size=6):
  - Jungkook's cute moments
  - Jungkook's cuteness
  - Jungkook's vocal abilities
  - Jungkook's tattoo
  - Jungkook
  - Jungkook's cute moments

Cluster 1 (size=6):
  - BTS's brotherhood
  - BTS's camaraderie
  - BTS members' interactions
  - Humorous moments in BTS videos
  - BTS
  - BTS's camaraderie

Cluster 3 (size=4):
  - Taehyung's unique personality
  - Taehyung's charm and looks
  - Taehyung's voice
  - Taehyung

Cluster 4 (si

### Trade offs
Smaller eps (0.25) maximizes thematic granularity but increases fragmentation, while larger eps (0.40) produces more abstract, consolidated themes.
To balance semantic precision and interpretability, this project adopts eps = 0.35, min_samples = 1 as the default configuration.
Depending on analytical goals, 0.25 and 0.40 can be used as alternative settings for fine-grained exploration or high-level summarization.