In [2]:
import sys
import os

# 把项目根目录加入 Python path
sys.path.append(os.path.abspath(".."))

In [4]:
from src.data_loader import load_comments
from src.chunking import chunk_comments
from src.llm_extraction import extract_insights_from_chunk

comments = load_comments("../data/lifestyle_comments.csv")
chunks = chunk_comments(comments, chunk_size=50)

result = extract_insights_from_chunk(chunks[0])
print(result)

```json
{
  "audience_interest_themes": [
    "Desire for more detailed content",
    "Interest in practical and relatable advice",
    "Appreciation for clear structure and pacing",
    "Request for additional content or follow-ups",
    "Curiosity about the process and time involved"
  ],
  "positive_content_drivers": [
    "Calming editing style",
    "Clear structure",
    "Motivational and relatable content"
  ],
  "recurring_pain_points": [
    "Content feels unrealistic",
    "Perceived repetitiveness",
    "Concerns about sponsorship and expense"
  ]
}
```


In [11]:
# Notebook cell: 解析 LLM 输出并构建 group_results（使用 src/parsing.robust_parse）
# 先确保你已经在 src/parsing.py 中实现了 robust_parse(raw_output, raise_on_fail=False)
# 并且 extract_insights_from_chunk(chunk) 函数可用（它会向 LLM 发请求并返回原始字符串）。

from src.parsing import robust_parse

group_results = []
failed = []

# 防护：确保 chunks 存在
try:
    iterator = enumerate(chunks)
except NameError:
    raise NameError("变量 `chunks` 未定义。请先生成 chunks（例如按视频或文本分片）。")

for i, chunk in iterator:
    # 1) 调用你的提取函数（这会返回 LLM 的原始字符串输出）
    raw_output = extract_insights_from_chunk(chunk)

    # 2) 用稳健解析器解析（不会在解析失败时抛异常，除非你改参数）
    parsed = robust_parse(raw_output, raise_on_fail=False)

    # 3) 处理解析失败的情况：记录并以空结构占位，避免后续 pipeline 崩溃
    if parsed is None:
        print(f"[Warning] chunk #{i} parse failed — saving preview to debug file.")
        failed.append(i)
        with open(f"debug_raw_chunk_{i}.txt", "w", encoding="utf-8") as f:
            # 保存原始输出，便于离线检查（供你或我分析）
            f.write(str(raw_output))
        group_results.append({
            "audience_interest_themes": [],
            "positive_content_drivers": [],
            "recurring_pain_points": []
        })
    else:
        # 解析成功 -> 追加到结果列表
        group_results.append(parsed)

print(f"Done. total={len(chunks)} failed={len(failed)}. failed indices={failed}")
# 现在你可以调用 aggregate_insights_with_clustering(group_results, eps=0.35)


Done. total=10 failed=0. failed indices=[]


In [10]:
from src.aggregation import aggregate_insights_with_clustering
final = aggregate_insights_with_clustering(group_results, eps=0.35)
print(final['top_audience_interest_themes'])
print(final['top_positive_content_drivers'])
print(final['top_recurring_pain_points'])


[('Desire for more detailed content', 10), ('Interest in practical and relatable content', 8), ('Requests for additional content or variations', 5), ('Appreciation for clear structure and pacing', 4), ('Concerns about cost and affordability', 4)]
[('Clear structure', 10), ('Motivational and relatable content', 5), ('Authenticity of the content', 5)]
[('Content feels too sponsored', 10), ('Lack of detailed explanations', 4), ('Lengthy intros', 4)]
