# 05: Annotation Pipeline (LLM 标注流程)

步骤：加载话题数据 → 预处理/过滤 → （可选）调用 LLM 标注情绪/风险 → 保存结果并汇总统计。

**重要说明（避免口径混乱）**：
- 本 notebook 主要用于 **连通性测试/小批量标注演示**，默认只抽样少量文本，避免误触发全量标注。
- 项目正式分析请以 `outputs/annotations/master/long_covid_annotations_master.jsonl` 为唯一事实来源（master）。
- 若要跑批量/全量标注，建议使用 `scripts/run_new_annotation.py` + `scripts/merge_new_annotations.py` 的流水线，而不是在 notebook 里长时间跑循环。


In [None]:
from pathlib import Path
import sys
import json
import random
import pandas as pd
from tqdm import tqdm

# 路径与导入设置：确保能找到 src 模块
ROOT = Path('..').resolve()  # notebook 位于 notebooks/ 下
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from src.empirical import (
    LLMAnnotator,
    load_topic_dataset,
    preprocess_weibo_text,
    is_valid_for_annotation,
)

# LLM 配置（示例；请按实际修改）
BASE_URL = 'http://10.13.12.164:7890/v1'
API_KEY = 'abc123'
MODEL_NAME = 'Qwen/Qwen3-8B'

# 数据路径
DATA_PATH = ROOT / 'dataset/Topic_data/#新冠后遗症#_filtered.csv'

# 输出路径：本 notebook 默认输出到 intermediate（演示/临时文件）
ANNOT_PATH = ROOT / 'outputs/annotations/intermediate/annotated_sample.jsonl'
ANNOT_PATH.parent.mkdir(parents=True, exist_ok=True)

# 主数据（正式分析以此为准）
MASTER_PATH = ROOT / 'outputs/annotations/master/long_covid_annotations_master.jsonl'

# 绘图输出
FIG_DIR = ROOT / 'outputs/figs'
FIG_DIR.mkdir(parents=True, exist_ok=True)

# 抽样设置：默认只跑小样本用于验证连通性
SAMPLE_SIZE = 200  # 设为 None 或更大可扩大规模（不建议在 notebook 里直接全量跑）
SEED = 42
random.seed(SEED)

# 读取数据
df = load_topic_dataset(DATA_PATH)

# 初始化标注器
ann = LLMAnnotator(
    provider='openai',
    api_key=API_KEY,
    model=MODEL_NAME,
    base_url=BASE_URL,
)

# 提示：master 现状（若存在）
if MASTER_PATH.exists():
    n_master = sum(1 for _ in MASTER_PATH.open('r', encoding='utf-8'))
    print(f'master 标注文件存在: {MASTER_PATH} (rows={n_master})')
else:
    print(f'master 标注文件不存在: {MASTER_PATH} (本 notebook 仅会生成演示样本 {ANNOT_PATH.name})')


In [None]:
# 1) 加载数据概览
print('rows:', len(df))
print(df.head())

# 2) 预处理：清洗文本、过滤无效样本
#    这里保留 mid 以便追溯；clean 文本用于送入 LLM
required_cols = {'mid', 'content'}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f'dataset 缺少必要字段: {missing}')

raw_df = df[['mid', 'content']].dropna()
print(f'原始文本数: {len(raw_df)}')

preprocessed = []  # (mid, original_text, clean_text)
for mid, orig in raw_df.itertuples(index=False):
    clean = preprocess_weibo_text(orig, max_length=500)
    if is_valid_for_annotation(clean, min_length=5):
        preprocessed.append((str(mid), orig, clean))

print(f'有效文本数: {len(preprocessed)}')

# 3) 抽样（默认小样本）
if SAMPLE_SIZE is not None and len(preprocessed) > SAMPLE_SIZE:
    preprocessed = random.sample(preprocessed, SAMPLE_SIZE)
print(f'本次标注样本数: {len(preprocessed)} (SAMPLE_SIZE={SAMPLE_SIZE})')

# 4) 展示预处理效果
print('
--- 预处理效果示例 ---')
for i, (mid, orig, clean) in enumerate(preprocessed[:3]):
    print(f'
[样本 {i+1}] mid={mid}')
    print(f'原文: {orig[:100]}...')
    print(f'清洗后: {clean[:100]}...')


In [None]:
# 3) 逐条调用 LLM 标注（使用清洗后的文本；保存 mid/原文/清洗后文本）
results = []
with ANNOT_PATH.open('w', encoding='utf-8') as f:
    for mid, orig, clean in tqdm(preprocessed, desc='annotating'):
        res = ann.annotate(clean, max_tokens=1024)

        result_dict = res.to_dict()
        result_dict['mid'] = mid
        result_dict['original_text'] = orig
        result_dict['content'] = clean

        results.append(res)
        f.write(json.dumps(result_dict, ensure_ascii=False) + '
')

print('done, saved to', ANNOT_PATH)


In [None]:
# 4) 汇总统计（优先使用 master；否则使用本次演示样本）
import pandas as pd

STATS_PATH = MASTER_PATH if MASTER_PATH.exists() else ANNOT_PATH
print('统计来源:', STATS_PATH)

df_ann = pd.read_json(STATS_PATH, lines=True)
print('rows:', len(df_ann))
print(df_ann.head())

required = {'emotion_class', 'risk_class'}
missing = required - set(df_ann.columns)
if missing:
    print('WARNING: 缺少字段:', missing)

print('
=== 标注结果统计 ===')
if 'emotion_class' in df_ann.columns:
    print('
情绪分布:')
    print(df_ann['emotion_class'].value_counts(normalize=True))
if 'risk_class' in df_ann.columns:
    print('
风险分布:')
    print(df_ann['risk_class'].value_counts(normalize=True))

if {'emotion_class', 'risk_class'} <= set(df_ann.columns):
    print('
=== 情绪 × 风险 交叉表 ===')
    print(pd.crosstab(df_ann['emotion_class'], df_ann['risk_class'], normalize='all').round(3))


In [None]:
print('Figures saved to', FIG_DIR)
print('Sample annotations saved to', ANNOT_PATH)
print('Master annotations (reference) at', MASTER_PATH)


In [None]:
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd

# 兼容单独运行：若未定义 ROOT/MASTER_PATH/ANNOT_PATH，则回退到 notebooks/..
if 'ROOT' not in globals():
    ROOT = Path('..').resolve()
if 'MASTER_PATH' not in globals():
    MASTER_PATH = ROOT / 'outputs/annotations/master/long_covid_annotations_master.jsonl'
if 'ANNOT_PATH' not in globals():
    ANNOT_PATH = ROOT / 'outputs/annotations/intermediate/annotated_sample.jsonl'

FIG_DIR = ROOT / 'outputs/figs'
FIG_DIR.mkdir(parents=True, exist_ok=True)

STATS_PATH = MASTER_PATH if MASTER_PATH.exists() else ANNOT_PATH
source_tag = 'master' if STATS_PATH == MASTER_PATH else 'sample'

# 读取标注结果
df_ann = pd.read_json(STATS_PATH, lines=True)

# 情绪分布
fig, ax = plt.subplots()
if 'emotion_class' in df_ann.columns:
    df_ann['emotion_class'].value_counts().reindex(['H', 'M', 'L']).plot(
        kind='bar',
        ax=ax,
        color=['#d62728', '#1f77b4', '#2ca02c'],
    )
ax.set_title(f'Emotion distribution ({source_tag})')
fig.tight_layout()
fig.savefig(FIG_DIR / f'fig5_emotion_dist_{source_tag}.png', dpi=200)

# 风险分布
fig, ax = plt.subplots()
if 'risk_class' in df_ann.columns:
    df_ann['risk_class'].value_counts().reindex(['risk', 'norisk']).plot(
        kind='bar',
        ax=ax,
        color=['#d62728', '#1f77b4'],
    )
ax.set_title(f'Risk distribution ({source_tag})')
fig.tight_layout()
fig.savefig(FIG_DIR / f'fig5_risk_dist_{source_tag}.png', dpi=200)
plt.show()

print('统计来源:', STATS_PATH)
print('Figures saved to', FIG_DIR)
