<a href="https://colab.research.google.com/github/yuuu125/Lunette/blob/main/AI_Assistent1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install openai==0.28.1 python-docx notion-client langdetect pydub
!pip install git+https://github.com/openai/whisper.git
!sudo apt update && sudo apt install ffmpeg  # 确保安装必要的依赖

import os
import re
import json
import openai
import whisper
from docx import Document
from google.colab import files, userdata
from notion_client import Client
from langdetect import detect, LangDetectException
import datetime
import tempfile
import torch
from pydub import AudioSegment
import subprocess

try:
    OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
    NOTION_TOKEN = userdata.get('NOTION_TOKEN')
    NOTION_DB_ID = userdata.get('NOTION_DB_ID')

    if not OPENAI_API_KEY:
        raise ValueError("OPENAI_API_KEY not set")
    if not NOTION_TOKEN:
        print("⚠️ Notion token missing - feature disabled")
    if not NOTION_DB_ID:
        print("⚠️ Notion DB ID missing - feature disabled")

    openai.api_key = OPENAI_API_KEY
    print("✅ OpenAI API key set")

except Exception as e:
    print(f"❌ Key retrieval failed: {str(e)}")

def get_audio_duration(audio_path):
    """使用ffmpeg获取精确音频时长（秒）"""
    try:
        result = subprocess.run(
            ["ffprobe", "-v", "error", "-show_entries", "format=duration",
             "-of", "default=noprint_wrappers=1:nokey=1", audio_path],
            capture_output=True, text=True
        )
        return float(result.stdout)
    except Exception as e:
        print(f"⚠️ 无法获取精确时长，使用估算值: {e}")
        # 估算：8KB/s 是常见音频比特率
        return max(30, os.path.getsize(audio_path) // 8000)

def transcribe_audio(audio_path, model_size="base"):
    """使用Whisper转录音频文件"""
    print(f"🔊 Starting transcription with Whisper ({model_size} model)...")

    try:
        # 检查GPU加速
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"💻 Using device: {device.upper()}")

        # 加载模型
        model = whisper.load_model(model_size, device=device)
        print(f"✅ Loaded Whisper {model_size} model")

        # 转录音频
        result = model.transcribe(
            audio_path,
            fp16=(device == "cuda"),
            verbose=True,
            task="transcribe"
        )

        transcription = result["text"]
        print(f"✅ Transcription complete! Characters: {len(transcription)}")
        return transcription

    except Exception as e:
        print(f"❌ Transcription failed: {str(e)}")
        raise

def test_notion_connection():
    """测试Notion连接是否有效"""
    try:
        # 初始化Notion客户端，显式配置客户端禁用代理
        notion = Client(
        auth=NOTION_TOKEN,
        client=httpx.Client(proxies=None)  # 显式禁用代理
        )
        notion.databases.retrieve(database_id=NOTION_DB_ID)
        print("✅ Notion connection verified")
        return True
    except Exception as e:
        print(f"❌ Notion connection failed: {str(e)}")
        return False

def clean_transcript(text):
    """Cleans raw transcript text"""
    text = re.sub(r'\d{1,2}:\d{2}:\d{2}', '', text)
    text = re.sub(r'Speaker\s*\d+:?', '', text)
    return re.sub(r'\n\s*\n', '\n\n', text).strip()

def segment_text(text):
    """Segments text into paragraphs"""
    return [p.strip() for p in text.split('\n\n') if p.strip()]

def handle_transcript_input():
    """Handles transcript input methods"""
    print("\n=== Handling Transcript Input ===")
    print("Choose input method:")
    print("1 - Upload text file (.txt or .docx)")
    print("2 - Paste text directly")
    print("3 - Upload audio file (transcribe with Whisper)")

    input_method = input("Your choice (1/2/3): ")
    transcript_text = ""

    # 文本文件上传
    if input_method == "1":
        uploaded = files.upload()
        if not uploaded:
            print("⚠️ No files uploaded, switching to paste")
            transcript_text = input("Paste meeting transcript: ")
        else:
            filename = list(uploaded.keys())[0]
            print(f"✅ Uploaded: {filename}")

            # 文本文件处理
            if filename.endswith('.txt'):
                transcript_text = uploaded[filename].decode('utf-8')

            # DOCX处理
            elif filename.endswith('.docx'):
                with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp:
                    tmp.write(uploaded[filename])
                    tmp_path = tmp.name

                doc = Document(tmp_path)
                transcript_text = "\n".join([para.text for para in doc.paragraphs])
                os.unlink(tmp_path)
            else:
                raise ValueError("Unsupported file format")

    # 文本粘贴
    elif input_method == "2":
        transcript_text = input("Paste meeting transcript: ")

    # 音频文件处理
    elif input_method == "3":
        uploaded_audio = files.upload()
        if not uploaded_audio:
            print("⚠️ No audio files uploaded, switching to text paste")
            transcript_text = input("Paste meeting transcript: ")
        else:
            filename = list(uploaded_audio.keys())[0]
            print(f"✅ Uploaded audio: {filename}")

            # 创建临时文件
            with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as tmp:
                tmp.write(uploaded_audio[filename])
                audio_path = tmp.name

            print("\n⚡ Select transcription speed:")
            print("1 - Fast (tiny model, fastest, lower accuracy)")
            print("2 - Balanced (base model, recommended)")
            print("3 - High Quality (small model, slower)")

            speed_choice = input("Your choice (1/2/3): ") or "2"
            model_map = {"1": "tiny", "2": "base", "3": "small"}
            model_size = model_map.get(speed_choice, "base")

            # 获取音频时长
            try:
                duration = get_audio_duration(audio_path)
                print(f"⏱ Audio duration: {duration//60:.0f}m {duration%60:.0f}s")

                # 时间估算
                time_estimates = {"tiny": 0.3, "base": 0.8, "small": 2.0}
                est_sec = duration * time_estimates[model_size]
                print(f"⏳ Estimated processing time: ~{est_sec//60:.0f}m {est_sec%60:.0f}s")
            except Exception as e:
                print(f"⚠️ Duration estimation failed: {e}")

            # 转录音频
            transcript_text = transcribe_audio(audio_path, model_size)

            # 清理临时文件
            os.unlink(audio_path)

    else:
        print("⚠️ Invalid option, defaulting to text paste")
        transcript_text = input("Paste meeting transcript: ")

    cleaned_text = clean_transcript(transcript_text)
    segments = segment_text(cleaned_text)

    print(f"📝 Processed text: {len(segments)} segments, {len(cleaned_text)} characters")
    return cleaned_text, segments

def analyze_with_gpt(text, language='en'):
    """Analyzes text with GPT API"""
    print("\n=== Analyzing with GPT ===")

    if not openai.api_key:
        print("❌ OpenAI API key missing")
        return {"error": "OpenAI API key not set", "fallback_used": True}, 0

    # Language mapping
    lang_map = {'zh': 'Chinese', 'es': 'Spanish', 'fr': 'French', 'en': 'English'}
    lang_name = lang_map.get(language[:2], 'English')

    # System prompt setup
    system_prompt = f"""
    You are a professional meeting analyst. Extract key information:
    - Respond in {lang_name}
    - Use this JSON format:
    {{
        "meeting_title": "Meeting Title",
        "participants": ["Attendee1", "Attendee2"],
        "summary": "Meeting summary",
        "action_items": [{{"task": "Task", "assignee": "Owner"}}],
        "key_points": {{
            "concerns": [],
            "decisions": [],
            "deadlines": [],
            "updates": []
        }},
        "meeting_type": "Meeting type",
        "platform": "Platform",
        "fallback_used": false
    }}

    Extraction rules:
    1. meeting_title: Extract from start/end or generate
    2. participants: Extract all attendees
    3. Focus on meeting start/end sections
    """

    user_prompt = f"Meeting transcript:\n{text[:10000]}"

    try:
        # GPT API call
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.3
        )

        content = response.choices[0].message['content']
        result = json.loads(content)
        tokens_used = response.usage['total_tokens']

        print(f"✅ GPT analysis complete! Tokens: {tokens_used}")
        print(f"Meeting title: {result.get('meeting_title', 'N/A')}")
        print(f"Participants: {len(result.get('participants', []))}")
        print(f"Meeting type: {result.get('meeting_type', 'N/A')}")
        print(f"Action items: {len(result.get('action_items', []))}")

        # Fallback for action items
        if not result.get('action_items'):
            result['fallback_used'] = True
            print("⚠️ No action items detected")

        return result, tokens_used

    except Exception as e:
        print(f"❌ GPT analysis failed: {str(e)}")
        return {
            "error": str(e),
            "fallback_used": True
        }, 0

def create_notion_entry(meeting_data):
    """Creates Notion database entry"""
    if not NOTION_TOKEN or not NOTION_DB_ID:
        print("⚠️ Notion config incomplete - skipping")
        return False

    print("\n=== Syncing to Notion ===")

    try:
        # 初始化Notion客户端，显式配置客户端禁用代理
        notion = Client(
        auth=NOTION_TOKEN,
        client=httpx.Client(proxies=None)  # 显式禁用代理
)

        # Prepare properties
        properties = {
            "Meeting Title": {"title": [{"text": {"content": meeting_data.get("meeting_title", "Untitled")}}]},
            "Participant": {"rich_text": [{"text": {"content": ", ".join(meeting_data.get("participants", ["Unknown"]))}}]},
            "Date & Duration": {"date": {"start": meeting_data.get("date", datetime.datetime.now().isoformat())}},
            "Meeting Type": {"rich_text": [{"text": {"content": meeting_data.get("meeting_type", "Other")}}]},
            "Platform": {"select": {"name": meeting_data.get("platform", "Unknown")}},
            "Summary": {"rich_text": [{"text": {"content": meeting_data.get("summary", "")}}]},
            "Key Points": {"rich_text": [{"text": {"content": format_key_points(meeting_data)}}]},
            "Action Items": {"rich_text": [{"text": {"content": format_action_items(meeting_data)}}]},
        }

        # Create entry
        new_page = notion.pages.create(
            parent={"database_id": NOTION_DB_ID},
            properties=properties
        )

        print(f"✅ Notion entry created! ID: {new_page['id']}")
        return True
    except Exception as e:
        print(f"❌ Notion sync failed: {str(e)}")
        return False

def format_key_points(data):
    """Formats key points for Notion"""
    points = []
    key_points = data.get("key_points", {})
    for category, items in key_points.items():
        if items and isinstance(items, list):
            points.append(f"{category.upper()}:")
            points.extend([f"- {item}" for item in items])
    return "\n".join(points)

def format_action_items(data):
    """Formats action items for Notion"""
    action_items = data.get("action_items", [])
    if not action_items or not isinstance(action_items, list):
        return "No action items"

    formatted = []
    for item in action_items:
        if isinstance(item, dict):
            task = item.get('task', 'Unknown task')
            assignee = item.get('assignee', 'Unassigned')
            formatted.append(f"- {task} (Owner: {assignee})")
        else:
            formatted.append(f"- {str(item)}")
    return "\n".join(formatted)

def main():
    """Main workflow execution"""
    if not openai.api_key:
        print("❌ OpenAI API key missing")
        return

    logs = {"steps": [], "errors": []}

    # Test Notion connection
    if NOTION_TOKEN and NOTION_DB_ID:
        if not test_notion_connection():
            print("⚠️ Notion connection failed")

    try:
        # Process input
        cleaned_text, segments = handle_transcript_input()
        logs["steps"].append({
            "step": "Text input",
            "segment_count": len(segments),
            "status": "success"
        })

        # Detect language
        try:
            language = detect(cleaned_text[:500]) if cleaned_text else 'en'
        except LangDetectException:
            language = 'en'
        print(f"🌐 Detected language: {language}")

        # GPT analysis
        gpt_results, tokens_used = analyze_with_gpt(cleaned_text, language)

        if "error" in gpt_results:
            logs["steps"].append({
                "step": "GPT analysis",
                "status": "failed",
                "error": gpt_results["error"]
            })
            print(f"❌ GPT failed: {gpt_results['error']}")
            return
        else:
            logs["steps"].append({
                "step": "GPT analysis",
                "tokens_used": tokens_used,
                "meeting_title": gpt_results.get("meeting_title"),
                "participants_count": len(gpt_results.get("participants", [])),
                "meeting_type": gpt_results.get("meeting_type"),
                "action_items_count": len(gpt_results.get("action_items", [])),
                "status": "success"
            })

        # Add date and sync to Notion
        gpt_results["date"] = datetime.datetime.now().isoformat()
        notion_success = create_notion_entry(gpt_results)
        logs["steps"].append({
            "step": "Notion sync",
            "status": "success" if notion_success else "failed"
        })

        # Save logs
        with open("meeting_logs.json", "w") as f:
            json.dump(logs, f, indent=2)

        print("\n✅ Process complete! Logs saved")

    except Exception as e:
        logs["errors"].append(str(e))
        print(f"\n❌ Process error: {str(e)}")
        with open("error_log.json", "w") as f:
            json.dump(logs, f, indent=2)

if __name__ == "__main__":
    main()

Collecting openai==0.28.1
  Downloading openai-0.28.1-py3-none-any.whl.metadata (11 kB)
Downloading openai-0.28.1-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.37.0
    Uninstalling openai-1.37.0:
      Successfully uninstalled openai-1.37.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-openai 0.1.9 requires openai<2.0.0,>=1.26.0, but you have openai 0.28.1 which is incompatible.[0m[31m
[0mSuccessfully installed openai-0.28.1


Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-774df_rk
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-774df_rk
[31mERROR: Operation cancelled by user[0m[31m
[0mTraceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
             ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
    return func(self, options, args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/commands/install.py", line 377, in run
    requirement_set = resolver.resolve(
                      ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/resolution/resolvelib/resolver.py", line 76, in resolve
    coll

KeyboardInterrupt: Interrupted by user

In [1]:
# 安装兼容版本的核心包
!pip uninstall -y langchain langchain-core langchain-community langchain-openai openai notion-client
!pip install langchain==0.2.0
!pip install langchain-core==0.2.38
!pip install langchain-community==0.2.0
!pip install langchain-openai==0.1.9
!pip install openai==1.37.0
!pip install notion-client==2.0.0
!pip install tqdm python-docx langdetect pydub httpx==0.27.0
!pip install git+https://github.com/openai/whisper.git
!sudo apt update && sudo apt install ffmpeg -y

import os
import json
import re  # 新增：用于修复JSON格式
import openai
import whisper
from docx import Document
from google.colab import files, userdata
from notion_client import Client, errors
from langdetect import detect, LangDetectException
import datetime
import tempfile
import torch
import subprocess
from tqdm import tqdm
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.exceptions import OutputParserException  # 正确的导入路径
from pydantic import BaseModel, Field
import httpx

# 清除代理环境变量
for var in ['HTTP_PROXY', 'HTTPS_PROXY', 'http_proxy', 'https_proxy']:
    if var in os.environ:
        del os.environ[var]

# 初始化Notion客户端
notion = Client(
    auth="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",
    client=httpx.Client(proxies=None)
)

# ======================
# 初始化设置
# ======================
try:
    OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
    NOTION_TOKEN = userdata.get('NOTION_TOKEN')
    NOTION_DB_ID = userdata.get('NOTION_DB_ID')
    NOTION_PAGE_ID = userdata.get('NOTION_PAGE_ID')

    missing_creds = []
    if not OPENAI_API_KEY:
        missing_creds.append("OPENAI_API_KEY")
    if not NOTION_TOKEN:
        missing_creds.append("NOTION_TOKEN")
    if not NOTION_DB_ID:
        missing_creds.append("NOTION_DB_ID")
    if not NOTION_PAGE_ID:
        missing_creds.append("NOTION_PAGE_ID")

    if missing_creds:
        raise ValueError(f"缺少凭证: {', '.join(missing_creds)}")

    print("✅ 所有凭证已设置")

except Exception as e:
    print(f"❌ 凭证获取失败: {str(e)}")
    print("\n🔧 设置说明:")
    print("1. 点击左侧边栏的钥匙图标（Colab密钥）")
    print("2. 添加以下密钥:")
    print("   - OPENAI_API_KEY: 你的OpenAI API密钥")
    print("   - NOTION_TOKEN: 你的Notion集成令牌")
    print("   - NOTION_DB_ID: Notion数据库ID")
    print("   - NOTION_PAGE_ID: 报告父页面ID")
    print("3. 添加后重新运行此单元格")
    raise

# ======================
# 日志系统
# ======================
class MeetingLogger:
    def __init__(self):
        self.logs = {
            "start_time": datetime.datetime.now().isoformat(),
            "steps": [],
            "errors": [],
            "metrics": {}
        }

    def log_step(self, step_name, status, details=None, error=None):
        entry = {
            "step": step_name,
            "timestamp": datetime.datetime.now().isoformat(),
            "status": status
        }
        if details:
            entry["details"] = details
        if error:
            entry["error"] = str(error)
        self.logs["steps"].append(entry)

    def log_metric(self, name, value):
        self.logs["metrics"][name] = value

    def save_logs(self, filename="meeting_logs.json"):
        with open(filename, "w") as f:
            json.dump(self.logs, f, indent=2)
        return filename

    def get_console_log(self):
        log_str = f"=== 会议处理日志 ===\n"
        log_str += f"开始时间: {self.logs['start_time']}\n"

        for step in self.logs["steps"]:
            status_icon = "✅" if step["status"] == "success" else "❌"
            log_str += f"{status_icon} [{step['timestamp']}] {step['step']}"
            if "details" in step:
                log_str += f" - {step['details']}"
            if step["status"] == "failed":
                log_str += f" - 错误: {step.get('error', '未知')}"
            log_str += "\n"

        if self.logs["metrics"]:
            log_str += "\n=== 指标 ===\n"
            for metric, value in self.logs["metrics"].items():
                log_str += f"- {metric}: {value}\n"

        return log_str

logger = MeetingLogger()

# ======================
# 音频处理
# ======================
def get_audio_duration(audio_path):
    try:
        result = subprocess.run(
            ["ffprobe", "-v", "error", "-show_entries", "format=duration",
             "-of", "default=noprint_wrappers=1:nokey=1", audio_path],
            capture_output=True, text=True
        )
        duration = float(result.stdout)
        logger.log_metric("音频时长(秒)", duration)
        return duration
    except Exception as e:
        logger.log_step("获取音频时长", "warning", error=e)
        return max(30, os.path.getsize(audio_path) // 8000)

def transcribe_audio(audio_path, model_size="base"):
    logger.log_step("音频转录", "started", {"模型大小": model_size, "音频路径": audio_path})

    try:
        device = "cuda" if torch.cuda.is_available() else "cpu"
        logger.log_step("硬件检查", "success", {"设备": device})

        model = whisper.load_model(model_size, device=device)
        logger.log_step("加载模型", "success")

        result = model.transcribe(
            audio_path,
            fp16=(device == "cuda"),
            verbose=False,
            task="transcribe"
        )

        transcription = result["text"]
        detected_lang = result["language"]
        logger.log_step("音频转录", "success", {
            "字符数": len(transcription),
            "检测语言": detected_lang
        })

        return transcription, detected_lang

    except Exception as e:
        logger.log_step("音频转录", "failed", error=e)
        raise

# ======================
# LangChain分析（修复JSON格式问题）
# ======================
# 1. 修正MeetingAnalysis模型（保持不变，但确保字段正确）
class MeetingAnalysis(BaseModel):
    meeting_title: str = Field(description="会议标题")
    participants: list[str] = Field(description="参与者名单")
    summary: str = Field(description="3-5段会议总结")
    key_points: dict = Field(description="按concerns、decisions、updates、risks分组的关键点（均为数组）")
    action_items: list[dict] = Field(description="行动项列表，包含task、assignee、due_date")
    meeting_type: str = Field(description="会议类型")
    platform: str = Field(description="会议平台")


# 2. 重写链的构建逻辑（彻底弃用LLMChain，改用新方式）
def setup_langchain_chains(language='zh'):
    # 多语言提示模板（确保所有语言的指令一致）
    lang_map = {
        'zh': "用中文分析会议记录，输出严格符合JSON格式，key_points的子字段均为数组（用[]包裹）",
        'en': "Analyze the meeting transcript in English, output strict JSON with key_points as arrays",
        'fr': "Analyser le procès-verbal en français, sortie JSON stricte avec key_points en tableaux"
    }
    lang_instruction = lang_map.get(language[:2], lang_map['zh'])

    # 输出解析器
    parser = JsonOutputParser(pydantic_object=MeetingAnalysis)

    # 提示模板（确保input_variables仅包含'transcript'）
    prompt_template = PromptTemplate(
        template="""
        {lang_instruction}

        {format_instructions}

        ### 会议记录:
        {transcript}

        请严格按照格式要求输出，确保JSON结构正确。
        """,
        input_variables=["transcript"],  # 仅保留必要的输入变量
        partial_variables={
            "lang_instruction": lang_instruction,
            "format_instructions": parser.get_format_instructions()
        }
    )

    # 初始化LLM
    llm = ChatOpenAI(
        openai_api_key=OPENAI_API_KEY,
        temperature=0.3,
        model="gpt-3.5-turbo"
    )

    # 🔥 关键修复：用新方式构建链（prompt → llm → parser）
    # 替代旧的LLMChain，彻底解决参数传递问题
    analysis_chain = prompt_template | llm | parser

    return analysis_chain


# 3. 修正分析函数（确保输入变量正确传递）
def analyze_meeting(transcript, language='zh'):
    logger.log_step("分析会议", "started", {"语言": language})
    print("\n开始分析会议内容...")

    try:
        # 获取新链（无需单独处理解析器，链已包含解析步骤）
        analysis_chain = setup_langchain_chains(language)
        processed_transcript = transcript[:15000]  # 限制长度
        print(f"使用的转录文本长度: {len(processed_transcript)}字符")

        # 🔥 关键修复：明确传入键为'transcript'的参数，与模板严格匹配
        parsed = analysis_chain.invoke({"transcript": processed_transcript})

        # 补充元数据
        parsed["language"] = language
        parsed["date"] = datetime.datetime.now().isoformat()

        # 验证行动项
        if not parsed.get("action_items"):
            logger.log_step("检查行动项", "warning", "未检测到行动项")
            print("⚠️ 未检测到行动项")
            parsed["fallback_used"] = True
        else:
            parsed["fallback_used"] = False

        logger.log_step("分析会议", "success", {
            "标题": parsed["meeting_title"],
            "参与者数量": len(parsed["participants"]),
            "行动项数量": len(parsed["action_items"])
        })
        print(f"✅ 会议分析完成 (标题: {parsed['meeting_title']})")
        return parsed

    except Exception as e:
        error_msg = f"分析会议失败: {str(e)}"
        logger.log_step("分析会议", "failed", error=error_msg)
        print(f"❌ {error_msg}")
        return {"error": error_msg, "fallback_used": True}

# 新增：修复JSON格式的工具函数
def fix_json_format(json_str):
    """尝试修复常见的JSON格式错误（针对法语等非英语输出）"""
    # 1. 将风险字段的{}替换为[]（解决集合变数组问题）
    json_str = re.sub(r'"risks":\s*\{', '"risks": [', json_str)
    json_str = re.sub(r'(?<="risks": \[.*)\}', ']', json_str, flags=re.DOTALL)

    # 2. 修复行动项中的键名错误（法语输出可能用responsible_person而非assignee）
    json_str = json_str.replace('"responsible_person"', '"assignee"')

    # 3. 移除尾部多余的逗号
    json_str = re.sub(r',\s*([\]}])', r' \1', json_str)

    return json_str
# ======================
# Notion报告生成
# ======================
def create_notion_report_page(meeting_data, transcript, logs):
    logger.log_step("创建Notion报告", "started")

    try:
        global notion

        # 验证父页面
        try:
            parent_page = notion.pages.retrieve(NOTION_PAGE_ID)
            page_title = parent_page.get('properties', {}).get('title', {}).get('title', [{}])[0].get('plain_text', '无标题')
            logger.log_step("父页面检查", "success", {"页面ID": NOTION_PAGE_ID, "标题": page_title})
            print(f"✅ 成功访问父页面: {page_title} (ID: {NOTION_PAGE_ID[:8]}...)")
        except errors.APIResponseError as e:
            if e.status == 404:
                error_msg = f"父页面不存在 (ID: {NOTION_PAGE_ID})。请检查ID是否正确。"
            elif e.status == 403:
                error_msg = f"没有访问父页面的权限 (ID: {NOTION_PAGE_ID})。请将页面共享给Notion集成。"
            else:
                error_msg = f"访问父页面失败: {str(e)}"
            logger.log_step("父页面检查", "failed", error=error_msg)
            print(f"❌ {error_msg}")
            return None
        except Exception as e:
            error_msg = f"父页面检查出错: {str(e)}"
            logger.log_step("父页面检查", "failed", error=error_msg)
            print(f"❌ {error_msg}")
            return None

        # 创建子页面
        new_page = notion.pages.create(
            parent={"page_id": NOTION_PAGE_ID},
            properties={
                "title": {
                    "title": [
                        {
                            "text": {
                                "content": meeting_data.get("meeting_title", "会议报告")[:200]
                            }
                        }
                    ]
                }
            }
        )
        page_id = new_page["id"]
        logger.log_step("创建子页面", "success", {"页面ID": page_id})
        print(f"✅ 已创建子页面 (ID: {page_id[:8]}...)")

        # 构建报告内容
        children_blocks = []

        # 1. 会议详情
        children_blocks.append({
            "object": "block",
            "type": "heading_2",
            "heading_2": {"rich_text": [{"text": {"content": "会议详情"}}]}
        })

        details_text = f"""
        **日期**: {meeting_data.get('date', '未知')}
        **参与者**: {', '.join(meeting_data.get('participants', []))}
        **语言**: {meeting_data.get('language', '未知')}
        **平台**: {meeting_data.get('platform', '未知')}
        **会议类型**: {meeting_data.get('meeting_type', '未知')}
        """
        children_blocks.append({
            "object": "block",
            "type": "paragraph",
            "paragraph": {"rich_text": [{"text": {"content": details_text.strip()}}]}
        })

        # 2. 会议总结
        children_blocks.append({
            "object": "block",
            "type": "heading_2",
            "heading_2": {"rich_text": [{"text": {"content": "总结"}}]}
        })
        children_blocks.append({
            "object": "block",
            "type": "paragraph",
            "paragraph": {"rich_text": [{"text": {"content": meeting_data.get('summary', '')}}]}
        })

        # 3. 关键点
        children_blocks.append({
            "object": "block",
            "type": "heading_2",
            "heading_2": {"rich_text": [{"text": {"content": "关键点"}}]}
        })

        key_points = meeting_data.get('key_points', {})
        for category, items in key_points.items():
            children_blocks.append({
                "object": "block",
                "type": "heading_3",
                "heading_3": {"rich_text": [{"text": {"content": category.capitalize()}}]}
            })

            if items:
                for item in items:
                    children_blocks.append({
                        "object": "block",
                        "type": "bulleted_list_item",
                        "bulleted_list_item": {"rich_text": [{"text": {"content": item}}]}
                    })

        # 4. 行动项
        children_blocks.append({
            "object": "block",
            "type": "heading_2",
            "heading_2": {"rich_text": [{"text": {"content": "行动项"}}]}
        })

        table_rows = []
        for idx, item in enumerate(meeting_data.get('action_items', [])):
            task = item.get('task', '')
            assignee = item.get('assignee', '未分配')
            due_date = item.get('due_date', '无')

            table_rows.append([
                [{"text": {"content": str(idx+1)}}],
                [{"text": {"content": task}}],
                [{"text": {"content": assignee}}],
                [{"text": {"content": due_date}}]
            ])

        children_blocks.append({
            "object": "block",
            "type": "table",
            "table": {
                "table_width": 4,
                "has_column_header": True,
                "has_row_header": False,
                "children": [
                    {
                        "object": "block",
                        "type": "table_row",
                        "table_row": {
                            "cells": [
                                [{"text": {"content": "序号"}}],
                                [{"text": {"content": "任务"}}],
                                [{"text": {"content": "负责人"}}],
                                [{"text": {"content": "截止日期"}}]
                            ]
                        }
                    },
                    *[{
                        "object": "block",
                        "type": "table_row",
                        "table_row": {"cells": cells}
                    } for cells in table_rows]
                ]
            }
        })

        # 5. 处理日志
        # 5. 处理日志部分（修复代码块）
        children_blocks.append({
    "object": "block",
    "type": "heading_2",
    "heading_2": {"rich_text": [{"text": {"content": "Processing Logs"}}]}
})
        children_blocks.append({
    "object": "block",
    "type": "code",
    "code": {
        "rich_text": [{"text": {"content": logger.get_console_log()}}],
        "language": "plain text"  # 关键修复：添加语言参数
    }
})

        # 添加内容到页面
        notion.blocks.children.append(
            block_id=page_id,
            children=children_blocks
        )
        logger.log_step("添加内容到页面", "success")
        print(f"✅ 已添加内容到子页面")

        # 关联数据库
        if NOTION_DB_ID:
            try:
                notion.pages.update(
                    page_id=page_id,
                    properties={
                        "Database Relation": {
                            "relation": [{"id": NOTION_DB_ID}]
                        }
                    }
                )
                logger.log_step("关联数据库", "success")
                print(f"✅ 已关联到数据库 (ID: {NOTION_DB_ID[:8]}...)")
            except Exception as e:
                logger.log_step("关联数据库", "warning", error=str(e))
                print(f"⚠️ 关联数据库失败: {str(e)}")

        report_url = new_page.get("url", "")
        logger.log_step("生成Notion报告", "success", {"URL": report_url})
        return report_url

    except Exception as e:
        error_details = f"Notion API错误: {str(e)}"
        if hasattr(e, 'response') and hasattr(e.response, 'content'):
            error_details += f"\n响应: {e.response.content.decode('utf-8')}"
        logger.log_step("生成Notion报告", "failed", error=error_details)
        print(f"❌ Notion操作失败: {error_details}")
        return None

# ======================
# 权限测试函数
# ======================
def test_notion_permissions():
    print("\n=== 开始Notion权限测试 ===")
    print(f"使用的父页面ID: {NOTION_PAGE_ID[:8]}... (完整: {NOTION_PAGE_ID})")

    # 1. 测试集成令牌有效性
    try:
        user_info = notion.users.me()
        print(f"✅ 集成令牌有效 (所属工作空间: {user_info.get('workspace_name', '未知')})")
    except errors.UnauthorizedError:
        print(f"❌ 集成令牌无效 (NOTION_TOKEN错误)")
        return False
    except Exception as e:
        print(f"❌ 验证集成令牌时出错: {str(e)}")
        return False

    # 2. 测试父页面访问权限
    try:
        page = notion.pages.retrieve(NOTION_PAGE_ID)
        page_title = page.get('properties', {}).get('title', {}).get('title', [{}])[0].get('plain_text', '无标题')
        print(f"✅ 成功访问父页面: {page_title}")
        return True
    except errors.APIResponseError as e:
        if e.status == 404:
            print(f"❌ 父页面不存在 (ID错误或页面已删除)")
        elif e.status == 403:
            print(f"❌ 没有访问权限 (请将页面共享给集成)")
        else:
            print(f"❌ 访问页面时出错 (状态码: {e.status}): {str(e)}")
        return False
    except Exception as e:
        print(f"❌ 访问页面时发生未知错误: {str(e)}")
        return False

# ======================
# 输入处理
# ======================
def handle_transcript_input():
    logger.log_step("处理输入", "started")

    print("\n=== 输入方式选择 ===")
    print("1: 上传音频文件 (.mp3/.wav/.m4a/.opus)")
    print("2: 上传文本文件 (.txt/.docx)")
    print("3: 直接粘贴文本")

    try:
        choice = input("请选择输入方式 (1/2/3): ").strip() or "1"
    except:
        choice = "1"

    if choice == "1":
        # 音频处理
        uploaded = files.upload()
        if not uploaded:
            logger.log_step("上传音频", "failed", "未上传任何文件")
            print("⚠️ 未上传文件，切换到文本输入")
            return handle_transcript_input()

        filename = next(iter(uploaded.keys()))
        logger.log_step("上传音频", "success", {"文件名": filename, "大小": len(uploaded[filename])})
        print(f"✅ 已上传音频文件: {filename}")

        ext = os.path.splitext(filename)[1].lower()
        supported_audio = ['.mp3', '.wav', '.m4a', '.opus']
        if ext not in supported_audio:
            error = f"不支持的音频格式: {ext} (支持: {', '.join(supported_audio)})"
            logger.log_step("处理音频", "failed", error=error)
            print(f"❌ {error}")
            raise ValueError(error)

        with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
            tmp.write(uploaded[filename])
            audio_path = tmp.name

        # 选择模型
        print("\n⚡ 选择转录模型:")
        print("1: 快速 (tiny, 低精度)")
        print("2: 平衡 (base, 推荐)")
        print("3: 高精度 (small, 较慢)")
        try:
            model_choice = input("请选择模型 (1/2/3): ").strip() or "2"
        except:
            model_choice = "2"
        model_map = {"1": "tiny", "2": "base", "3": "small"}
        model_size = model_map.get(model_choice, "base")
        print(f"使用模型: {model_size}")

        # 转录
        duration = get_audio_duration(audio_path)
        print(f"音频时长: {duration:.1f}秒，开始转录...")
        transcript, detected_lang = transcribe_audio(audio_path, model_size)
        os.unlink(audio_path)

        print(f"✅ 转录完成 (语言: {detected_lang})")
        return transcript, detected_lang

    elif choice == "2":
        # 文本文件
        uploaded = files.upload()
        if not uploaded:
            logger.log_step("上传文本", "failed", "未上传任何文件")
            print("⚠️ 未上传文件，切换到直接粘贴")
            return handle_transcript_input()

        filename = next(iter(uploaded.keys()))
        logger.log_step("上传文本", "success", {"文件名": filename})
        print(f"✅ 已上传文件: {filename}")

        try:
            if filename.endswith('.txt'):
                transcript = uploaded[filename].decode('utf-8')
            elif filename.endswith('.docx'):
                with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp:
                    tmp.write(uploaded[filename])
                    doc = Document(tmp.name)
                    transcript = "\n".join(p.text for p in doc.paragraphs)
                    os.unlink(tmp.name)
            else:
                raise ValueError(f"不支持的文件格式: {filename} (支持: .txt, .docx)")

            # 检测语言
            lang = detect(transcript[:500]) if transcript else 'fr'
            logger.log_step("检测语言", "success", {"语言": lang})
            print(f"✅ 读取完成 (检测语言: {lang})")
            return transcript, lang
        except Exception as e:
            logger.log_step("处理文本文件", "failed", error=str(e))
            print(f"❌ 处理文件出错: {str(e)}")
            raise

    elif choice == "3":
        # 直接粘贴
        print("\n请粘贴会议记录 (粘贴后按Enter，输入空行结束):")
        lines = []
        while True:
            line = input()
            if not line:
                break
            lines.append(line)
        transcript = "\n".join(lines)

        if not transcript.strip():
            logger.log_step("输入文本", "failed", "未输入任何内容")
            print("⚠️ 未输入任何内容，重新选择输入方式")
            return handle_transcript_input()

        # 检测语言
        try:
            lang = detect(transcript[:500])
            logger.log_step("检测语言", "success", {"语言": lang})
            print(f"✅ 已输入文本 (检测语言: {lang})")
        except:
            lang = 'fr'
            logger.log_step("检测语言", "warning", "使用默认语言法语")
            print(f"✅ 已输入文本 (使用默认语言: 法语)")

        return transcript, lang

    else:
        logger.log_step("选择输入方式", "warning", "无效选择，使用默认音频输入")
        print("⚠️ 无效选择，默认使用音频输入")
        return handle_transcript_input()

# ======================
# 主函数
# ======================
def main():
    logger.log_step("工作流程", "started")
    print("=== 会议记录处理工具 ===")

    try:
        # 测试Notion权限
        if not test_notion_permissions():
            print("\n❌ 权限测试未通过，请先解决上述问题")
            log_file = logger.save_logs("error_logs.json")
            print(f"错误日志已保存到: {log_file}")
            return

        # 处理输入
        transcript, language = handle_transcript_input()
        logger.log_metric("转录文本长度", len(transcript))

        # 分析会议
        meeting_data = analyze_meeting(transcript, language)
        if "error" in meeting_data:
            raise RuntimeError(f"分析失败: {meeting_data['error']}")

        # 创建Notion报告
        print("\n开始创建Notion报告...")
        report_url = create_notion_report_page(meeting_data, transcript, logger.logs)

        if not report_url:
            raise RuntimeError("创建Notion报告失败")

        # 完成
        log_file = logger.save_logs()
        print(f"\n🎉 处理完成！")
        print(f"📄 会议报告URL: {report_url}")
        print(f"📋 日志文件: {log_file}")

        # 显示链接
        from IPython.display import HTML
        display(HTML(f'<a href="{report_url}" target="_blank">点击打开Notion报告</a>'))

    except Exception as e:
        logger.log_step("工作流程", "failed", error=str(e))
        log_file = logger.save_logs("error_logs.json")
        print(f"\n❌ 处理失败！")
        print(f"错误详情: {str(e)}")
        print(f"错误日志已保存到: {log_file}")

if __name__ == "__main__":
    main()

Found existing installation: langchain 0.2.0
Uninstalling langchain-0.2.0:
  Successfully uninstalled langchain-0.2.0
Found existing installation: langchain-core 0.2.43
Uninstalling langchain-core-0.2.43:
  Successfully uninstalled langchain-core-0.2.43
[0mCollecting langchain==0.2.0
  Using cached langchain-0.2.0-py3-none-any.whl.metadata (13 kB)
Collecting langchain-core<0.3.0,>=0.2.0 (from langchain==0.2.0)
  Using cached langchain_core-0.2.43-py3-none-any.whl.metadata (6.2 kB)
Using cached langchain-0.2.0-py3-none-any.whl (973 kB)
Using cached langchain_core-0.2.43-py3-none-any.whl (397 kB)
Installing collected packages: langchain-core, langchain
Successfully installed langchain-0.2.0 langchain-core-0.2.43
Collecting langchain-core==0.2.38
  Using cached langchain_core-0.2.38-py3-none-any.whl.metadata (6.2 kB)
Using cached langchain_core-0.2.38-py3-none-any.whl (396 kB)
Installing collected packages: langchain-core
  Attempting uninstall: langchain-core
    Found existing installa

Saving DUO.mp3 to DUO.mp3
✅ 已上传音频文件: DUO.mp3

⚡ 选择转录模型:
1: 快速 (tiny, 低精度)
2: 平衡 (base, 推荐)
3: 高精度 (small, 较慢)
请选择模型 (1/2/3): 1
使用模型: tiny
音频时长: 1783.4秒，开始转录...


100%|█████████████████████████████████████| 72.1M/72.1M [00:01<00:00, 65.1MiB/s]


Detected language: English


100%|██████████| 178604/178604 [03:39<00:00, 813.19frames/s]


✅ 转录完成 (语言: en)

开始分析会议内容...
使用的转录文本长度: 15000字符
✅ 会议分析完成 (标题: French Podcast Episode Featuring Simone Ego)

开始创建Notion报告...
✅ 成功访问父页面: Parent Page (ID: 2335fee1...)
✅ 已创建子页面 (ID: 2335fee1...)
✅ 已添加内容到子页面
⚠️ 关联数据库失败: Invalid property identifier Database Relation

🎉 处理完成！
📄 会议报告URL: https://www.notion.so/French-Podcast-Episode-Featuring-Simone-Ego-2335fee18e3781ea961acd89a8144485
📋 日志文件: meeting_logs.json
