In [None]:
import requests
import json
import os
from datetime import datetime
from pydantic import BaseModel, Field
from typing import List, Dict
from collections import defaultdict

# 定义Pydantic模型
class ClassifiedEntities(BaseModel):
    """按类型分类的实体模型"""
    type: str = Field(description="实体类型")
    entities: List[str] = Field(default_factory=list, description="该类型的所有实体")
    count: int = Field(description="该类型的实体数量")

def test_api_connection():
    """
    测试API连接是否正常
    """
    url = "https://zjlchat-ner.vip.cpolar.cn/predict"
    headers = {"Content-Type": "application/json"}
    test_text = "测试文本"
    data = {"text": test_text}
    
    print("正在测试API连接...")
    print(f"API地址: {url}")
    
    try:
        response = requests.post(url, json=data, headers=headers, timeout=10)
        print(f"状态码: {response.status_code}")
        print(f"响应头: {dict(response.headers)}")
        print(f"响应内容: {response.text[:200]}")  # 只显示前200个字符
        
        if response.status_code == 200:
            print("✓ API连接正常")
            return True
        else:
            print(f"✗ API返回错误状态码: {response.status_code}")
            return False
    except Exception as e:
        print(f"✗ API连接失败: {e}")
        return False

def split_into_sentences(text):
    """
    将文本按句子分割
    """
    import re
    # 按句号、问号、感叹号等分割，但保留标点符号
    sentences = re.split(r'([。！？\n])', text)
    
    # 重新组合句子和标点
    result = []
    for i in range(0, len(sentences)-1, 2):
        if sentences[i].strip():  # 跳过空句子
            sentence = sentences[i] + (sentences[i+1] if i+1 < len(sentences) else '')
            result.append(sentence.strip())
    
    # 处理最后一个可能没有标点的句子
    if len(sentences) % 2 == 1 and sentences[-1].strip():
        result.append(sentences[-1].strip())
    
    return result

def process_ner_text():
    """
    从ner.txt文件中抽取医学命名实体，为构建知识图谱做准备
    
    处理流程：
    1. 读取ner.txt文件，拼接所有行后按句子分割
    2. 将每个句子传入实体识别模型进行NER
    3. 按实体类型分类并去重
    4. 保存到JSON文件，供后续关系抽取和知识图谱构建使用
    
    实体类型映射到KG：
    - dis(疾病) -> Disease
    - sym(症状) -> Symptom  
    - ite(检验项目) -> Test
    - pro(操作/治疗) -> Treatment
    - mic(微生物) -> Pathogen
    - dru(药物) -> Drug
    - bod(身体部位) -> BodyPart
    """
    # 定义API地址
    url = "https://zjlchat-ner.vip.cpolar.cn/predict"
    headers = {"Content-Type": "application/json"}
    
    # 定义文件路径
    ner_file_path = r"O:\MyProject\Knowleage\ner.txt"
    output_dir = r"O:\MyProject\Knowleage"
    
    # 存储所有结果
    results = []
    
    # 读取ner.txt文件
    try:
        with open(ner_file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        
        print(f"成功读取文件，共 {len(lines)} 行")
        
        # 拼接所有行
        full_text = ' '.join(line.strip() for line in lines if line.strip())
        print(f"拼接后文本总长度: {len(full_text)} 字符")
        
        # 按句子分割
        sentences = split_into_sentences(full_text)
        print(f"分割后共 {len(sentences)} 个句子\n")
        
        # 处理每个句子
        for idx, sentence in enumerate(sentences, 1):
            if not sentence:  # 跳过空句子
                continue
            
            print(f"正在处理第 {idx}/{len(sentences)} 句...")
            print(f"  文本长度: {len(sentence)} 字符")
            if len(sentence) > 50:
                print(f"  内容预览: {sentence[:50]}...")
            
            # 准备请求数据
            data = {"text": sentence}
            
            try:
                # 发送请求到实体识别API
                response = requests.post(url, json=data, headers=headers, timeout=30)
                
                # 检查响应状态
                if response.status_code == 405:
                    print(f"  ✗ 405错误：方法不允许")
                    results.append({
                        "text": sentence,
                        "entities": [],
                        "error": "405 Method Not Allowed"
                    })
                    continue
                
                response.raise_for_status()
                
                # 解析JSON响应
                result = response.json()
                
                # 简化实体信息，只保留实体名称和类型
                simplified_entities = []
                if "entities" in result and isinstance(result["entities"], list):
                    for entity in result["entities"]:
                        simplified_entities.append({
                            "entity": entity.get("entity", ""),
                            "type": entity.get("type", "")
                        })
                
                # 保存简化后的结果
                results.append({
                    "text": sentence,
                    "entities": simplified_entities
                })
                
                print(f"  ✓ 识别到 {len(simplified_entities)} 个实体")
                
            except requests.exceptions.ConnectionError as e:
                print(f"  ✗ 连接错误")
                results.append({
                    "text": sentence,
                    "entities": [],
                    "error": "连接错误"
                })
            except requests.exceptions.Timeout as e:
                print(f"  ✗ 请求超时")
                results.append({
                    "text": sentence,
                    "entities": [],
                    "error": "请求超时"
                })
            except requests.exceptions.HTTPError as e:
                print(f"  ✗ HTTP错误")
                results.append({
                    "text": sentence,
                    "entities": [],
                    "error": "HTTP错误"
                })
            except Exception as e:
                print(f"  ✗ 其他错误: {e}")
                results.append({
                    "text": sentence,
                    "entities": [],
                    "error": str(e)
                })
        
        # 使用Pydantic模型处理结果 - 按类型收集实体
        entity_type_dict = defaultdict(set)  # 使用set避免重复
        total_entity_count = 0
        
        for result in results:
            entities = result.get("entities", [])
            total_entity_count += len(entities)
            
            # 按类型收集实体
            for entity in entities:
                entity_type_dict[entity.get("type", "")].add(entity.get("entity", ""))
        
        # 创建分类后的实体列表
        classified_list = []
        for entity_type, entity_set in sorted(entity_type_dict.items()):
            if entity_type:  # 跳过空类型
                classified = ClassifiedEntities(
                    type=entity_type,
                    entities=sorted(list(entity_set)),  # 排序并去重
                    count=len(entity_set)
                )
                classified_list.append(classified)
        
        # 显示所有识别到的实体类型
        print(f"\n{'='*60}")
        print(f"NER识别完成")
        print(f"{'='*60}")
        print(f"\n统计信息：")
        print(f"  - 总句子数: {len(results)}")
        print(f"  - 总实体数: {total_entity_count}")
        print(f"  - 实体类型数: {len(classified_list)}")
        print(f"\n识别到的实体类型：")
        for classified in classified_list:
            print(f"  - {classified.type}: {classified.count} 个不同实体")
        
        # 过滤：只保留 sym 和 dis
        print(f"\n{'='*60}")
        print(f"过滤实体类型：只保留症状和疾病")
        print(f"{'='*60}")
        
        kept_types = ['sym', 'dis']
        filtered_list = []
        removed_count = 0
        
        for classified in classified_list:
            if classified.type in kept_types:
                filtered_list.append(classified)
                print(f"  ✓ 保留 {classified.type}: {classified.count} 个实体")
            else:
                removed_count += classified.count
                print(f"  ✗ 移除 {classified.type}: {classified.count} 个实体")
        
        # 生成输出文件
        output_file = os.path.join(output_dir, "ner_results_classified.json")
        
        # 保存过滤后的结果
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(
                [item.model_dump() for item in filtered_list],
                f,
                ensure_ascii=False,
                indent=2
            )
        
        # 显示最终结果
        final_entity_count = sum(item.count for item in filtered_list)
        
        print(f"\n{'='*60}")
        print(f"处理完成！")
        print(f"{'='*60}")
        print(f"结果已保存到: {output_file}")
        print(f"\n最终统计：")
        print(f"  - 总句子数: {len(results)}")
        print(f"  - 识别实体数: {total_entity_count}")
        print(f"  - 保留实体数: {final_entity_count}")
        print(f"  - 移除实体数: {removed_count}")
        print(f"  - 保留比例: {final_entity_count/total_entity_count*100:.1f}%")
        print(f"\n保存的实体类型：")
        for classified in filtered_list:
            type_name = "疾病" if classified.type == "dis" else "症状"
            print(f"  - {type_name} ({classified.type}): {classified.count} 个不同实体")
        
        return output_file, results
        
    except FileNotFoundError:
        print(f"错误: 找不到文件 {ner_file_path}")
        return None, None
    except Exception as e:
        print(f"读取文件时发生错误: {e}")
        return None, None

# 先测试API连接
print("=" * 60)
api_ok = test_api_connection()
print("=" * 60)

if api_ok:
    print("\nAPI测试通过，开始处理文件...\n")
    output_file, results = process_ner_text()
else:
    print("\n警告：API测试失败，但仍然尝试处理文件...\n")
    output_file, results = process_ner_text()


In [None]:
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage
import json
import os
from collections import Counter

# 配置LLM
llm = ChatOpenAI(
    model="qwen2.5:14b",
    base_url="https://zjlchat.vip.cpolar.cn/v1",
    api_key="EMPTY",
    temperature=0.1,
    top_p=0.8
)

def clean_entities():
    """
    清理ner_results_classified.json中的实体：
    1. 去除重复实体
    2. 使用LLM判断并剔除不正确的实体
    3. 智能合并相似的疾病和症状
    """
    classified_file = r"O:\MyProject\Knowleage\ner_results_classified.json"
    
    try:
        # 1. 读取原始数据
        with open(classified_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        print("=" * 60)
        print("实体清理智能体启动")
        print("=" * 60)
        print()
        
        # 1. 检查数据结构
        if isinstance(data, dict) and 'entities' in data:
            # 如果已经包含relationships，只处理entities部分
            classified_entities = data['entities']
            has_relationships = True
        else:
            # 原始格式
            classified_entities = data
            has_relationships = False
        
        print(f"原始数据统计：")
        total_entities_before = 0
        for entity_group in classified_entities:
            count = entity_group['count']
            total_entities_before += count
            type_name = "疾病" if entity_group['type'] == "dis" else ("症状" if entity_group['type'] == "sym" else entity_group['type'])
            print(f"  - {type_name} ({entity_group['type']}): {count} 个")
        print(f"  总计: {total_entities_before} 个实体\n")
        
        # 2. 去重处理
        print("=" * 60)
        print("步骤 1: 去除重复实体")
        print("=" * 60)
        
        deduplicated_entities = []
        duplicate_count = 0
        
        for entity_group in classified_entities:
            entity_type = entity_group['type']
            original_entities = entity_group['entities']
            
            # 使用set去重，保持原有顺序
            seen = set()
            unique_entities = []
            for entity in original_entities:
                entity_lower = entity.lower().strip()
                if entity_lower not in seen:
                    seen.add(entity_lower)
                    unique_entities.append(entity.strip())
                else:
                    duplicate_count += 1
            
            deduplicated_entities.append({
                'type': entity_type,
                'entities': unique_entities,
                'count': len(unique_entities)
            })
            
            if len(original_entities) != len(unique_entities):
                print(f"  {entity_type}: {len(original_entities)} → {len(unique_entities)} (去除 {len(original_entities) - len(unique_entities)} 个)")
        
        print(f"\n✓ 共去除 {duplicate_count} 个重复实体\n")
        
        # 3. 使用LLM进行智能清理
        print("=" * 60)
        print("步骤 2: 使用LLM验证实体有效性")
        print("=" * 60)
        print("（这可能需要几分钟，请耐心等待）\n")
        
        cleaned_entities = []
        total_removed = 0
        
        type_descriptions = {
            'bod': '身体部位（如皮肤、筋膜、肌肉、器官等）',
            'dep': '医院科室（如急诊科、外科、感染科等）',
            'dis': '疾病名称（如感染、炎症、综合征等）',
            'dru': '药物名称（如抗生素、激素、中成药等）',
            'equ': '医疗设备（如呼吸机、监护仪等）',
            'ite': '检查项目或指标（如血常规、C反应蛋白等）',
            'mic': '微生物或病原体（如细菌、病毒等）',
            'pro': '医疗操作或程序（如手术、清创、检查等）',
            'sym': '症状或体征（如发热、疼痛、水肿等）'
        }
        
        for entity_group in deduplicated_entities:
            entity_type = entity_group['type']
            entities = entity_group['entities']
            type_desc = type_descriptions.get(entity_type, entity_type)
            
            print(f"正在检查 {entity_type} 类型的 {len(entities)} 个实体...")
            
            # 将实体分批处理（每批30个）
            batch_size = 30
            valid_entities = []
            
            for i in range(0, len(entities), batch_size):
                batch = entities[i:i+batch_size]
                
                system_prompt = f"""你是一个医学实体验证专家。你的任务是检查给定的实体列表，判断哪些实体属于指定的类别，哪些不属于或明显错误。

当前类别：{entity_type} - {type_desc}

验证标准：
1. 实体必须是完整的医学术语，不能是句子片段或无意义内容
2. 实体必须明确属于当前类别
3. 剔除明显的识别错误（如乱码、标点符号、数字串等）
4. 保留缩写、专业术语、中英文混合的医学术语

输出格式：只输出JSON数组，包含有效的实体名称
["实体1", "实体2", "实体3"]

重要：直接输出JSON数组，不要任何解释或markdown格式。"""

                user_prompt = f"""请检查以下实体列表，只保留属于"{type_desc}"类别的有效实体：

{json.dumps(batch, ensure_ascii=False)}

输出有效实体的JSON数组："""

                try:
                    messages = [
                        SystemMessage(content=system_prompt),
                        HumanMessage(content=user_prompt)
                    ]
                    
                    response = llm.invoke(messages)
                    response_text = response.content.strip()
                    
                    # 提取JSON
                    if '[' in response_text and ']' in response_text:
                        start_idx = response_text.find('[')
                        end_idx = response_text.rfind(']')
                        json_text = response_text[start_idx:end_idx+1]
                        
                        batch_valid = json.loads(json_text)
                        valid_entities.extend(batch_valid)
                        
                        removed = len(batch) - len(batch_valid)
                        if removed > 0:
                            print(f"  批次 {i//batch_size + 1}: 保留 {len(batch_valid)}/{len(batch)} 个实体")
                    else:
                        # 如果解析失败，保留所有实体
                        print(f"  批次 {i//batch_size + 1}: 解析失败，保留所有实体")
                        valid_entities.extend(batch)
                        
                except Exception as e:
                    print(f"  批次 {i//batch_size + 1}: 处理出错 ({e})，保留所有实体")
                    valid_entities.extend(batch)
            
            removed_count = len(entities) - len(valid_entities)
            total_removed += removed_count
            
            cleaned_entities.append({
                'type': entity_type,
                'entities': sorted(list(set(valid_entities))),  # 再次去重并排序
                'count': len(valid_entities)
            })
            
            if removed_count > 0:
                print(f"  ✓ {entity_type}: {len(entities)} → {len(valid_entities)} (剔除 {removed_count} 个)\n")
            else:
                print(f"  ✓ {entity_type}: 保持 {len(valid_entities)} 个\n")
        
        print(f"✓ 共剔除 {total_removed} 个无效实体\n")
        
        # 4. 合并相似实体（专门针对 dis 和 sym）
        print("=" * 60)
        print("步骤 3: 合并相似实体")
        print("=" * 60)
        print("（针对疾病和症状进行智能合并）\n")
        
        merged_entities = []
        total_merged = 0
        
        for entity_group in cleaned_entities:
            entity_type = entity_group['type']
            entities = entity_group['entities']
            
            # 只对疾病和症状进行合并处理
            if entity_type in ['dis', 'sym'] and len(entities) > 1:
                print(f"正在分析 {entity_type} 类型的 {len(entities)} 个实体...")
                
                # 使用LLM进行智能合并
                system_prompt = f"""你是医学实体标准化专家。任务是识别并合并语义相似或重复的医学实体。

实体类型：{entity_type} - {'疾病' if entity_type == 'dis' else '症状'}

合并规则：
1. 语义完全相同的实体应合并（如"发热"和"发烧"）
2. 一般概念和特殊概念可合并（如"疼痛"可包含"剧烈疼痛"）
3. 带修饰词的实体可与核心概念合并（如"糖尿病患者"合并到"糖尿病"）
4. 同一疾病的不同称呼应合并（如"NSTIs"和"坏死性软组织感染"）
5. 保留最标准、最常用的名称作为合并后的名称
6. 如果不确定是否应该合并，保持独立

输出格式：JSON对象，包含合并映射和最终实体列表
{{
  "merges": [
    {{"original": ["实体1", "实体2"], "merged": "标准名称", "reason": "合并原因"}},
  ],
  "final_entities": ["标准实体1", "标准实体2", ...]
}}

重要：直接输出JSON，不要markdown格式。"""

                user_prompt = f"""请分析以下{len(entities)}个实体，识别并合并相似的实体：

{json.dumps(entities, ensure_ascii=False)}

输出合并结果的JSON："""

                try:
                    messages = [
                        SystemMessage(content=system_prompt),
                        HumanMessage(content=user_prompt)
                    ]
                    
                    response = llm.invoke(messages)
                    response_text = response.content.strip()
                    
                    # 提取JSON
                    if '{' in response_text and '}' in response_text:
                        # 移除可能的markdown标记
                        if '```json' in response_text:
                            response_text = response_text.split('```json')[1].split('```')[0].strip()
                        elif '```' in response_text:
                            response_text = response_text.split('```')[1].split('```')[0].strip()
                        
                        start_idx = response_text.find('{')
                        end_idx = response_text.rfind('}') + 1
                        json_text = response_text[start_idx:end_idx]
                        
                        merge_result = json.loads(json_text)
                        
                        if 'merges' in merge_result and 'final_entities' in merge_result:
                            merges = merge_result['merges']
                            final = merge_result['final_entities']
                            
                            # 显示合并详情
                            if merges:
                                print(f"  发现 {len(merges)} 组需要合并的实体：")
                                for merge in merges:
                                    originals = merge.get('original', [])
                                    merged = merge.get('merged', '')
                                    reason = merge.get('reason', '')
                                    if len(originals) > 1:
                                        print(f"    • {' + '.join(originals)} → {merged}")
                                        print(f"      原因: {reason}")
                                        total_merged += len(originals) - 1
                            
                            merged_entities.append({
                                'type': entity_type,
                                'entities': sorted(final),
                                'count': len(final)
                            })
                            
                            reduction = len(entities) - len(final)
                            if reduction > 0:
                                print(f"  ✓ {entity_type}: {len(entities)} → {len(final)} (合并 {reduction} 个)\n")
                            else:
                                print(f"  ✓ {entity_type}: 保持 {len(final)} 个（无需合并）\n")
                        else:
                            # 解析失败，保持原样
                            print(f"  ⚠ {entity_type}: 解析失败，保持原样\n")
                            merged_entities.append(entity_group)
                    else:
                        # 解析失败，保持原样
                        print(f"  ⚠ {entity_type}: 解析失败，保持原样\n")
                        merged_entities.append(entity_group)
                        
                except Exception as e:
                    print(f"  ✗ {entity_type}: 处理出错 ({e})，保持原样\n")
                    merged_entities.append(entity_group)
            else:
                # 其他类型不合并，直接保留
                merged_entities.append(entity_group)
        
        if total_merged > 0:
            print(f"✓ 共合并 {total_merged} 个相似实体\n")
        else:
            print(f"✓ 未发现需要合并的实体\n")
        
        # 使用合并后的数据
        cleaned_entities = merged_entities
        
        # 5. 保存清理后的数据
        print("=" * 60)
        print("保存清理结果")
        print("=" * 60)
        
        if has_relationships:
            # 保持原有的relationships数据
            result_data = {
                'entities': cleaned_entities,
                'relationships': data.get('relationships', []),
                'relationship_count': data.get('relationship_count', 0),
                'source_text_length': data.get('source_text_length', 0)
            }
        else:
            result_data = cleaned_entities
        
        with open(classified_file, 'w', encoding='utf-8') as f:
            json.dump(result_data, f, ensure_ascii=False, indent=2)
        
        print(f"✓ 清理后的数据已保存到: {classified_file}\n")
        
        # 6. 显示清理统计
        print("=" * 60)
        print("清理结果统计")
        print("=" * 60)
        
        total_entities_after = sum(eg['count'] for eg in cleaned_entities)
        
        print(f"清理前总实体数: {total_entities_before}")
        print(f"清理后总实体数: {total_entities_after}")
        print(f"总共处理: {total_entities_before - total_entities_after} 个实体")
        print(f"  - 剔除无效: {total_removed} 个")
        print(f"  - 合并重复: {total_merged} 个")
        print(f"保留率: {total_entities_after/total_entities_before*100:.1f}%\n")
        
        print("各类型实体统计：")
        for entity_group in cleaned_entities:
            type_name = "疾病" if entity_group['type'] == "dis" else ("症状" if entity_group['type'] == "sym" else entity_group['type'])
            print(f"  - {type_name} ({entity_group['type']}): {entity_group['count']} 个")
        
        # 7. 显示部分清理示例
        print("\n" + "=" * 60)
        print("清理后的实体样例")
        print("=" * 60)
        for entity_group in cleaned_entities:
            entity_type = entity_group['type']
            entities = entity_group['entities']
            type_name = "疾病" if entity_type == "dis" else ("症状" if entity_type == "sym" else entity_type)
            
            # 对疾病和症状显示更多，其他类型显示5个
            display_count = 10 if entity_type in ['dis', 'sym'] else 5
            entities_sample = entities[:display_count]
            
            print(f"\n{type_name} ({entity_type}) - 共 {len(entities)} 个:")
            for entity in entities_sample:
                print(f"  • {entity}")
            if len(entities) > display_count:
                print(f"  ... 还有 {len(entities) - display_count} 个")
        
        return cleaned_entities
        
    except FileNotFoundError as e:
        print(f"✗ 错误: 找不到文件 - {e}")
        return None
    except Exception as e:
        print(f"✗ 处理过程中发生错误: {e}")
        import traceback
        traceback.print_exc()
        return None

# 执行实体清理
print("=" * 60)
print("智能实体清理任务")
print("=" * 60)
print("\n功能：")
print("  1. 去除重复实体")
print("  2. 使用LLM剔除无效实体")
print("  3. 智能合并相似的疾病和症状")
print("\n开始处理...\n")

result = clean_entities()

if result:
    print("\n" + "=" * 60)
    print("✓ 实体清理完成！")
    print("=" * 60)
    print("\n完成的处理：")
    print("  ✓ 去重 - 移除完全重复的实体")
    print("  ✓ 验证 - 剔除无效或错误的实体")
    print("  ✓ 合并 - 整合语义相似的疾病和症状")
    print("\n结果已保存到 ner_results_classified.json")
else:
    print("\n✗ 实体清理失败")
