In [3]:
import re
import time
import json
from pymongo import MongoClient
import dashscope
from http import HTTPStatus

# ================= 配置区域 =================
dashscope.api_key = 'sk-fa13f585000140deabdfa506b25a7f3d' 

MONGO_URI = 'mongodb://admin:12345678@192.168.16.138:27017/?authSource=admin'
DB_NAME = "test"
COLLECTION_NAME = "overseas_website_data_all"

# ===========================================

client = MongoClient(MONGO_URI)
db = client[DB_NAME]
source_collection = db[COLLECTION_NAME]

# --- 核心改进：定义“伴生词”列表 ---
# 只有当短关键词（如 Li, Co, U）周围出现这些词时，我们才认为它是化学元素
VALIDATION_CONTEXT_WORDS = {
    "mine", "mining", "mineral", "resource", "deposit", "reserve", "exploration",
    "metal", "metallic", "element", "oxide", "hydroxide", "carbonate", "alloy",
    "battery", "batteries", "cathode", "anode", "ev", "vehicle", "storage",
    "energy", "nuclear", "power", "fuel", "reactor", "plant", # 针对 U (铀)
    "production", "produce", "export", "import", "supply", "chain", "tonnes", "tons",
    "refinery", "processing", "smelting", "grade", "project"
}

# --- 关键词配置 (保持不变) ---
CHINA_KEYWORDS = [
    "China", "Chinese", "Beijing", "PRC", "People's Republic of China", "Mainland China", 
    "State Council", "Zhongnanhai", "CCP", "Communist Party of China", "Xi Jinping", 
    "Politburo", "Yuan", "RMB", "Renminbi", "Belt and Road", "BRI", "Made in China", 
    "Huawei", "Tencent", "Alibaba", "ByteDance", "TikTok", "PLA", "People's Liberation Army", 
    "Shanghai", "Shenzhen", "Hong Kong", "Macau", "Macao", "Taiwan", "Taipei", 
    "Xinjiang", "Tibet", "South China Sea", "Sino-"
]
CHINA_KEYWORD_REGEX = [re.compile(r'\bsino-', re.IGNORECASE)] + [
    re.compile(r'\b' + re.escape(kw.lower()) + r'\b', re.IGNORECASE) for kw in CHINA_KEYWORDS if kw != "Sino-"
]

STRATEGIC_RESOURCES_CONFIG = {
    "稀土": {
        "base_keywords": ["rare earth", "rare earth elements", "REE"], 
        "detailed_tags": {
            "轻稀土": ["Light Rare Earth Elements", "LREE", "Lanthanum", "La", "Cerium", "Ce", "Praseodymium", "Pr", "Neodymium", "Nd", "Promethium", "Pm", "Samarium", "Sm", "Europium", "Eu"],
            "重稀土": ["Heavy Rare Earth Elements", "HREE", "Gadolinium", "Gd", "Terbium", "Tb", "Dysprosium", "Dy", "Holmium", "Ho", "Erbium", "Er", "Thulium", "Tm", "Ytterbium", "Yb", "Lutetium", "Lu", "Yttrium", "Y", "Scandium", "Sc"],
            "永磁材料": ["Permanent Magnets", "NdFeB", "Neodymium Iron Boron", "SmCo", "Samarium Cobalt", "rare earth magnet"],
            "开采与分离技术": ["Mining Technology", "extraction technology", "Separation & Purification", "rare earth separation", "purification process", "solvent extraction"],
            "回收利用": ["Recycling", "rare earth recycling", "urban mining"]
        }
    },
    "矿产": {
        "base_keywords": ["mineral", "minerals", "critical minerals", "strategic minerals"],
        "detailed_tags": {
            "能源矿产": ["Energy Minerals", "Uranium", "U", "Thorium", "Th"],
            "金属矿产": [
                "Metallic Minerals", "Lithium", "Li", "Cobalt", "Co", "Nickel", "Ni", 
                "Tungsten", "W", "Tin", "Sn", "Antimony", "Sb", "Beryllium", "Be", 
                "Niobium", "Nb", "Tantalum", "Ta", "Zirconium", "Zr"
            ],
            "非金属矿产": ["Non-metallic Minerals", "Fluorite", "Graphite", "Quartz"],
            "采矿技术": ["Mining technology", "deep sea mining", "open-pit mining"]
        }
    },
    "能源": {
        "base_keywords": ["energy", "energy resources"],
        "detailed_tags": {
            "传统能源": ["Tradition Energy", "Petroleum", "Oil", "Coal", "Natural Gas", "LNG", "Fossil fuel"],
            "新能源": ["New Energy", "Renewables", "Renewable energy", "Hydrogen Energy", "Solar Power", "Photovoltaic", "Nuclear Power", "Wind Power", "Energy Storage", "Battery storage"],
            "能源运输": ["Energy Transport", "pipeline", "oil tanker", "LNG carrier", "energy grid", "transmission line"]
        }
    }
}

def extract_expanded_context(text, match_obj, max_chars=800):
    keyword_start = match_obj.start()
    keyword_end = match_obj.end()
    text_len = len(text)
    punctuations = ['.', '?', '!']
    
    curr_sent_start = 0
    for i in range(keyword_start, -1, -1):
        if text[i] in punctuations:
            curr_sent_start = i + 1
            break
            
    curr_sent_end = text_len
    for i in range(keyword_end, text_len):
        if text[i] in punctuations:
            curr_sent_end = i + 1
            break
            
    prev_sent_start = 0 
    if curr_sent_start > 0:
        for i in range(curr_sent_start - 2, -1, -1):
            if text[i] in punctuations:
                prev_sent_start = i + 1
                break
    
    next_sent_end = text_len
    if curr_sent_end < text_len:
        for i in range(curr_sent_end + 1, text_len):
            if text[i] in punctuations:
                next_sent_end = i + 1
                break
                
    expanded_text = text[prev_sent_start : next_sent_end].strip()
    if len(expanded_text) > max_chars:
        start = max(0, keyword_start - 300)
        end = min(text_len, keyword_end + 300)
        expanded_text = "..." + text[start:end] + "..."
        
    expanded_text = re.sub(r'\s+', ' ', expanded_text)
    return expanded_text

def verify_with_qwen(keyword, context, tag_name):
    prompt = f"""
    你是一个专业的战略资源情报分析师。
    
    【任务】
    判断提供的文本片段中，特定的“关键词”是否在语义上属于“目标分类”。
    
    【输入信息】
    1. 目标分类: "{tag_name}"
    2. 待分析关键词: "{keyword}"
    3. 文本片段 (包含关键词的前后句上下文): 
    "{context}"

    【判断逻辑】
    请阅读整个文本片段，分析该关键词的含义。
    - 如果关键词明确指代"{tag_name}"（例如 'Li' 指代锂元素，'Co' 指代钴矿），返回 true。
    - 如果关键词是缩写、人名的一部分、或其他普通单词（例如 'Co' 是 Co-operation, 'W' 是 George W. Bush），返回 false。
    - 如果语境完全无关，返回 false。

    【输出格式】
    请仅返回标准 JSON 字符串：
    {{
        "is_match": true,
        "reason": "请用一句话中文解释，例如：根据后文提到的电池生产，这里的Li指代锂资源。"
    }}
    """

    try:
        response = dashscope.Generation.call(
            model='qwen-turbo',
            prompt=prompt,
            result_format='message',
        )

        if response.status_code == HTTPStatus.OK:
            content = response.output.choices[0].message.content
            content = content.replace('```json', '').replace('```', '').strip()
            return content
        else:
            return f'{{"is_match": false, "reason": "API Error: {response.code}"}}'
            
    except Exception as e:
        return f'{{"is_match": false, "reason": "Exception: {str(e)}"}}'

# --- 核心修改：结合了正则 + 上下文伴生词验证的匹配函数 ---
def check_keywords_with_hybrid_logic(text, keywords):
    found_items = []
    seen_kws = set()

    for kw in keywords:
        if kw.lower() in seen_kws:
            continue

        # 1. 构建正则
        pattern_str = r'\b' + re.escape(kw) + r'\b'
        
        # 针对极短词 (U, W, Co, Li, Be)
        is_short_keyword = len(kw) <= 2
        
        if is_short_keyword:
            # 规则 A: 排除后面紧跟点号的情况 (排除 U.S., Co. Ltd)
            pattern_str = r'\b' + re.escape(kw) + r'(?!\.)\b'
            # 规则 B: 强制大小写敏感 (text搜索时不加 re.IGNORECASE)
            matches = list(re.finditer(pattern_str, text))
        else:
            # 普通词忽略大小写
            matches = list(re.finditer(pattern_str, text, re.IGNORECASE))
            
        for match in matches:
            match_word = match.group()
            match_index = match.start()
            
            # --- 2. 短词特殊验证逻辑 (Proximity Check) ---
            if is_short_keyword:
                # 再次检查：如果是 Be，绝对不能是全小写的 be (动词)
                if kw == "Be" and match_word == "be":
                    continue
                
                # 提取前后 60 个字符的小窗口来检查伴生词
                window_start = max(0, match_index - 60)
                window_end = min(len(text), match.end() + 60)
                window_text = text[window_start:window_end].lower()
                
                # 核心过滤：如果窗口内没有出现任何一个“验证伴生词”，则丢弃
                # 这样可以过滤掉 "Li Changchun" (周围无 mining/battery)，但保留 "Li production"
                has_context = any(v_word in window_text for v_word in VALIDATION_CONTEXT_WORDS)
                if not has_context:
                    continue

            # 提取完整上下文并保存
            context = extract_expanded_context(text, match)
            found_items.append((kw, context))
            seen_kws.add(kw.lower())
            break # 一个词只记录一次

    return found_items

def test_strategic_resource_tagging():
    print("--- 开始测试：混合逻辑匹配 (正则 + 伴生词验证) + AI 深度验证 ---\n")
    print("逻辑说明：对于 Li, Co, U 等短词，只有当周围出现 mining, battery, resource 等词时才会被选中。\n")
    
    cursor = source_collection.find({}, {"title": 1, "content": 1, "link": 1}).limit(20)
    
    total_processed = 0
    matched_docs = 0
    
    for doc in cursor:
        total_processed += 1
        content = doc.get("content", "")
        title = doc.get("title", "")
        link = doc.get("link", "")
        
        full_text = (str(title) + ". " + str(content)).strip()
        if not full_text:
            continue

        is_china_related = False
        for regex in CHINA_KEYWORD_REGEX:
            if regex.search(full_text.lower()):
                is_china_related = True
                break
        
        doc_matches = {}
        has_match_in_doc = False

        for level_2_tag, config in STRATEGIC_RESOURCES_CONFIG.items():
            # 使用新的混合逻辑函数
            base_hits_tuples = check_keywords_with_hybrid_logic(full_text, config["base_keywords"])
            
            detailed_hits_info = {}
            has_detailed = False
            
            for level_3_tag, sub_keywords in config["detailed_tags"].items():
                hits_tuples = check_keywords_with_hybrid_logic(full_text, sub_keywords)
                if hits_tuples:
                    detailed_hits_info[level_3_tag] = hits_tuples
                    has_detailed = True
            
            if base_hits_tuples or has_detailed:
                doc_matches[level_2_tag] = {
                    "base": base_hits_tuples,
                    "detailed": detailed_hits_info
                }
                has_match_in_doc = True
        
        if has_match_in_doc:
            matched_docs += 1
            print(f"[{total_processed}] 文档: {title[:60]}...")
            print(f"   Link: {link}")
            print(f"   [中国相关]: {'是' if is_china_related else '否'}")
            print(f"   ★ 一级标签: 【战略资源】") 

            for level_2, data in doc_matches.items():
                base_tuples = data['base']
                detailed_dict = data['detailed']
                
                print(f"      ├── 二级标签: 【{level_2}】")
                
                if not detailed_dict and not base_tuples:
                    print("      │     (无匹配)")
                elif base_tuples and not detailed_dict:
                    print(f"      │     [仅原有关键词匹配]: { [x[0] for x in base_tuples] }")

                if detailed_dict:
                    print(f"      │     [新增三级标签匹配 & AI 分析]:")
                    for level_3, hits in detailed_dict.items():
                        print(f"      │       + 三级分类: <{level_3}>")
                        for kw, ctx in hits:
                            print(f"      │           --------------------------------------------------")
                            print(f"      │           关键词: [{kw}]")
                            print(f"      │           上下文: \"{ctx}\"")
                            
                            ai_result_json = verify_with_qwen(kw, ctx, level_3)
                            
                            try:
                                res = json.loads(ai_result_json)
                                is_match = res.get("is_match")
                                reason = res.get("reason")
                                icon = "✅" if is_match else "❌"
                                print(f"      │           {icon} AI 结论: {reason}")
                            except:
                                print(f"      │           ⚠️ AI JSON 解析失败: {ai_result_json}")
                            
                            time.sleep(0.3) 
                
                print("      │")
            print("=" * 90)

    print(f"\n测试结束。")
    print(f"共扫描文档: {total_processed}")
    print(f"命中【战略资源】相关文档: {matched_docs}")

if __name__ == "__main__":
    try:
        test_strategic_resource_tagging()
    except Exception as e:
        print(f"程序运行出错: {e}")

--- 开始测试：混合逻辑匹配 (正则 + 伴生词验证) + AI 深度验证 ---

逻辑说明：对于 Li, Co, U 等短词，只有当周围出现 mining, battery, resource 等词时才会被选中。

[2] 文档: The low road to national insecurity: how state’s diplomacy f...
   Link: https://www.aspi.org.au/strategist-posts/the-low-road-to-national-insecurity-how-states-diplomacy-fractures-australia/
   [中国相关]: 是
   ★ 一级标签: 【战略资源】
      ├── 二级标签: 【矿产】
      │     [新增三级标签匹配 & AI 分析]:
      │       + 三级分类: <金属矿产>
      │           --------------------------------------------------
      │           关键词: [Li]
      │           上下文: "It creates a veneer of legitimacy for engagements that require the highest level of scrutiny. We have already seen the corrosive effect of programs such as the Confucius Institutes, which former senior Chinese Communist Party official Li Changchun described as ‘an important part of China’s overseas propaganda set-up’. These institutes have been linked to censorship of politically sensitive topics and have raised serious questions about academic freedo