In [1]:
import re
import time
import json
from pymongo import MongoClient
import dashscope
from http import HTTPStatus

In [None]:
# ================= 配置区域 =================

# 1. 请在此处填入您的阿里千问 API Key
dashscope.api_key = 'sk-fa13f585000140deabdfa506b25a7f3d' 

# 2. 数据库配置
MONGO_URI = 'mongodb://admin:12345678@192.168.16.138:27017/?authSource=admin'
DB_NAME = "test"
COLLECTION_NAME = "overseas_website_data_all"

# ===========================================

In [3]:
# --- 数据库连接 ---
client = MongoClient(MONGO_URI)
db = client[DB_NAME]
source_collection = db[COLLECTION_NAME]

In [4]:
# --- 中国相关性筛选 (保持原逻辑) ---
CHINA_KEYWORDS = [
    "China", "Chinese", "Beijing", "PRC", "People's Republic of China", "Mainland China", 
    "State Council", "Zhongnanhai", "CCP", "Communist Party of China", "Xi Jinping", 
    "Politburo", "Yuan", "RMB", "Renminbi", "Belt and Road", "BRI", "Made in China", 
    "Huawei", "Tencent", "Alibaba", "ByteDance", "TikTok", "PLA", "People's Liberation Army", 
    "Shanghai", "Shenzhen", "Hong Kong", "Macau", "Macao", "Taiwan", "Taipei", 
    "Xinjiang", "Tibet", "South China Sea", "Sino-"
]
CHINA_KEYWORD_REGEX = [re.compile(r'\bsino-', re.IGNORECASE)] + [
    re.compile(r'\b' + re.escape(kw.lower()) + r'\b', re.IGNORECASE) for kw in CHINA_KEYWORDS if kw != "Sino-"
]

In [5]:
# --- 新增：战略资源领域的详细三级关键词定义 ---
# 结构：Category -> { 'base_keywords': [旧的粗略匹配词], 'detailed_tags': { 三级分类: [详细词表] } }
# 这里的 Key (稀土, 矿产, 能源) 即为【二级标签】
# 它们统一属于【一级标签：战略资源】
STRATEGIC_RESOURCES_CONFIG = {
    "稀土": {
        # 旧模式：只看有没有 "rare earth"
        "base_keywords": ["rare earth", "rare earth elements", "REE"], 
        "detailed_tags": {
            "轻稀土": ["Light Rare Earth Elements", "LREE", "Lanthanum", "La", "Cerium", "Ce", "Praseodymium", "Pr", "Neodymium", "Nd", "Promethium", "Pm", "Samarium", "Sm", "Europium", "Eu"],
            "重稀土": ["Heavy Rare Earth Elements", "HREE", "Gadolinium", "Gd", "Terbium", "Tb", "Dysprosium", "Dy", "Holmium", "Ho", "Erbium", "Er", "Thulium", "Tm", "Ytterbium", "Yb", "Lutetium", "Lu", "Yttrium", "Y", "Scandium", "Sc"],
            "永磁材料": ["Permanent Magnets", "NdFeB", "Neodymium Iron Boron", "SmCo", "Samarium Cobalt", "rare earth magnet"],
            "开采与分离技术": ["Mining Technology", "extraction technology", "Separation & Purification", "rare earth separation", "purification process", "solvent extraction"],
            "回收利用": ["Recycling", "rare earth recycling", "urban mining"]
        }
    },
    "矿产": {
        # 旧模式：只看有没有 "mineral"
        "base_keywords": ["mineral", "minerals", "critical minerals", "strategic minerals"],
        "detailed_tags": {
            "能源矿产": ["Energy Minerals", "Uranium", "U", "Thorium", "Th"], # 建议实际使用全称，单字母容易误判，此处已补全
            "金属矿产": [
                "Metallic Minerals", 
                "Lithium", "Li", "Cobalt", "Co", "Nickel", "Ni", 
                "Tungsten", "W", "Tin", "Sn", "Antimony", "Sb", 
                "Beryllium", "Be", "Niobium", "Nb", "Tantalum", "Ta", "Zirconium", "Zr"
            ],
            "非金属矿产": ["Non-metallic Minerals", "Fluorite", "Graphite", "Quartz"],
            "采矿技术": ["Mining technology", "deep sea mining", "open-pit mining"]
        }
    },
    "能源": {
        # 旧模式：只看有没有 "energy"
        "base_keywords": ["energy", "energy resources"],
        "detailed_tags": {
            "传统能源": ["Tradition Energy", "Petroleum", "Oil", "Coal", "Natural Gas", "LNG", "Fossil fuel"],
            "新能源": ["New Energy", "Renewables", "Renewable energy", "Hydrogen Energy", "Solar Power", "Photovoltaic", "Nuclear Power", "Wind Power", "Energy Storage", "Battery storage"],
            "能源运输": ["Energy Transport", "pipeline", "oil tanker", "LNG carrier", "energy grid", "transmission line"]
        }
    }
}

In [6]:
def extract_expanded_context(text, match_obj, max_chars=800):
    """
    提取 [前一句] + [当前句] + [后一句]
    """
    keyword_start = match_obj.start()
    keyword_end = match_obj.end()
    text_len = len(text)
    
    # 定义标点符号集合 (用于判断句子边界)
    punctuations = ['.', '?', '!']
    
    # --- 1. 寻找当前句子的边界 (Current Sentence) ---
    # 向前找句首
    curr_sent_start = 0
    for i in range(keyword_start, -1, -1):
        if text[i] in punctuations:
            # 只有当标点后面有空格或换行时，才视为句子结束，避免 "U.S." 误判
            # 这里做一个简单处理：找到标点就停，位置+1为句首
            curr_sent_start = i + 1
            break
            
    # 向后找句尾
    curr_sent_end = text_len
    for i in range(keyword_end, text_len):
        if text[i] in punctuations:
            curr_sent_end = i + 1
            break
            
    # --- 2. 寻找前一句的开始 (Previous Sentence) ---
    # 从当前句首的前一个字符开始，继续向前找标点
    prev_sent_start = 0 
    if curr_sent_start > 0:
        for i in range(curr_sent_start - 2, -1, -1):
            if text[i] in punctuations:
                prev_sent_start = i + 1
                break
    
    # --- 3. 寻找后一句的结束 (Next Sentence) ---
    # 从当前句尾的后一个字符开始，继续向后找标点
    next_sent_end = text_len
    if curr_sent_end < text_len:
        for i in range(curr_sent_end + 1, text_len):
            if text[i] in punctuations:
                next_sent_end = i + 1
                break
                
    # --- 4. 截取并清理 ---
    # 截取范围：[前一句首 : 后一句尾]
    expanded_text = text[prev_sent_start : next_sent_end].strip()
    
    # 安全限制：如果因为标点缺失导致提取内容过长，强制截断
    if len(expanded_text) > max_chars:
        # 回退到简单的窗口截取
        start = max(0, keyword_start - 300)
        end = min(text_len, keyword_end + 300)
        expanded_text = "..." + text[start:end] + "..."
        
    # 清理多余换行，保持整洁
    expanded_text = re.sub(r'\s+', ' ', expanded_text)
    
    return expanded_text

In [7]:
def verify_with_qwen(keyword, context, tag_name):
    """
    Prompt 升级：明确告知大模型这是三句话的上下文
    """
    prompt = f"""
    你是一个专业的战略资源情报分析师。
    
    【任务】
    判断提供的文本片段中，特定的“关键词”是否在语义上属于“目标分类”。
    
    【输入信息】
    1. 目标分类: "{tag_name}"
    2. 待分析关键词: "{keyword}"
    3. 文本片段 (包含关键词的前后句上下文): 
    "{context}"

    【判断逻辑】
    请阅读整个文本片段，分析该关键词的含义。
    - 如果关键词明确指代"{tag_name}"（例如 'Li' 指代锂元素，'Co' 指代钴矿），返回 true。
    - 如果关键词是缩写、人名的一部分、或其他普通单词（例如 'Co' 是 Co-operation, 'W' 是 George W. Bush），返回 false。
    - 如果语境完全无关，返回 false。

    【输出格式】
    请仅返回标准 JSON 字符串：
    {{
        "is_match": true,
        "reason": "请用一句话中文解释，例如：根据后文提到的电池生产，这里的Li指代锂资源。"
    }}
    """

    try:
        response = dashscope.Generation.call(
            model='qwen-turbo', # 使用 turbo 即可，速度快
            prompt=prompt,
            result_format='message',
        )

        if response.status_code == HTTPStatus.OK:
            content = response.output.choices[0].message.content
            # 清理 Markdown 代码块标记
            content = content.replace('```json', '').replace('```', '').strip()
            return content
        else:
            return f'{{"is_match": false, "reason": "API Error: {response.code}"}}'
            
    except Exception as e:
        return f'{{"is_match": false, "reason": "Exception: {str(e)}"}}'

In [8]:
def check_keywords_with_context(text, keywords):
    """
    检查文本中是否包含列表中的任意关键词，并提取上下文
    返回格式: [(keyword, context_string), ...]
    """
    found_items = []
    # 为了避免同一个词被多次记录（大小写差异），用 set 过滤一下
    seen_kws = set()

    for kw in keywords:
        if kw.lower() in seen_kws:
            continue
            
        # 针对单字母/双字母进行严格匹配及大小写控制
        if len(kw) <= 2:
             pattern = r'\b' + re.escape(kw) + r'\b'
             # 短词（如 U, W, Li）通常强制大小写敏感，避免误判
             # 如果您确定数据源非常杂乱，也可以改为 re.IGNORECASE，但风险很高
             match = re.search(pattern, text) 
        else:
            # 普通词忽略大小写匹配
            pattern = r'\b' + re.escape(kw.lower()) + r'\b'
            match = re.search(pattern, text, re.IGNORECASE)
            
        if match:
            context = extract_expanded_context(text, match)
            found_items.append((kw, context))
            seen_kws.add(kw.lower())
            
    return found_items

In [9]:
def test_strategic_resource_tagging():
    print("--- 开始测试：三级关键词匹配 + 前后句上下文 + AI 深度验证 ---\n")
    print("提示：系统将自动提取 [前一句 + 当前句 + 后一句] 发送给大模型进行分析。\n")
    print("注意：因为需要调用 API，处理速度会变慢，请耐心等待...\n")
    
    # 测试前 20 篇文档 
    cursor = source_collection.find({}, {"title": 1, "content": 1, "link": 1}).limit(20)
    
    total_processed = 0
    matched_docs = 0
    
    for doc in cursor:
        total_processed += 1
        content = doc.get("content", "")
        title = doc.get("title", "")
        link = doc.get("link", "")
        
        full_text = (str(title) + " " + str(content)).strip()
        if not full_text:
            continue

        # 1. 中国相关性
        is_china_related = False
        for regex in CHINA_KEYWORD_REGEX:
            if regex.search(full_text.lower()):
                is_china_related = True
                break
        
        # 2. 匹配逻辑
        doc_matches = {}
        has_match_in_doc = False

        # 遍历所有二级标签 (稀土, 矿产, 能源)
        for level_2_tag, config in STRATEGIC_RESOURCES_CONFIG.items():
            
            # A. 基础匹配 (原有的粗略匹配)
            base_hits_tuples = check_keywords_with_context(full_text, config["base_keywords"])
            
            # B. 详细三级匹配
            detailed_hits_info = {}
            has_detailed = False
            
            for level_3_tag, sub_keywords in config["detailed_tags"].items():
                hits_tuples = check_keywords_with_context(full_text, sub_keywords)
                if hits_tuples:
                    detailed_hits_info[level_3_tag] = hits_tuples
                    has_detailed = True
            
            # 只要命中基础词 OR 命中详细词，就记录下来
            if base_hits_tuples or has_detailed:
                doc_matches[level_2_tag] = {
                    "base": base_hits_tuples,
                    "detailed": detailed_hits_info
                }
                has_match_in_doc = True
        
        # 3. 打印结果
        if has_match_in_doc:
            matched_docs += 1
            print(f"[{total_processed}] 文档: {title[:60]}...")
            print(f"   Link: {link}")
            cn_status = "是" if is_china_related else "否"
            print(f"   [中国相关]: {cn_status}")
            
            # 打印一级标签
            print(f"   ★ 一级标签: 【战略资源】") # 此处固定为战略资源

            for level_2, data in doc_matches.items():
                base_tuples = data['base']
                detailed_dict = data['detailed']
                
                # 打印二级标签
                print(f"      ├── 二级标签: 【{level_2}】")
                
                # 打印旧的/基础的匹配情况
                if base_tuples:
                    print(f"      │     [原有关键词匹配]:")
                    for kw, ctx in base_tuples:
                        print(f"      │       • 词: {kw} -> 句: \"{ctx}\"")
                else:
                    print(f"      │     [原有关键词匹配]: (无)")
                
                # 打印新的三级标签匹配
                if detailed_dict:
                    print(f"      │     [新增三级标签匹配 & AI 分析]:")
                    for level_3, hits in detailed_dict.items():
                        print(f"      │       + 三级分类: <{level_3}>")
                        for kw, ctx in hits:
                            print(f"      │           --------------------------------------------------")
                            print(f"      │           关键词: [{kw}]")
                            print(f"      │           上下文: \"{ctx}\"")
                            
                            # 调用 AI
                            ai_result_json = verify_with_qwen(kw, ctx, level_3)
                            
                            try:
                                res = json.loads(ai_result_json)
                                is_match = res.get("is_match")
                                reason = res.get("reason")
                                icon = "✅" if is_match else "❌"
                                print(f"      │           {icon} AI 结论: {reason}")
                            except:
                                print(f"      │           ⚠️ AI JSON 解析失败: {ai_result_json}")
                            
                            time.sleep(0.3) 
                
                print("      │")
            print("=" * 90)

    print(f"\n测试结束。")
    print(f"共扫描文档: {total_processed}")
    print(f"命中【战略资源】相关文档: {matched_docs}")

In [None]:
if __name__ == "__main__":
    try:
        test_strategic_resource_tagging()
    except Exception as e:
        print(f"程序运行出错: {e}")

--- 开始测试：三级关键词匹配 + 前后句上下文 + AI 深度验证 ---

提示：系统将自动提取 [前一句 + 当前句 + 后一句] 发送给大模型进行分析。

注意：因为需要调用 API，处理速度会变慢，请耐心等待...

[2] 文档: The low road to national insecurity: how state’s diplomacy f...
   Link: https://www.aspi.org.au/strategist-posts/the-low-road-to-national-insecurity-how-states-diplomacy-fractures-australia/
   [中国相关]: 是
   ★ 一级标签: 【战略资源】
      ├── 二级标签: 【矿产】
      │     [原有关键词匹配]: (无)
      │     [新增三级标签匹配 & AI 分析]:
      │       + 三级分类: <金属矿产>
      │           --------------------------------------------------
      │           关键词: [Li]
      │           上下文: "It creates a veneer of legitimacy for engagements that require the highest level of scrutiny. We have already seen the corrosive effect of programs such as the Confucius Institutes, which former senior Chinese Communist Party official Li Changchun described as ‘an important part of China’s overseas propaganda set-up’. These institutes have been linked to censorship of politically sensitive topics and have raised serious 