In [1]:
import pymongo
import hashlib
import re
from datetime import datetime

# 1. 连接配置
client = pymongo.MongoClient("mongodb://admin:12345678@192.168.16.138:27017/?authSource=admin")
db = client["test"]
collection = db["overseas_website_tag"]
collection.delete_many({}) 

# --- 定义通用的依赖词组 (同义词库) ---
DEP_RARE_EARTH = ["rare earth", "REE", "LREE", "HREE", "critical mineral", "strategic resource"]
DEP_MINERAL    = ["mineral", "mining", "resource", "metal", "ore"]
DEP_ENERGY     = ["energy", "power", "fuel", "electricity", "resource"]

# 2. 数据源
data_source = {
    "稀土（rare earth）": {
        "Light Rare Earth Elements（轻稀土）": ["La（镧）", "Ce（铈）", "Pr（镨）", "Nd（钕）", "Pm（钷）", "Sm（钐）", "Eu（铕）"],
        "Heavy Rare Earth Elements（重稀土）": ["Gd（钆）", "Tb（铽）", "Dy（镝）", "Ho（钬）", "Er（铒）", "Tm（铥）", "Yb（镱）", "Lu（镥）", "Y（钇）"],
        "Permanent Magnets（永磁材料）": ["NdFeB (钕铁硼)", "SmCo (钐钴)"],
        "Mining Technology（开采技术）": ["Mining Technology（开采技术）"], 
        "Separation & Purification（分离提纯）": ["Separation & Purification（分离提纯）"],
        "Recycling（回收利用）": ["Recycling（回收利用）"]
    },
    "矿产（mineral）": {
        "Energy Minerals（能源矿产）": ["U(铀)", "Th(钍)"],
        "Metallic Minerals（金属矿产）": ["Li（锂）", "Co（钴）", "Ni（镍）", "W（钨）", "Sn（锡）", "Sb（锑）", "Be（铍）", "Nb（铌）", "Ta（钽）", "Zr（锆）"],
        "Non-metallic Minerals（非金属矿产）": ["Fluorite（萤石）", "Graphite（石墨）", "Quartz（石英）"],
        "Mining technology（采矿技术）": ["Mining technology（采矿技术）"]
    },
    "能源（energy）": {
        "Tradition Energy（传统能源）": ["Petroleum（石油）", "Coal（煤炭）", "Natural Gas（天然气）"],
        "New Energy（新能源）": ["Renewables（可再生能源）", "Hydrogen Energy（氢能）", "Solar Power（太阳能）", "Nuclear Power（核能）", "Wind Power（风能）", "Energy Storage（储能技术）"],
        "Energy Transport（能源运输）": ["Energy Transport（能源运输）"]
    }
}

LEVEL_1 = "敏感产业板块"
LEVEL_2 = "战略资源"

def parse_term(text):
    text = text.replace("（", "(").replace("）", ")")
    match = re.match(r"(.*)\((.*)\)", text)
    if match:
        part_a = match.group(1).strip()
        part_b = match.group(2).strip()
        if any('\u4e00' <= char <= '\u9fff' for char in part_a):
            return {"cn": part_a, "en": part_b}
        else:
            return {"cn": part_b, "en": part_a}
    else:
        return {"cn": text, "en": text}

def get_dependency_config(level_1_en, keyword_en):
    """
    ★ 智能配置函数：决定这个词是否需要依赖，以及依赖列表是什么
    """
    # 1. 强特征词：如果关键词长度 > 4 且不是通用词，通常不需要依赖校验
    #    例如: Dysprosium, Neodymium, Photovoltaic
    #    排除列表: "Coal", "Oil" 虽然短但很常见，需要依赖；"Graphite" 也是。
    #    这里做一个简单的启发式规则：
    
    # 极短词或高频通用词，强制需要依赖
    weak_keywords = ["La", "Y", "U", "W", "Li", "Be", "Co", "Oil", "Gas", "Coal", "Power"]
    
    if keyword_en in weak_keywords or len(keyword_en) <= 2:
        # 需要依赖，返回对应的同义词列表
        if "rare earth" in level_1_en.lower(): return DEP_RARE_EARTH
        if "mineral" in level_1_en.lower(): return DEP_MINERAL
        if "energy" in level_1_en.lower(): return DEP_ENERGY
        return [level_1_en] # 默认
    else:
        # 强特征词，不需要依赖 (返回空列表)
        return []

documents = []

for k1_raw, sub_groups in data_source.items():
    k1_obj = parse_term(k1_raw)
    
    for k2_raw, tags in sub_groups.items():
        k2_obj = parse_term(k2_raw)
        parent_id = hashlib.md5(k2_obj["en"].encode('utf-8')).hexdigest()
        
        # 1. 存入父级本身 (Category)
        # 像 "Light Rare Earth Elements" 这种长词，本身就是强特征，不需要再查 "rare earth"
        category_doc = {
            "match_keyword": k2_obj["en"],          
            "dependency_keywords": [], # ★ 空列表表示不需要依赖校验
            
            "name_cn": k2_obj["cn"], "name_en": k2_obj["en"],                
            "parent_cn": k2_obj["cn"], "parent_en": k2_obj["en"],               
            "dependency_cn": k1_obj["cn"],
            
            "parentId": parent_id,
            "is_self_category": True,               
            "level_1": LEVEL_1, "level_2": LEVEL_2,
            "create_time": datetime.now()
        }
        documents.append(category_doc)

        for tag_raw in tags:
            tag_obj = parse_term(tag_raw)
            if tag_obj["en"] == k2_obj["en"]: continue 

            # ★ 2. 获取智能依赖配置
            deps = get_dependency_config(k1_obj["en"], tag_obj["en"])

            doc = {
                "match_keyword": tag_obj["en"],
                
                # ★ 存入列表，如果是空列表则代表 skip check
                "dependency_keywords": deps,      
                
                "name_cn": tag_obj["cn"], "name_en": tag_obj["en"],                
                "parent_cn": k2_obj["cn"], "parent_en": k2_obj["en"],               
                "dependency_cn": k1_obj["cn"],
                
                "parentId": parent_id,
                "is_self_category": False,
                "level_1": LEVEL_1, "level_2": LEVEL_2,
                "create_time": datetime.now()
            }
            documents.append(doc)

if documents:
    collection.insert_many(documents)
    print(f"成功更新 {len(documents)} 条智能规则。")
    collection.create_index([("match_keyword", 1)])

成功更新 55 条智能规则。
