In [1]:
import pymongo
import hashlib
import re
from datetime import datetime

# 1. 连接配置
client = pymongo.MongoClient("mongodb://admin:12345678@192.168.16.138:27017/?authSource=admin")
db = client["test"]
collection = db["overseas_website_tag"]
collection.delete_many({}) # 清空旧数据

# 2. 原始数据
data_source = {
    "稀土（rare earth）": {
        "Light Rare Earth Elements（轻稀土）": ["La（镧）", "Ce（铈）", "Pr（镨）", "Nd（钕）", "Pm（钷）", "Sm（钐）", "Eu（铕）"],
        "Heavy Rare Earth Elements（重稀土）": ["Gd（钆）", "Tb（铽）", "Dy（镝）", "Ho（钬）", "Er（铒）", "Tm（铥）", "Yb（镱）", "Lu（镥）", "Y（钇）"],
        "Permanent Magnets（永磁材料）": ["NdFeB (钕铁硼)", "SmCo (钐钴)"],
        "Mining Technology（开采技术）": ["Mining Technology（开采技术）"], 
        "Separation & Purification（分离提纯）": ["Separation & Purification（分离提纯）"],
        "Recycling（回收利用）": ["Recycling（回收利用）"]
    },
    "矿产（mineral）": {
        "Energy Minerals（能源矿产）": ["U(铀)", "Th(钍)"],
        "Metallic Minerals（金属矿产）": ["Li（锂）", "Co（钴）", "Ni（镍）", "W（钨）", "Sn（锡）", "Sb（锑）", "Be（铍）", "Nb（铌）", "Ta（钽）", "Zr（锆）"],
        "Non-metallic Minerals（非金属矿产）": ["Fluorite（萤石）", "Graphite（石墨）", "Quartz（石英）"],
        "Mining technology（采矿技术）": ["Mining technology（采矿技术）"]
    },
    "能源（energy）": {
        "Tradition Energy（传统能源）": ["Petroleum（石油）", "Coal（煤炭）", "Natural Gas（天然气）"],
        "New Energy（新能源）": ["Renewables（可再生能源）", "Hydrogen Energy（氢能）", "Solar Power（太阳能）", "Nuclear Power（核能）", "Wind Power（风能）", "Energy Storage（储能技术）"],
        "Energy Transport（能源运输）": ["Energy Transport（能源运输）"]
    }
}

LEVEL_1 = "敏感产业板块"
LEVEL_2 = "战略资源"

# --- 解析工具函数 ---
def parse_term(text):
    """
    输入: "Light Rare Earth Elements（轻稀土）"
    输出: {"en": "Light Rare Earth Elements", "cn": "轻稀土"}
    """
    text = text.replace("（", "(").replace("）", ")")
    match = re.match(r"(.*)\((.*)\)", text)
    if match:
        part_a = match.group(1).strip()
        part_b = match.group(2).strip()
        # 判断哪部分包含中文
        if any('\u4e00' <= char <= '\u9fff' for char in part_a):
            return {"cn": part_a, "en": part_b}
        else:
            return {"cn": part_b, "en": part_a}
    else:
        # 无括号，假设同名或纯英文 (根据你的数据主要是纯英文没有中文名的情况，或者反之)
        return {"cn": text, "en": text}

documents = []

for k1_raw, sub_groups in data_source.items():
    # 1. 解析依赖层 (Keyword 1: 稀土/矿产/能源)
    k1_obj = parse_term(k1_raw)
    
    for k2_raw, tags in sub_groups.items():
        # 2. 解析父级层 (Keyword 2: 轻稀土/能源矿产...)
        k2_obj = parse_term(k2_raw)
        
        # 生成 parentId (基于父级英文名，保持工程一致性)
        parent_id = hashlib.md5(k2_obj["en"].encode('utf-8')).hexdigest()
        
        for tag_raw in tags:
            # 3. 解析标签层 (具体的元素或技术)
            tag_obj = parse_term(tag_raw)
            
            # --- 关键逻辑：判断是否是“自我分类”节点 ---
            # 如果标签的英文 == 父级的英文 (比如 "Mining Technology" 下面还是 "Mining Technology")
            # 标记为 True，后续你写打标签代码时，看到 True 就不加括号，看到 False 就加括号
            is_self = (tag_obj["en"] == k2_obj["en"])
            
            doc = {
                # --- 核心匹配字段 (工程查这两个去匹配) ---
                "match_keyword": tag_obj["en"],          # 比如 "La"
                "dependency_keyword": k1_obj["en"],      # 比如 "rare earth"
                
                # --- 完整元数据 (存好备用，怎么拼标签由后续代码决定) ---
                "name_cn": tag_obj["cn"],                # 镧
                "name_en": tag_obj["en"],                # La
                
                "parent_cn": k2_obj["cn"],               # 轻稀土
                "parent_en": k2_obj["en"],               # Light Rare Earth Elements
                
                "dependency_cn": k1_obj["cn"],           # 稀土
                "dependency_en": k1_obj["en"],           # rare earth
                
                # --- 结构辅助 ---
                "parentId": parent_id,
                "is_self_category": is_self,             # 重点：由这个字段控制后续逻辑
                "level_1": LEVEL_1,
                "level_2": LEVEL_2,
                
                "create_time": datetime.now()
            }
            documents.append(doc)

# 写入数据库
if documents:
    collection.insert_many(documents)
    print(f"成功插入 {len(documents)} 条配置数据。")
    
    # 创建索引，方便后续工程代码高频查询
    # 复合索引：通常你会先查 keyword，再校验 dependency
    collection.create_index([("match_keyword", 1), ("dependency_keyword", 1)])
    print("索引创建完成。")

成功插入 47 条配置数据。
索引创建完成。
