#### 标准化加载所有公式（PGA+SA）

In [None]:

# 步骤1：标准化加载所有公式（递归 models）
import json, pathlib, re

BASE_DIR = pathlib.Path().resolve().parent
print(f"数据目录: {BASE_DIR}")
PGA_DIR = BASE_DIR / "PGA"
SA_DIR  = BASE_DIR / "Spectral_Ordinates"
NUM_RE = re.compile(r"[-+]?\\d+(\\.\\d+)?([eE][-+]?\\d+)?")

def _iter_files():
    for folder in (PGA_DIR, SA_DIR):
        if not folder.exists():
            continue
        for f in sorted(folder.glob("*.json")):
            yield f

def _read_json_file(fp):
    try:
        txt = fp.read_text(encoding="utf-8")
        data = json.loads(txt)
        # 顶层是 dict，且有 models 列表
        if isinstance(data, dict) and "models" in data and isinstance(data["models"], list):
            for obj in data["models"]:
                if isinstance(obj, dict):
                    yield obj
        # 顶层直接是公式对象
        elif isinstance(data, dict):
            yield data
        elif isinstance(data, list):
            for obj in data:
                if isinstance(obj, dict):
                    yield obj
    except Exception:
        return

def _skeleton(eq: str) -> str:
    if not eq: return ""
    eq = eq.strip()
    eq = NUM_RE.sub("C", eq)
    return re.sub(r"\\s+", " ", eq)

def normalize(raw):
    model = raw.get("model") or raw.get("Model") or raw.get("name") or raw.get("model_id")
    eq = raw.get("equation") or raw.get("formula")
    coeffs = raw.get("coefficients") or raw.get("coeffs") or {}
    # 兼容参数字段
    if not coeffs and "parameters" in raw and isinstance(raw["parameters"], dict):
        coeffs = raw["parameters"]
    ftype = (raw.get("type") or "PGA").upper()
    try:
        period = float(raw.get("period_s", 0.0))
    except Exception:
        period = 0.0
    rid = raw.get("id") or raw.get("model_id") or f"{model}_{ftype}_{period}"
    variables = raw.get("variables", {})
    # === 关键：把所有需要的字段都保留 ===
    record = {
        "id": rid,
        "model": model,
        "model_id": raw.get("model_id", ""),
        "author_year": raw.get("author_year", ""),
        "equation": eq if isinstance(eq, str) else "; ".join(eq) if isinstance(eq, list) else str(eq),
        "equation_desc": raw.get("equation_desc", ""),
        "application_scope": raw.get("application_scope", ""),
        "parameters": raw.get("parameters", {}),
        "type": ftype,
        "period_s": period,
        "equation_skeleton": _skeleton(eq if isinstance(eq, str) else eq[0] if isinstance(eq, list) else str(eq)),
        "coefficients": coeffs,
        "variables": variables,
        "reference": raw.get("reference", "") or raw.get("author_year", ""),
        "notes": raw.get("notes", "") or raw.get("equation_desc", ""),
        "output_unit": raw.get("output_unit", "")
    }
    return record

def load_all():
    out = []
    for fp in _iter_files():
        for obj in _read_json_file(fp):
            rec = normalize(obj)
            if rec:
                out.append(rec)
    print(f"共加载公式 {len(out)} 条")
    return out

formula_records = load_all()
formula_records[:2]  # 查看前两条

数据目录: D:\LLM\LangChain\data_retrieval_agent\z_self_evolving_test
共加载公式 801 条


[{'id': 'PGA_1',
  'model': 'PGA_1',
  'type': 'PGA',
  'period_s': 0.0,
  'equation': 'a = c * exp(α*M) * R^(-β)',
  'equation_skeleton': 'a = c * exp(α*M) * R^(-β)',
  'coefficients': {'a': '峰值地面加速度，单位：cm/s²',
   'c': '经验系数，取值2000',
   'α': '震级影响系数，取值0.8',
   'β': '距离衰减系数，取值2',
   'M': '地震震级（未明确，默认M_s）',
   'R': '震源距离（未明确，默认震中距）',
   'σ': '标准差，未报告'},
  'variables': {},
  'reference': 'Esteva and Rosenblueth (1964)',
  'notes': '首个经典PGA实证模型，基于早期强震记录，采用指数-幂次形式描述PGA与震级、距离的关系',
  'output_unit': ''},
 {'id': 'PGA_2',
  'model': 'PGA_2',
  'type': 'PGA',
  'period_s': 0.0,
  'equation': 'a = (a₁ / √T_G) * 10^(a₂*M - P*log₁₀R + Q); P = a₃ + a₄ / R; Q = a₅ + a₆ / R',
  'equation_skeleton': 'a = (a₁ / √T_G) * 10^(a₂*M - P*log₁₀R + Q)',
  'coefficients': {'a': '峰值地面加速度，单位：cm/s²',
   'a₁': '经验系数，取值5',
   'a₂': '震级影响系数，取值0.61',
   'a₃': '距离项系数1，取值1.66',
   'a₄': '距离项系数2，取值3.60',
   'a₅': '距离项系数3，取值0.167',
   'a₆': '距离项系数4，取值-1.83',
   'T_G': '场地固有周期，单位：s（需根据场地条件确定）',
   'M': '地震震级（未明确，默认M_L）',
 

#### 构建 embedding 文本

In [27]:
# 步骤2：构建 embedding 文本
def build_embedding_text(rec):
    # 参数名与含义拼接
    param_str = ""
    params = rec.get("parameters", {})
    if isinstance(params, dict):
        param_str = " ".join([f"{k}:{v}" for k, v in params.items()])
    # 其他关键信息拼接
    return (
        f"model_id={rec.get('model_id','')} "
        f"author_year={rec.get('author_year','')} "
        f"equation_desc={rec.get('equation_desc','')} "
        f"application_scope={rec.get('application_scope','')} "
        f"equation={rec.get('equation','')} "
        f"parameters={param_str}"
    )

embedding_texts = [build_embedding_text(r) for r in formula_records]
for t in embedding_texts[:2]:
    print(t)

model_id= author_year= equation_desc= application_scope= equation=a = c * exp(α*M) * R^(-β) parameters=
model_id= author_year= equation_desc= application_scope= equation=a = (a₁ / √T_G) * 10^(a₂*M - P*log₁₀R + Q); P = a₃ + a₄ / R; Q = a₅ + a₆ / R parameters=


#### 生成向量

In [22]:
# 步骤3：生成向量（用 Qwen embedding）
import dashscope
import numpy as np
import time
import os

EMBED_DIM = 1536  # Qwen embedding 默认维度

try:
    import dashscope
    from dotenv import load_dotenv
    load_dotenv()
    DASHSCOPE_API_KEY = os.getenv("DASHSCOPE_API_KEY")
    if DASHSCOPE_API_KEY:
        dashscope.api_key = DASHSCOPE_API_KEY
except ImportError:
    dashscope = None
    DASHSCOPE_API_KEY = None

def qwen_embedding(text, model="text-embedding-v1"):
    dashscope.api_key = DASHSCOPE_API_KEY
    # Qwen API 支持批量，但这里单条
    rsp = dashscope.TextEmbedding.call(model=model, input=text)
    if rsp.status_code == 200:
        return rsp.output["embeddings"][0]["embedding"]
    else:
        print("Qwen embedding error:", rsp.message)
        return [0.0] * EMBED_DIM

# 批量生成（如有速率限制可加 sleep）
embeddings = []
for t in embedding_texts:
    emb = qwen_embedding(t)
    embeddings.append(emb)
    time.sleep(0.2)  # 避免QPS超限
embeddings = np.array(embeddings, dtype="float32")
print("向量 shape:", embeddings.shape)

向量 shape: (801, 1536)


#### 构建 FAISS 索引并保存元数据

In [23]:
# 步骤4：构建 FAISS 索引并保存元数据
import faiss, os

INDEX_FILE = "pga_formula_index.faiss"
META_FILE = "pga_formula_meta.jsonl"

index = faiss.IndexFlatL2(EMBED_DIM)
index.add(embeddings)
faiss.write_index(index, INDEX_FILE)
print("已写入索引:", INDEX_FILE)

with open(META_FILE, "w", encoding="utf-8") as f:
    for rec in formula_records:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")
print("已写入元数据:", META_FILE)

已写入索引: pga_formula_index.faiss
已写入元数据: pga_formula_meta.jsonl


#### RAG 检索函数

In [24]:
# 步骤5：RAG 检索函数
def search_formula(query, topk=3):
    # 使用与索引相同的 embedding 方法
    q_emb = np.array(qwen_embedding(query), dtype="float32").reshape(1, -1)
    D, I = index.search(q_emb, topk)
    results = []
    with open(META_FILE, "r", encoding="utf-8") as f:
        meta = [json.loads(line) for line in f]
    for idx, dist in zip(I[0], D[0]):
        if idx < 0 or idx >= len(meta): continue
        rec = meta[idx]
        results.append({"dist": float(dist), "record": rec})
    return results

# 示例
query = "地震动峰值加速度的经验公式"
hits = search_formula(query, topk=3)
for i, hit in enumerate(hits, 1):
    print(f"Top{i}: dist={hit['dist']:.3f} model={hit['record']['model']} eq={hit['record']['equation'][:50]}")

Top1: dist=6586.401 model=PGA_21 eq=\overline{a} = a₁ \overline{R}^{a₂} exp(a₃ M)
Top2: dist=6626.491 model=Spectral_097 eq=地壳地震：ln SA'(T) = C₁(T) + C₄AS*(M - 6) + C₃AS(T)*(8
Top3: dist=6718.464 model=Spectral_057 eq=ln[y(f)] = C₁ + C₂*(M - 6) + C₃*(M - 6)² + C₄*lnR 


#### RAG生成（拼接检索结果，供LLM生成答案）

In [25]:
# 步骤6：RAG生成（拼接检索结果，供LLM生成答案）
def build_rag_context(query, topk=3):
    hits = search_formula(query, topk=topk)
    context = "\n\n".join(
        f"【模型】{h['record']['model']}【类型】{h['record']['type']}【公式】{h['record']['equation']}\n【参考】{h['record']['reference']}"
        for h in hits
    )
    return context

# 示例
rag_context = build_rag_context("地震动峰值加速度的经验公式", topk=2)
print("RAG上下文：\n", rag_context)

RAG上下文：
 【模型】PGA_21【类型】PGA【公式】\overline{a} = a₁ \overline{R}^{a₂} exp(a₃ M)
【参考】Ambraseys (1978b)

【模型】Spectral_097【类型】PGA【公式】地壳地震：ln SA'(T) = C₁(T) + C₄AS*(M - 6) + C₃AS(T)*(8.5 - M)² + C₅(T)*r + (C₈(T) + C₆AS*(M - 6))*ln[√(r² + C₁0AS²(T))] + C₄6(T)*r_VOL + {C₂(T)*r + C₄4(T) + (C₉(T) + C₇(T)*(M - 6))*(ln[√(r² + C₁0AS²(T))] - ln C₁0AS)} + {C₂9(T)} + {C₃0AS(T)*ln(PGA_WA' + 0.03) + C₄3(T)} + C₃2*CN + C₃3AS(T)*CR + 悬挂墙项（参考Section 4.79）; 俯冲带地震：ln SA'(T) = C₁1(T) + [C₁2Y + (C₁7Y(T) - C₁7(T))*C₁9Y] + C₁3Y(T)*(10 - M)³ + C₁7(T)*ln[r + C₁8Y*exp(C₁9Y*M)] + C₂0(T)*H_C + C₂4(T)*SI + C₄6(T)*r_VOL*(1 - DS) + {C₄4(T) + C₁6(T)*(ln[r + C₁8Y*exp(C₁9Y*M)] - ln[C₁8Y*exp(C₁9Y*M)])} + {C₂9(T)} + {C₃0Y(T)*ln(PGA_WA' + 0.03) + C₄3(T)}
【参考】McVerry et al. (2000)
