In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import re
import glob
import json
from pathlib import Path

import pandas as pd
import numpy as np

In [2]:
# ===================== 1. 名称标准化 =====================

def normalize_strain_name(name: str) -> str:
    """
    将菌株名称统一标准化，适用于 Subject 字段和文件名 stem。

    规则：
    - 下划线 -> 空格
    - ATCC_10987 / ATCC10987 -> ATCC 10987
    - 同理处理 DSM / JCM / NBRC 等编号
    - sp, sp., spp, spp. 统一成 'sp.'
    - 多个空格压缩成一个
    """
    name = name.replace("_", " ")

    name = re.sub(r"(ATCC)\s*[_]?\s*(\d+)", r"\1 \2", name, flags=re.IGNORECASE)
    name = re.sub(r"(DSM)\s*[_]?\s*(\d+)",  r"\1 \2", name, flags=re.IGNORECASE)
    name = re.sub(r"(JCM)\s*[_]?\s*(\d+)",  r"\1 \2", name, flags=re.IGNORECASE)
    name = re.sub(r"(NBRC)\s*[_]?\s*(\d+)", r"\1 \2", name, flags=re.IGNORECASE)

    name = re.sub(r"\bsp[p]?\.?\b", "sp.", name, flags=re.IGNORECASE)

    name = re.sub(r"\s+", " ", name).strip()
    return name


# ===================== 2. 从 JSON 中解析 TaxID =====================

def parse_taxid_from_meta(meta: dict):
    """
    尝试从 JSON dict 中抽取 TaxID：
    - 顶层 key: Taxid/taxid/TaxID/tax_id
    - 一层嵌套 dict 中的同名 key
    """
    if not isinstance(meta, dict):
        return None

    # 顶层
    for key in ["Taxid", "taxid", "TaxID", "tax_id"]:
        if key in meta:
            return meta[key]

    # 一层嵌套
    for v in meta.values():
        if isinstance(v, dict):
            for key in ["Taxid", "taxid", "TaxID", "tax_id"]:
                if key in v:
                    return v[key]

    return None


def build_taxid_from_json(json_root: Path) -> pd.DataFrame:
    """
    遍历 json_root 下的 *.assembly_meta.json，
    返回 DataFrame: [NormalizedSubject, TaxID_json, JsonPath]
    """
    records = []

    json_paths = sorted(json_root.glob("*.assembly_meta.json"))
    print(f"[INFO] 在 {json_root} 中共发现 {len(json_paths)} 个 *.assembly_meta.json 文件")

    for path in json_paths:
        fname = path.name                                  # Bacillus_subtilis_PCM2850.assembly_meta.json
        stem  = fname[:-len(".assembly_meta.json")]        # Bacillus_subtilis_PCM2850
        norm_subject = normalize_strain_name(stem)         # Bacillus subtilis PCM2850

        try:
            with path.open("r", encoding="utf-8") as f:
                meta = json.load(f)
        except Exception as e:
            print(f"[WARN] 解析 JSON 失败：{fname}, error={e}")
            records.append({
                "NormalizedSubject": norm_subject,
                "TaxID_json": np.nan,
                "JsonPath": str(path),
            })
            continue

        taxid = parse_taxid_from_meta(meta)
        if taxid is None:
            print(f"[WARN] {fname} 中未找到 Taxid 字段，标记为缺失。")

        records.append({
            "NormalizedSubject": norm_subject,
            "TaxID_json": taxid,
            "JsonPath": str(path),
        })

    df_json = pd.DataFrame(records)
    print(f"[INFO] 从 JSON 中成功解析 TaxID 的菌株数：{df_json['TaxID_json'].notna().sum()}")
    return df_json

In [3]:
current_path = Path.cwd()
HOME_DIR = current_path.parent

PROCESS_DIR = HOME_DIR / "data/1-processed_data"

STRAIN_FILE = PROCESS_DIR / "strain_species_866_norm.tsv"
JSON_ROOT = Path("/apdcephfs_qy3/share_2932069/kangcz/StrainNetwork/Strain")
OUT_TAXMAP_PATH = PROCESS_DIR / "strain_taxid_from_json.tsv"

print(f"[INFO] HOME_DIR      = {HOME_DIR}")
print(f"[INFO] PROCESS_DIR   = {PROCESS_DIR}")
print(f"[INFO] STRAIN_FILE   = {STRAIN_FILE}")
print(f"[INFO] JSON_ROOT     = {JSON_ROOT}")

[INFO] HOME_DIR      = /opt/ai4g_chriszyyang/buddy1/2_project_ongoing/4-antibio_resistance/PANACEA
[INFO] PROCESS_DIR   = /opt/ai4g_chriszyyang/buddy1/2_project_ongoing/4-antibio_resistance/PANACEA/data/1-processed_data
[INFO] STRAIN_FILE   = /opt/ai4g_chriszyyang/buddy1/2_project_ongoing/4-antibio_resistance/PANACEA/data/1-processed_data/strain_species_866_norm.tsv
[INFO] JSON_ROOT     = /apdcephfs_qy3/share_2932069/kangcz/StrainNetwork/Strain


In [4]:
# 1) 读取 strain_species_XXX_norm.tsv，拿到所有 NormalizedSubject
df_strain = pd.read_csv(STRAIN_FILE, sep="\t")
df_strain["NormalizedSubject"] = df_strain["NormalizedSubject"].astype(str)

unique_subjects = sorted(df_strain["NormalizedSubject"].dropna().unique())
print(f"[INFO] strain_species_XXX_norm.tsv 中共有 {len(unique_subjects)} 个不同 NormalizedSubject")

df_subjects = pd.DataFrame({"NormalizedSubject": unique_subjects})
df_subjects.head()

[INFO] strain_species_XXX_norm.tsv 中共有 866 个不同 NormalizedSubject


Unnamed: 0,NormalizedSubject
0,Acinetobacter baumannii 123C
1,Acinetobacter baumannii 309C
2,Acinetobacter baumannii 432B
3,Acinetobacter baumannii 518B
4,Acinetobacter baumannii A2265


In [10]:
# 2) 从 JSON 解析 TaxID
df_json = build_taxid_from_json(JSON_ROOT)
df_json["NormalizedSubject"] = df_json["NormalizedSubject"].astype(str)

# 3) 只保留我们关心的菌株
df_merged = df_subjects.merge(
    df_json[["NormalizedSubject", "TaxID_json"]],
    on="NormalizedSubject",
    how="left"
)

# 4) 类型清洗 + Source 标记
def _coerce_int(x):
    try:
        return int(x)
    except Exception:
        return np.nan

df_merged["TaxID"] = df_merged["TaxID_json"].apply(_coerce_int)
df_merged["Source"] = np.where(df_merged["TaxID"].notna(), "json", "missing")

print("\n[INFO] TaxID 来源统计")
print(df_merged["Source"].value_counts())

[INFO] 在 /apdcephfs_qy3/share_2932069/kangcz/StrainNetwork/Strain 中共发现 1759 个 *.assembly_meta.json 文件
[INFO] 从 JSON 中成功解析 TaxID 的菌株数：1759

[INFO] TaxID 来源统计
Source
json    876
Name: count, dtype: int64


In [23]:
# 5) 保存结果
out_df = (
    df_merged[["NormalizedSubject", "TaxID", "Source"]]
    .sort_values("NormalizedSubject")
    .drop_duplicates(subset=["NormalizedSubject"], keep="first")
    .reset_index(drop=True)
)

PROCESS_DIR.mkdir(parents=True, exist_ok=True)
out_df.to_csv(OUT_TAXMAP_PATH, sep="\t", index=False)

print(f"[DONE] 已保存 JSON 来源的 TaxID 映射到：{OUT_TAXMAP_PATH}")

[DONE] 已保存 JSON 来源的 TaxID 映射到：/opt/ai4g_chriszyyang/buddy1/2_project_ongoing/4-antibio_resistance/PANACEA/data/1-processed_data/strain_taxid_from_json.tsv
