# generate XLSX distribution

In [1]:
import cobra
import pandas as pd
import requests
import re
import time

# ============================================================================
# 配置区域 - 在这里修改模型文件和输出文件名
# ============================================================================
MODELS_CONFIG = [
    {
        "model_file": "other model/iCW773R.xml",
        "model_name": "iCW773R",
        "output_excel": "8.analysis/reaction_distribution_iCW773R.xlsx",
        "use_bigg_mapping": True  # ✅ 使用BiGG映射表
    },
    {
        "model_file": "other model/iCGB21FR.xml",
        "model_name": "iCGB21FR",
        "output_excel": "8.analysis/reaction_distribution_iCGB21FR.xlsx",
        "use_bigg_mapping": True  # ✅ 使用BiGG映射表
    },
    {
        "model_file": "iCZ870_CGXII.xml",
        "model_name": "iCZ870",
        "output_excel": "8.analysis/reaction_distribution_iCZ870.xlsx",
        "use_bigg_mapping": False  # ✅ 从注释提取
    }
]

# BiGG映射表路径
BIGG_MAPPING_FILE = "4. 添加linker/bigg_models_reactions.tsv"

# 生物量反应列表
BIOMASS_REACTIONS = [
    "ARATH", "DNA", "IONS", "MYCOLI", "PEPTH", "PPTHTH",
    "PROTEIN", "SOL_POOL", "CG_biomass_cgl_ATCC13032",
    "TMCM_FREE", "FREENTCO", "TDCM_FREE", "Growth"
]

# KEGG API基础URL
BASE_URL = "https://rest.kegg.jp/"

# 主要代谢类别映射
PATHWAY_CATEGORIES = {
    'Carbon metabolism': ['00010', '00020', '00030', '00040', '00051', '00052', '00053', '00500', '00520', '00620', '00630', '00640', '00650', '00660'],
    'Energy metabolism': ['00190', '00195', '00196', '00710', '00720', '00680', '00910', '00920'],
    'Lipid metabolism': ['00061', '00062', '00071', '00072', '00073', '00100', '00120', '00140', '00561', '00564', '00565', '00590', '00591', '00592'],
    'Nucleotide metabolism': ['00230', '00240', '00250', '00260'],
    'Amino acid metabolism': ['00250', '00260', '00270', '00280', '00290', '00300', '00310', '00220', '00330', '00340', '00350', '00360', '00380', '00400'],
    'Glycan metabolism': ['00510', '00513', '00512', '00515', '00514', '00532', '00534', '00533', '00531', '00563', '00601', '00603', '00604', '00540', '00550', '00511', '00571', '00572'],
    'Cofactor and vitamin metabolism': ['00730', '00740', '00750', '00760', '00770', '00780', '00785', '00790', '00670', '00830', '00860', '00130'],
    'Secondary metabolite metabolism': ['00232', '00965', '00966', '00402', '00311', '00312', '00332', '00331', '00901', '00403', '00940', '00945', '00944', '00942', '00943', '00901', '00904', '00981', '00950']
}

# 优先匹配的代谢途径
PRIORITY_PATHWAYS = {
    "Carbon metabolism": "01200",
    "Biosynthesis of cofactors": "01240",
    "Nucleotide metabolism": "01232",
    "Biosynthesis of amino acids": "01230",
    "Fatty acid metabolism": "01212"
}

# ============================================================================
# 函数定义
# ============================================================================

def clean_reaction_id(reaction_id):
    """清理反应ID（移除下划线后的部分）"""
    if '_' in reaction_id:
        return reaction_id.split('_')[0]
    return reaction_id

def extract_kegg_id_from_bigg(cleaned_id, bigg_mapping_df):
    """从BiGG映射表提取KEGG反应ID"""
    if bigg_mapping_df.empty:
        return None

    matched_rows = bigg_mapping_df[bigg_mapping_df['bigg_id'] == cleaned_id]

    if not matched_rows.empty:
        for _, row in matched_rows.iterrows():
            if 'database_links' in row and pd.notna(row['database_links']):
                database_links = row['database_links']
                if isinstance(database_links, str):
                    # 匹配KEGG反应ID格式
                    kegg_pattern = re.compile(r'KEGG Reaction: http://identifiers.org/kegg.reaction/([rR]\d{5})')
                    match = kegg_pattern.search(database_links)
                    if match:
                        return match.group(1)
    return None

def extract_kegg_id_from_annotation(reaction):
    """从COBRApy反应对象的annotation中提取KEGG反应ID"""
    try:
        if hasattr(reaction, 'annotation') and reaction.annotation:
            annotation = reaction.annotation

            # 尝试多种可能的键名
            possible_keys = ['kegg.reaction', 'KEGG Reaction', 'kegg', 'KEGG']

            for key in possible_keys:
                if key in annotation:
                    kegg_value = annotation[key]

                    # 可能是字符串或列表
                    if isinstance(kegg_value, str):
                        kegg_ids = [kegg_value]
                    elif isinstance(kegg_value, list):
                        kegg_ids = kegg_value
                    else:
                        continue

                    # 提取第一个有效的KEGG ID
                    for kegg_id in kegg_ids:
                        kegg_id = str(kegg_id)
                        if 'kegg.reaction/' in kegg_id:
                            kegg_id = kegg_id.split('kegg.reaction/')[-1]

                        # 验证格式 (R + 5位数字)
                        if re.match(r'^[rR]\d{5}$', kegg_id):
                            return kegg_id.upper()

            # 也尝试从identifiers.org链接中提取
            if 'identifiers.org' in str(annotation):
                for value in annotation.values():
                    if isinstance(value, str) and 'identifiers.org/kegg.reaction/' in value:
                        match = re.search(r'kegg\.reaction/([rR]\d{5})', value)
                        if match:
                            return match.group(1).upper()
                    elif isinstance(value, list):
                        for item in value:
                            if isinstance(item, str) and 'identifiers.org/kegg.reaction/' in item:
                                match = re.search(r'kegg\.reaction/([rR]\d{5})', item)
                                if match:
                                    return match.group(1).upper()

        return None

    except Exception as e:
        return None

def get_pathways_for_reaction(kegg_id):
    """从KEGG获取反应对应的pathway信息"""
    if not kegg_id:
        return []

    try:
        url = f"{BASE_URL}get/{kegg_id}"
        response = requests.get(url)

        if response.status_code == 200:
            pathway_section = False
            pathways = []

            for line in response.text.splitlines():
                if line.startswith("PATHWAY"):
                    pathway_section = True
                    parts = line.split()
                    if len(parts) >= 2:
                        pathway_id = parts[1]
                        pathway_name = ' '.join(parts[2:])
                        pathways.append((pathway_id, pathway_name))
                elif pathway_section and line.startswith(" "):
                    parts = line.strip().split()
                    if parts:
                        pathway_id = parts[0]
                        pathway_name = ' '.join(parts[1:])
                        pathways.append((pathway_id, pathway_name))
                elif pathway_section and not line.startswith(" "):
                    pathway_section = False

            return pathways
        else:
            if response.status_code != 404:
                print(f"获取反应 {kegg_id} 的信息失败: {response.status_code}")
            return []

    except Exception as e:
        print(f"获取反应 {kegg_id} 的途径信息时出错: {str(e)}")
        return []

def classify_reaction_by_pathway(pathways):
    """根据pathway信息和优先级进行分类"""
    if not pathways:
        return None

    priority_order = [
        "Carbon metabolism",
        "Biosynthesis of amino acids",
        "Nucleotide metabolism",
        "Biosynthesis of cofactors",
        "Fatty acid metabolism"
    ]

    # 第一轮：通过精确ID匹配检查所有相关途径
    pathway_ids = [p_id for p_id, _ in pathways]
    matched_priorities = []

    for priority_name, priority_id in PRIORITY_PATHWAYS.items():
        if priority_id in pathway_ids:
            matched_priorities.append(priority_name)

    if matched_priorities:
        for priority in priority_order:
            if priority in matched_priorities:
                return priority
        return matched_priorities[0]

    # 第二轮：尝试从pathway名称中匹配优先级关键词
    pathway_names = [p_name for _, p_name in pathways]
    matched_by_name = []

    for priority_name in priority_order:
        for pathway_name in pathway_names:
            if priority_name.lower() in pathway_name.lower():
                matched_by_name.append(priority_name)
                break

    if matched_by_name:
        return matched_by_name[0]

    # 第三轮：查找是否属于某个特定类别（根据pathway ID）
    for pathway_id, _ in pathways:
        for category, pid_list in PATHWAY_CATEGORIES.items():
            if pathway_id in pid_list:
                return category

    return pathways[0][1]

def process_model(model_file, output_excel, use_bigg_mapping, bigg_mapping_df=None):
    """处理单个模型并生成Excel文件"""
    print(f"\n{'='*60}")
    print(f"开始处理模型: {model_file}")
    if use_bigg_mapping:
        print(f"获取KEGG ID方式: BiGG映射表")
    else:
        print(f"获取KEGG ID方式: 模型注释")
    print(f"{'='*60}")

    # 读取模型
    model = cobra.io.read_sbml_model(model_file)
    print(f"模型读取成功，包含 {len(model.reactions)} 个反应")

    reactions_data = []
    processed_cleaned_ids = set()

    # 第一步：功能分类
    print("\n第一步：进行功能分类...")
    for reaction in model.reactions:
        reaction_id = reaction.id

        if reaction_id.endswith("_reverse"):
            continue

        if reaction_id.startswith("EX_"):
            reaction_type = "exchange"
        elif reaction_id in BIOMASS_REACTIONS or "biomass" in reaction_id.lower() or "bio_" in reaction_id.lower():
            reaction_type = "biomass"
        else:
            has_external_metabolite = False
            for metabolite in reaction.metabolites:
                if "_e" in metabolite.id:
                    has_external_metabolite = True
                    break

            if has_external_metabolite:
                reaction_type = "transport"
            else:
                continue

        reactions_data.append({
            'reaction_id': reaction_id,
            'cleaned_id': reaction_id,
            'reaction_name': reaction.name if reaction.name else reaction_id,
            'reaction_type': reaction_type,
            'kegg_id': None,
            'pathway_info': "None"
        })

    print(f"功能分类完成，已分类 {len(reactions_data)} 个反应")

    # 第二步：KEGG分类
    print("\n第二步：通过KEGG进行代谢途径分类...")
    total_to_classify = 0
    for reaction in model.reactions:
        if not reaction.id.endswith("_reverse"):
            already_classified = any(d['reaction_id'] == reaction.id for d in reactions_data)
            if not already_classified:
                total_to_classify += 1

    print(f"需要通过KEGG分类的反应数: {total_to_classify}")

    classified_count = 0
    has_kegg_count = 0
    no_kegg_count = 0
    no_pathway_count = 0

    for i, reaction in enumerate(model.reactions):
        reaction_id = reaction.id

        if reaction_id.endswith("_reverse"):
            continue

        already_classified = any(d['reaction_id'] == reaction_id for d in reactions_data)
        if already_classified:
            continue

        cleaned_id = clean_reaction_id(reaction_id)

        # 检查是否已处理过（去重）
        if cleaned_id in processed_cleaned_ids:
            continue

        processed_cleaned_ids.add(cleaned_id)

        reaction_type = "Other"
        kegg_id = None
        pathway_info = []

        # 根据配置选择获取KEGG ID的方式
        if use_bigg_mapping:
            # 从BiGG映射表获取
            kegg_id = extract_kegg_id_from_bigg(cleaned_id, bigg_mapping_df)
        else:
            # 从模型注释获取
            kegg_id = extract_kegg_id_from_annotation(reaction)

        if kegg_id:
            has_kegg_count += 1

            # API限速
            if classified_count > 0 and classified_count % 10 == 0:
                time.sleep(1)

            # 获取pathway信息
            pathway_info = get_pathways_for_reaction(kegg_id)

            if pathway_info:
                pathway_category = classify_reaction_by_pathway(pathway_info)
                if pathway_category:
                    reaction_type = pathway_category
            else:
                no_pathway_count += 1
        else:
            no_kegg_count += 1

        reactions_data.append({
            'reaction_id': reaction_id,
            'cleaned_id': cleaned_id,
            'reaction_name': reaction.name if reaction.name else reaction_id,
            'reaction_type': reaction_type,
            'kegg_id': kegg_id if kegg_id else "None",
            'pathway_info': "; ".join([f"{pid}: {pname}" for pid, pname in pathway_info]) if pathway_info else "None"
        })

        classified_count += 1
        if classified_count % 50 == 0 or classified_count == total_to_classify:
            print(f"已处理 {classified_count}/{total_to_classify} 个反应")

    # 打印统计信息
    print(f"\nKEGG ID统计:")
    print(f"  - 找到KEGG ID: {has_kegg_count}")
    print(f"  - 未找到KEGG ID: {no_kegg_count}")
    print(f"  - 有KEGG但无途径: {no_pathway_count}")

    # 创建DataFrame并保存
    reactions_df = pd.DataFrame(reactions_data)
    print(f"\n处理完成，共 {len(reactions_df)} 个反应")

    reactions_df.to_excel(output_excel, index=False)
    print(f"结果已保存到: {output_excel}")

    # 打印反应类型统计
    reaction_type_counts = reactions_df['reaction_type'].value_counts()
    print(f"\n反应类型统计:")
    for category, count in reaction_type_counts.items():
        print(f"  - {category}: {count}个反应")

    return reactions_df

# ============================================================================
# 主程序
# ============================================================================

if __name__ == "__main__":
    print("="*60)
    print("代谢模型反应分类批处理程序")
    print("="*60)

    # 检查是否需要BiGG映射表
    needs_bigg = any(config["use_bigg_mapping"] for config in MODELS_CONFIG)

    bigg_mapping_df = pd.DataFrame()
    if needs_bigg:
        print(f"\n读取BiGG映射表: {BIGG_MAPPING_FILE}")
        try:
            bigg_mapping_df = pd.read_csv(BIGG_MAPPING_FILE, sep='\t')
            print(f"BiGG映射表读取成功，包含 {len(bigg_mapping_df)} 条记录")
        except Exception as e:
            print(f"警告: 无法读取BiGG映射表: {str(e)}")
            print("需要BiGG映射表的模型将无法正确分类")

    # 处理所有模型
    results = {}
    for config in MODELS_CONFIG:
        try:
            result_df = process_model(
                config["model_file"],
                config["output_excel"],
                config["use_bigg_mapping"],
                bigg_mapping_df if config["use_bigg_mapping"] else None
            )
            results[config["model_name"]] = result_df
        except Exception as e:
            print(f"\n处理模型 {config['model_name']} 时出错: {str(e)}")
            import traceback
            traceback.print_exc()
            continue

    print("\n" + "="*60)
    print("所有模型处理完成！")
    print("="*60)
    print("\n生成的文件:")
    for config in MODELS_CONFIG:
        print(f"  - {config['output_excel']}")

代谢模型反应分类批处理程序

读取BiGG映射表: 4. 添加linker/bigg_models_reactions.tsv
BiGG映射表读取成功，包含 28301 条记录

开始处理模型: other model/iCW773R.xml
获取KEGG ID方式: BiGG映射表


'' is not a valid SBML 'SId'.


Set parameter Username
Set parameter LicenseID to value 2723056
Academic license - for non-commercial use only - expires 2026-10-16
模型读取成功，包含 1850 个反应

第一步：进行功能分类...
功能分类完成，已分类 580 个反应

第二步：通过KEGG进行代谢途径分类...
需要通过KEGG分类的反应数: 1011
已处理 50/1011 个反应
已处理 100/1011 个反应
已处理 150/1011 个反应
已处理 200/1011 个反应
已处理 250/1011 个反应
已处理 300/1011 个反应
已处理 350/1011 个反应
已处理 400/1011 个反应
已处理 450/1011 个反应
已处理 500/1011 个反应
已处理 550/1011 个反应
已处理 600/1011 个反应
已处理 650/1011 个反应
已处理 700/1011 个反应

KEGG ID统计:
  - 找到KEGG ID: 422
  - 未找到KEGG ID: 286
  - 有KEGG但无途径: 24

处理完成，共 1288 个反应
结果已保存到: 8.analysis/reaction_distribution_iCW773R.xlsx

反应类型统计:
  - transport: 405个反应
  - Other: 310个反应
  - exchange: 163个反应
  - Nucleotide metabolism: 66个反应
  - Biosynthesis of amino acids: 66个反应
  - Biosynthesis of cofactors: 65个反应
  - Carbon metabolism: 50个反应
  - Fatty acid metabolism: 21个反应
  - Aminoacyl-tRNA biosynthesis: 19个反应
  - Purine metabolism: 17个反应
  - biomass: 12个反应
  - Terpenoid backbone biosynthesis: 8个反应
  - Starch and sucrose m

SBML package 'layout' not supported by cobrapy, information is not parsed
https://juser.fz-juelich.de/record/188973 does not conform to 'http(s)://identifiers.org/collection/id' or'http(s)://identifiers.org/COLLECTION:id


模型读取成功，包含 1539 个反应

第一步：进行功能分类...
功能分类完成，已分类 433 个反应

第二步：通过KEGG进行代谢途径分类...
需要通过KEGG分类的反应数: 1106
已处理 50/1106 个反应
已处理 100/1106 个反应
已处理 150/1106 个反应
已处理 200/1106 个反应
已处理 250/1106 个反应
已处理 300/1106 个反应
已处理 350/1106 个反应
已处理 400/1106 个反应
已处理 450/1106 个反应
已处理 500/1106 个反应
已处理 550/1106 个反应
已处理 600/1106 个反应
已处理 650/1106 个反应
已处理 700/1106 个反应
已处理 750/1106 个反应
已处理 800/1106 个反应
已处理 850/1106 个反应
已处理 900/1106 个反应
已处理 950/1106 个反应
已处理 1000/1106 个反应
已处理 1050/1106 个反应

KEGG ID统计:
  - 找到KEGG ID: 486
  - 未找到KEGG ID: 599
  - 有KEGG但无途径: 36

处理完成，共 1518 个反应
结果已保存到: 8.analysis/reaction_distribution_iCGB21FR.xlsx

反应类型统计:
  - Other: 635个反应
  - transport: 250个反应
  - exchange: 182个反应
  - Biosynthesis of cofactors: 72个反应
  - Biosynthesis of amino acids: 72个反应
  - Nucleotide metabolism: 64个反应
  - Carbon metabolism: 61个反应
  - Purine metabolism: 17个反应
  - Cysteine and methionine metabolism: 10个反应
  - Glyoxylate and dicarboxylate metabolism: 9个反应
  - Terpenoid backbone biosynthesis: 9个反应
  - Pentose and glucuronate i

No objective coefficients in model. Unclear what should be optimized


模型读取成功，包含 1753 个反应

第一步：进行功能分类...
功能分类完成，已分类 433 个反应

第二步：通过KEGG进行代谢途径分类...
需要通过KEGG分类的反应数: 1320
已处理 50/1320 个反应
已处理 100/1320 个反应
已处理 150/1320 个反应
已处理 200/1320 个反应
已处理 250/1320 个反应
已处理 300/1320 个反应
已处理 350/1320 个反应
已处理 400/1320 个反应
已处理 450/1320 个反应
已处理 500/1320 个反应
已处理 550/1320 个反应
已处理 600/1320 个反应
已处理 650/1320 个反应
已处理 700/1320 个反应
已处理 750/1320 个反应
已处理 800/1320 个反应
已处理 850/1320 个反应
已处理 900/1320 个反应
已处理 950/1320 个反应
已处理 1000/1320 个反应
已处理 1050/1320 个反应
已处理 1100/1320 个反应
已处理 1150/1320 个反应
已处理 1200/1320 个反应
已处理 1250/1320 个反应
已处理 1300/1320 个反应
已处理 1320/1320 个反应

KEGG ID统计:
  - 找到KEGG ID: 997
  - 未找到KEGG ID: 323
  - 有KEGG但无途径: 153

处理完成，共 1753 个反应
结果已保存到: 8.analysis/reaction_distribution_iCZ870.xlsx

反应类型统计:
  - Other: 476个反应
  - transport: 239个反应
  - exchange: 172个反应
  - Biosynthesis of cofactors: 104个反应
  - Nucleotide metabolism: 80个反应
  - Biosynthesis of amino acids: 70个反应
  - Carbon metabolism: 70个反应
  - Fatty acid metabolism: 49个反应
  - Purine metabolism: 28个反应
  - biomass: 22个反应
  - Ami

# comprare rxn distribution

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.gridspec as gridspec

# ============================================================================
# 配置区域
# ============================================================================

# 模型配置
MODELS_CONFIG = [
    # 第1个元素：对应最左侧饼图（iCZ870）
    {
        "excel_file": "8.analysis/reaction_distribution_iCZ870.xlsx",
        "model_name": "iCZ870"
    },
    # 第2个元素：对应中间饼图（iCW773R）
    {
        "excel_file": "8.analysis/reaction_distribution_iCW773R.xlsx",
        "model_name": "iCW773R"
    },
    # 第3个元素：对应最右侧饼图（iCGB21FR）
    {
        "excel_file": "8.analysis/reaction_distribution_iCGB21FR.xlsx",
        "model_name": "iCGB21FR"
    }
]

# 输出文件名
OUTPUT_FILE = "Reaction_distribution_comparison_3models.png"

# 图表参数
MIN_CATEGORY_SIZE = 21  # 小于此值的类别合并为"Other"
DPI = 600
FIGURE_SIZE = (30, 10)  # 三个饼图并排，增加宽度

# 出版级样式设置
plt.rcParams.update({
    'font.family': 'Arial',
    'font.size': 10,
    'axes.linewidth': 1.0,
    'axes.labelsize': 12,
    'xtick.major.width': 1.0,
    'ytick.major.width': 1.0,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10,
    'legend.fontsize': 10,
    'legend.frameon': False,
    'axes.spines.top': False,
    'axes.spines.right': False,
    'figure.dpi': 300
})

# 配色方案
COLORS = [
    '#91CCC0',  # 薄荷绿
    '#7FABD1',  # 浅蓝色
    '#F7AC53',  # 橙色
    '#EC6E66',  # 珊瑚红
    '#B5CE4E',  # 黄绿色
    '#BD7795',  # 紫红色
    '#963B79',  # 紫色
    '#C7C1DE',  # 淡紫色
    '#EEB6D4',  # 粉红色
    '#7C7979',  # 灰色 (Other类别专用)
]

# 重要类别（不会被合并到Other）
IMPORTANT_CATEGORIES = ["transport", "exchange", "Purine metabolism", "biomass"]

# ============================================================================
# 函数定义
# ============================================================================

def process_reaction_data(excel_file, min_category_size=21):
    """处理反应数据"""
    print(f"\n读取数据: {excel_file}")
    df = pd.read_excel(excel_file)
    print(f"  总反应数: {len(df)}")

    if 'reaction_type' not in df.columns:
        raise ValueError(f"Excel文件 {excel_file} 缺少 'reaction_type' 列")

    # 清理反应类型名称
    df['clean_reaction_type'] = df['reaction_type'].apply(
        lambda x: str(x).strip('\'"() ') if pd.notna(x) else "Other"
    )

    # 计算类别数量
    reaction_counts = df['clean_reaction_type'].value_counts()

    # 找出需要合并的小类别
    small_categories = [
        cat for cat in reaction_counts[reaction_counts < min_category_size].index
        if cat not in IMPORTANT_CATEGORIES and cat != "Other"
    ]

    # 合并小类别
    df_vis = df.copy()
    if small_categories:
        print(f"  合并 {len(small_categories)} 个小类别到 'Other'")
        df_vis.loc[df_vis['clean_reaction_type'].isin(small_categories), 'clean_reaction_type'] = "Other"

    # 重新计算
    reaction_counts = df_vis['clean_reaction_type'].value_counts()

    # 排序，Other放最后
    if "Other" in reaction_counts:
        other_count = reaction_counts["Other"]
        reaction_counts = reaction_counts[reaction_counts.index != "Other"]
        reaction_counts = reaction_counts.sort_values(ascending=False)
        reaction_counts["Other"] = other_count
    else:
        reaction_counts = reaction_counts.sort_values(ascending=False)

    return pd.DataFrame({
        'category': reaction_counts.index,
        'count': reaction_counts.values
    })

def get_consistent_color_mapping(*data_list):
    """为所有数据集创建一致的颜色映射"""
    # 合并所有数据集
    combined_data = pd.concat(data_list, ignore_index=True)

    # 按总数排序
    combined_counts = combined_data.groupby('category')['count'].sum().reset_index()
    combined_counts = combined_counts.sort_values('count', ascending=False)

    all_categories = combined_counts['category'].tolist()

    # Other类别移到最后
    if "Other" in all_categories:
        all_categories.remove("Other")
        all_categories.append("Other")

    # 创建颜色映射
    color_mapping = {}
    for i, category in enumerate(all_categories):
        if category == "Other":
            color_mapping[category] = '#7C7979'
        else:
            color_idx = i % (len(COLORS) - 1)
            color_mapping[category] = COLORS[color_idx]

    return color_mapping

def ensure_same_categories(*data_list):
    """确保所有数据集有相同的类别（用于一致性）"""
    # 获取所有唯一类别
    all_categories = set()
    for data in data_list:
        all_categories.update(data['category'].tolist())

    # 为每个数据集添加缺失的类别（计数为0）
    result_list = []
    for data in data_list:
        data_copy = data.copy()
        for cat in all_categories:
            if cat not in data_copy['category'].values:
                new_row = pd.DataFrame({'category': [cat], 'count': [0]})
                data_copy = pd.concat([data_copy, new_row], ignore_index=True)

        # 排序，Other放最后
        if "Other" in data_copy['category'].values:
            other_row = data_copy[data_copy['category'] == "Other"]
            data_copy = data_copy[data_copy['category'] != "Other"]
            data_copy = data_copy.sort_values(by='count', ascending=False)
            data_copy = pd.concat([data_copy, other_row], ignore_index=True)
        else:
            data_copy = data_copy.sort_values(by='count', ascending=False)

        data_copy = data_copy.reset_index(drop=True)
        result_list.append(data_copy)

    return tuple(result_list)

def create_pie_chart(data, ax, title, color_mapping):
    """在指定的axes上创建饼图"""
    # 过滤掉计数为0的类别
    data = data[data['count'] > 0].copy()

    categories = data['category'].tolist()
    counts = data['count'].tolist()

    # 计算百分比
    total = sum(counts)
    percentages = [(count / total) * 100 for count in counts]

    # 获取颜色
    color_list = [color_mapping.get(category, COLORS[0]) for category in categories]

    # 设置explode（仅Other类别）
    explode = np.zeros(len(categories))
    if "Other" in categories:
        explode[categories.index("Other")] = 0.1

    # 创建饼图
    wedges, _, autotexts = ax.pie(
        counts,
        labels=None,
        autopct='%1.1f%%',
        startangle=90,
        explode=explode,
        colors=color_list,
        wedgeprops={'linewidth': 0.5, 'edgecolor': 'white'},
        pctdistance=0.75,
        textprops={'fontsize': 10, 'weight': 'bold', 'color': 'black'}
    )

    # 调整小切片的标签
    for i, autotext in enumerate(autotexts):
        if percentages[i] < 1.0:
            autotext.set_fontsize(0)  # 隐藏小于1%的标签
        else:
            autotext.set_fontsize(9)
        autotext.set_fontweight('bold')

    # 创建图例标签
    legend_labels = [f"{cat} ({count}, {pct:.1f}%)"
                    for cat, count, pct in zip(categories, counts, percentages)]

    # 添加图例
    legend = ax.legend(
        wedges,
        legend_labels,
        loc="center left",
        bbox_to_anchor=(0.95, 0.5),
        frameon=False,
        handlelength=1.0,
        handletextpad=0.3,
        labelspacing=0.6,
        columnspacing=0.8
    )

    # 设置图例文本属性
    for text in legend.get_texts():
        text.set_color('black')
        text.set_ha('left')

    # 获取图例位置
    legend_bbox = legend.get_window_extent(ax.figure.canvas.get_renderer())
    legend_bbox = legend_bbox.transformed(ax.transAxes.inverted())

    # 添加图例标题
    plt.text(
        legend_bbox.x0,
        legend_bbox.y1 + 0.05,
        "Reaction Types",
        transform=ax.transAxes,
        fontsize=14,
        fontweight='bold',
        color='black',
        ha='left'
    )

    # 添加图表标题
    if title:
        ax.set_title(
            title,
            fontsize=20,
            fontweight='bold',
            pad=15,
            y=0.98,
            loc='center'
        )

    # 确保饼图是圆形的
    ax.set_aspect('equal')

def create_three_model_comparison(data_list, model_names, output_file,
                                 color_mapping, dpi=600, figsize=(30, 10)):
    """创建三模型对比图"""
    print(f"\n创建三模型对比图...")

    # 创建图形
    fig = plt.figure(figsize=figsize, facecolor='white')

    # 创建网格布局
    gs = gridspec.GridSpec(1, 3, width_ratios=[1, 1, 1], wspace=0.4)

    # 为每个模型创建饼图
    axes = []
    for i in range(3):
        ax = plt.subplot(gs[i])
        axes.append(ax)

        create_pie_chart(
            data_list[i],
            ax,
            f'Reaction distribution of {model_names[i]}',
            color_mapping
        )

    # 保存图形
    plt.savefig(output_file, dpi=dpi, bbox_inches='tight',
                facecolor='white', pad_inches=0.5)
    print(f"对比图已保存: {output_file}")

    plt.close()

# ============================================================================
# 主程序
# ============================================================================

if __name__ == "__main__":
    print("="*60)
    print("三模型反应分布对比可视化程序")
    print("="*60)

    # 处理所有模型的数据
    data_list = []
    model_names = []

    for config in MODELS_CONFIG:
        data = process_reaction_data(
            config["excel_file"],
            MIN_CATEGORY_SIZE
        )
        data_list.append(data)
        model_names.append(config["model_name"])

    # 确保所有数据集有相同的类别
    print(f"\n统一类别...")
    data_list = list(ensure_same_categories(*data_list))

    # 创建一致的颜色映射
    print(f"创建颜色映射...")
    color_mapping = get_consistent_color_mapping(*data_list)

    # 创建三模型对比图
    create_three_model_comparison(
        data_list,
        model_names,
        OUTPUT_FILE,
        color_mapping,
        DPI,
        FIGURE_SIZE
    )

    print("\n" + "="*60)
    print("完成！")
    print("="*60)

三模型反应分布对比可视化程序

读取数据: 8.analysis/reaction_distribution_iCZ870.xlsx
  总反应数: 1753
  合并 97 个小类别到 'Other'

读取数据: 8.analysis/reaction_distribution_iCW773R.xlsx
  总反应数: 1288
  合并 40 个小类别到 'Other'

读取数据: 8.analysis/reaction_distribution_iCGB21FR.xlsx
  总反应数: 1518
  合并 46 个小类别到 'Other'

统一类别...
创建颜色映射...

创建三模型对比图...
对比图已保存: Reaction_distribution_comparison_3models.png

完成！
