In [16]:
import pandas as pd
import os
import glob
from config import DATABASE_DIR, DATABASE_CSV_FILES, validate_config

# 全局变量，用于存储加载后的数据库DataFrames
DB = {} 

def load_database():
    """
    加载数据库目录中的所有CSV文件到全局的DB字典中。
    使用config.py中的配置。
    """
    print("正在加载数据库到内存...")
    
    # 验证配置
    config_errors = validate_config()
    if config_errors:
        print("配置错误:")
        for error in config_errors:
            print(f"  - {error}")
        return
    
    if not DATABASE_DIR.exists():
        print(f"致命错误：数据库目录未找到于 '{DATABASE_DIR}'。Agent无法启动。")
        return

    # 使用配置中的CSV文件列表
    for csv_file in DATABASE_CSV_FILES:
        file_path = DATABASE_DIR / csv_file
        if not file_path.exists():
            print(f"警告：数据表文件 '{csv_file}' 不存在于 '{DATABASE_DIR}'")
            continue
            
        try:
            # 使用文件名（不含扩展名）作为字典的键
            key = csv_file.split('.')[0]
            DB[key] = pd.read_csv(file_path)
            print(f"  - 已加载数据表 '{key}' (共 {len(DB[key])} 行)")
        except Exception as e:
            print(f"  - 加载数据表 '{csv_file}' 失败: {e}")
    
    print("数据库加载完成。")

# 当此模块被首次导入时，自动执行加载数据库的操作
load_database()

正在加载数据库到内存...
  - 已加载数据表 '1_reactions_core' (共 3485 行)
  - 已加载数据表 '2_enzymes' (共 3485 行)
  - 已加载数据表 '3_experimental_conditions' (共 3485 行)
  - 已加载数据表 '4_activity_performance' (共 3485 行)
  - 已加载数据表 '5_reaction_participants' (共 16146 行)
  - 已加载数据表 '6_kinetic_parameters' (共 4756 行)
  - 已加载数据表 '7_mutants_characterized' (共 2388 行)
  - 已加载数据表 '8_inhibitors_main' (共 2466 行)
  - 已加载数据表 '9_inhibition_params' (共 1838 行)
  - 已加载数据表 '10_auxiliary_factors' (共 2740 行)
数据库加载完成。


In [17]:
from google.adk.tools import FunctionTool
import pandas as pd
import numpy as np
from typing import List, Dict, Optional, Any
from config import QUERY_CONFIG, ANALYSIS_CONFIG
import re

In [34]:
def _enzyme_name_or_synonym_match(df, enzyme_name):
    """
    支持enzyme_name和enzyme_synonyms（|分隔）模糊匹配
    """
    if 'enzyme_synonyms' not in df.columns:
        return df['enzyme_name'].str.contains(enzyme_name, case=False, na=False)
    # 先对synonyms做分割，生成布尔Series
    def match_synonyms(synonyms):
        if pd.isnull(synonyms):
            return False
        for syn in str(synonyms).split('|'):
            if enzyme_name.lower() in syn.lower():
                return True
        return False
    return (
        df['enzyme_name'].str.contains(enzyme_name, case=False, na=False) |
        df['enzyme_synonyms'].apply(match_synonyms)
    )

def find_reactions_by_enzyme(
    enzyme_name: str,
    organism: str,
    max_results: int
) -> str:
    """
    根据酶名称和物种查找相关反应。
    
    :param enzyme_name: str
    :param organism: str
    :param max_results: int
    """
    if not DB: return "数据库未加载。"
    
    enzymes_df = DB.get('2_enzymes', pd.DataFrame())
    core_df = DB.get('1_reactions_core', pd.DataFrame())
    
    if enzymes_df.empty or core_df.empty:
        return "核心数据表未加载。"
    
    # 构建查询条件
    query_conditions = []
    if enzyme_name:
        query_conditions.append(_enzyme_name_or_synonym_match(enzymes_df, enzyme_name))
    if organism:
        query_conditions.append(enzymes_df['organism'].str.contains(organism, case=False, na=False))

    if not query_conditions:
        return "请提供酶名称或物种信息。"
    
    # 应用查询条件
    filtered_enzymes = enzymes_df[pd.concat(query_conditions, axis=1).all(axis=1)]
    
    if filtered_enzymes.empty:
        return f"未找到匹配酶 '{enzyme_name}' 和物种 '{organism}' 的反应。"
    
    # 合并反应信息
    merged_df = pd.merge(filtered_enzymes, core_df, on=['literature_id', 'reaction_id'])
    
    # 限制结果数量
    max_results = min(max_results, QUERY_CONFIG["max_results"])
    result_df = merged_df.head(max_results)
    
    # 格式化输出
    result = f"# 酶相关反应查询结果\n\n"
    result += f"**查询条件**: 酶={enzyme_name}, 物种={organism}\n"
    result += f"**找到反应数**: {len(result_df)} (共{len(merged_df)}个)\n\n"
    
    for _, row in result_df.iterrows():
        result += f"## {row['literature_id']}:{row['reaction_id']}\n"
        result += f"- **酶**: {row['enzyme_name']}\n"
        result += f"- **物种**: {row['organism']}\n"
        result += f"- **反应**: {row['reaction_equation']}\n"
        result += f"- **是否可逆**: {row['reaction_type_reversible']}\n\n"
        print(row['enzyme_synonyms'])
    return result

In [36]:
find_reactions_by_enzyme('AtPPPS',"Arabidopsis thaliana",5)

AtSPS3|AtPPPS


'# 酶相关反应查询结果\n\n**查询条件**: 酶=AtPPPS, 物种=Arabidopsis thaliana\n**找到反应数**: 1 (共1个)\n\n## PMID32034864:reaction_3\n- **酶**: Solanesyl diphosphate synthase 3\n- **物种**: Arabidopsis thaliana\n- **反应**: Geranylgeranyl diphosphate + 5 Isopentenyl diphosphate -> Solanesyl diphosphate + 5 Pyrophosphate\n- **是否可逆**: Not specified\n\n'

In [32]:
def smart_search_reactions(
    search_query: str,
    search_fields: List[str],
    max_results: int
) -> str:
    """
    智能搜索反应，支持多字段模糊匹配。
    
    :param search_query: str
    :param search_fields: List[str]
    :param max_results: int
    """
    if not DB: return "数据库未加载。"
    
    core_df = DB.get('1_reactions_core', pd.DataFrame())
    enzymes_df = DB.get('2_enzymes', pd.DataFrame())
    
    if core_df.empty or enzymes_df.empty:
        return "核心数据表未加载。"
    
    # 合并数据
    merged_df = pd.merge(core_df, enzymes_df, on=['literature_id', 'reaction_id'])
    
    # 构建搜索条件
    search_conditions = []
    for field in search_fields:
        if field in merged_df.columns:
            if field == "enzyme_name" or "enzyme_synonyms":
                search_conditions.append(_enzyme_name_or_synonym_match(merged_df, search_query))
            else:
                search_conditions.append(merged_df[field].str.contains(search_query, case=False, na=False))
    
    if not search_conditions:
        return "未找到有效的搜索字段。"
    
    # 应用搜索条件（OR逻辑）
    combined_condition = pd.concat(search_conditions, axis=1).any(axis=1)
    filtered_df = merged_df[combined_condition]
    
    if filtered_df.empty:
        return f"未找到匹配查询 '{search_query}' 的反应。"
    
    # 限制结果数量
    max_results = min(max_results, QUERY_CONFIG["max_results"])
    result_df = filtered_df.head(max_results)
    
    # 格式化输出
    result = f"# 智能搜索结果\n\n"
    result += f"**搜索查询**: {search_query}\n"
    result += f"**搜索字段**: {', '.join(search_fields)}\n"
    result += f"**找到反应数**: {len(result_df)} (共{len(filtered_df)}个)\n\n"
    
    for _, row in result_df.iterrows():
        result += f"## {row['literature_id']}:{row['reaction_id']}\n"
        result += f"- **酶**: {row['enzyme_name']}\n"
        result += f"- **物种**: {row['organism']}\n"
        result += f"- **反应**: {row['reaction_equation']}\n"
        result += f"- **反应是否可逆**: {row['reaction_type_reversible']}\n\n"
        result += "\n"
    return result

In [33]:
smart_search_reactions("AtPPPS",["enzyme_name"],5)

'# 智能搜索结果\n\n**搜索查询**: AtPPPS\n**搜索字段**: enzyme_name\n**找到反应数**: 1 (共1个)\n\n## PMID32034864:reaction_3\n- **酶**: Solanesyl diphosphate synthase 3\n- **物种**: Arabidopsis thaliana\n- **反应**: Geranylgeranyl diphosphate + 5 Isopentenyl diphosphate -> Solanesyl diphosphate + 5 Pyrophosphate\n- **反应是否可逆**: Not specified\n\n- **温度**: N/A°C\n- **pH**: N/A\n- **pH补充说明**: N/A\n- **实验类型**: N/A\n- **实验细节**: N/A\n- **缓冲液/溶剂**: N/A\n- **表达宿主**: N/A\n- **表达载体**: N/A\n- **诱导条件**: N/A\n\n'