# gapfilling


In [None]:
import cobra
import pandas as pd

def split_reversible_reactions(model):
        """
        Split reversible reactions in the model into separate forward and backward reactions.
        
        Parameters:
        model (cobra.Model): The metabolic model.
        
        Returns:
        cobra.Model: The model with split reversible reactions.
        """
        for rxn in list(model.reactions):
            if rxn.lower_bound < 0 and rxn.upper_bound > 0:
                # Create forward and backward reactions
                forward_rxn = cobra.Reaction(f"{rxn.id}_forward")
                backward_rxn = cobra.Reaction(f"{rxn.id}_backward")
                
                # Add the new forward and backward reactions to the model
                model.add_reactions([forward_rxn, backward_rxn])
                
                # Set the reaction attributes after adding to the model
                forward_rxn.reaction = rxn.reaction
                forward_rxn.lower_bound = 0
                forward_rxn.upper_bound = rxn.upper_bound
                
                # Create the backward reaction using the reversed metabolites
                backward_rxn.add_metabolites({k: -v for k, v in rxn.metabolites.items()})
                backward_rxn.lower_bound = 0
                backward_rxn.upper_bound = abs(rxn.lower_bound)
                
                # Remove the original reversible reaction
                model.remove_reactions([rxn])
                
        return model
def merge_and_gapfill(model, universal_model, target_met_id='biomass_c', delta=1, output_file='gapfilling_results.xlsx'):
    """
    Merge a universal model with the given model and adjust weights for the universal model.
    
    Parameters:
    model (cobra.Model): The metabolic model to be gapfilled.
    universal_model (cobra.Model): The universal metabolic model (e.g., ModelSEED).
    target_met_id (str): The ID of the target metabolite to be gapfilled.
    delta (float): The weight for minimizing the number of universal reactions.
    output_file (str): The name of the output file to store the added reactions.
    
    Returns:
    cobra.Model: The merged model with reactions added from the universal model.
    """
    # Copy the original model
    combined_model = model.copy()
    combined_model.add_reactions(universal_model.reactions)
    # Create a pseudo-reaction to represent the number of universal reactions
    num_universal_rxn = cobra.Reaction('num_universal_rxn')
    num_universal_rxn.lower_bound = 0
    combined_model.add_reactions([num_universal_rxn])
    
    # Add the pseudo-metabolite to each universal reaction
    num_universal_met = cobra.Metabolite('num_universal_met', formula='')
    num_universal_met.compartment = 'c'
    for rxn in combined_model.reactions:
        if rxn.id not in model.reactions:
            rxn.add_metabolites({num_universal_met: delta})
            num_universal_rxn.add_metabolites({num_universal_met: -1})
            #print(rxn)
    added_reactions = []
    
    # Get the metabolite object for the target metabolite
    target_met = model.metabolites.get_by_id(target_met_id)
    print(f"Processing metabolite {target_met_id}...")

    # Add a demand reaction for the target metabolite in the original model
    demand_rxn_orig = cobra.Reaction(f'DM_{target_met_id}_orig')
    demand_rxn_orig.add_metabolites({target_met: -1})
    demand_rxn_orig.bounds = (0, 10)
    model.add_reactions([demand_rxn_orig])
    model.objective = demand_rxn_orig
    orig_sol = model.optimize()
    
    # Add a demand reaction for the target metabolite in the combined model 
    demand_rxn_combined = cobra.Reaction(f'DM_{target_met_id}')
    demand_rxn_combined.add_metabolites({target_met: -1})
    demand_rxn_combined.bounds = (0,1000)
    combined_model.add_reactions([demand_rxn_combined])
    
    # Set the objective to maximize the demand reaction and minimize the number of universal reactions
    combined_model.objective = {demand_rxn_combined: 1, num_universal_rxn: -delta}# 
    
    # Perform optimization on the combined model
    combined_sol = combined_model.optimize()
    print(f"Original model flux: {orig_sol.objective_value}, Combined model flux: {combined_sol.fluxes[demand_rxn_combined.id]}")
    print(f"Number of universal reactions used: {combined_sol.fluxes[num_universal_rxn.id]}")
    
    if orig_sol.objective_value < 1e-6:
        print(f'Metabolite {target_met_id} cannot be synthesized in the original model.')
        
        if combined_sol.fluxes[demand_rxn_combined.id] >= 1e-6:
            print(f'Metabolite {target_met_id} can be synthesized after gapfilling.')                     
            # Extract active reactions from the universal model that contribute to gapfilling
            active_universal_rxns = [
                rxn for rxn in universal_model.reactions
                if rxn.id in combined_sol.fluxes and abs(float(combined_sol.fluxes[rxn.id])) > 1e-6
            ]
            
            model.add_reactions(active_universal_rxns)
            # Record added reactions for output 
            for rxn in active_universal_rxns:
                added_reactions.append({
                    'Metabolite': target_met_id,
                    'Reaction ID': rxn.id,
                    'Reaction Name': rxn.name,
                    'Reaction Formula': str(rxn.reaction),
                    'Lower Bound': rxn.lower_bound,
                    'Upper Bound': rxn.upper_bound,
                    'Flux Value': combined_sol.fluxes[rxn.id]
                })
                
            print(f'Identified {len(active_universal_rxns)} reactions from universal model for {target_met_id}.')
        else:
            print(f'Metabolite {target_met_id} cannot be synthesized even after gapfilling.') 
    else:
        print(f'Metabolite {target_met_id} can be synthesized in the original model.')

    # Remove the demand reactions from both models
    model.remove_reactions([demand_rxn_orig]) 
    combined_model.remove_reactions([demand_rxn_combined])

    # Remove the pseudo-reaction for number of universal reactions    
    combined_model.remove_reactions([num_universal_rxn])
    
    # Create a DataFrame of added reactions for output
    df_added_reactions = pd.DataFrame(added_reactions)

    # Write the added reactions to an Excel file
    df_added_reactions.to_excel(output_file, index=False) 
    print(f"Added reactions saved to {output_file}")

    return model
model = cobra.io.read_sbml_model('14067gem13.xml')  # Your metabolic model  
universal_model = cobra.io.load_json_model('5. GAPFILLING/processed_universal_modelseed.json')  # Your universal model
# Split reversible reactions in the universal model
universal_model_split = split_reversible_reactions(universal_model.copy())
gapfilled_model = merge_and_gapfill(model, universal_model_split, target_met_id='biomass_c')
# Save the gapfilled model
cobra.io.write_sbml_model(gapfilled_model, '14067gem13_gap.xml') 
print('Gapfilling complete for biomass_c. Gapfilled model saved as gapfilled_model_biomass.xml')
# Perform FBA on the gapfilled model
gapfilled_model.objective = gapfilled_model.reactions.get_by_id('Bio_cgATCC14067_c') 
fba_solution = gapfilled_model.optimize()
if fba_solution.objective_value >= 1e-6:
    print('Gapfilled model can produce biomass.')
else:
    print('Gapfilled model cannot produce biomass. Please check the model.')

# 去除重复值

In [None]:
import pandas as pd

# 读取 Excel 文件
df = pd.read_excel('gapfilling_results.xlsx')

# 根据 "Reaction ID" 列删除重复值所在的行
df.drop_duplicates(subset='Reaction ID', inplace=True)

# 将结果保存到新的 Excel 文件
df.to_excel('gapfilling_results.xlsx', index=False)

# 去除已在模型中的反应

In [None]:
import pandas as pd

# 读取两个xlsx文件
df1 = pd.read_excel('trans.xlsx')  
df2 = pd.read_excel('14067gem13.xlsx')

# 获取df1中"Reaction ID"列的值
reaction_ids = df1['Reaction ID'].tolist()

# 获取df2中"ID"列的值 
ids = df2['ID'].tolist()

# 找出reaction_ids中不存在于ids的值
missing_ids = [id for id in reaction_ids if id not in ids]

# 根据missing_ids从df1中筛选出相应的行
missing_rows = df1[df1['Reaction ID'].isin(missing_ids)]

# 将筛选出的行写入新的xlsx文件
missing_rows.to_excel('gapfilling_results.xlsx', index=False)

# 修正反应式

In [None]:
import pandas as pd
import re

def process_files(xlsx_path, tsv_path, output_path):
    # 读取Excel文件
    df_excel = pd.read_excel(xlsx_path)
    
    # 读取TSV文件，使用tab作为分隔符
    df_tsv = pd.read_csv(tsv_path, sep='\t')
    
    # 创建一个函数来处理Reaction ID
    def process_reaction_id(reaction_id):
        if pd.isna(reaction_id):
            return reaction_id
        # 使用正则表达式匹配rxn后面的5位数字
        match = re.search(r'rxn(\d{5})', reaction_id)
        if match:
            # 提取5位数字并构建新的ID格式
            number = match.group(1)
            return f'rxn{number}_c0'
        return reaction_id

    # 处理Reaction ID列
    df_excel['Reaction ID'] = df_excel['Reaction ID'].apply(process_reaction_id)
    
    # 创建一个映射字典，用于快速查找
    tsv_mapping = dict(zip(df_tsv['id'], zip(df_tsv['name'], df_tsv['equation'], df_tsv['definition'])))
    
    # 创建用于存储匹配结果的新列
    df_excel['Reaction Name'] = ''
    df_excel['Reaction Formula'] = ''
    df_excel['Definition'] = ''  # 新建列用于存储definition
    
    # 遍历Excel文件中的每一行
    for index, row in df_excel.iterrows():
        reaction_id = row['Reaction ID']
        if pd.notna(reaction_id):
            # 从reaction_id中提取5位数字部分
            match = re.search(r'rxn(\d{5})', reaction_id)
            if match:
                search_id = f'rxn{match.group(1)}'
                # 在TSV数据中查找匹配项
                if search_id in tsv_mapping:
                    name, equation, definition = tsv_mapping[search_id]
                    df_excel.at[index, 'Reaction Name'] = name
                    df_excel.at[index, 'Reaction Formula'] = equation
                    df_excel.at[index, 'Definition'] = definition
    
    # 保存处理后的Excel文件
    df_excel.to_excel(output_path, index=False)
    print(f'处理完成，文件已保存至: {output_path}')

# 使用示例
if __name__ == "__main__":
    xlsx_path = "111111.xlsx"  # 替换为你的Excel文件路径
    tsv_path = "4. add linker/modelseed_reactions.tsv"  # TSV文件路径
    output_path = "r1.xlsx"  # 输出文件路径
    
    process_files(xlsx_path, tsv_path, output_path)

In [None]:
import pandas as pd
import os

def reverse_reaction(reaction):
    if not isinstance(reaction, str):  # 检查是否为字符串
        return reaction
    
    if '<=' in reaction and '<=>' not in reaction:
        # 分割 '<=' 并反转方向
        reaction = reaction.replace('[0]', '_c')
        reaction = reaction.replace('[1]', '_e')
        products, reactants = reaction.split('<=')
        # 去除多余空格并拼接成标准化形式
        
        return f"{reactants.strip()} => {products.strip()}"
        
    # 替换 [0] 为 _c 和 [1] 为 _e
    reaction = reaction.replace('[0]', '_c')
    reaction = reaction.replace('[1]', '_e')
    
    return reaction

def process_excel(input_file, output_file):
    try:
        # 创建输出目录（如果不存在）
        output_dir = os.path.dirname(output_file)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir)
            
        # 读取Excel文件
        print(f"正在读取文件: {input_file}")
        df = pd.read_excel(input_file)
        
        # 处理Definition列的反应
        print("正在处理反应方程...")
        df['Definition'] = df['Definition'].apply(reverse_reaction)
        df['Reaction Formula'] = df['Reaction Formula'].apply(reverse_reaction)
        # 保存处理后的文件
        print(f"正在保存结果到: {output_file}")
        df.to_excel(output_file, index=False)
        print("处理完成！")
        
    except Exception as e:
        print(f"处理过程中发生错误: {str(e)}")

# 使用示例
if __name__ == "__main__":
    input_file = "r1.xlsx"
    output_file = "r2.xlsx"
    
    process_excel(input_file, output_file)

# 添加bigg linker

In [None]:
import pandas as pd
import re

# 读取数据
xlsx_file = 'r2.xlsx'  # 这里替换为你的xlsx文件路径
tsv_file = '4. add linker/modelseed_reactions.tsv'  # 这里替换为你的tsv文件路径
# 读取 Excel 和 TSV 文件
df_xlsx = pd.read_excel(xlsx_file)
df_tsv = pd.read_csv(tsv_file, sep='\t')

# 添加 seed.reaction 列：从 ID 列提取 rxn{5}
df_xlsx['seed.reaction'] = df_xlsx['Reaction ID'].str.extract(r'(rxn\d{5})')

# 函数：从 aliases 列提取 BiGG 和 KEGG 信息（保留多个）
def extract_bigg_kegg(aliases):
    # 确保 aliases 是字符串类型
    if isinstance(aliases, str):
        # 提取所有 BiGG 名称
        bigg_matches = re.findall(r'BiGG: ([^|]+)', aliases)
        bigg = '; '.join(bigg_matches) if bigg_matches else ''
        
        # 提取所有 KEGG 编号
        kegg_matches = re.findall(r'KEGG: ([^|]+)', aliases)
        kegg = '; '.join(kegg_matches) if kegg_matches else ''
        
        return bigg, kegg
    return '', ''  # 如果 aliases 不是字符串，返回空值

# 函数：获取 ec_numbers，只保留第一个 EC 编号
def extract_ec_numbers(ec_numbers):
    if pd.notnull(ec_numbers):
        # 如果包含多个 EC 编号，则只保留第一个（以 "|" 分隔）
        ec_list = ec_numbers.split('|')
        return ec_list[0]
    return ''  # 如果为空，则返回空字符串

# 创建 BiGG.reaction, KEGG.reaction, ec-code 和 definition 列
df_xlsx['BiGG.reaction'] = ''
df_xlsx['KEGG.reaction'] = ''
df_xlsx['ec-code'] = ''


# 遍历 Excel 文件中的每一行，进行匹配并更新列
for idx, row in df_xlsx.iterrows():
    seed_reaction = row['seed.reaction']
    
    # 根据 seed.reaction 去匹配 tsv 文件
    matched_row = df_tsv[df_tsv['id'] == seed_reaction]
    if not matched_row.empty:
        # 提取 aliases, ec_numbers 和 definition
        aliases = matched_row.iloc[0]['aliases']
        ec_numbers = matched_row.iloc[0]['ec_numbers']
        
        
        # 提取 BiGG 和 KEGG 信息
        bigg, kegg = extract_bigg_kegg(aliases)
        
        # 更新 xlsx 数据框
        df_xlsx.at[idx, 'BiGG.reaction'] = bigg
        df_xlsx.at[idx, 'KEGG.reaction'] = kegg
        df_xlsx.at[idx, 'ec-code'] = extract_ec_numbers(ec_numbers)
        

# 保存修改后的文件
df_xlsx.to_excel('r3.xlsx', index=False)

print("操作完成！")

In [None]:
import pandas as pd
import re

def process_files(xlsx_path, tsv_path, output_path):
    """
    处理xlsx和tsv文件，根据匹配规则更新BiGG.reaction列
    支持多个bigg_id的情况，用分号分隔
    
    Parameters:
    xlsx_path (str): 输入xlsx文件路径
    tsv_path (str): 输入tsv文件路径
    output_path (str): 输出xlsx文件路径
    """
    # 读取xlsx文件
    xlsx_df = pd.read_excel(xlsx_path)
    
    # 读取tsv文件
    tsv_df = pd.read_csv(tsv_path, sep='\t')
    
    # 获取需要查找的rxn模式
    rxn_pattern = re.compile(r'rxn\d{5}_c0')
    rxn_ids = []
    
    # 从xlsx文件中提取符合模式的ID
    for idx, row in xlsx_df.iterrows():
        if pd.isna(row['BiGG.reaction']):  # 检查BiGG.reaction是否为空
            match = rxn_pattern.search(str(row['Reaction ID']))
            if match:
                rxn_ids.append((idx, match.group(0)[:8]))  # 只取rxn部分，不含_c0
    
    # 在tsv文件中查找对应的bigg_id
    for idx, rxn_id in rxn_ids:
        # 构造搜索模式
        search_pattern = f"SEED Reaction: http://identifiers.org/seed.reaction/{rxn_id}"
        
        # 在database_links列中查找所有匹配项
        matches = tsv_df[tsv_df['database_links'].str.contains(search_pattern, na=False)]
        
        if not matches.empty:
            # 获取所有匹配的bigg_id并用分号连接
            bigg_ids = matches['bigg_id'].unique()  # 使用unique()去除重复值
            bigg_ids_str = '; '.join(bigg_ids)
            
            # 更新到xlsx文件中
            xlsx_df.at[idx, 'BiGG.reaction'] = bigg_ids_str
            
            # 打印匹配信息（可选，用于调试）
            print(f"Found matches for {rxn_id}: {bigg_ids_str}")
    
    # 保存更新后的xlsx文件
    xlsx_df.to_excel(output_path, index=False)
    print(f"处理完成，结果已保存到: {output_path}")
    
    # 打印统计信息
    total_processed = len(rxn_ids)
    total_matched = len(xlsx_df[xlsx_df['BiGG.reaction'].notna()])
    print(f"\n统计信息:")
    print(f"处理的rxn数量: {total_processed}")
    print(f"成功匹配的数量: {total_matched}")

# 使用示例
if __name__ == "__main__":
    # 替换为实际的文件路径
    xlsx_file = "r3.xlsx"
    tsv_file = "4. add linker/bigg_models_reactions.tsv"
    output_file = "r4.xlsx"
    
    process_files(xlsx_file, tsv_file, output_file)

# 添加反应其他linker

In [None]:
import pandas as pd
import re

def extract_last_rhea(database_links):
    """提取最后一个RHEA ID"""
    if pd.isna(database_links):
        return None
    
    rhea_pattern = r'RHEA: http://identifiers\.org/rhea/(\d+)'
    matches = re.finditer(rhea_pattern, database_links)
    
    # 转换迭代器到列表并获取最后一个匹配
    matches_list = list(matches)
    if matches_list:
        return matches_list[-1].group(1)
    return None

def extract_metanetx(database_links):
    """提取MetaNetX reaction ID"""
    if pd.isna(database_links):
        return None
    
    metanetx_pattern = r'MetaNetX \(MNX\) Equation: http://identifiers\.org/metanetx\.reaction/(MNXR\d+)'
    match = re.search(metanetx_pattern, database_links)
    
    return match.group(1) if match else None

def extract_biocyc(database_links):
    """提取BioCyc ID"""
    if pd.isna(database_links):
        return None
    
    biocyc_pattern = r'BioCyc: http://identifiers\.org/biocyc/(META:[^;]+)'
    match = re.search(biocyc_pattern, database_links)
    
    return match.group(1) if match else None

def process_files(xlsx_path, tsv_path, output_path):
    """
    处理xlsx和tsv文件，提取相关ID并创建新列
    
    Parameters:
    xlsx_path (str): 输入xlsx文件路径
    tsv_path (str): 输入tsv文件路径
    output_path (str): 输出xlsx文件路径
    """
    # 读取文件
    xlsx_df = pd.read_excel(xlsx_path)
    tsv_df = pd.read_csv(tsv_path, sep='\t')
    
    # 初始化新列
    xlsx_df['rhea'] = None
    xlsx_df['MetanetX.reaction'] = None
    xlsx_df['biocyc'] = None
    
    # 处理每一行
    for idx, row in xlsx_df.iterrows():
        bigg_reaction = row['BiGG.reaction']
        
        # 只处理非空的BiGG.reaction
        if pd.notna(bigg_reaction):
            # 在tsv文件中查找匹配的行
            matches = tsv_df[tsv_df['bigg_id'] == bigg_reaction]
            
            if not matches.empty:
                # 获取第一个匹配行的database_links
                database_links = matches.iloc[0]['database_links']
                
                # 提取各种ID
                rhea_id = extract_last_rhea(database_links)
                metanetx_id = extract_metanetx(database_links)
                biocyc_id = extract_biocyc(database_links)
                
                # 更新DataFrame
                xlsx_df.at[idx, 'rhea'] = rhea_id
                xlsx_df.at[idx, 'MetanetX.reaction'] = metanetx_id
                xlsx_df.at[idx, 'biocyc'] = biocyc_id
    
    # 保存结果
    xlsx_df.to_excel(output_path, index=False)
    print(f"处理完成，结果已保存到: {output_path}")
    
    # 打印统计信息
    total_processed = xlsx_df['BiGG.reaction'].notna().sum()
    rhea_matched = xlsx_df['rhea'].notna().sum()
    metanetx_matched = xlsx_df['MetanetX.reaction'].notna().sum()
    biocyc_matched = xlsx_df['biocyc'].notna().sum()
    
    print(f"\n统计信息:")
    print(f"处理的BiGG.reaction数量: {total_processed}")
    print(f"成功匹配RHEA ID数量: {rhea_matched}")
    print(f"成功匹配MetaNetX ID数量: {metanetx_matched}")
    print(f"成功匹配BioCyc ID数量: {biocyc_matched}")

# 使用示例
if __name__ == "__main__":
    # 替换为实际的文件路径
    xlsx_file = "r4.xlsx"
    tsv_file = "4. add linker/bigg_models_reactions.tsv"
    output_file = "r5.xlsx"
    
    process_files(xlsx_file, tsv_file, output_file)

# 提取代谢物

In [None]:
import pandas as pd
import re

def extract_metabolites(equation):
    """从反应方程式中提取所有代谢物（包括compartment标记）"""
    if pd.isna(equation):
        return []
    
    # 更新的正则表达式模式，匹配:
    # 1. 带系数的代谢物: (数字) 代谢物[compartment]
    # 2. 不带系数的代谢物: 代谢物[compartment]
    pattern = r'([\w\-]+)_([ce]\d*)'
    
    # 找出所有匹配项
    matches = re.finditer(pattern, equation)
    
    # 提取代谢物名称和compartment
    metabolites = [f"{match.group(1)}_{match.group(2)}" for match in matches]
    
    return metabolites

def process_file(input_path, output_path):
    """
    处理Excel文件，提取并去重代谢物
    
    Parameters:
    input_path (str): 输入xlsx文件路径
    output_path (str): 输出xlsx文件路径
    """
    # 读取Excel文件
    df = pd.read_excel(input_path)
    
    # 存储所有唯一的代谢物
    all_metabolites = set()
    
    # 从每个方程式中提取代谢物
    for equation in df['Reaction Formula']:
        metabolites = extract_metabolites(equation)
        all_metabolites.update(metabolites)
    
    # 转换为排序后的列表
    sorted_metabolites = sorted(list(all_metabolites))
    
    # 创建新的DataFrame
    output_df = pd.DataFrame({'metabolites_id': sorted_metabolites})
    
    # 保存到新的Excel文件
    output_df.to_excel(output_path, index=False)
    
    # 打印统计信息
    print(f"处理完成，结果已保存到: {output_path}")
    print(f"找到的唯一代谢物数量: {len(sorted_metabolites)}")
    
    # 打印一些示例进行验证
    print("\n代谢物示例:")
    # 打印部分带[c0]的代谢物
    c0_mets = [m for m in sorted_metabolites if '_c' in m][:3]
    print("带[c0]的代谢物示例:", c0_mets)
    
    # 打印部分带[e0]的代谢物
    e0_mets = [m for m in sorted_metabolites if '_e' in m][:3]
    print("带[e0]的代谢物示例:", e0_mets)

# 使用示例
if __name__ == "__main__":
    input_file = "r5.xlsx"
    output_file = "m1.xlsx"
    
    process_file(input_file, output_file)

In [None]:
import pandas as pd
import re

# 读取xlsx文件
df_xlsx = pd.read_excel('m1.xlsx')

# 确保必要的列存在并填充空值
columns_to_create = ['name', 'abbreviation', 'formula', 'mass', 'inchikey', 'charge', 'deltag', 'deltagerr', 'pka', 'pkb', 'smiles']
for col in columns_to_create:
    if col not in df_xlsx.columns:
        df_xlsx[col] = pd.NA  # 创建列并填充NaN

# 读取tsv文件
df_tsv = pd.read_csv('4. add linker/modelseed_compounds.tsv', sep='\t', low_memory=False)

# 定义一个函数，用于匹配metabolite ID并从TSV中获取相应的数据
def get_metabolite_data(metabolite_id, name):
    stripped_id = re.sub(r'_[ce]$', '', str(metabolite_id))  # 清除掉cell状态
    matched_compound = df_tsv[df_tsv['id'] == stripped_id]
    
    if not matched_compound.empty:
        return matched_compound[['name', 'abbreviation', 'formula', 'mass', 'inchikey', 'charge', 'deltag', 'deltagerr', 'pka', 'pkb', 'smiles']].iloc[0]
    
    # 如果没有匹配的数据，则返回默认空值
    return pd.Series(['']*11, index=['name', 'abbreviation', 'formula', 'mass', 'inchikey', 'charge', 'deltag', 'deltagerr', 'pka', 'pkb', 'smiles'])

# 处理xlsx数据并添加匹配的tsv数据
metabolite_data = df_xlsx.apply(lambda x: get_metabolite_data(x['metabolites_id'], x['name']), axis=1)

# 更新xlsx中的列，如果tsv中有数据就更新，没有则保留原值或填充空值
df_xlsx['name'] = df_xlsx['name'].where(pd.notnull(df_xlsx['name']), metabolite_data['name'])
df_xlsx['abbreviation'] = df_xlsx['abbreviation'].where(pd.notnull(df_xlsx['abbreviation']), metabolite_data['abbreviation'])
df_xlsx['formula'] = df_xlsx['formula'].where(pd.notnull(df_xlsx['formula']), metabolite_data['formula'])
df_xlsx['mass'] = df_xlsx['mass'].where(pd.notnull(df_xlsx['mass']), metabolite_data['mass'])
df_xlsx['charge'] = df_xlsx['charge'].where(pd.notnull(df_xlsx['charge']), metabolite_data['charge'])
df_xlsx['pka'] = df_xlsx['pka'].where(pd.notnull(df_xlsx['pka']), metabolite_data['pka'])
df_xlsx['pkb'] = df_xlsx['pkb'].where(pd.notnull(df_xlsx['pkb']), metabolite_data['pkb'])
df_xlsx['smiles'] = df_xlsx['smiles'].where(pd.notnull(df_xlsx['smiles']), metabolite_data['smiles'])

df_xlsx['inchikey'] = metabolite_data['inchikey']
df_xlsx['deltaG'] = metabolite_data.apply(lambda x: f"{x['deltag']}±{x['deltagerr']} (kcal/mol)" if pd.notnull(x['deltag']) else '', axis=1)

# 将更新后的数据写入新的xlsx文件
df_xlsx.to_excel('m2.xlsx', index=False)
print('met_pro2.xlsx 生成成功!')


In [None]:
import pandas as pd
import re

# 读取Excel文件
df = pd.read_excel('m2.xlsx')

# 读取TSV文件，设置low_memory=False来避免混合类型警告
modelseed_df = pd.read_csv('4. add linker/modelseed_compounds.tsv', sep='\t', low_memory=False)

# 创建新列seed.compou，提取cpd编号
df['seed.compound'] = df['metabolites_id'].str.extract(r'(cpd\d+)')

# 创建新列用于存储BiGG和KEGG的标识符
df['BiGG.compound'] = ''
df['KEGG.compound'] = ''

# 遍历每一行
for index, row in df.iterrows():
    # 获取cpd编号
    cpd = row['seed.compound']
    if pd.notna(cpd):
        # 在modelseed_compounds中查找匹配的行
        match = modelseed_df[modelseed_df['id'] == cpd]
        
        if not match.empty:
            aliases = match.iloc[0]['aliases']
            
            # 检查aliases是否为有效字符串
            if isinstance(aliases, str):
                # 提取BiGG标识符
                bigg_match = re.search(r'BiGG: ([^|]+)', aliases)
                if bigg_match:
                    df.at[index, 'BiGG.compound'] = bigg_match.group(1)
                
                # 提取KEGG标识符
                kegg_match = re.search(r'KEGG: ([^|]+)', aliases)
                if kegg_match:
                    df.at[index, 'KEGG.compound'] = kegg_match.group(1)

# 保存结果到新的Excel文件
df.to_excel('m3.xlsx', index=False)

In [None]:
import pandas as pd
import re

# 读取Excel文件和TSV文件
df = pd.read_excel('m3.xlsx')
bigg_df = pd.read_csv('4. add linker/bigg_models_metabolites.tsv', sep='\t', low_memory=False)

# 创建新列
df['MetaNetX.compound'] = ''
df['Biocyc'] = ''

# 遍历每一行
for index, row in df.iterrows():
    bigg_id = row['BiGG.compound']
    
    # 只处理BiGG.compound非空的行
    if pd.notna(bigg_id):
        # 在bigg_models_metabolites中查找匹配的行
        match = bigg_df[bigg_df['universal_bigg_id'] == bigg_id]
        
        if not match.empty:
            database_links = match.iloc[0]['database_links']
            
            # 确保database_links是字符串
            if isinstance(database_links, str):
                # 提取MetaNetX ID
                metanetx_match = re.search(r'MetaNetX \(MNX\) Chemical: http://identifiers\.org/metanetx\.chemical/([^;]+)', database_links)
                if metanetx_match:
                    df.at[index, 'MetaNetX.compound'] = metanetx_match.group(1).strip()
                
                # 提取BioCyc ID
                biocyc_match = re.search(r'BioCyc: http://identifiers\.org/biocyc/([^;]+)', database_links)
                if biocyc_match:
                    df.at[index, 'Biocyc'] = biocyc_match.group(1).strip()

# 保存结果到新的Excel文件
df.to_excel('m4.xlsx', index=False)

In [None]:
import pandas as pd

# 读取 met_pro4.xlsx 文件的 metabolites 工作簿
df_met_pro4 = pd.read_excel('m4.xlsx')

# 读取 14067gem2.xlsx 文件的 metabolites 工作簿
df_14067gem2 = pd.read_excel('14067gem14.xlsx', sheet_name='metabolites')

# 找到 met_pro4.xlsx 中存在而 14067gem2.xlsx 中不存在的代谢物所在的行
new_metabolites = df_met_pro4[~df_met_pro4['metabolites_id'].isin(df_14067gem2['metabolites_id'])]

# 将新的代谢物行添加到 14067gem2.xlsx 的 metabolites 工作簿中
df_14067gem3 = pd.concat([df_14067gem2, new_metabolites], ignore_index=True)

# 读取 14067gem2.xlsx 文件的其他工作簿
xlsx_14067gem2 = pd.read_excel('14067gem14.xlsx', sheet_name=None)

writer = pd.ExcelWriter('model_history/14067gem14.xlsx', engine='xlsxwriter')

# 将 14067gem2.xlsx 的其他工作簿写入新文件
for sheet_name, df in xlsx_14067gem2.items():
    if sheet_name != 'metabolites':
        df.to_excel(writer, sheet_name=sheet_name, index=False)

# 将更新后的 metabolites 工作簿写入新文件
df_14067gem3.to_excel(writer, sheet_name='metabolites', index=False)

# 关闭 ExcelWriter 对象
writer.close()

print("任务完成。新文件 '14067gem3.xlsx' 已创建,包含更新后的 metabolites 工作簿以及原始文件的其他工作簿。")