# 1. 提取 id 蛋白质序列 DNA序列

In [1]:
import json
import pandas as pd
# 打开 JSON 文件并加载数据
with open('0.Model_from_modelseed/genome.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# 检查字典的键
print("数据的键：", data.keys())

# 假设 'features' 键包含了基因的详细信息
features = data.get('features', [])

# 提取需要的字段
records = []

for feature in features:
    if isinstance(feature, dict):  # 确保每个元素是字典
        record = {
            'id': feature.get('id', ''),
            'function': feature.get('function', ''),
            'dna_sequence': feature.get('dna_sequence', ''),
            'protein_translation': feature.get('protein_translation', '')
        }
        records.append(record)

# 使用 pandas 创建一个 DataFrame
df = pd.DataFrame(records)

# 将 DataFrame 保存为 Excel 文件
df.to_excel('modelseedgene_date.xlsx', index=False, engine='openpyxl')

print("数据已成功保存到 modelseedgene_date.xlsx 文件。")


数据的键： dict_keys(['num_contigs', 'domain', 'close_genomes', 'source', 'contigs', 'source_id', 'scientific_name', 'contig_ids', 'taxonomy', 'id', 'contig_lengths', 'dna_size', 'features', 'genetic_code'])
数据已成功保存到 modelseedgene_date.xlsx 文件。


# 2.blastp获取gene locutag

生成protein 数据库

In [122]:
import openpyxl
import re

# 读取txt文件
txt_file = '1.Replace genes in cg14067_rxn/genomic.gbff'  # 替换为你的txt文件路径
with open(txt_file, 'r') as file:
    genbank_record = file.read()
genbank_sections = genbank_record.split("gene            ")

# 创建正则表达式模式
locus_tag_pattern = r'/locus_tag="([^"]+)"'
translation_pattern = r'/translation="([^"]+?)"'  # 非贪婪匹配翻译字段内容，确保只提取第一个闭合的引号之间的内容

# 初始化存储数据的列表
locus_tags = []
translations = []

# 遍历每个分割后的部分
for section in genbank_sections:
    # 查找locus_tag和translation字段
    locus_tag_matches = re.findall(locus_tag_pattern, section)
    translation_matches = re.findall(translation_pattern, section, flags=re.DOTALL)
    
    # 如果找到locus_tag和translation，添加第一个匹配
    if locus_tag_matches and translation_matches:
        locus_tags.append(locus_tag_matches[0])  # 只取第一个locus_tag
        translations.append(translation_matches[0])

# 清理每个翻译文本的空格
def clean_translation(translation):
    # 删除前导和尾随空格
    translation = translation.strip()
    # 将多个连续的空格替换为一个空格
    translation = re.sub(r'\s+', '', translation)
    return translation

# 清理所有翻译文本
translations = [clean_translation(translation) for translation in translations]

# # 打印提取的字段以检查是否正确
# print("Locus Tags:", locus_tags)
# print("Translations:", translations)

# 创建一个新的Excel工作簿
wb = openpyxl.Workbook()
ws = wb.active
ws.title = "GenBank Data"

# 写入标题行
ws.append(["locus_tag", "translation"])

# 将数据写入Excel文件
for locus_tag, translation in zip(locus_tags, translations):
    ws.append([locus_tag, translation])

# 保存为Excel文件
wb.save("genbank_data.xlsx")

print("数据已成功保存到genbank_data_cleaned.xlsx文件中。")

Locus Tags: ['CEY17_RS00005', 'CEY17_RS00015', 'CEY17_RS00020', 'CEY17_RS00025', 'CEY17_RS00030', 'CEY17_RS00035', 'CEY17_RS00040', 'CEY17_RS00045', 'CEY17_RS00050', 'CEY17_RS00055', 'CEY17_RS00060', 'CEY17_RS00065', 'CEY17_RS00070', 'CEY17_RS00085', 'CEY17_RS00090', 'CEY17_RS00095', 'CEY17_RS00100', 'CEY17_RS00105', 'CEY17_RS00110', 'CEY17_RS00115', 'CEY17_RS00120', 'CEY17_RS00125', 'CEY17_RS16460', 'CEY17_RS16870', 'CEY17_RS16875', 'CEY17_RS16880', 'CEY17_RS00140', 'CEY17_RS00150', 'CEY17_RS00155', 'CEY17_RS00160', 'CEY17_RS00165', 'CEY17_RS00170', 'CEY17_RS00175', 'CEY17_RS00180', 'CEY17_RS00185', 'CEY17_RS00190', 'CEY17_RS00195', 'CEY17_RS00200', 'CEY17_RS00215', 'CEY17_RS00220', 'CEY17_RS00225', 'CEY17_RS00230', 'CEY17_RS00235', 'CEY17_RS00240', 'CEY17_RS00245', 'CEY17_RS00250', 'CEY17_RS00255', 'CEY17_RS00260', 'CEY17_RS00265', 'CEY17_RS00270', 'CEY17_RS00275', 'CEY17_RS00280', 'CEY17_RS00285', 'CEY17_RS00290', 'CEY17_RS00295', 'CEY17_RS00300', 'CEY17_RS00305', 'CEY17_RS00310', '

## 2.1 转化protein序列为fasta文件

转化数据库为fasta

In [123]:
import pandas as pd

# 读取 Excel 文件，假设文件名为 'protein_sequences.xlsx'
file_path = '1.Replace genes in cg14067_rxn/genbank_data.xlsx'

# 使用 pandas 读取文件
df = pd.read_excel(file_path)

# 创建并打开一个新的 FASTA 文件
with open('1.Replace genes in cg14067_rxn/proteindatabase.fasta', 'w') as fasta_file:
    # 遍历每一行数据
    for index, row in df.iterrows():
        # 获取蛋白质的 id 和蛋白质序列
        protein_id = row['locus_tag']
        protein_sequence = row['translation']
        
        # 生成 FASTA 格式的数据，并写入文件
        fasta_file.write(f">{protein_id}\n{protein_sequence}\n")

print("FASTA 文件已生成并保存为 'proteindatabase.fasta'")

FASTA 文件已生成并保存为 'proteindatabase.fasta'


转化modelseed的蛋白序列为fasta

In [125]:
file_path = '1.Replace genes in cg14067_rxn/modelseedgene_date.xlsx'

# 使用 pandas 读取文件
df = pd.read_excel(file_path)

# 创建并打开一个新的 FASTA 文件
with open('1.Replace genes in cg14067_rxn/modelseed_protein_sequences.fasta', 'w') as fasta_file:
    # 遍历每一行数据
    for index, row in df.iterrows():
        # 获取蛋白质的 id 和蛋白质序列
        protein_id = row['id']
        protein_sequence = row['protein_translation']
        
        # 生成 FASTA 格式的数据，并写入文件
        fasta_file.write(f">{protein_id}\n{protein_sequence}\n")

print("FASTA 文件已生成并保存为 'modelseed_protein_sequences.fasta'")

FASTA 文件已生成并保存为 'modelseed_protein_sequences.fasta'


进行blast

## 2.2 在blast之后，提取相似度为99对应关系

In [149]:
import pandas as pd
import re
from openpyxl import Workbook

# 读取 TSV 文件
tsv_file = '1.Replace genes in cg14067_rxn/blastp_results.tsv'
df_tsv = pd.read_csv(tsv_file, sep='\t')

# 读取 FASTA 文件并提取所需的信息
fasta_file = '1.Replace genes in cg14067_rxn/14067_genesequence.txt'
fasta_info = {}

# 使用正则表达式从 FASTA 文件中提取匹配信息
with open(fasta_file, 'r') as fasta:
    sequence_id = None
    gene = locus_tag = protein = protein_id = location = None
    for line in fasta:
        if line.startswith(">"):  # 处理序列标题行
            if locus_tag:
                fasta_info[locus_tag] = {
                    'gene': gene, 
                    'locus_tag': locus_tag, 
                    'protein': protein, 
                    'protein_id': protein_id, 
                    'location': location
                }
            
            # 提取当前序列的 ID 和注释信息
            # sequence_id = re.search(r"([^\s>]+)", line).group(1)  # 获取序列 ID
            # 使用正则提取 gene, locus_tag, protein, protein_id, location 等信息
            gene = re.search(r'gene=([^]]+)', line)
            locus_tag = re.search(r'locus_tag=([^]]+)', line)
            protein = re.search(r'protein=([^]]+)', line)
            protein_id = re.search(r'protein_id=([^]]+)', line)
            location = re.search(r'location=([^]]+)', line)

            # 提取值，若未匹配到则为 None
            gene = gene.group(1) if gene else None
            locus_tag = locus_tag.group(1) if locus_tag else None
            protein = protein.group(1) if protein else None
            protein_id = protein_id.group(1) if protein_id else None
            location = location.group(1) if location else None
            
            # 调试信息：打印出当前的 gene, locus_tag, protein 等信息
            # print(f"sequence_id: {sequence_id}, gene: {gene}, locus_tag: {locus_tag}, protein: {protein}, protein_id: {protein_id}, location: {location}")
        else:
            continue
    
    # 最后一条序列信息
    if locus_tag:
        fasta_info[locus_tag] = {
            'gene': gene, 
            'locus_tag': locus_tag, 
            'protein': protein, 
            'protein_id': protein_id, 
            'location': location
        }

# 打印解析后的 fasta_info
print("FASTA 文件解析结果：")
print(fasta_info)

# 将匹配结果合并到 TSV 文件中
result = []

for index, row in df_tsv.iterrows():
    if row["C3"] > 99:
        query_id = row["C2"]
        protein_id = row["C1"]
        if query_id in fasta_info:
            gene_info = fasta_info[query_id]
            result.append({
                'modelseed_gene': row['C1'],
                'QueryID': query_id,
                'gene': gene_info['gene'],
                'locus_tag': gene_info['locus_tag'],
                'protein': gene_info['protein'],
                'protein_id': gene_info['protein_id'],
                'location': gene_info['location']
            })
        else:
            print(f"未找到匹配的 ID: {query_id}")

# 转换成 DataFrame
df_result = pd.DataFrame(result)

# 打印最终合并的结果
print("合并后的结果：")
print(df_result)

# 输出为 Excel 文件
if not df_result.empty:
    output_file = '1.Replace genes in cg14067_rxn/gene_date.xlsx'
    df_result.to_excel(output_file, index=False)
    print(f"结果已保存到 {output_file}")
else:
    print("没有匹配的数据，无法生成 Excel 文件")


FASTA 文件解析结果：
{'CEY17_RS00005': {'gene': 'dnaA', 'locus_tag': 'CEY17_RS00005', 'protein': 'chromosomal replication initiator protein DnaA', 'protein_id': 'WP_003860980.1', 'location': '1..1575'}, 'CEY17_RS00015': {'gene': 'dnaN', 'locus_tag': 'CEY17_RS00015', 'protein': 'DNA polymerase III subunit beta', 'protein_id': 'WP_003855336.1', 'location': '2292..3476'}, 'CEY17_RS00020': {'gene': 'recF', 'locus_tag': 'CEY17_RS00020', 'protein': 'DNA replication/repair protein RecF', 'protein_id': 'WP_003860982.1', 'location': '3585..4769'}, 'CEY17_RS00025': {'gene': None, 'locus_tag': 'CEY17_RS00025', 'protein': 'DUF721 domain-containing protein', 'protein_id': 'WP_003860983.1', 'location': '4766..5302'}, 'CEY17_RS00030': {'gene': 'gyrB', 'locus_tag': 'CEY17_RS00030', 'protein': 'DNA topoisomerase (ATP-hydrolyzing) subunit B', 'protein_id': 'WP_003860984.1', 'location': '5435..7489'}, 'CEY17_RS00035': {'gene': None, 'locus_tag': 'CEY17_RS00035', 'protein': 'alpha/beta fold hydrolase', 'protein_

# 3. 对rxn中gene进行替换

In [155]:
import pandas as pd
import re

# 读取 TSV 文件
tsv_file = '0.Model_from_modelseed/cg14067_rxn.tsv'  # 替换为你的 TSV 文件路径
df_tsv = pd.read_csv(tsv_file, sep='\t')

# 读取 XLSX 文件
xlsx_file = '1.Replace genes in cg14067_rxn/gene_date.xlsx'  # 替换为你的 XLSX 文件路径
df_xlsx = pd.read_excel(xlsx_file)

# 创建一个基因 ID 到 locus_tag 的映射字典
locus_tag_dict = {}
for gene_id, locus_tag in zip(df_xlsx.iloc[:, 0], df_xlsx.iloc[:, 3]):
    if gene_id not in locus_tag_dict:
        locus_tag_dict[gene_id] = set()  # 使用 set 来避免重复的 locus_tag
    locus_tag_dict[gene_id].add(locus_tag)

# 检查哪些 gene_id 对应多个 locus_tag
# for gene_id, locus_tags in locus_tag_dict.items():
#     if len(locus_tags) > 1:
#         print(f"警告：基因 ID {gene_id} 对应多个 locus_tag: {locus_tags}")

# 函数：用于处理 Genes 列中的基因 ID，并替换为相应的 locus_tag
def replace_genes_with_locus_tag(genes):
    # 使用正则表达式提取所有基因 ID (fig|6666666.1394842.peg.xxx)
    gene_ids = re.findall(r'fig\|\d+\.\d+\.peg\.\d+', genes)

    # 替换每个基因 ID 为对应的 locus_tag
    for gene_id in gene_ids:
        if gene_id in locus_tag_dict:
            locus_tags = list(locus_tag_dict[gene_id])  # 将 set 转换为 list
            if len(locus_tags) > 1:
                # 如果一个基因 ID 对应多个 locus_tag，则将它们连接起来，并用括号包裹
                locus_tag_str = " or ".join(locus_tags)
                locus_tag_str = f"({locus_tag_str})"
                print(f"警告：基因 ID {gene_id} 对应多个 locus_tag: {locus_tag_str}")
                # 替换为多个 locus_tag 用 "or" 连接，并括起来
                genes = genes.replace(gene_id, locus_tag_str)
            else:
                # print("精准替换")
                genes = genes.replace(gene_id, locus_tags[0])

    return genes

# 对 TSV 文件的 Genes 列进行处理
df_tsv['Genes'] = df_tsv['Genes'].apply(replace_genes_with_locus_tag)

# 将结果保存为 XLSX 文件
output_file = '1.Replace genes in cg14067_rxn/14067_rxn.xlsx'  # 替换为你希望保存的文件路径
df_tsv.to_excel(output_file, index=False)

print(f"替换后的文件已保存为 {output_file}")


替换后的文件已保存为 1.对cg14067_rxn中gene进行替换/14067_rxn.xlsx


手动处理gene列中的unknown

In [156]:
import pandas as pd
import re

# 读取 XLSX 文件
xlsx_file = '1.Replace genes in cg14067_rxn/14067_rxn.xlsx'  # 替换为你的 XLSX 文件路径
df_xlsx = pd.read_excel(xlsx_file)

# 函数：去除 Gene 列中每个值的开头和结尾的括号
def remove_parentheses(gene):
    if isinstance(gene, str):  # 确保 gene 是字符串
        # 如果存在括号，去掉开头和结尾的括号
        if gene.startswith('(') and gene.endswith(')'):
            gene = gene[1:-1]
    return gene

# 确保 Gene 列为字符串类型，处理 NaN 值
df_xlsx['Genes'] = df_xlsx['Genes'].fillna('').astype(str).apply(remove_parentheses)


# 将结果保存为新的 XLSX 文件
output_file = '1.Replace genes in cg14067_rxn/14067_rxn_1.xlsx'  # 替换为你希望保存的文件路径
df_xlsx.to_excel(output_file, index=False)

print(f"处理后的文件已保存为 {output_file}")


处理后的文件已保存为 1.对cg14067_rxn中gene进行替换/14067_rxn_1.xlsx
