# 从kegg获取nuiport、ncbi等linker

In [None]:
import re
from requests.adapters import HTTPAdapter
from urllib3 import Retry
import requests
import pandas as pd
import time
# 设置请求的重试机制
def create_session():
    session = requests.Session()
    retry = Retry(
        total=3,  # 设置最多重试次数
        backoff_factor=1,  # 设置退避因子，重试间隔会指数增长
        status_forcelist=[500, 502, 503, 504]  # 针对这些HTTP错误进行重试
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('https://', adapter)
    session.mount('http://', adapter)
    return session

# 获取基因详细信息
def get_gene_info(gene_id, session):
    # 构建KEGG API的URL，使用get操作获取基因详细信息
    url = f"https://rest.kegg.jp/get/cgb:{gene_id}"
    # 发送GET请求
    response = session.get(url)
    # 检查响应状态码
    if response.status_code == 200:
        # 获取响应文本
        text = response.text
        # 按行分割文本，并取前50行
        lines = text.splitlines()
        # 只返回前50行数据
        return "\n".join(lines[:50])
    else:
        return "Failed to retrieve gene information."

# 提取EC号
def getNCBIProteinID(gene_info):
    NCBI_pattern = re.compile(r'NCBI-ProteinID:\s*([^\s]+)')
    matches = NCBI_pattern.findall(gene_info)
    return matches
def getUniProt(gene_info):
    NCBI_pattern = re.compile(r'UniProt:\s*([^\s]+)')
    matches = NCBI_pattern.findall(gene_info)
    return matches

# 读取Excel文件
df = pd.read_excel('14067map13032.xlsx')
# 创建一个会话对象
session = create_session()
# 新增列
df['NCBI-ProteinID'] = None
df['UniProt'] = None
# 遍历基因列，查询KEGG并添加EC号
for index, row in df.iterrows():
    gene = row['Predicted Genes']
    if pd.notnull(gene):
        gene_info = get_gene_info(gene, session)  # 修正为调用 get_gene_info
        if gene_info != "Failed to retrieve gene information.":
            time.sleep(2)  # 控制请求间隔为2秒，避免过于频繁
            ncbiprotid = getNCBIProteinID(gene_info)
            uniport_entry = getUniProt(gene_info)
            if ncbiprotid:
                df.at[index, 'NCBI-ProteinID'] = ', '.join(ncbiprotid)
                print(f"找到并添加 {gene} -> {ncbiprotid}")
            else:
                print(f"{gene} 无NCBIProteinID")
            if uniport_entry:
                df.at[index, 'UniProt'] = ', '.join(uniport_entry)
                print(f"找到并添加 {gene} -> {uniport_entry}")
            else:
                print(f"{gene} 无uniport_entry")
        else:
            print(f"{gene} 查询失败或没有结果")

# 保存修改后的Excel文件
df.to_excel('14067map13032_NU.xlsx', index=False)

print('EC号已成功添加并保存到 14067map13032_NU.xlsx')


# 提取genes

In [2]:
import pandas as pd
import re

# 读取Excel文件
df = pd.read_excel('14067gem20_CGXII.xlsx')

# 创建一个集合来存储所有唯一的基因ID
all_genes = set()

# 遍历Genes列的每个单元格
for cell in df['Genes'].dropna():  # dropna()跳过空单元格
    # 使用正则表达式找出所有匹配的基因ID
    genes = re.findall(r'CEY17_RS\d{5}', str(cell))
    # 将找到的基因ID添加到集合中（集合会自动去重）
    all_genes.update(genes)

# 将集合转换为列表并排序
sorted_genes = sorted(list(all_genes))

# 创建新的DataFrame，只包含一列基因ID
genes_df = pd.DataFrame(sorted_genes, columns=['GeneID'])

# 保存到新的Excel文件
genes_df.to_excel('14067_genes.xlsx', index=False)

# 打印找到的基因数量
print(f"找到 {len(sorted_genes)} 个唯一的基因ID")

找到 871 个唯一的基因ID


# 匹配相应的信息

In [3]:
import pandas as pd

# 读取两个Excel文件
genes_df = pd.read_excel('14067_genes.xlsx')
map_df = pd.read_excel('4. 添加linker/14067map13032_NU.xlsx')

# 创建新的列来存储匹配的信息
genes_df['Gene_name'] = ''
genes_df['EC_number'] = ''
genes_df['Product'] = ''
genes_df['Protein_id'] = ''
genes_df['Predicted_Genes'] = ''
genes_df['NCBI_ProteinID'] = ''
genes_df['UniProt'] = ''

# 遍历genes_df中的每一行
for index, row in genes_df.iterrows():
    # 在map_df中查找匹配的行
    match = map_df[map_df['locus_tag'] == row['GeneID']]
    
    if not match.empty:
        # 获取第一个匹配的行（如果有多个匹配的话）
        match_row = match.iloc[0]
        
        # 更新相应的列
        genes_df.at[index, 'Gene_name'] = match_row.get('Gene_name', '')
        genes_df.at[index, 'EC_number'] = match_row.get('EC_number', '')
        genes_df.at[index, 'Product'] = match_row.get('Product', '')
        genes_df.at[index, 'Protein_id'] = match_row.get('Protein_id', '')
        genes_df.at[index, 'Predicted_Genes'] = match_row.get('Predicted Genes', '')
        genes_df.at[index, 'NCBI_ProteinID'] = match_row.get('NCBI-ProteinID', '')
        genes_df.at[index, 'UniProt'] = match_row.get('UniProt', '')

# 保存结果到新的Excel文件
genes_df.to_excel('14067_genes_linker.xlsx', index=False)

# 打印匹配统计信息
total_genes = len(genes_df)
matched_genes = genes_df['Gene_name'].notna().sum()
print(f"总基因数: {total_genes}")
print(f"成功匹配的基因数: {matched_genes}")
print(f"未匹配的基因数: {total_genes - matched_genes}")

总基因数: 871
成功匹配的基因数: 372
未匹配的基因数: 499
