In [None]:
from Bio import pairwise2
import re
import pandas as pd

def read_fasta_faa_sequences2(file_path):
    sequences = {}
    with open(file_path, 'r') as file:
        seq_id = ""
        seq = ""
        for line in file:
            if line.startswith(">"):  
                if seq_id and seq:  
                    sequences[seq_id] = seq
                seq_id = line.strip()  
                seq = ""
            else:
                seq += line.strip()
        if seq_id and seq:  
            sequences[seq_id] = seq
    return sequences

def read_fasta_faa_sequences1(file_path):
    sequences = {}
    with open(file_path, 'r') as file:
        seq_id = ""
        seq = ""
        for line in file:
            if line.startswith(">"):  
                if seq_id and seq:  
                    sequences[seq_id] = seq
                seq_id = line.strip().split()[0][1:]  
                seq = ""
            else:
                seq += line.strip()
        if seq_id and seq: 
            sequences[seq_id] = seq
    return sequences

def extract_uniprot_ids(uniprot_sequences):
    uniprot_ids = {}
    for header in uniprot_sequences.keys():
        match = re.search(r'GN=([^ ]+)', header)
        if match:
            uniprot_id = match.group(1)
            uniprot_ids[uniprot_id] = uniprot_sequences[header]
    return uniprot_ids

def match_sequences(protein_sequences, uniprot_ids):
    matches = []
    
    for protein_id, protein_seq in protein_sequences.items():
        protein_seq_trimmed = protein_seq[19:70]
        
        best_match = (None, 0) 
        
        for uniprot_id, uniprot_seq in uniprot_ids.items():
            uniprot_seq_trimmed = uniprot_seq[19:70]
            alignments = pairwise2.align.globalxx(protein_seq_trimmed, uniprot_seq_trimmed)
            best_score = alignments[0][2] if alignments else 0
            
            if best_score > best_match[1]:
                best_match = (uniprot_id, best_score)
        
        if best_match[0] is None:
            matches.append((protein_id, "none"))
        else:
            matches.append((protein_id, best_match[0]))
    
    return matches

protein_file_path = 'yuan3danbai.faa'  
uniprot_file_path = 'C:/Users/1/Desktop/uniport基因数据/uniprotkb_proteome_UP000001551_2023_09_08.fasta'  

protein_sequences = read_fasta_faa_sequences1(protein_file_path)
uniprot_sequences = read_fasta_faa_sequences2(uniprot_file_path)




In [None]:

uniprot_ids = extract_uniprot_ids(uniprot_sequences)


matched_results = match_sequences(protein_sequences, uniprot_ids)


output_csv_path = 'matched_results.csv'  
result_df = pd.DataFrame(matched_results, columns=['Protein ID', 'Uniprot ID'])
result_df.to_csv(output_csv_path, index=False)

print(f" {output_csv_path}")

结果已保存为 matched_results.csv
