In [20]:
from datetime import datetime
import os

class FastaProcessor:
    def __init__(self, fasta_file, outpath, prefix):
        self.fasta_file = fasta_file
        self.outpath = outpath
        self.prefix = prefix
        self.today = datetime.today().strftime('%y%m%d')

    def read_fasta(self):
        """
        读取FASTA文件并返回Ref和Var序列列表
        """
        with open(self.fasta_file, 'r') as file:
            lines = file.readlines()

        self.ref_sequences = []
        self.var_sequences = []
        for line in lines:
            if line.startswith('>Ref'):
                self.ref_sequences.append((line.strip(), ''))
            elif line.startswith('>Var'):
                self.var_sequences.append((line.strip(), ''))
            else:
                if self.ref_sequences and self.ref_sequences[-1][1] == '':
                    self.ref_sequences[-1] = (self.ref_sequences[-1][0], line.strip())
                elif self.var_sequences and self.var_sequences[-1][1] == '':
                    self.var_sequences[-1] = (self.var_sequences[-1][0], line.strip())


    def transform_ids(self, sequences, type_):
        """
        转换序列ID并生成新的ID对应关系
        """
        transformed_sequences = []
        id_transformations = []
        for count, (old_id, seq) in enumerate(sequences, start=1):
            new_id = f">{type_}{self.today}{count:04d}"
            transformed_sequences.append((new_id, seq))
            id_transformations.append(f"{old_id} -> {new_id}")
        return transformed_sequences, id_transformations

    def write_fasta(self, sequences, filename):
        """
        将序列写入FASTA文件
        """
        outfile = os.path.join(self.outpath, filename)
        with open(outfile, 'w') as file:
            for seq in sequences:
                file.write(f"{seq[0]}\n{seq[1]}\n")

    def write_transformations(self, transformations):
        """
        将ID转换关系写入文件
        """
        outfile = os.path.join(self.outpath, f"{self.prefix}_id_transformation.txt")
        with open(outfile, 'w') as file:
            for transformation in transformations:
                file.write(f"{transformation}\n")

    def process(self):
        """
        主函数：处理FASTA文件并输出结果
        """
        # 读取FASTA文件
        self.read_fasta()

        # 转换Ref序列的ID
        transformed_ref_sequences, ref_id_transformations = self.transform_ids(self.ref_sequences, 'Ref')

        # 转换Var序列的ID
        transformed_var_sequences, var_id_transformations = self.transform_ids(self.var_sequences, 'Var')

        # 写入Ref序列到输出文件
        self.write_fasta(transformed_ref_sequences, f"{self.prefix}_ref_seq.faa")

        # 写入Var序列到输出文件
        self.write_fasta(transformed_var_sequences, f"{self.prefix}_var_seq.faa")

        # 写入ID转换关系到输出文件
        self.write_transformations(ref_id_transformations + var_id_transformations)


In [19]:
# 使用示例
samplename = "0009689008"
fasta_processor = FastaProcessor(
    fasta_file=f'/home/jovyan/work/10.data_CODA_ahslyy/03.Result.SAKit2/{samplename}_CA/results/08.SvInDelSnvCalling/COAD_{samplename}.ref_variants.faa',
    outpath = f'/home/jovyan/work/10.data_CODA_ahslyy/03.Result.SAKit2/{samplename}_CA/results/10.EpitopePrediction_snv',
    prefix = samplename
)
fasta_processor.process()