In [1]:
from Bio import SeqIO
import pandas as pd

aset = set()
for file in ['reference.fasta', 'protein_fasta_protein_homolog_model.fasta', 'proteins.faa']:
    with open(f'reference/{file}') as handle:
        for record in SeqIO.parse(handle, 'fasta'):
            aset.add(record.seq)

with open('tmp/sarg2nr.fa', 'w') as output_handle:
    for file in ['env_nr', 'nr']:
        with open(f'tmp/{file}.fa') as handle:
            for title, seq in SeqIO.FastaIO.SimpleFastaParser(handle):
                if seq in aset:
                    output_handle.write('>%s\n%s\n' % (title, seq))

In [2]:
seq2ref = dict()
with open('reference/sequence.fasta') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        seq2ref[record.seq] = record.id

seq2desc = dict()
with open('tmp/sarg2nr.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        seq2desc[record.seq] = record.description.split(' >')[0]

records = []
lines = []
with open('misc/init.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        desc = seq2desc.get(record.seq)
        if not desc:
            continue

        line = [record.id, seq2ref.get(record.seq), desc.split(' ', 1)[-1].split(' [')[0].split('MULTISPECIES: ')[-1]]
        record.id = '|'.join(record.id.split('|')[:3] + [desc.split(' ', 1)[0]])
        record.description = desc.split(' ', 1)[-1]
        records.append(record)

        lines.append(line + [record.id])

with open('sarg_ref.fa', 'w') as output_handle:
    SeqIO.write(records, output_handle, 'fasta')

summary = pd.DataFrame(lines, columns = ['id', 'ref', 'desc', 'sarg'])
summary['type'] = summary['id'].str.split('|').str.get(1)
summary['subtype'] = summary['id'].str.split('|').str.get(2)
summary['accession'] = summary['id'].str.split('|').str.get(3)

cols = ['type', 'subtype', 'ref', 'sarg', 'desc']
summary[cols].sort_values(cols).to_csv('misc/summary.tsv', index=False, sep='\t')