In [1]:
from Bio import SeqIO
from collections import defaultdict
import pandas as pd
import re

aset = set()
with open('misc/init.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        aset.add(record.seq.upper())

records = []
with open('tmp/env_nr.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        if record.seq in aset:
            records.append(record)

with open('tmp/nr.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        if record.seq in aset:
            if len(records) % 1000 == 0 and len(records)!=0:
                print('done:', len(records))
            records.append(record)

with open('tmp/sarg2nr.fa', 'w') as output_handle:
    SeqIO.write(records, output_handle, 'fasta')

done: 1000
done: 2000
done: 3000
done: 4000
done: 5000
done: 6000
done: 7000


In [2]:
seq2ref = dict()
with open('reference/sequence.fasta') as f:
    for record in SeqIO.parse(f, 'fasta'):
        seq2ref[record.seq] = record.id

seq2desc = dict()
with open('tmp/sarg2nr.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        seq2desc[record.seq] = record.description.split(' >')[0]

records = []
lines = []
with open('misc/init.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        desc = seq2desc.get(record.seq)
        line = [record.id, seq2ref.get(record.seq), desc.split(' ', 1)[-1].split(' [')[0].split('MULTISPECIES: ')[-1]]

        if not desc:
            print('unfound:', record.id)
            continue

        record.id = '|'.join(record.id.split('|')[:3] + [desc.split(' ', 1)[0]])
        record.description = desc.split(' ', 1)[-1]
        records.append(record)

        lines.append(line + [record.id])

with open('sarg_ref.fa', 'w') as output_handle:
    SeqIO.write(records, output_handle, 'fasta')

summary = pd.DataFrame(lines, columns = ['id', 'ref', 'desc', 'sarg'])
summary['type'] = summary['id'].str.split('|').str.get(1)
summary['subtype'] = summary['id'].str.split('|').str.get(2)
summary['accession'] = summary['id'].str.split('|').str.get(3)

cols = ['type', 'subtype', 'ref', 'sarg', 'desc']
summary[cols].sort_values(cols).to_csv('misc/summary.tsv', index=False, sep='\t')