In [None]:
%%bash
seqkit sort -s reference/reference.fasta -o reference/reference.fasta

In [None]:
import json5
from collections import defaultdict
import glob
import pandas as pd
import regex as re
from Bio import SeqIO
import subprocess

added = {}
discarded = set()
replaced_seqs = dict()
replaced_genes = dict()
with open('sarg.json', 'r') as f:
    for i,j in json5.load(f).items():
        if i!='discarded':
            for k,l in j.items():
                if k == 'discarded':
                    discarded.update(l)
                if k == 'changed':
                    replaced_seqs.update({z:x for x,y in l.items() for z in y if isinstance(y, list)})
                    replaced_genes.update({x:y for x,y in l.items() if not isinstance(y, list)})
                if k == 'added':
                    for m,n in l.items():
                        added[m] = 'REF|' + i + '|' + n + '|' + m
        else:
            discarded.update(j)

records = []
dups = set()
with open('reference/sequence.fasta') as f:
    for record in SeqIO.parse(f, 'fasta'):
        if record.id not in discarded:
            if record.seq not in dups:
                dups.add(record.seq)
                records.append(record)

In [None]:
id2refseq = dict()
with open('reference/reference.fasta') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        if record.seq not in dups:
            dups.add(record.seq)
            
            if record.id in added:
                record.id = added.get(record.id)
                records.append(record)
            else:
                if 'GN=' in record.description:
                    gene = record.description.split(' GN=')[-1].split(' ')[0]
                else:
                    gene = record.description.split(' [')[0].split(' (')[0].split(' ')[-1]
                    gene = gene[0].lower() + gene[1:]
    
                record.id = 'REF|' + added.get(gene, 'NA') + '|' + gene + '|' + record.id
                print(f'unused: {record.id}')

In [None]:
records_renamed = []
for record in records:
    if 'REF|' in record.id:
        record.id = '@'.join(record.id.split('|')[1:3])
    elif 'NCBI|' in record.id or 'CARD|' in record.id:
        record.description = record.description.split(' ', 1)[-1]
        if record.id in replaced_seqs:
            record.id = replaced_seqs.get(record.id)
        else:
            if '|stress:biocide|' in record.name:
                record.id = 'biocide@' + record.id.split('|')[3]
            else:
                record.id = '@'.join(record.id.split('|')[2:4])
    else:
        continue

    gene, id = record.id.split('@')[-1], record.id
    for k, v in replaced_genes.items():
        id = re.sub(k, v, id)

    if "sp|" in record.name:
        name = record.name.split('@')[-1].split('|')[-2]
    else:
        name = record.name.split('@')[-1].split('|')[-1]

    record.id = f"SARG|{id.replace('@', '|')}|{name}"
    record.description = record.description.split(' >')[0]    
    if id.split('@')[-1] not in discarded and gene not in discarded:
        records_renamed.append(record)

In [None]:
with open('init.fasta', 'w') as output_handle:
    SeqIO.write(records_renamed, output_handle, 'fasta')

In [None]:
%%bash
seqkit sort -s init.fasta -o init.fasta