In [1]:
import json5
import glob
import pandas as pd
import regex as re
from Bio import SeqIO
from collections import defaultdict

added = {}
discarded = set()
replaced_seqs = dict()
replaced_genes = dict()
with open('sarg.json', 'r') as f:
    for i,j in json5.load(f).items():
        if i != 'discarded':
            for k,l in j.items():
                if k == 'discarded':
                    discarded.update(l)
                if k == 'changed':
                    replaced_seqs.update({z:x for x,y in l.items() for z in y if isinstance(y, list)})
                    replaced_genes.update({x:y for x,y in l.items() if not isinstance(y, list)})
                if k == 'added':
                    for m,n in l.items():
                        added[m] = 'REF|' + i + '|' + n + '|' + m
        else:
            discarded.update(j)

records = []
dups = set()
with open('reference/sequence.fasta') as f:
    for record in SeqIO.parse(f, 'fasta'):
        if record.id not in discarded:
            if record.seq not in dups:
                dups.add(record.seq)
                records.append(record)

In [2]:
%%bash
seqkit sort -s reference/reference.fasta -o reference/reference.fasta

[INFO][0m read sequences ...
[INFO][0m 488 sequences loaded
[INFO][0m sorting ...
[INFO][0m output ...


In [3]:
id2refseq = dict()
with open('reference/reference.fasta') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        if record.seq not in dups:
            dups.add(record.seq)
            
            if record.id in added:
                record.id = added.get(record.id)
                records.append(record)
            else:
                if 'GN=' in record.description:
                    gene = record.description.split(' GN=')[-1].split(' ')[0]
                else:
                    gene = record.description.split(' [')[0].split(' (')[0].split(' ')[-1]
                    gene = gene[0].lower() + gene[1:]
    
                record.id = 'REF|' + added.get(gene, 'NA') + '|' + gene + '|' + record.id
                print(f'unused: {record.id}')

unused: REF|NA|Rv1877|sp|P9WG85|Y1877_MYCTU
unused: REF|NA|vraF|WP_001637630.1
unused: REF|NA|nicT|sp|I6YEJ7|NICT_MYCTU
unused: REF|NA|Krm|sp|Q2J7L5|KRM_FRACC
unused: REF|NA|protein|WP_003722374.1
unused: REF|NA|transporter|WP_000473297.1
unused: REF|NA|mmpS5|sp|P9WJS7|MMPS5_MYCTU
unused: REF|NA|SAV1866|sp|Q99T13|Y1866_STAAM
unused: REF|NA|fmrO|sp|Q08325|FMRO_MICOL
unused: REF|NA|transporter|WP_000905722.1
unused: REF|NA|Rv1273c|sp|P9WQJ1|Y1273_MYCTU
unused: REF|NA|kamC|sp|P25919|KAMC_SACHI
unused: REF|NA|Rv1410c|sp|P9WJY3|MFS55_MYCTU
unused: REF|NA|Rv2686c|sp|P9WJB3|FLQE2_MYCTU
unused: REF|NA|Rv0194|sp|O53645|MDREP_MYCTU
unused: REF|NA|Rv1218c|sp|O86311|MEATP_MYCTU
unused: REF|NA|vraG|WP_002457535.1
unused: REF|NA|vraF|WP_020363729.1
unused: REF|NA|Rv1217c|sp|O05318|MEPRM_MYCTU
unused: REF|NA|Rv2688c|sp|P9WQL7|FLQE1_MYCTU
unused: REF|NA|Rv0191|sp|P9WJX7|CHLEP_MYCTU
unused: REF|NA|Rv1634|sp|P9WJX3|Y1634_MYCTU
unused: REF|NA|vraG|WP_000143623.1
unused: REF|NA|transporter|WP_003119853.1


In [4]:
records_renamed = []
for record in records:
    if 'REF|' in record.id:
        record.id = '@'.join(record.id.split('|')[1:3])
    elif 'NCBI|' in record.id or 'CARD|' in record.id:
        record.description = record.description.split(' ', 1)[-1]
        if record.id in replaced_seqs:
            record.id = replaced_seqs.get(record.id)
        else:
            if '|stress:biocide|' in record.name:
                record.id = 'biocide@' + record.id.split('|')[3]
            else:
                record.id = '@'.join(record.id.split('|')[2:4])
    else:
        continue

    gene, id = record.id.split('@')[-1], record.id
    for k, v in replaced_genes.items():
        id = re.sub(k, v, id)

    if "sp|" in record.name:
        name = record.name.split('@')[-1].split('|')[-2]
    else:
        name = record.name.split('@')[-1].split('|')[-1]

    record.id = f"SARG|{id.replace('@', '|')}|{name}"
    record.description = record.description.split(' >')[0]    
    if id.split('@')[-1] not in discarded and gene not in discarded:
        records_renamed.append(record)

In [5]:
with open('misc/init.fasta', 'w') as output_handle:
    SeqIO.write(records_renamed, output_handle, 'fasta')

In [6]:
%%bash
seqkit sort -s misc/init.fasta -o misc/init.fasta

[INFO][0m read sequences ...
[INFO][0m 7387 sequences loaded
[INFO][0m sorting ...
[INFO][0m output ...
