This notebook demonstrates how to parse genome download from [PATRIC](https://www.patricbrc.org/), including reading, CD searching, and printing.

[PATRIC USER GUIDES/FTP](https://docs.patricbrc.org/user_guides/ftp.html):
> Each genome directory provides the following data files for PATRIC and RefSeq annotations (when available).
- fna: FASTA contig sequences
- faa: FASTA protein sequence file
- features.tab: All genomic features and related information in tab-delimited format
- ffn: FASTA nucleotide sequences for genomic features, i.e. genes, RNAs, and other misc features
- frn: FASTA nucleotide sequences for RNAs
- gff: Genome annotations in GFF file format
- pathway.tab: Metabolic pathway assignments in tab-delimited format
- spgene.tab: Specialty gene assignments (i.e. AMR genes, virulance factors, essential genes, etc) in tab-delimited format
- subsystem.tab: Subsystem assignments in tab-delimited format


In [16]:
import os
import json
import time

import pandas as pd
from Bio import SeqIO
from Bio.Blast.Applications import NcbirpsblastCommandline

# path to CDD (conserved domain database) and related info
CDD_PATH = os.path.abspath('../data/interim/CDD/Cdd')
RPSBLAST_KWARGS = {
    'db': CDD_PATH,
    'seg': 'no',
    'comp_based_stats': '1',
    'evalue': 0.01,
    'outfmt': 5,
}

# path to all the genomes (downloaded from PATRIC directly)
GENOME_PARENT_PATH = os.path.abspath('./genome_parsing_examples')


In [18]:
# get all the genome names (PATRIC ID)
genome_ids = next(os.walk(GENOME_PARENT_PATH))[1]

for _genome_id in genome_ids:
    _genome_path = os.path.join(GENOME_PARENT_PATH, _genome_id)

    # get the nucleotide sequences (whole contig and the one with features only)
    _cntg_seq_path = os.path.join(_genome_path, _genome_id + '.fna')
    _cntg_seq_records = list(SeqIO.parse(_cntg_seq_path, 'fasta'))
    _feat_seq_path = os.path.join(_genome_path, _genome_id + '.PATRIC.ffn')
    _feat_seq_records = list(SeqIO.parse(_feat_seq_path, 'fasta'))

    print(f'Genome {_genome_id} contains '
      f'{len(_cntg_seq_records)} contig(s) and '
      f'{len(_feat_seq_records)} features (genes, RNAs, etc.)')

    # make directory for contigs ...
    _cntg_path = os.path.join(_genome_path, 'contigs')
    if not os.path.isdir(_cntg_path):
        os.makedirs(_cntg_path)

    # split contigs and write into different files named with their IDs
    _num_cntg = len(_cntg_seq_records)
    _cntg_len_dict = {}
    for _cntg_seq_record in _cntg_seq_records:
        _cntg_len_dict[_cntg_seq_record.id] = len(_cntg_seq_record)
        with open(os.path.join(_genome_path, 'contigs',  _cntg_seq_record.id + '.fa'), 'w+') as f:
            SeqIO.write(_cntg_seq_record, f, 'fasta')

    # split features for each and every contig the same way
    # features are saved in TSV format (default pandas setting)
    _feat_path = os.path.join(_genome_path, 'features')
    if not os.path.isdir(_feat_path):
        os.makedirs(_feat_path)
    _feat_df = pd.read_table(os.path.join(_genome_path, _genome_id + '.PATRIC.features.tab'))
    for _accession, _feat_by_accession_df in _feat_df.groupby('accession'):
        _source_mask = _feat_by_accession_df['feature_type'] == 'source'
        _feat_by_accession_df[~_source_mask].to_csv(os.path.join(_feat_path, f'{_accession}.tsv'), index=None, sep='\t')

    # create an information file that contains
    # - genome name
    # - number of contigs
    # - each contig accession and its length
    _genome_info = {
        'genome_name': _feat_df['genome_name'].unique()[0],
        'number_of_contigs': _num_cntg,
        'lengths_of_contigs': _cntg_len_dict,
    }

    with open(os.path.join(_genome_path, 'info.json'), 'w+') as f:
        json.dump(_genome_info, f, indent=4)


Genome 83333.84 contains 1 contig(s) and 4357 features (genes, RNAs, etc.)
Genome 562.2282 contains 248 contig(s) and 4196 features (genes, RNAs, etc.)
Genome 37762.5 contains 1 contig(s) and 4477 features (genes, RNAs, etc.)


In [None]:
# each genome takes 2.5 - 4 hours on lambda, and fewer contigs = longer computation time
# this segment of code is commented out for extremely long execution time ...
# the results are retrieved directly from remote host (lambda)
pass

# for _genome_id in genome_ids:
#     _genome_path = os.path.join(GENOME_PARENT_PATH, _genome_id)
#
#     # get the nucleotide sequences (whole contig and the one with features only)
#     _cntg_seq_path = os.path.join(_genome_path, _genome_id + '.fna')
#     _feat_seq_path = os.path.join(_genome_path, _genome_id + '.PATRIC.ffn')
#
#     # perform CD search for the contigs with measured time
#     _start_time = time.time()
#     _cntg_rpsblast_cmd = NcbirpsblastCommandline(query=_cntg_seq_path, **RPSBLAST_KWARGS)
#     _cntg_rpsblast_xml_result, _cntg_rpsblast_cmd_error_msg = _cntg_rpsblast_cmd()
#     _exe_time = (time.time() - _start_time).strftime('%H hours, %M minutes, and %S seconds')
#     print(f'The execution time for RPSBLAST (conserved domain search) on all genome contigs is {_exe_time}')
#     with open(os.path.join(_genome_path, _genome_id + '.contig_rpsblast.xml'), 'w+') as f:
#         f.write(_cntg_rpsblast_xml_result)
#
#     # perform CD search for the features only with measure time
#     _start_time = time.time()
#     _feat_rpsblast_cmd = NcbirpsblastCommandline(query=_feat_seq_path, **RPSBLAST_KWARGS)
#     _feat_rpsblast_xml_result, _feat_rpsblast_cmd_error_msg = _feat_rpsblast_cmd()
#     _exe_time = (time.time() - _start_time).strftime('%H hours, %M minutes, and %S seconds')
#     print(f'The execution time for RPSBLAST (conserved domain search) on genome features is {_exe_time}')
#     with open(os.path.join(_genome_path, _genome_id + '.feature_rpsblast.xml'), 'w+') as f:
#         f.write(_feat_rpsblast_xml_result)
