This notebook demonstrates how to parse genome download from [PATRIC](https://www.patricbrc.org/), including reading, CD searching, and printing.

[PATRIC USER GUIDES/FTP](https://docs.patricbrc.org/user_guides/ftp.html):
> each genome directory provides the following data files for PATRIC and RefSeq annotations
- fna: FASTA contig sequences
- features.tab: All genomic features and related information in tab-delimited format
- ...


In [1]:
import os
import json
import time

import pandas as pd
from Bio import SeqIO
from Bio.Blast.Applications import NcbirpstblastnCommandline

# path to CDD (conserved domain database) and related info
CDD_PATH = os.path.abspath('../data/interim/CDD/Cdd')
RPSBLAST_KWARGS = {
    'db': CDD_PATH,
    'seg': 'no',
    'comp_based_stats': '1',
    'evalue': 0.01,
    'outfmt': 5,
    'num_threads': 16,
}

# path to all the genomes (downloaded from PATRIC directly)
GENOME_PARENT_PATH = os.path.abspath('./examples/genome_processing')


In [2]:
# get all the genome names (PATRIC ID)
genome_ids = next(os.walk(GENOME_PARENT_PATH))[1]

for _genome_id in genome_ids:
    _genome_path = os.path.join(GENOME_PARENT_PATH, _genome_id)

    # get the nucleotide sequences (whole contig and the one with features only)
    _cntg_seq_path = os.path.join(_genome_path, _genome_id + '.fna')
    _cntg_seq_records = list(SeqIO.parse(_cntg_seq_path, 'fasta'))
    # _feat_seq_path = os.path.join(_genome_path, _genome_id + '.PATRIC.ffn')
    # _feat_seq_records = list(SeqIO.parse(_feat_seq_path, 'fasta'))

    print(f'Genome {_genome_id} contains {len(_cntg_seq_records)} contig(s)')

    # make directory for contigs ...
    _cntg_path = os.path.join(_genome_path, 'contigs')
    if not os.path.isdir(_cntg_path):
        os.makedirs(_cntg_path)

    # split contigs and write into different files named with their IDs
    _num_cntg = len(_cntg_seq_records)
    _cntg_dict = {}
    for _cntg_seq_record in _cntg_seq_records:
        _cntg_dict[_cntg_seq_record.id] = {'length': len(_cntg_seq_record)}
        with open(os.path.join(_genome_path, 'contigs', f'{_cntg_seq_record.id}.fna'), 'w+') as f:
            SeqIO.write(_cntg_seq_record, f, 'fasta')

    # split features for each and every contig the same way
    # features are saved in TSV format (default pandas setting)
    _feat_path = os.path.join(_genome_path, 'features')
    if not os.path.isdir(_feat_path):
        os.makedirs(_feat_path)
    _feat_df = pd.read_table(os.path.join(_genome_path,  f'{_genome_id}.PATRIC.features.tab'))
    for _accession, _feat_by_accession_df in _feat_df.groupby('accession'):
        _source_mask = _feat_by_accession_df['feature_type'] == 'source'
        _feat_by_accession_df[~_source_mask].to_csv(os.path.join(_feat_path, f'{_accession}.tsv'), index=None, sep='\t')

    # create an information file that contains
    # - genome name
    # - number of contigs
    # - each contig accession and its length (maybe information for genes and CDs?)
    _genome_info = {
        'genome_name': _feat_df['genome_name'].unique()[0],
        'number_of_contigs': _num_cntg,
        'contigs': _cntg_dict,
    }

    with open(os.path.join(_genome_path, 'info.json'), 'w+') as f:
        json.dump(_genome_info, f, indent=4)


Genome 562.2282 contains 3 contig(s)


In [3]:
# the following statement is for rpsblast with translated nucleotide sequences
# make sure of the following before proceed to conserved domain search:
#   - the features in *.ffn and *.features.tab are directly sampled from the contigs
#       - start, end, and name should be matched both ...
#   - the protein sequences in *.faa file belong to a subset of the features in *.ffn and *.features.tab
#   - every protein in *.faa file could be translated from a feature nucleotide in *.ffn and *.features.tab
#       - use Biopython Seq.translate() for verification ...
#       - what about the introns ...
#       - strand ...

for _genome_id in genome_ids:
    _genome_path = os.path.join(GENOME_PARENT_PATH, _genome_id)
    _cntg_cd_path = os.path.join(_genome_path, 'conserved_domains')
    if not os.path.isdir(_cntg_cd_path):
        os.makedirs(_cntg_cd_path)

    # get all the contigs and iterate through for conserved domain search
    with open(os.path.join(_genome_path, 'info.json'), 'r') as f:
        _genome_info = json.load(f)
    _cntg_ids = _genome_info['contigs'].keys()

    _start_time = time.time()
    for _cntg_id in _cntg_ids:
        _cntg_path = os.path.join(_genome_path, 'contigs', f'{_cntg_id}.fna')
        _cntg_rpsblast_cmd = NcbirpstblastnCommandline(query=_cntg_path, **RPSBLAST_KWARGS)
        _cntg_cd_result, _cntg_rpsblast_cmd_error_msg = _cntg_rpsblast_cmd()
        _cntg_cd_result_path = os.path.join(_cntg_cd_path, f'{_cntg_id}.xml')
        with open(_cntg_cd_result_path, 'w+') as f:
            f.write(_cntg_cd_result)
    _exe_time = time.strftime('%H hours, %M minutes, and %S seconds', time.gmtime(time.time() - _start_time))
    print(f'The execution time for rpstblastn on all contigs of {_genome_id} is {_exe_time}')


The execution time for rpstblastn on all contigs of 562.2282 is 00 hours, 04 minutes, and 50 seconds
