In [1]:
# path to CDD (conserved domain database)
CDD_PATH = '~/data/CDD/cdd'

# use Myo7b (both protein and RNA encoding) as an example for rpsblast
# gene source: http://www.informatics.jax.org/sequence/marker/MGI:107709
GENE_SEQ_PATH = './conserved_domain_search_examples/ENSMUST00000134663.fasta'
# protein source: https://www.uniprot.org/uniprot/Q99MZ6
PROT_SEQ_PATH = './conserved_domain_search_examples/Q99MZ6.fasta'

# use gene sequence if this flag is set to true, which might take
# longer but the results should be the same, otherwise use protein sequence
USE_GENE_SEQ = False


In [2]:
import io
import pandas as pd

from Bio.Blast import NCBIXML
from Bio.Blast.Record import Alignment, HSP
from Bio.Blast.Record import Blast as BlastResult
from Bio.Blast.Applications import NcbirpsblastCommandline

# print usage and version info about rpsblast
version_info = NcbirpsblastCommandline(version=True)()[0]
print(version_info)
help_msg = NcbirpsblastCommandline(help=True)()[0]
print(help_msg)


rpsblast: 2.10.1+
 Package: blast 2.10.1, build May 12 2020 13:06:02

USAGE
  rpsblast [-h] [-help] [-import_search_strategy filename]
    [-export_search_strategy filename] [-db database_name]
    [-dbsize num_letters] [-entrez_query entrez_query] [-query input_file]
    [-out output_file] [-evalue evalue] [-qcov_hsp_perc float_value]
    [-max_hsps int_value] [-xdrop_ungap float_value] [-xdrop_gap float_value]
    [-xdrop_gap_final float_value] [-searchsp int_value]
    [-sum_stats bool_value] [-seg SEG_options] [-soft_masking soft_masking]
    [-culling_limit int_value] [-best_hit_overhang float_value]
    [-best_hit_score_edge float_value] [-subject_besthit]
    [-window_size int_value] [-lcase_masking] [-query_loc range]
    [-parse_deflines] [-outfmt format] [-show_gis]
    [-num_descriptions int_value] [-num_alignments int_value]
    [-line_length line_length] [-html] [-sorthits sort_hits]
    [-sorthsps sort_hsps] [-max_target_seqs num_sequences]
    [-num_threads int_value] [-

In [3]:
rpsblast_cmd = NcbirpsblastCommandline(
    query=PROT_SEQ_PATH,
    db=CDD_PATH,
    outfmt=5,
    # TODO: more options like e-value restriction ...
)
rpsblast_xml_result, rpsblast_cmd_error_msg = rpsblast_cmd()


In [4]:
rpsblast_result: BlastResult = NCBIXML.read(io.StringIO(rpsblast_xml_result))
print(f'There are {len(rpsblast_result.alignments)} rpsblast alignments ...')

alignment_header = (
    'title',
    'id',
    # 'def',
    'score',
    'e-value',
    'start',
    'end',
    'length',
)
alignment_list = []
_alignment: Alignment
for _alignment in rpsblast_result.alignments:

    _title = _alignment.title
    _hit_id = _alignment.hit_id
    # _hit_def = _alignment.hit_def

    for _hsp in _alignment.hsps:

        _score = _hsp.score
        _expect = _hsp.expect
        _start = _hsp.query_start
        _end = _hsp.query_end
        _length = _hsp.align_length

        alignment_list.append([
            _title,
            _hit_id,
            # _hit_def,
            _score,
            _expect,
            _start,
            _end,
            _length,
        ])


print(pd.DataFrame(alignment_list, columns=alignment_header).to_markdown())


There are 172 rpsblast alignments ...
|     | title                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     