This notebook demonstrates how to search for conserved domains and replicate the searching results from the [NCBI/CDD](https://www.ncbi.nlm.nih.gov/cdd/) website.

In [1]:
# path to CDD (conserved domain database) and related info
CDD_PATH = '../data/interim/CDD/cdd'
CD_TRACK_PATH = '../data/interim/CDD/cd_track.txt'
CD_TRACK_HEADER = [
    'accession',
    'short_name',
    '(pssm_)id',
    'parent',
    'root',
    'version',
    'live',
    'release',
    'redundant',
    'date',
    'time',
]

# use Myo7b (both protein and RNA encoding) as an example for rpsblast
# gene source: http://www.informatics.jax.org/sequence/marker/MGI:107709
GENE_SEQ_PATH = './conserved_domain_search_examples/ENSMUST00000134663.fasta'
# protein source: https://www.uniprot.org/uniprot/Q99MZ6
PROT_SEQ_PATH = './conserved_domain_search_examples/Q99MZ6.fasta'

# use gene sequence if this flag is set to true, which might take
# longer but the results should be the same, otherwise use protein sequence
USE_GENE_SEQ = False

_USE_GENE_STR = 'gene' if USE_GENE_SEQ else 'protein'
RPSBLAST_XML_RESULT_PATH = \
    f'./conserved_domain_search_examples/rpsblast_result_{_USE_GENE_STR}.xml'
RPSBLAST_CSV_RESULT_PATH = \
    f'./conserved_domain_search_examples/rpsblast_result_{_USE_GENE_STR}.csv'


In [2]:
import io
import pandas as pd

from Bio.Blast import NCBIXML
from Bio.Blast.Record import Alignment, HSP
from Bio.Blast.Record import Blast as BlastResult
from Bio.Blast.Applications import NcbirpsblastCommandline

# print usage and version info about rpsblast
version_info = NcbirpsblastCommandline(version=True)()[0]
print(version_info)
# help_msg = NcbirpsblastCommandline(help=True)()[0]
# print(help_msg)


rpsblast: 2.10.1+
 Package: blast 2.10.1, build May 12 2020 13:06:02



In [3]:
# same rpsblast config as the online conserved domain search
# based from the README file in CDD FTP server
# https://ftp.ncbi.nih.gov/pub/mmdb/cdd/README
rpsblast_cmd = NcbirpsblastCommandline(
    query=PROT_SEQ_PATH,
    db=CDD_PATH,
    seg='no',
    comp_based_stats='1',
    evalue=0.01,
    outfmt=5,
)
rpsblast_xml_result, rpsblast_cmd_error_msg = rpsblast_cmd()
with open(RPSBLAST_XML_RESULT_PATH, 'w+') as f:
    f.write(rpsblast_xml_result)
# print(rpsblast_xml_result)

In [4]:
rpsblast_result: BlastResult = NCBIXML.read(io.StringIO(rpsblast_xml_result))
print(f'There are {len(rpsblast_result.alignments)} rpsblast alignments ...')

alignment_header = (
    'title',
    'id',
    'def',
    'score',
    'e-value',
    'start',
    'end',
    'length',
)
alignment_list = []
_alignment: Alignment
for _alignment in rpsblast_result.alignments:

    _title = (_alignment.title[:32] + " ...") if len(_alignment.title) > 32 \
        else _alignment.title
    _hit_id = _alignment.hit_id
    _hit_def = (_alignment.hit_def[:32] + " ...") if len(_alignment.hit_def) > 32 \
        else _alignment.hit_def

    _hsp: HSP
    for _hsp in _alignment.hsps:

        _score = _hsp.score
        _expect = _hsp.expect
        _start = _hsp.query_start
        _end = _hsp.query_end
        _length = _hsp.align_length

        alignment_list.append([
            _title,
            _hit_id,
            _hit_def,
            _score,
            _expect,
            _start,
            _end,
            _length,
        ])


alignment_df = pd.DataFrame(alignment_list, columns=alignment_header)
alignment_df.to_csv(RPSBLAST_CSV_RESULT_PATH)
print(alignment_df.head(10).to_markdown())


There are 103 rpsblast alignments ...
|    | title                                | id             | def                                  |   score |   e-value |   start |   end |   length |
|---:|:-------------------------------------|:---------------|:-------------------------------------|--------:|----------:|--------:|------:|---------:|
|  0 | gnl|CDD|276832 cd01381, MYSc_Myo ... | gnl|CDD|276832 | cd01381, MYSc_Myo7, class VII my ... |    3333 |         0 |      79 |   748 |      670 |
|  1 | gnl|CDD|214580 smart00242, MYSc, ... | gnl|CDD|214580 | smart00242, MYSc, Myosin. Large  ... |    2491 |         0 |      60 |   759 |      707 |
|  2 | gnl|CDD|276950 cd00124, MYSc, My ... | gnl|CDD|276950 | cd00124, MYSc, Myosin motor doma ... |    2260 |         0 |      79 |   748 |      683 |
|  3 | gnl|CDD|365845 pfam00063, Myosin ... | gnl|CDD|365845 | pfam00063, Myosin_head, Myosin h ... |    2131 |         0 |      67 |   748 |      687 |
|  4 | gnl|CDD|276849 cd14883, MYSc_Myo ... 

In [5]:
'''
Column descriptions:

Acc = conserved domain model accession number (e.g., pfam09006)

ShortName = first 10 characters of domain model's short name,
        in this case, Surfac_D-t, for Surfac_D-trimer.

PSSMID = unique identifier for the position specific scoring matrix
        (e.g., as the pfam09006 domain model has evolved, it has had
        three PSSMs, with IDs 72424, 87766, and 90442, respectively).

        If there are any changes in the protein sequence alignment
        of a domain model (for example, the addition/deletion of
        member protein sequences or changes in the span of aligned residues),
        or if there are changes in the interpretation of the alignment,
        a new PSSM will be calculated. In that case, it will receive
        a new PSSM ID, although the accession number of the conserved
        domain model will remain the same.

        If only the domain model description or other annotations have
        changed, but the PSSM did not change, the version of the model
        will be incremented but the the PSSM ID will remain the same,
        as it did for version 1 and 2 of pfam09006, both of which had
        the PSSM ID 72424.

Root =  if the domain model is NCBI-curated, the "Root" column will
        show the accession number of the parent node of the curated
        domain hierarchy.  If the domain hierarchy contains only a
        single node, the value in the "Root" column will be the same
        as that in the "Acc" column.  The values will also be the same
        if the accession listed in the first column is the parent node
        of a multi-level hierarchy.

Version = version number of that particular domain model

Lv =         indicates the current live version of the record:
        1 = live status;
        0 = dead, earlier version.

Rl =         indicates whether the domain model version has been
             released into the public database. This is a flag
             NCBI uses for internal data tracking.
             For most domain models, the value will be
             1= released, which means at some point the model was
             live in the database. Occasionally a value of "0" might
             appear, primarily for ncbi-curated models.  This indicates
             a newer version of a model is in preparation at NCBI and
             will be released in the future.

ER =         Expendable or redundant models; value in this column can be:
             0 = non-expendable or not redundant
             1 = expendable or redundant; indicates a model that has been
             removed from the default "cdd" search set because the
             information in it is represented in another domain model.

Time =         date and time on which the model was last updated in the
        internal conserved domain tracking database.
'''

cd_track_df = pd.read_csv(
    CD_TRACK_PATH,
    sep='\s+',
    header=None,
    index_col=None,
    skiprows=4,
    names=CD_TRACK_HEADER,
)

_cd_track_cd_len = len(cd_track_df)

# delete the "dead" rows and keep the useful columns only
cd_track_df = cd_track_df.loc[cd_track_df['live'] == 1]
# cd_track_df = cd_track_df[
#     ['accession', 'short_name', '(pssm_)id', 'parent', 'root',]]
# cd_track_df = cd_track_df.set_index('(pssm_)id')

print(f'There are {len(cd_track_df)}/{_cd_track_cd_len} (live/total) PSSMs in CDD ...')
# print(cd_track_df[['accession', 'short_name', '(pssm_)id', 'parent', 'root']].head(10).to_markdown())
print(cd_track_df.head(10).to_markdown())

There are 16181/16181 (live/total) PSSMs in CDD ...
|    | accession   | short_name     |   (pssm_)id | parent   | root    |   version |   live |   release |   redundant | date     | time     |
|---:|:------------|:---------------|------------:|:---------|:--------|----------:|-------:|----------:|------------:|:---------|:---------|
|  0 | cd00001     | PTS_IIB_man    |      237975 | nan      | cd00001 |         6 |      1 |         1 |           0 | 01/17/13 | 11:10:00 |
|  1 | cd00002     | YbaK_deacylase |      237976 | cd04332  | cd04332 |         6 |      1 |         1 |           0 | 01/17/13 | 11:10:00 |
|  2 | cd00003     | PNPsynthase    |      237977 | nan      | cd00003 |         6 |      1 |         1 |           0 | 01/17/13 | 11:10:00 |
|  3 | cd00004     | Sortase        |      320674 | nan      | cd00004 |         7 |      1 |         1 |           0 | 08/18/16 | 17:14:00 |
|  4 | cd00005     | CBM9_like_1    |      187674 | cd00241  | cd00241 |         5 |      1 |   

In [6]:
_unique_cd_root = cd_track_df['root'].unique()
print(f'There are {len(_unique_cd_root)} unique CD roots in CDD ...')

# merge the CD track info into (rpsblast) alignment dataframe
alignment_df['(pssm_)id'] = \
    alignment_df['id'].str.replace('gnl\|CDD\|', '').astype(int)
alignment_cd_track_df = alignment_df.merge(
    right=cd_track_df[['(pssm_)id', 'root']],
    how='left',
    on='(pssm_)id',
)

# fill in the short names and accession
_def = alignment_cd_track_df['def'].str.split(',', n=2 ,expand=True)
_accessions, _short_names = _def[0], _def[1]
alignment_cd_track_df['accession'] = _accessions
alignment_cd_track_df['short_name'] = _short_names

# fill in the root if it does not exist
# for non-NCBI-curated conserved domains, there is no root in CD track
alignment_cd_track_df['root'].fillna(
    alignment_cd_track_df['accession'],
    inplace=True,
)

print(alignment_cd_track_df[
          ['(pssm_)id', 'short_name', 'accession', 'root', 'start', 'end', 'length']].to_markdown())


There are 1270 unique CD roots in CDD ...
|     |   (pssm_)id | short_name                  | accession   | root       |   start |   end |   length |
|----:|------------:|:----------------------------|:------------|:-----------|--------:|------:|---------:|
|   0 |      276832 | MYSc_Myo7                   | cd01381     | cd01363    |      79 |   748 |      670 |
|   1 |      214580 | MYSc                        | smart00242  | smart00242 |      60 |   759 |      707 |
|   2 |      276950 | MYSc                        | cd00124     | cd01363    |      79 |   748 |      683 |
|   3 |      365845 | Myosin_head                 | pfam00063   | pfam00063  |      67 |   748 |      687 |
|   4 |      276849 | MYSc_Myo22                  | cd14883     | cd01363    |      80 |   748 |      676 |
|   5 |      227355 | COG5022                     | COG5022     | COG5022    |      59 |   875 |      832 |
|   6 |      276831 | MYSc_Myo5                   | cd01380     | cd01363    |      81 |   748

In [None]:
# The binding sites and other annotated information is in cddannot.dat from the CDD FTP server
# Note that the info is only available for NCBI-curated domains
