This notebook experiments the alignment between two conserved domain FASTA files.


In [46]:
import os

import numpy as np
import pandas as pd
from Bio import SeqIO
from skbio.sequence import Protein
from skbio.alignment import local_pairwise_align_ssw

# path to the example FASTA sequences for conserved domains (downloaded from PATRIC directly)
SEQUENCE_PARENT_PATH = os.path.abspath('./examples/conserved_domain_alignment/')
CD1_SEQUENCE_PATH = os.path.join(SEQUENCE_PARENT_PATH, 'cd00001.FASTA')
CD2_SEQUENCE_PATH = os.path.join(SEQUENCE_PARENT_PATH, 'cd00001.FASTA')


In [47]:
blosum62_df = pd.read_csv(
    'https://www.ncbi.nlm.nih.gov/Class/FieldGuide/BLOSUM62.txt',
    sep='\s+',
    index_col=0,
    header=0,
    skiprows=6,
)
# add a column and a row for gap
blosum62_df['-'] = 0
blosum62_df.loc['-'] = 0
blosum62_dict = blosum62_df.to_dict()


In [48]:
cd1_seq_records = list(SeqIO.parse(CD1_SEQUENCE_PATH, 'fasta'))
cd2_seq_records = list(SeqIO.parse(CD2_SEQUENCE_PATH, 'fasta'))
cd1_consensus_seq = str(cd1_seq_records[0].seq)
cd2_consensus_seq = str(cd2_seq_records[0].seq)

# Smith Waterman alignment for the consensus sequences
tabular_msa, alignment_score, position_list = local_pairwise_align_ssw(
    sequence1=Protein(cd1_consensus_seq),
    sequence2=Protein(cd2_consensus_seq),
    substitution_matrix=blosum62_dict,
    protein=True,
)


In [49]:
cd1_seq_start, cd1_seq_end = position_list[0][0], position_list[0][1] + 1
cd2_seq_start, cd2_seq_end = position_list[1][0], position_list[1][1] + 1
scores = []
for _cd1_seq_record in cd1_seq_records[1:]:
    for _cd2_seq_record in cd2_seq_records[1:]:
        _, _score, _ = local_pairwise_align_ssw(
            sequence1=Protein(str(_cd1_seq_record.seq)[cd1_seq_start:cd1_seq_end]),
            sequence2=Protein(str(_cd2_seq_record.seq)[cd2_seq_start:cd2_seq_end]),
            substitution_matrix=blosum62_dict,
            protein=True,
            # score_only=True,
        )
        scores.append(_score)

print(np.average(scores) / tabular_msa.shape[1])

1.5517301204819276
