# 4. Pairwise alignments to reference, Li et al.
___
Dr. Raffael lab <br>
2024

In [1]:
import os
import re
import datetime
from pathlib import Path
import multiprocessing as mp

import numpy as np
import pandas as pd

# from Bio.Alphabet import Gapped
from Bio.Align import MultipleSeqAlignment
from Bio.Align import substitution_matrices as matlist
from Bio import Align
from Bio import SeqIO, AlignIO, Seq, Align, SeqRecord, pairwise2

from tqdm import tqdm



In [2]:
def performPairwiseAlns(ref_seq_record, target_fasta_handle):
    """Performs pairwise alignment of each record in a fasta against
    a reference sequence. The reference itself is still included in the alignment.
    Uses the following default alignment parameters:
        open_gap_score = -10
        extend_gap_score = -0.5
    
    Arguments:
        (Bio.SeqRecord) ref_seq_record: a record corresponding to the reference
            sequence.
        (str|pathlib.Path) target_fasta_handle: handle of per-enzyme fasta against
            which to perform pairwise alignments
    
    Returns:
        (list) a list of SeqRecord object corresponding to each of the fasta
            seqs aligned to the reference sequence.
    
    """
    aln_results = []
    
    for target_seq in SeqIO.parse(target_fasta_handle, 'fasta'):
        
        # Define pairwise alignment parameters
        aligner = Align.PairwiseAligner()
        aligner.open_gap_score = -10
        aligner.extend_gap_score = -0.5
        aligner.substitution_matrix = matlist.load('BLOSUM62')
        
        # Execute alignment
        alignments = aligner.align(ref_seq_record.seq, target_seq.seq)
        best_aln = alignments[0]
        aln_score = best_aln.score
        
        # Extract aligned sequences from results
#         split_best_aln = str(alignments[0]).split('\n')
        aligned_reference = best_aln[0]#split_best_aln[0]
        aligned_target = best_aln[1]#split_best_aln[2]

        # Mask the target by the non-gapped reference positions
        aligned_reference_arr = np.array(list(aligned_reference), dtype = 'str')
        aligned_target_arr = np.array(list(aligned_target), dtype = 'str')
        trimmed_aligned_target = ''.join(aligned_target_arr[(aligned_reference_arr != '-')].tolist())
        
        # Append the record to our results list
        aln_results.append(SeqRecord.SeqRecord(Seq.Seq(trimmed_aligned_target), 
                                               id = '', 
                                               name = '', 
                                               description = target_seq.description+'|'+str(aln_score)
                                              )
                          )

    return aln_results

def alignAndExport(fh, rep_db, alignment_root):
    """Perform parwise alignment of each record in a fasta against a 
    reference sequence, and export the resulting multiple sequence alignment
    as a fasta with the reference information embedded in the header as a comment
    escaped with a semicolon (";")
    
    Arguments:
        (pathlib.Path) fh: file handle of fasta containing per-enzyme
            NR.gz query records
        (dict) rep_db: dictionary mapping from enzyme name to a tuple of
            a Bio.SeqRecord object corresponding to the determined reference sequence
            and the mean+3SD alignment threshold. This will be written into the header
            of the alignment fasta as a two line comment.
        (pathlib.Path) alignment_root: root folder to contain the pairwise alignment
            results.
    
    Returns:
        None

    """
    if fh.stem in rep_db.keys():
        ref_record, aln_thresh = rep_db[fh.stem] # get the reference sequence record
    else:
        print('Enzyme not in simulation keys: {}'.format(fh.stem))
    #i think here it should return pass or just return to stop here if the enzyme not there.So i am writing return here
        return
    
    alignments = performPairwiseAlns(ref_record, fh)
    
    out_aln_handle = alignment_root.joinpath(fh.stem+'.aln')
    with open(out_aln_handle, 'a+') as alignment_file:
        alignment_file.write(';{}|{}\n;{}\n'.format(ref_record.description, 
                                                    str(aln_thresh), 
                                                    str(ref_record.seq)))
        SeqIO.write(alignments, alignment_file, 'fasta')


def alignAndExportChunked(fh_recs_chunks, rep_db, alignment_root):
    """A wrapper for performing pairwise alignments of a iterable collection of fastas
    against their dtermined representatives.
    
    
    Arguments:
        (pd.DataFrame) fh_recs_chunks: a dataframe in which each row is a record
            containing the path of a fasta to be aligned.
        (dict) rep_db: dictionary mapping from enzyme name to a tuple of
            a Bio.SeqRecord object corresponding to the determined reference sequence
            and the mean+3SD alignment threshold. This will be written into the header
            of the alignment fasta as a two line comment.
        (pathlib.Path) alignment_root: root folder to contain the pairwise alignment
            results.
    
    Returns:
        None
    
    """
    results = []
    for i, rec in fh_recs_chunks.iterrows():
        results.append(alignAndExport(rec.fasta_path, rep_db, alignment_root))

## 1. Parse the alignment simulation results

In [3]:
#Use this commented line when working with multiple enzymes
#references_thresholds = pd.read_csv('Lietal_BlastReps/Lietal_200128_RepSummary_AlnSims_Summary.csv', compression = 'bz2').set_index('name')
references_thresholds = pd.read_csv('Lietal_BlastReps/Lietal_200128_RepSummary_AlnSims_Summary.csv').set_index('name')
rep_db = {}
for name, reference_record in references_thresholds.iterrows():
    ref_seq_record = SeqRecord.SeqRecord(reference_record.rep_seq, 
                                         id = '', 
                                         name = '', 
                                         description = reference_record.rep_desc)
    rep_db[name] = (ref_seq_record, reference_record['mean+3sd'])



## 2. Parse the cleaned fastas

In [4]:
fasta_root = Path('Lietal_FetchedSeqs/parsed_fastas_cleaned_temped/MeetingOrgCountCrit/')

fasta_records = pd.DataFrame([{'fasta_path': Path(fh),'fasta_size':fh.stat().st_size} for fh in fasta_root.iterdir()]).sort_values('fasta_size')
fasta_records.head()

Unnamed: 0,fasta_path,fasta_size
0,Lietal_FetchedSeqs/parsed_fastas_cleaned_tempe...,334216


### 3. Execute pairwise alignments

In [None]:
# numchunks = 80
# alignment_root = Path('Lietal_PairwiseAlns/')

# for recordSet in tqdm(np.array_split(fasta_records, numchunks), total = numchunks, desc = 'Aligning to sequences'):

#     numthreads = 24
#     pool = mp.Pool(numthreads)
#     results = []

#     result_objects = [pool.apply_async(alignAndExportChunked, args=(chunk, rep_db, alignment_root)) 
#                       for chunk in np.array_split(recordSet.reset_index(drop = True), numthreads)]
        
#     pool.close()
#     pool.join()

Aligning to sequences:   1%|▏         | 1/80 [00:00<00:42,  1.86it/s]

Enzyme not in simulation keys: molybdate-transporting_ATPase


Aligning to sequences:  12%|█▎        | 10/80 [00:08<01:15,  1.09s/it]

Enzyme not in simulation keys: cyanide_hydratase


Aligning to sequences:  99%|█████████▉| 79/80 [3:25:23<18:15, 1095.82s/it]

In [5]:
#For single enzyme processing use this. For multiple use above cell.
alignment_root = Path('Lietal_PairwiseAlns/')
alignAndExportChunked(fasta_records, rep_db, alignment_root)   

### 4. Check results of first alignment. Large fastas could not align in bulk due to RAM constraints, so perform those alignments separately

In [9]:
alignment_root = Path('Lietal_PairwiseAlns/')
complete_alns = pd.DataFrame([{'enzyme_name': Path(fh).stem} for fh in alignment_root.iterdir() if not '.ipynb_checkpoints' in str(fh)])

In [10]:
complete_alns

Unnamed: 0,enzyme_name
0,steroid_DELTA-isomerase


In [11]:
missing_enzymes = set([fh.stem for fh in fasta_records.fasta_path]).difference(set(complete_alns.enzyme_name))
missing_recs = fasta_records.loc[fasta_records.fasta_path.apply(lambda s: s.stem).isin(missing_enzymes)]
missing_recs

Unnamed: 0,fasta_path,fasta_size


In [None]:
#Not executing below two as they are for processing the missing_recs again but we dont have any missing_recs.

In [None]:
numchunks = 5
alignment_root = Path('Lietal_PairwiseAlns/')

for recordSet in tqdm(np.array_split(missing_recs, numchunks), total = numchunks, desc = 'Aligning missings to sequences'):

    numthreads = 2
    pool = mp.Pool(numthreads)
    results = []

    result_objects = [pool.apply_async(alignAndExportChunked, args=(chunk, rep_db, alignment_root)) 
                      for chunk in np.array_split(recordSet.reset_index(drop = True), numthreads)]

    pool.close()
    pool.join()

Aligning missings to sequences:   0%|          | 0/5 [00:00<?, ?it/s]

Enzyme not in simulation keys: molybdate-transporting_ATPase


Aligning missings to sequences:  20%|██        | 1/5 [00:00<00:01,  2.39it/s]

### 5. Check final records (noting that 2 records had no representatives, so we expect two to have failed)

In [9]:
alignment_root = Path('/media/dmokhtari/DataStore2/Collaborations/MMP/Thermoadaptation_Revisions/\
SequenceFetching/1912_SequenceFetching/Lietal_PairwiseAlns/')
complete_alns = pd.DataFrame([{'enzyme_name': Path(fh).stem} for fh in alignment_root.iterdir()])

missing_enzymes = set([fh.stem for fh in fasta_records.fasta_path]).difference(set(complete_alns.enzyme_name))
missing_recs = fasta_records.loc[fasta_records.fasta_path.apply(lambda s: s.stem).isin(missing_enzymes)]
missing_recs

Unnamed: 0,fasta_path,fasta_size
53,/media/dmokhtari/DataStore2/Collaborations/MMP...,19512
10,/media/dmokhtari/DataStore2/Collaborations/MMP...,66815
