In [1]:
from Bio import AlignIO
from Bio.Alphabet import single_letter_alphabet
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment

In [2]:
aln_file1 = '/home/sur/micropopgen/exp/2018/today9/filtered/dnaG.aln'
aln_file2 = '/home/sur/micropopgen/exp/2018/today9/filtered/nusA.aln'
aln1 = AlignIO.read(aln_file1, 'fasta')
aln2 = AlignIO.read(aln_file2, 'fasta')
alns = [aln1[:,0:10], aln2[:,0:10]]

In [9]:
def reorder_alignment(aln, specs, species, alphabet=single_letter_alphabet, gap='-'):
    """Take an alignment and reorder it acording to species list.
    Add records as gapped if missing"""
    
    new_aln = []
    missing_seq = ''.join([gap] * aln.get_alignment_length())
    for s in species:    
        # Check if species exist in alignment
        try:
            i = specs.index(s)
        except ValueError:
            i = -1
        except:
            raise

        if i >= 0:
            new_aln.append(aln[i])
        elif i == -1:
            new_aln.append(SeqRecord(Seq(missing_seq, alphabet), id=s))

    new_aln = MultipleSeqAlignment(new_aln)
    
    return(new_aln)

In [10]:
def concatenate_alignments(alns, alphabet=single_letter_alphabet, gap='-'):
    """Take a list of multiple sequence alignments and 
    concatenate them, fill with gaps where missing sequences."""
    
    # Get list of species from alignments
    species = []
    species_per_aln = []
    for a in alns:
        specs = [r.id for r in a]
        species.extend(specs)
        species_per_aln.append(specs)

    species = list(set(species))
    
    # Create empty alignmet
    new_aln = MultipleSeqAlignment([SeqRecord(Seq('', alphabet), id=s) for s in species])
    
    # Iterate over each species, re-ordering when neccessary
    for i in range(len(alns)):
        # print("alginment", i)
        specs = species_per_aln[i]
        if specs != species:
            # print("\treordering")
            # new_alns.append(reorder_alignment(aln=alns[i], specs=specs, species=species))
            new_aln = new_aln + reorder_alignment(aln=alns[i], specs=specs,
                                                  species=species, alphabet=alphabet,
                                                  gap=gap)
        else:
            # print("matched")
            new_aln = new_aln + alns[i]
    
    return new_aln
    

In [11]:
aln = concatenate_alignments(alns=alns)
print(aln)

SingleLetterAlphabet() alignment with 3 rows and 945 columns
MYKLSEDEEKIINNIDIVDLIGQYVDNKAVSYKGYSKSENTSHP...--- Streptobacillus_moniliformis_61686.CDS_0
-VIDQQTRQRILDSAQILDVVSDFISRRQVSYVGLCHSDRNYNP...--- Porphyromonas_sp_57899.CDS_0
HFLPPEFLDELRQRITLSSLIGRTVKTRAREFRACCHNEKTYND...EKA Zymomonas_mobilis_57566.CDS_0


In [6]:
print(alns[0])
print(alns[1])

SingleLetterAlphabet() alignment with 3 rows and 10 columns
MYKLSEDEEK Streptobacillus_moniliformis_61686.CDS_0
-VIDQQTRQR Porphyromonas_sp_57899.CDS_0
HFLPPEFLDE Zymomonas_mobilis_57566.CDS_0
SingleLetterAlphabet() alignment with 2 rows and 10 columns
-----MKKTT Porphyromonas_sp_57899.CDS_0
MASAISNALL Zymomonas_mobilis_57566.CDS_0


In [7]:
alns = [aln1, aln2]
%timeit concatenate_alignments(alns=alns)

231 µs ± 11.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [8]:
%prun concatenate_alignments(alns=alns)

 