In [2]:
import sys
import re
from copy import deepcopy
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.Alphabet import Gapped
from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA, ExtendedIUPACProtein

In [3]:
# Input
fn = "NSP1Arra.fas.nonn.orf.revalign.dedup"
ofn = "NSP1Arra.fas.nonn.orf.revalign.dedup.trim5"
thresh = 100
xthresh = 2
gapthresh = 10

In [4]:
# Read alignment
align = AlignIO.read(fn,"fasta",alphabet=Gapped(IUPACAmbiguousDNA(),"-"))

In [5]:
# Define sequence
# kozak = re.compile(r"GCC[AG]CCATGG",re.IGNORECASE)
kozak = re.compile(r"[AG]CCATGG",re.IGNORECASE)

In [6]:
# Find end of regexp
kozaklist = [kozak.search(str(a.seq)) for a in align]
kozakmatches = [k for k in kozaklist if k!=None]
kend = [k.span()[1] for k in kozakmatches]

In [7]:
# Make a list of hits before threshold
klist = list(set(kend))
klist = [k for k in klist if k<= thresh]
kfreq = [kend.count(k) for k in klist]

In [8]:
# Make a decision
kpos = [i for i in range(len(kfreq)) if kfreq[i]==max(kfreq)][0]
kvote = klist[kpos]

In [10]:
# Trim alignment
trimalign=align[:,(kvote-4):]

In [11]:
pralign = MultipleSeqAlignment(records=[],alphabet=Gapped(ExtendedIUPACProtein(), '-'),annotations=None)
for a in trimalign:
    s=deepcopy(a)
    s.seq = s.seq.translate()
    pralign.append(s)

In [12]:
xcount = [str(p.seq).count('X') for p in pralign]
xlist = list(set(xcount))
xfreq = [xcount.count(x) for x in xlist]
print([xlist,xfreq])

In [32]:
# Exclude sequences
trimalign2 = [trimalign[i,:] for i in range(len(pralign)) if xcount[i] <= xthresh]
trimalign2 = MultipleSeqAlignment(records=trimalign2,alphabet=Gapped(IUPACAmbiguousDNA(),"-"))

In [35]:
# Count gaps, then go in reverse until the threshold is first crossed
gapcount = [trimalign2[:,i].count("-") for i in range(trimalign2.get_alignment_length())]


In [36]:
gapcount

[68,
 68,
 68,
 64,
 64,
 64,
 62,
 62,
 62,
 61,
 61,
 61,
 58,
 58,
 58,
 51,
 51,
 51,
 46,
 46,
 46,
 38,
 38,
 38,
 31,
 31,
 31,
 30,
 30,
 30,
 30,
 30,
 30,
 29,
 29,
 29,
 27,
 27,
 27,
 26,
 26,
 26,
 26,
 26,
 26,
 22,
 22,
 22,
 22,
 22,
 22,
 21,
 21,
 21,
 19,
 19,
 19,
 19,
 19,
 19,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 15,
 15,
 15,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,


In [13]:
AlignIO.write(trimalign2,ofn,"fasta")

1