# INFO-F-208

## Partie 1.1

Avant d'implémenter un algorithme qui calcule l’alignement entre deux séquences, vous implémentez deux ADT (Abstract Data Type):    

* Un ADT séquence qui représente une séquence d’acides aminés et tous les opérations qu’on peut exécuter sur une séquence.

In [6]:
class Sequence(str):
    """
    ADT séquence qui représente une séquence d’acides aminés
    et tous les opérations qu’on peut exécuter sur une séquence.
    """
    def __init__(self, sequence):
        self.sequence = sequence
        
    def __getitem__(self, index):
        """
        """
        return self.sequence[index]

    @staticmethod
    def load(filename):
        """
        """
        f = open(filename)
        sequences = []
        seq = ""
        for line in f:
            if line[0] != '>':
                seq += line[:-1]
            else:
                if (seq != ""):
                    sequences.append(Sequence(seq))
                seq = ""
        if (seq != ""):
            sequences.append(Sequence(seq))

        f.close()
        return sequences

* Un ADT score qui représente une matrice de substitution et les opérations qu’on peut exécuter sur cette matrice.

In [7]:
class Score:
    INDEXES = "ARNDCQEGHILKMFPSTWYV"
    def __init__(self, matrix = [[]], indexes = INDEXES):
        self.matrix = matrix
        self.indexes = indexes
        
    def __getitem__(self, acids): 
        """
        @desc: Get the score for a tuple of Acids
        
        @param{acide}: Tuple of acids.
        @ex: Score[('A', 'Q')]
        """
        i = self.indexes.index(acids[0])
        j = self.indexes.index(acids[1])
        return self.matrix[i][j]

    def __repr__(self):
        ret = "  "
        for char in self.indexes:
            ret += ' ' + char + ' '
        ret += '\n'
        i = 0
        for line in self.matrix:
            ret += self.indexes[i] + ' '
            i+=1
            for char in line:
                if (char >= 0):
                    ret += ' '
                ret += str(char) + ' '
            ret += '\n'

            return ret
    
    @staticmethod
    def load(filename):
        f = open(filename)
        matrix = []
        indexes = Score.INDEXES
        for line in f:
            if line[0] == '#':
                pass
            elif line[0] == ' ' or line[0] == '\t':
                indexes = line.split()
            elif line[0].isalpha() or line[0] == '*' or line[0] == '-':
                l = line[1:]
                matrix.append(list(map(int, l.split())))
        f.close()
        assert(len(indexes) == 24)
        assert(len(matrix) == len(indexes))
        return Score(matrix, indexes)

# Implémentation de l'algorithme de Needleman-Wunsch

En utilisant les ADT construits pendant l'étape précédente, vous implémentez en Python l’algorithme Needleman‐Wunsch qui calcule l’alignement global en utilisant la pénalité affine.

In [8]:
# Score returned for matching letters.
MATCH_SCORE = 10
# Penality for mismatching letters
MISMATCH_PENALITY = -5
# When aligning sequences there are often gaps (i.e. indels), sometimes large ones.
# Biologically, a large gap is more likely to occur as one large deletion as opposed
# to multiple single deletions. Hence we should score two small indels to be worse
# than one large one. The simple and common way to do this is via a large gap-start
# score for a new indel and a smaller gap-extension score for every letter which 
# extends the indel. For example, new-indel may cost -5 and extend-indel may cost -1.
#
# Source: Wikipedia
GAP_PENALITY = -5

GAP_START = 5

In [9]:
def init_matrix(x, y):
    """
    Initialize the matrix with zeros and a column of base score.
    """
    return [[(GAP_PENALITY * i + GAP_PENALITY * j) if (i == 0 or j == 0) else 0 for j in range(y + 1)] for i in range(x + 1)]

In [10]:
score = Score.load('./blosum80.txt')
print(score)

   A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  Z  X  * 
A  5 -2 -2 -2 -1 -1 -1  0 -2 -2 -2 -1 -1 -3 -1  1  0 -3 -2  0 -2 -1 -1 -6 



In [26]:
class NeedlemanWunsch:
    def __init__(self, seq1, seq2, scoringMatrix):
        self.seq1 = seq1
        self.seq2 = seq2
        self.scoringMatrix = scoringMatrix
        self.result = []
        
        self.m = len(seq1)
        self.n = len(seq2)
        
        self.S = init_matrix(self.m, self.n)
        self.V = init_matrix(self.m, self.n)
        self.W = init_matrix(self.m, self.n)
        
        # Remplit les matrices en utilisant les formules vue dans les slides du cour.
        for i in range(1, self.m + 1):
            for j in range(1, self.n + 1):
                # Voir slide 63 (L3 alignement de séquences)
                self.V[i][j] = max(
                    self.S[i - 1][j] - 1,          # before = alignment and now = gap
                    self.V[i - 1][j] + GAP_PENALITY # before = gap and now = gap
                )

                self.W[i][j] = max(
                    self.S[i][j - 1] - 1,          # before = alignment and now = gap
                    self.W[i][j - 1] + GAP_PENALITY # before = gap and now = gap
                )
            
                self.S[i][j] = max(
                    self.S[i - 1][j - 1] + scoringMatrix[self.seq1[i - 1], self.seq2[j - 1]], # alignment = diagonal
                    self.V[i][j], # gap in Sequence A = top
                    self.W[i][j]  # gap in Sequence B = left
                ) 
    
        self._NW("", "", self.m, self.n)
        
    def __repr__(self):
        ret = ""
        for i in range(0, len(self.result)):
            ret += "%s\n" % (self.result[i][0]) # seq1
            identity = 0
            similarity = 0
            gap = 0
            # lalign style notation ":" for identity, "." for similarity, " " for a gap
            for j in range(0, len(self.result[i][0])):
                if (self.result[i][0][j] == self.result[i][1][j]): # identiy
                    ret += ':'
                    identity += 1
                    similarity += 1
                elif (self.result[i][0][j] == '-' or self.result[i][1][j] == '-'): # gap
                    ret += ' '
                    gap += 1
                else: # similarity
                    ret += '.'
                    similarity += 1
            ret += "\n"
            ret += "%s\n" % (self.result[i][1]) # seq2
            ret += "%3.3f%% identity\n" % (100 * identity / len(self.result[i][0]))
            ret += "%3.3f%% similarity\n" % (100 * similarity / len(self.result[i][0]))
            ret += "%3.3f%% gap\n"  % (100 * gap / len(self.result[i][0]))
            ret += "Length : %i\n" % (len(self.result[i][0]))
            ret += "Global score : %s\n" % (self.S[self.m][self.n])
        return ret

        
    def _NW(self, align1, align2, i, j):
        """
        """
        # print("---\nALIGN1: %s\nALIGN2: %s\n---" % (align1, align2))
        if i > 0 and j > 0:
            if self.S[i][j] == self.S[i - 1][j - 1] + self.scoringMatrix[self.seq1[i - 1], self.seq2[j - 1]]:
                self._NW(self.seq1[i - 1] + align1, self.seq2[j - 1] + align2, i - 1, j - 1)
            elif self.S[i][j] == self.V[i][j]:
                self._NW(self.seq1[i - 1] + align1, "-" + align2, i - 1, j)
            elif self.S[i][j] == self.W[i][j]:
                self._NW("-" + align1, self.seq2[j - 1] + align2, i, j - 1)

        # end of backtracking : we are back in S[0][0]
        else:
            self.result.append((align1, align2))

In [27]:
print(NeedlemanWunsch('GGVTTFVALYDYESRTETDLSFKKGERLQIVNNTEGDWWLAHSLSTGQTGYIPSNYVAPS',
        'MEAIAKYDFKATADDELSFKRGDILKVLNEECDQNWYKAELNGKDGFIPKNYIEMKPH',
        score
))

GGVTTFVA-LYDYESRT-ETD-LSFKKGE-RLQIVNNTE-GD--WW-LAHSLSTGQTGYIPSNYV--APS
.  .. .:  ::... : . : ::::.:.  :... :.:  :  :.  : .: .:..:.::.::.  .:.
M--EA-IAK-YDFKA-TAD-DELSFKRGDI-LKVL-NEEC-DQNWYK-A-EL-NGKDGFIPKNYIEMKPH
34.286% identity
68.571% similarity
31.429% gap
Length : 70
Global score : 137
GGVTTFVAL-YDYESRT-ETD-LSFKKGE-RLQIVNNTE-GD--WW-LAHSLSTGQTGYIPSNYV--APS
.  .. .:  ::... : . : ::::.:.  :... :.:  :  :.  : .: .:..:.::.::.  .:.
M--EA-IA-KYDFKA-TAD-DELSFKRGDI-LKVL-NEEC-DQNWYK-A-EL-NGKDGFIPKNYIEMKPH
34.286% identity
68.571% similarity
31.429% gap
Length : 70
Global score : 137
GGVTTFVA-LYDYESRT-ETD-LSFKKGER-LQIVNNTE-GD--WW-LAHSLSTGQTGYIPSNYV--APS
.  .. .:  ::... : . : ::::.:.  :... :.:  :  :.  : .: .:..:.::.::.  .:.
M--EA-IAK-YDFKA-TAD-DELSFKRGD-ILKVL-NEEC-DQNWYK-A-EL-NGKDGFIPKNYIEMKPH
34.286% identity
68.571% similarity
31.429% gap
Length : 70
Global score : 137
GGVTTFVAL-YDYESRT-ETD-LSFKKGER-LQIVNNTE-GD--WW-LAHSLSTGQTGYIPSNYV--APS
.  .. .:  ::... : . : ::::.:.  :... :.:  :  :.  : .: 

## Partie 1.3

Modifiez le logiciel de la partie 1.2 de sorte que on peut faire un alignement local (Smith‐Waterman). Utilisez les séquences dans le fichier maguk-sequences.fasta sur le site web. Retrouvez les similarités entre les 4 séquences.
Expliquez les similarités. Plus d’informations concernant les protéines dans le fichier .fasta peuvent être trouvées sur le site UniProt (http://www.uniprot.org/).

In [24]:
class SmithWaterman:
    def __init__(self, seq1, seq2, scoringMatrix):
        self.seq1 = seq1
        self.seq2 = seq2
        self.scoringMatrix = scoringMatrix
        self.result = []
        
        self.m = len(seq1)
        self.n = len(seq2)
        
        self.max = [self.m, self.n]
        
        self.S = init_matrix(self.m, self.n)
        self.V = init_matrix(self.m, self.n)
        self.W = init_matrix(self.m, self.n)
        
        # Remplit les matrices en utilisant les formules vue dans les slides du cour.
        for i in range(1, self.m + 1):
            for j in range(1, self.n + 1):
                # Voir slide 63 (L3 alignement de séquences)
                self.V[i][j] = max(
                    self.S[i - 1][j] - 1,          # before = alignment and now = gap
                    self.V[i - 1][j] + GAP_PENALITY # before = gap and now = gap
                )

                self.W[i][j] = max(
                    self.S[i][j - 1] - 1,          # before = alignment and now = gap
                    self.W[i][j - 1] + GAP_PENALITY # before = gap and now = gap
                )
            
                self.S[i][j] = max(
                    self.S[i - 1][j - 1] + scoringMatrix[self.seq1[i - 1], self.seq2[j - 1]], # alignment = diagonal
                    self.V[i][j], # gap in Sequence A = top
                    self.W[i][j],  # gap in Sequence B = left
                    0 # Toutes les valeurs négatives sont mise à 0.
                ) 

                if (self.S[i][j] > self.S[self.max[0]][self.max[1]]):
                    self.max = [i, j]
    
        self._SW("", "", self.m, self.n)
        
    def __repr__(self):
        ret = ""
        for i in range(0, len(self.result)):
            ret += "%s\n" % (self.result[i][0]) # seq1
            identity = 0
            similarity = 0
            gap = 0
            for j in range(0, len(self.result[i][0])):
                if (self.result[i][0][j] == self.result[i][1][j]): # identiy
                    ret += ':'
                    identity += 1
                    similarity += 1
                elif (self.result[i][0][j] == '-' or self.result[i][1][j] == '-'): # gap
                    ret += ' '
                    gap += 1
                else:
                    ret += '.'
                    similarity += 1
            ret += "\n"
            ret += "%s\n" % (self.result[i][1]) # seq2
            ret += "%3.3f%% identity\n" % (100 * identity / len(self.result[i][0]))
            ret += "%3.3f%% similarity\n" % (100 * similarity / len(self.result[i][0]))
            ret += "%3.3f%% gap\n"  % (100 * gap / len(self.result[i][0]))
            ret += "Length : %i\n" % (len(self.result[i][0]))
            ret += "Global score : %s\n" % (self.S[self.max[0]][self.max[1]])
        return ret

        
    def _SW(self, align1, align2, i, j):
        """
        """
        # print("---\nALIGN1: %s\nALIGN2: %s\n---" % (align1, align2))
        if (i > 0 and j > 0) and (self.S[i][j] > 0):
            if self.S[i][j] == self.S[i - 1][j - 1] + self.scoringMatrix[self.seq1[i - 1], self.seq2[j - 1]]:
                # Vérification par apport à la diagonale.
                self._SW(self.seq1[i - 1] + align1, self.seq2[j - 1] + align2, i - 1, j - 1)
            elif self.S[i][j] == self.V[i][j]:
                # Vérification par apport à la gauche: trous dans seq2
                self._SW(self.seq1[i - 1] + align1, "-" + align2, i - 1, j)
            elif self.S[i][j] == self.W[i][j]:
                # Vérification par apport au dessus: trous sequ1
                self._SW("-" + align1, self.seq2[j - 1] + align2, i, j - 1)

        else:
            self.result.append((align1, align2))

In [25]:
print(SmithWaterman('GGVTTFVALYDYESRTETDLSFKKGERLQIVNNTEGDWWLAHSLSTGQTGYIPSNYVAPS',
                    'MEAIAKYDFKATADDELSFKRGDILKVLNEECDQNWYKAELNGKDGFIPKNYIEMKPH',
                    score
      )
)

VA-LYDYESRT-ETD-LSFKKGE-RLQIVNNTE-GD--WW-LAHSLSTGQTGYIPSNYV--APS
.:  ::... : . : ::::.:.  :... :.:  :  :.  : .: .:..:.::.::.  .:.
IAK-YDFKA-TAD-DELSFKRGDI-LKVL-NEEC-DQNWYK-A-EL-NGKDGFIPKNYIEMKPH
37.500% identity
70.312% similarity
29.688% gap
Length : 64
Global score : 146

