# INFO-F-208

## Partie 1.1

Avant d'implémenter un algorithme qui calcule l’alignement entre deux séquences, vous implémentez deux ADT (Abstract Data Type):    

* Un ADT séquence qui représente une séquence d’acides aminés et tous les opérations qu’on peut exécuter sur une séquence.

In [18]:
class Sequence(str):
    """
    ADT séquence qui représente une séquence d’acides aminés
    et tous les opérations qu’on peut exécuter sur une séquence.
    """
    def __init__(self, sequence):
        self.sequence = sequence
    
    def __repr__(self):
        return self.sequence
    
    def __getitem__(self, index):
        """
        @desc: Permet d'interpreter la classe comme une "String".
        
        @param{index}: Index de la lettre qu'on veut consulter.
        """
        return self.sequence[index]

    @staticmethod
    def load(filename):
        """
        """
        with open(filename) as f:
            seq = ""
            for line in f:
                if line[0] == ">":
                    if seq:
                        yield Sequence(seq)
                    seq = ""
                else:
                    seq += line[:-1] # Not including '\n'
                    
            if (seq):
                yield Sequence(seq)

In [19]:
sh3sequence = [seq for seq in Sequence.load("./SH3-sequence.fasta")]
maguk = [seq for seq in Sequence.load("./maguk-sequences.fasta")]

* Un ADT score qui représente une matrice de substitution et les opérations qu’on peut exécuter sur cette matrice.

In [5]:
class _ScoreLine:
    """
    ADT utilisée pour pouvoir utiliser l'ADT Score comme une matrice.
    """
    def __init__(self, line, indexes):
        self.line = line
        self.indexes = indexes
    
    def __getitem__(self, letter):
        j = self.indexes.index(letter)
        return self.line[j]

class Score:
    INDEXES = "ARNDCQEGHILKMFPSTWYV"
    def __init__(self, matrix = [[]], indexes = INDEXES):
        self.matrix = matrix
        self.indexes = indexes
        
    def __getitem__(self, letter): 
        """
        @desc: Renvoie le score d'une combinaison de deux acides
        
        @param{letter}: Le premier acide.
        @ex: Score['A']['Q']
        """
        i = self.indexes.index(letter)
        return _ScoreLine(self.matrix[i], self.indexes)

    def __repr__(self):
        ret = "  "
        for char in self.indexes:
            ret += " %3s " % (char)
        ret += '\n'
        i = 0
        for line in self.matrix:
            ret += "%3s " % (self.indexes[i])
            i += 1
            for char in line:
                if (char >= 0):
                    ret += ' '
                ret += "%3s " % (char)
            ret += '\n'

        return ret
    
    @staticmethod
    def load(filename):
        f = open(filename)
        matrix = []
        indexes = Score.INDEXES
        for line in f:
            if line[0] == '#':
                pass
            elif line[0] == ' ' or line[0] == '\t':
                indexes = line.split()
            elif line[0].isalpha() or line[0] == '*' or line[0] == '-':
                l = line[1:]
                matrix.append(list(map(int, l.split())))
        f.close()
        assert(len(indexes) == 24)
        assert(len(matrix) == len(indexes))
        return Score(matrix, indexes)

# Implémentation de l'algorithme de Needleman-Wunsch

En utilisant les ADT construits pendant l'étape précédente, vous implémentez en Python l’algorithme Needleman‐Wunsch qui calcule l’alignement global en utilisant la pénalité affine.

In [25]:
OPENING_GAP_PENALTY = -12
EXTENDING_GAP_PENALTY = -2

In [87]:
def init_matrix(x, y):
    """
    Initialize the matrix with zeros and a column of base score.
    """
    return [[(i * (OPENING_GAP_PENALTY + (j - 1) * EXTENDING_GAP_PENALTY)) + (j * (OPENING_GAP_PENALTY + (i - 1) * EXTENDING_GAP_PENALTY)) if (i == 0 or j == 0) else 0 for j in range(y + 1)] for i in range(x + 1)]

In [88]:
blosum80 = Score.load('./blosum80.txt')
print(blosum80)
blosum62 = Score.load('./blosum62.txt')
print(blosum62)
pam120 = Score.load('./pam120.txt')
print(pam120)
pam60 = Score.load('./pam60.txt')
print(pam60)

     A    R    N    D    C    Q    E    G    H    I    L    K    M    F    P    S    T    W    Y    V    B    Z    X    * 
  A    5  -2  -2  -2  -1  -1  -1    0  -2  -2  -2  -1  -1  -3  -1    1    0  -3  -2    0  -2  -1  -1  -6 
  R  -2    6  -1  -2  -4    1  -1  -3    0  -3  -3    2  -2  -4  -2  -1  -1  -4  -3  -3  -2    0  -1  -6 
  N  -2  -1    6    1  -3    0  -1  -1    0  -4  -4    0  -3  -4  -3    0    0  -4  -3  -4    4    0  -1  -6 
  D  -2  -2    1    6  -4  -1    1  -2  -2  -4  -5  -1  -4  -4  -2  -1  -1  -6  -4  -4    4    1  -1  -6 
  C  -1  -4  -3  -4    9  -4  -5  -4  -4  -2  -2  -4  -2  -3  -4  -2  -1  -3  -3  -1  -4  -4  -1  -6 
  Q  -1    1    0  -1  -4    6    2  -2    1  -3  -3    1    0  -4  -2    0  -1  -3  -2  -3    0    3  -1  -6 
  E  -1  -1  -1    1  -5    2    6  -3    0  -4  -4    1  -2  -4  -2    0  -1  -4  -3  -3    1    4  -1  -6 
  G    0  -3  -1  -2  -4  -2  -3    6  -3  -5  -4  -2  -4  -4  -3  -1  -2  -4  -4  -4  -1  -3  -1  -6 
  H  -2    0    0  -2  -

In [89]:
class NeedlemanWunsch:
    def __init__(self, seq1, seq2, scoringMatrix):
        self.seq1 = seq1
        self.seq2 = seq2
        self.scoringMatrix = scoringMatrix
        self.result = []
        
        self.m = len(seq1)
        self.n = len(seq2)
        
        self.S = [[(i * (OPENING_GAP_PENALTY + (j - 1) * EXTENDING_GAP_PENALTY)) + (j * (OPENING_GAP_PENALTY + (i - 1) * EXTENDING_GAP_PENALTY)) if (i == 0 or j == 0) else 0 for j in range(self.n + 1)] for i in range(self.m + 1)]
        self.V = [[(i * (OPENING_GAP_PENALTY + (j - 1) * EXTENDING_GAP_PENALTY)) + (j * (OPENING_GAP_PENALTY + (i - 1) * EXTENDING_GAP_PENALTY)) if (i == 0 or j == 0) else 0 for j in range(self.n + 1)] for i in range(self.m + 1)]
        self.W = [[(i * (OPENING_GAP_PENALTY + (j - 1) * EXTENDING_GAP_PENALTY)) + (j * (OPENING_GAP_PENALTY + (i - 1) * EXTENDING_GAP_PENALTY)) if (i == 0 or j == 0) else 0 for j in range(self.n + 1)] for i in range(self.m + 1)]
        
        for i in range(1, self.m + 1):
            for j in range(1, self.n + 1):
                # Voir slide 63 (L3 alignement de séquences)
                self.V[i][j] = max(
                    self.S[i - 1][j] + OPENING_GAP_PENALTY + EXTENDING_GAP_PENALTY, # before = alignment and now = gap
                    # self.S[i - 1][j] + OPENING_GAP_PENALTY, # before = alignment and now = gap
                    # self.S[i - 1][j] - 1, # before = alignment and now = gap
                    self.V[i - 1][j] + EXTENDING_GAP_PENALTY # before = gap and now = gap
                )

                self.W[i][j] = max(
                    self.S[i][j - 1] + OPENING_GAP_PENALTY + EXTENDING_GAP_PENALTY, # before = alignment and now = gap
                    # self.S[i][j - 1] + OPENING_GAP_PENALTY, # before = alignment and now = gap
                    # self.S[i][j - 1] - 1, # before = alignment and now = gap
                    self.W[i][j - 1] + EXTENDING_GAP_PENALTY # before = gap and now = gap
                )
            
                self.S[i][j] = max(
                    self.S[i - 1][j - 1] + scoringMatrix[self.seq1[i - 1]][self.seq2[j - 1]], # alignment = diagonal
                    self.V[i][j], # gap in Sequence A = top
                    self.W[i][j]  # gap in Sequence B = left
                ) 
    
        self._NW("", "", self.m, self.n)
        
    def __repr__(self):
        ret = ""
        for i in range(0, len(self.result)):
            ret += "%s\n" % (self.result[i][0]) # seq1
            identity = 0
            similarity = 0
            gap = 0
            
            for j in range(0, len(self.result[i][0])):
                if (self.result[i][0][j] == self.result[i][1][j]): # identiy
                    ret += ':'
                    identity += 1
                    similarity += 1
                elif (self.result[i][0][j] == '-' or self.result[i][1][j] == '-'): # gap
                    ret += ' '
                    gap += 1
                else: # similarity
                    ret += '.'
                    similarity += 1
            ret += "\n"
            ret += "%s\n" % (self.result[i][1]) # seq2
            ret += "%3.3f%% identity\n" % (100 * identity / len(self.result[i][0]))
            ret += "%3.3f%% similarity\n" % (100 * similarity / len(self.result[i][0]))
            ret += "%3.3f%% gap\n"  % (100 * gap / len(self.result[i][0]))
            ret += "Length : %i\n" % (len(self.result[i][0]))
            ret += "Global score : %s\n" % (self.S[self.m][self.n])
        return ret

        
    def _NW(self, align1, align2, i, j):
        """
        """
        if (i > 0 and j > 0) and (self.S[i][j] == self.S[i - 1][j - 1] + self.scoringMatrix[self.seq1[i - 1]][self.seq2[j - 1]]):
            self._NW(self.seq1[i - 1] + align1, self.seq2[j - 1] + align2, i - 1, j - 1)
        elif (i > 0) and (self.S[i][j] == self.V[i][j]):
            self._NW(self.seq1[i - 1] + align1, "-" + align2, i - 1, j)
        elif (j > 0) and (self.S[i][j] == self.W[i][j]):
            self._NW("-" + align1, self.seq2[j - 1] + align2, i, j - 1)
        else:
            # end of backtracking : we are back in S[0][0]
            self.result.append((align1, align2))

Vérifiez si votre programme donne le même résultat que l’outil LALIGN http://www.ch.embnet.org/software/LALIGN_form.html

Utilisez les séquences dans le fichier SH3-sequences.fasta sur le site web.        

In [90]:
scoringMatrix = blosum80
print(NeedlemanWunsch(sh3sequence[0], sh3sequence[1],scoringMatrix))

GGVTTFVALYDYESRTETDLSFKKGERLQIVNNTEGDWWLAHSLSTGQTGYIPSNYVAPSDS
 .  ...:.::........::::.:..:...:......:....: .:..:.::.::......
-M--EAIAKYDFKATADDELSFKRGDILKVLNEECDQNWYKAEL-NGKDGFIPKNYIEMKPH
29.032% identity
93.548% similarity
6.452% gap
Length : 62
Global score : 59



Le résultat avec _LALIGN_ est le suivant:

```
lobal/global (N-W) score: 114; 30.2% identity (61.9% similar) in 63 aa overlap (1-60:1-58)

               10        20        30         40        50          60         
unknow GGVTTFVALYDYESRTETDLSFKKGERLQIVNNT-EGDWWLAHSLSTGQTGYIPSNYV--APS
             .: ::... .. .::::.:. :...:   . .:. :.   .:. :.:: ::.   :
unknow ---MEAIAKYDFKATADDELSFKRGDILKVLNEECDQNWYKAEL--NGKDGFIPKNYIEMKPH
                  10        20        30        40          50     
```

In [94]:
scoringMatrix = pam120
print(NeedlemanWunsch(sh3sequence[0], sh3sequence[1], scoringMatrix))

GGVTTFVALYDYESRTETDLSFKKGERLQIVNNTEGDWWLAHSLSTGQTGYIPSNYVAPSDS
 . .. .:.::........::::.:..:...:......:....:. :..:.::.::......
-M-EA-IAKYDFKATADDELSFKRGDILKVLNEECDQNWYKAELN-GKDGFIPKNYIEMKPH
29.032% identity
93.548% similarity
6.452% gap
Length : 62
Global score : 57



Résultat de _LALIGN_

```
 n-w opt:  57  Z-score: 157.8  bits: 32.6 E(1): 2.1e-27
global/global (N-W) score: 57; 29.0% identity (61.3% similar) in 62 aa overlap (1-62:1-58)

               10        20        30        40        50        60
unknow GGVTTFVALYDYESRTETDLSFKKGERLQIVNNTEGDWWLAHSLSTGQTGYIPSNYVAPSDS
         .   .: ::. . .. .::::.:. :...:.  .. :    :. :. :.:: ::..  
unknow MEA---IAKYDFKATADDELSFKRGDILKVLNEECDQNWYKAELN-GKDGFIPKNYIEMKPH
                  10        20        30        40         50               
```

## Partie 1.3

Modifiez le logiciel de la partie 1.2 de sorte que on peut faire un alignement local (Smith‐Waterman). Utilisez les séquences dans le fichier maguk-sequences.fasta sur le site web. Retrouvez les similarités entre les 4 séquences.
Expliquez les similarités. Plus d’informations concernant les protéines dans le fichier .fasta peuvent être trouvées sur le site UniProt (http://www.uniprot.org/).

In [91]:
class SmithWaterman:
    def __init__(self, seq1, seq2, scoringMatrix):
        self.seq1 = seq1
        self.seq2 = seq2
        self.scoringMatrix = scoringMatrix
        self.result = []
        
        self.m = len(seq1)
        self.n = len(seq2)
        
        self.max = [self.m, self.n]
        
        self.S = init_matrix(self.m, self.n)
        self.V = init_matrix(self.m, self.n)
        self.W = init_matrix(self.m, self.n)
        
        # Remplit les matrices en utilisant les formules vue dans les slides du cour.
        for i in range(1, self.m + 1):
            for j in range(1, self.n + 1):
                # Voir slide 63 (L3 alignement de séquences)
                self.V[i][j] = max(
                    self.S[i - 1][j] + OPENING_GAP_PENALTY + EXTENDING_GAP_PENALTY, # before = alignment and now = gap
                    self.V[i - 1][j] + EXTENDING_GAP_PENALTY # before = gap and now = gap
                )

                self.W[i][j] = max(
                    self.S[i][j - 1] + OPENING_GAP_PENALTY + EXTENDING_GAP_PENALTY, # before = alignment and now = gap
                    self.W[i][j - 1] + EXTENDING_GAP_PENALTY # before = gap and now = gap
                )
            
                self.S[i][j] = max(
                    self.S[i - 1][j - 1] + scoringMatrix[self.seq1[i - 1]][self.seq2[j - 1]], # alignment = diagonal
                    self.V[i][j], # gap in Sequence A = top
                    self.W[i][j],  # gap in Sequence B = left
                    0 # Toutes les valeurs négatives sont mise à 0.
                ) 

                if (self.S[i][j] > self.S[self.max[0]][self.max[1]]):
                    self.max = [i, j]
    
        self._SW("", "", self.m, self.n)
        
    def __repr__(self):
        ret = ""
        for i in range(0, len(self.result)):
            ret += "%s\n" % (self.result[i][0]) # seq1
            identity = 0
            similarity = 0
            gap = 0
            for j in range(0, len(self.result[i][0])):
                if (self.result[i][0][j] == self.result[i][1][j]): # identiy
                    ret += ':'
                    identity += 1
                    similarity += 1
                elif (self.result[i][0][j] == '-' or self.result[i][1][j] == '-'): # gap
                    ret += ' '
                    gap += 1
                else:
                    ret += '.'
                    similarity += 1
            ret += "\n"
            ret += "%s\n" % (self.result[i][1]) # seq2
            ret += "%3.3f%% identity\n" % (100 * identity / len(self.result[i][0]))
            ret += "%3.3f%% similarity\n" % (100 * similarity / len(self.result[i][0]))
            ret += "%3.3f%% gap\n"  % (100 * gap / len(self.result[i][0]))
            ret += "Length : %i\n" % (len(self.result[i][0]))
            ret += "Global score : %s\n" % (self.S[self.max[0]][self.max[1]])
        return ret

        
    def _SW(self, align1, align2, i, j):
        """
        """
        # print("---\nALIGN1: %s\nALIGN2: %s\n---" % (align1, align2))
        if (i > 0 and j > 0) and (self.S[i][j] > 0):
            if self.S[i][j] == self.S[i - 1][j - 1] + self.scoringMatrix[self.seq1[i - 1]][self.seq2[j - 1]]:
                # Vérification par apport à la diagonale.
                self._SW(self.seq1[i - 1] + align1, self.seq2[j - 1] + align2, i - 1, j - 1)
            elif self.S[i][j] == self.V[i][j]:
                # Vérification par apport à la gauche: trous dans seq2
                self._SW(self.seq1[i - 1] + align1, "-" + align2, i - 1, j)
            elif self.S[i][j] == self.W[i][j]:
                # Vérification par apport au dessus: trous sequ1
                self._SW("-" + align1, self.seq2[j - 1] + align2, i, j - 1)

        else:
            self.result.append((align1, align2))

In [92]:
import sys
sys.setrecursionlimit(10000)

scoringMatrix = blosum80

print(SmithWaterman(maguk[0], maguk[1], scoringMatrix))

#for seq1 in maguk:
#    for seq2 in maguk:
#        if seq1 != seq2:
#            print(SmithWaterman(seq1, seq2, scoringMatrix))

SHSHISPIKPTEA-VLPSPPTVPVIPVLPVPAENTVIL-PTIPQANPPPVLVNTDSLETPTYVNGTDADYEYEEITLERGNSGLGFSIAGGTDNPHIGDDSSIFITKIITGGAAAQDGRLRVNDCILRVNEVDVRDVTHSKAVEALKEAGSIVRLYVKRRKPVSEKIMEIKLIKGPKGLGFSIAGGVGNQHIPGDNSIYVTKIIEGGAAHKDGKLQIGDKLLAVNNVCLEEVTHEEAVTALKNTSDFVYLKVAKPTSMYMNDGYAPPDITNSSSQPVDNHVS-PSS--FLG---------QTPA-SPARYSPVSKAVLGDDEITREPRKVVLHRGSTGLGFNIVGGEDGEGIFISFILAGGPADLSGELRKGDRIISVNSVDLRAASHEQAAAALKNAGQAVTIVAQYRPEEYSRFEAKIHDLREQMMNSSISSGSGSLRTSQKRSLYVRALFDYDKTKDSGLPSQGLNFKFGDILHVINASDDEWWQARQVTPDGESDEVGVIPSKRRVEKKERARLKTVKFNSKTRDKGEIPDDMGSKGLKHVTSNASDSESSYRGQEEYVLSYEPVNQQEVNYTRPVIILGPMKDRINDDLISEFPDKFGSCVPHTTRPKRDYEVDGRDYHFVTSREQMEKDIQEHKFIEAGQYNNHLYGTSVQSVREVAEKGKHCILDVSGNAIKRLQIAQLYPISIFIKPKSMENIMEMNKRLTEEQARKTFERAMKLEQEFTEHFTAIVQGDTLEDIYNQVKQIIEEQSGSYIWVPAKEKL
:.....:...:.: ..:....:...:..:::...:..: ...:...:.....:.:..:  . :::.:....::::.:::::::::::::::.::::..::..:::::::.:::::.::::.::::.::::::::..:.::.:::::::::..:::.:.::.:..:.:::..:.:::::::::::::.::::::::::::.:::::::::.:::.:::::.::::::..:..: