## Жадный алгоритм множественного выравнивания
Реализуйте алгоритм, который принимал бы на вход массив строк, штраф за удаления, вставки и несовпадения, а также цену совпадений. А возвращал бы множественное выравнивание. На первом шаге алгоритм должен выбрать две самые близкие по расстоянию Левенштейна строки и заменить их консенснусной строкой. При следующих шагах алгоритма выравниваться между собой могут так же и консенснусные строки. При этом стоит хранить для каждой строки не только ее саму но и профиль множественного выравнивания, чтобы в итоге правильно пересчитывать консенсус.
Результат работы алгоритма - массив строк, соответствующий некоторому множественному выравниванию.

In [131]:
import numpy as np

def levenstein_distance_matrix(seq1, seq2, cost_del, cost_ins, cost_match, cost_mismatch):
    # print(seq1, seq2)
    n = len(seq1)
    m = len(seq2)
    dp = [[0 for _ in range(m+1)] for _ in range(n+1)]
    for i in range(n+1):
        dp[i][0] = i*cost_ins
    for i in range(m+1):
        dp[0][i] = i*cost_del
    for i in range(1, n+1):
        for j in range(1, m+1):
            dp[i][j] = max(
                dp[i-1][j-1] + (cost_match if seq1[i-1] == seq2[j-1] else cost_mismatch),
                dp[i-1][j] + cost_ins,
                dp[i][j-1] + cost_del,
            )
    return dp


def alignments(seq1, seq2, dp, cost_del, cost_ins, cost_match, cost_mismatch):
    n = len(seq1)
    m = len(seq2)
    alignment1 = ''
    alignment2 = ''
    while n > 0 or m > 0:
        if n > 0 and m > 0 and dp[n][m] == dp[n-1][m-1] + (cost_match if seq1[n-1] == seq2[m-1] else cost_mismatch):
            alignment1 = seq1[n-1] + alignment1
            alignment2 = seq2[m-1] + alignment2
            n -= 1
            m -= 1
        elif n > 0 and dp[n][m] == dp[n-1][m] + cost_ins:
            alignment1 = seq1[n-1] + alignment1
            alignment2 = '-' + alignment2
            n -= 1
        elif m > 0 and dp[n][m] == dp[n][m-1] + cost_del:
            alignment2 = seq2[m-1] + alignment2
            alignment1 = '-' + alignment1
            m -= 1
    return alignment1, alignment2

class consensuns_string:
    def __init__(self, string=None, msa_profile=None, is_msa = False):
        if not is_msa:
            self.msa_profile = np.array([[ch for ch in string]])
        else:
            self.msa_profile = np.array(msa_profile)
        self.consensuns_string = self.get_consensuns()

    def update_msa_profile(self, aligned_old_consensuns):
        old_consensuns = self.consensuns_string + '*'*len(aligned_old_consensuns)
        self.consensuns_string = aligned_old_consensuns
        # print(aligned_old_consensuns)
        # print(old_consensuns)
        for chr_idx in range(len(aligned_old_consensuns)):
            if aligned_old_consensuns[chr_idx] == '-' and old_consensuns[chr_idx] != '-':  # think!
                self.msa_profile = np.insert(self.msa_profile, chr_idx, '-', axis=1)
                old_consensuns = old_consensuns[:chr_idx] + '-' + old_consensuns[chr_idx:]
        # self.msa_profile = np.append(self.msa_profile)

    def get_consensuns(self):
        frequency = []
        num_sequences = self.msa_profile.shape[0]
        idxs = {'A': 0, 'C': 1, 'G': 2, 'T': 3, '-': 4}
        revert_idxs = {0: 'A', 1: 'C', 2: 'G', 3: 'T', 4: '-'}
        for seq in self.msa_profile.T:
            unique, counts = np.unique(seq, return_counts=True)
            new_counts = [0 for _ in range(5)]
            for ch_idx in range(len(unique)):
                new_counts[idxs[unique[ch_idx]]] = counts[ch_idx]
            frequency.append(new_counts)
        frequency = np.array(frequency).astype('float')
        frequency /= num_sequences
        # print(frequency)
        consensuns = ''
        for row in frequency:
            consensuns += revert_idxs[np.argmax(row)]
        # print(consensuns)
        return consensuns


def msa(sequences, cost_del, cost_ins, cost_match, cost_mismatch):
    sequences = [consensuns_string(sequence) for sequence in sequences]
    while len(sequences) > 1:
        best_score = -np.inf
        best_matrix = None
        best_sequences = None
        for i in range(len(sequences)):
            for j in range(i):
                seq1 = sequences[i]
                seq2 = sequences[j]
                matrix = levenstein_distance_matrix(seq1.consensuns_string, seq2.consensuns_string, cost_del, cost_ins, cost_match, cost_mismatch)
                score = matrix[-1][-1]
                # print(score)
                if score > best_score:
                    best_score = score
                    best_matrix = matrix
                    best_sequences = (seq1, seq2)
        sequence1, sequence2 = best_sequences
        alignment1, alignment2 = alignments(sequence1.consensuns_string, sequence2.consensuns_string, best_matrix, cost_del, cost_ins, cost_match, cost_mismatch)
        sequence1.update_msa_profile(alignment1)
        sequence2.update_msa_profile(alignment2)
        # print(sequence1.msa_profile)
        # print(sequence2.msa_profile)
        new_msa_profile = np.concatenate((sequence1.msa_profile, sequence2.msa_profile), axis=0)
        new_sequence = consensuns_string(None, new_msa_profile, True)
        sequences.remove(sequence1)
        sequences.remove(sequence2)
        sequences.append(new_sequence)
        # print([sequence.consensuns_string for sequence in sequences])
    return sequences[0].msa_profile


In [132]:
a = consensuns_string('AGTC', [['A', 'G', 'T', 'C'], ['A', 'T', 'C', 'G'], ['A', 'T', 'C', 'G']])

In [135]:
secs1 = ['TCGGGGTTTTT', 'CCTGACTTAC', 'ACGGGATTTTC',
     'AGTC', 'TTGGGGACTTCC', 'TCGGATTCAT',
     'GGGATTCC', 'TAGGGGAACC', 'TCGGGTATAACC']

secs2 = ['ATGTTATA', 'AGCGATCA', 'ATCGTCTC']

In [140]:
print(msa(secs2, -2, -2, 1, -1))

[['A' 'T' 'C' 'G' 'T' 'C' 'T' '-' 'C']
 ['A' 'T' 'G' 'T' 'T' 'A' 'T' '-' 'A']
 ['A' 'G' 'C' 'G' '-' 'A' 'T' 'C' 'A']]
