In [1]:
import Bio
from Bio import pairwise2
from Bio import SeqIO
from Bio.Align import substitution_matrices
import random



In [2]:
fasta_sequences = SeqIO.parse(r"D:\HSE\BioinformaticsCourse2024\homework\1_1\data\f8.fasta", "fasta")
f8_sequences = []
for record in fasta_sequences.records:
    print("Read sequence from file:")
    print("ID:", record.id)
    print("Description:", record.description)
    print("Sequence:", record.seq)
    f8_sequences.append(record.seq)

Read sequence from file:
ID: NC_000023
Description: NC_000023
Sequence: AAGTGCTTGGGGCGGCGAGCATGGCGGCAGCGGCTGCAGGCCTGGGCGGCGGCGGCGCCGGCCCGGGACCCGAGGCCGGGGACTTCCTGGCCCGCTACCGGCTGGTATCGAACAAGCTGAAGAAGCGGTTCCTGCGGAAGCCGAACGTGGCGGAGGCCGGCGAGCAGTTCGGACAGCTGGGCCGGGAGCTGCGCGCCCAGGAGTGTCTGCCCTACGCGGCCTGGTGCCAGCTGGCGGTGGCGCGCTGCCAGCAGGCGCTCTTCCACGGGCCCGGGGAGGCGCTGGCCCTCACCGAGGCCGCCCGCCTCTTCCTGCGGCAGGAGCGCGACGCGCGCCAGCGCCTGGTCTGCCCCGCCGCCTACGGGGAGCCGCTGCAGGCCGCCGCCAGCGCCCTGGGCGCCGCGGTGCGTCTGCACCTCGAGCTGGGCCAGCCGGCCGCCGCCGCCGCCCTCTGCCTCGAGCTGGCCGCCGCCCTGCGCGACCTGGGCCAGCCGGCCGCCGCCGCCGGTCACTTCCAGCGCGCCGCCCAGCTCCAGCTGCCCCAGCTGCCCCTGGCCGCGCTGCAGGCGCTTGGCGAGGCCGCCTCCTGCCAGCTGCTGGCGCGCGACTACACCGGCGCCCTGGCGGTCTTCACGCGCATGCAGCGCCTGGCGCGGGAGCACGGCAGCCACCCGGTGCAGTCACTGCCGCCGCCCCCGCCGCCGGCACCCCAGCCCGGGCCCGGGGCGACGCCCGCCCTACCGGCCGCGCTGCTTCCTCCGAACTCCGGCTCGGCGGCGCCCTCTCCCGCCGCCCTGGGCGCCTTCTCGGACGTGCTGGTCCGCTGCGAGGTGTCCCGCGTGCTGCTGCTGCTCCTCCTGCAACCACCGCCCGCCAAGCTGCTGCCGGAGCACGCCCAGACCCTGGAGAAGTACTCCTGGGAGGCTTT

In [3]:
fasta_sequences = SeqIO.parse(r"D:\HSE\BioinformaticsCourse2024\homework\1_1\data\gattaca.fasta", "fasta")
gattaca_sequences = []
for record in fasta_sequences.records:
    print("Read sequence from file:")
    print("ID:", record.id)
    print("Description:", record.description)
    print("Sequence:", record.seq)
    gattaca_sequences.append(record.seq)

Read sequence from file:
ID: first
Description: first
Sequence: GATTACA
Read sequence from file:
ID: second
Description: second
Sequence: CATTAGA


In [4]:
def generate_seq(len1, len2):
    
    '''Generates absolute random sequence of a given length'''
    seq1 = ''
    seq2 = ''
    for _ in range(len1):
        seq1 += random.choice('ARNDCQEGHILKMFPSTWYVBZX')
    for _ in range(len2):
        seq2 += random.choice('ARNDCQEGHILKMFPSTWYVBZX')
    return seq1, seq2

# Нидлман Вунш

Реализуйте алгоритм Нидлмана Вунша для выравнивания последовательностеей. На вход принимается две строки, матрица замен и стоимость гэпа. В результате верните оптимальное выравнивание и его вес. При проверке помните, что оптимальных выравниваний может быть несколько, но вес у них должен совпадать.

In [97]:
def NeedlemanWunsch(seq1, seq2, subsistution_matrix, gap_penalty):
    n = len(seq1)
    m = len(seq2)
    dp = [[0 for _ in range(m+1)] for _ in range(n+1)]
    for i in range(n+1):
        dp[i][0] = i*gap_penalty
    for i in range(m+1):
        dp[0][i] = i*gap_penalty
    for i in range(1, n+1):
        for j in range(1, m+1):
            aminoacid1_idx = subsistution_matrix.alphabet.find(seq1[i-1])
            aminoacid2_idx = subsistution_matrix.alphabet.find(seq2[j-1])
            dp[i][j] = max(
                dp[i-1][j-1] + subsistution_matrix[aminoacid1_idx][aminoacid2_idx],
                dp[i-1][j] + gap_penalty,
                dp[i][j-1] + gap_penalty,
            )
    score = dp[n][m]

    alignment1 = ''
    alignment2 = ''
    while n > 0 or m > 0:
        aminoacid1_idx = subsistution_matrix.alphabet.find(seq1[n-1])
        aminoacid2_idx = subsistution_matrix.alphabet.find(seq2[m-1])
        if n > 0 and m > 0 and dp[n][m] == dp[n-1][m-1] + subsistution_matrix[aminoacid1_idx][aminoacid2_idx]:
            alignment1 = seq1[n-1] + alignment1
            alignment2 = seq2[m-1] + alignment2
            n -= 1
            m -= 1
        elif n > 0 and dp[n][m] == dp[n-1][m] + gap_penalty:
            alignment1 = seq1[n-1] + alignment1
            alignment2 = '-' + alignment2
            n -= 1
        else:
            alignment2 = seq2[m-1] + alignment2
            alignment1 = '-' + alignment1
            m -= 1
    return (score, alignment1, alignment2)

def generate_comparison(my_alignment1, my_alignment2):
    comparison_str = ''
    for i in range(len(my_alignment1)):
        if my_alignment1[i] == my_alignment2[i]:
            comparison_str += '|'
        elif my_alignment1[i] != my_alignment2[i] and my_alignment1[i] != '-' and my_alignment2[i] != '-':
            comparison_str += '.'
        else:
            comparison_str += '-'
    return comparison_str



matrix = substitution_matrices.load("BLOSUM62")

#  ARNDCQEGHILKMFPSTWYVBZX
# seq1, seq2 = gattaca_sequences
seq1 = 'YAFDLGYTCMFPVLLGGGELHIVQKETYTAPDEI'
seq2 = 'AFDVSAGDFARALLTGGQLIVCPNEVKMDPASLYAII'
# seq1, seq2 = generate_seq(10, 20)  # absolute random sequences

my_score, my_alignment1, my_alignment2 = NeedlemanWunsch(seq1, seq2, matrix, -4)
comparison_str = generate_comparison(my_alignment1, my_alignment2)

print(f'My solution:\nScore = {my_score}:')
print(f'target\t\t  0 {my_alignment1} {len(seq1)}')
print(f'\t\t  0 {comparison_str} {len(comparison_str)}')
print(f'query\t\t  0 {my_alignment2} {len(seq2)}\n')

aligner = Bio.Align.PairwiseAligner()
aligner.substitution_matrix = matrix
aligner.open_gap_score = -4
aligner.extend_gap_score = -4
print('Test solution:')
for alignment in aligner.align(seq1, seq2):
    print("Score = %.1f:" % alignment.score)
    print(alignment)

My solution:
Score = 21.0:
target		  0 YAFDLGYTCMFP-VLLGGGELHIV-QKETYTAP-D--E-I 34
		  0 -|||..-...|.-.||.||.|-||-..|....|-.--.-| 40
query		  0 -AFDVS-AGDFARALLTGGQL-IVCPNEVKMDPASLYAII 37

Test solution:
Score = 21.0:
target            0 YAFDLGYTCMFP-VLLGGGELHIV-QKETYTAP-D--EI- 34
                  0 -|||..-...|.-.||.||.|-||-..|....|-.--.|- 40
query             0 -AFDVS-AGDFARALLTGGQL-IVCPNEVKMDPASLYAII 37

Score = 21.0:
target            0 YAFDLGYTCMFP-VLLGGGELHIV-QKETYTAP-D--E-I 34
                  0 -|||..-...|.-.||.||.|-||-..|....|-.--.-| 40
query             0 -AFDVS-AGDFARALLTGGQL-IVCPNEVKMDPASLYAII 37



# Афинные гэпы

Реализуйте выравнивание с афинными гэпами, алгоритм на вход принимает две строки, матрицу замен, штраф за начало гэпа α, и за его продолжение β. В результате возвращает выравнивание и его вес. Сложность алгоритма квадратичная по памяти и по времени.

In [101]:
def AfinityGaps(seq1, seq2, subsistution_matrix, open_gap_penalty, extend_gap_penalty):
    n = len(seq1)
    m = len(seq2)
    dp = [[0 for _ in range(m+1)] for _ in range(n+1)]
    dp_insertion = [[0 for _ in range(m+1)] for _ in range(n+1)]
    dp_deletion = [[0 for _ in range(m+1)] for _ in range(n+1)]
    for i in range(1, m+1):
        dp_insertion[0][i] = -10e6
    for i in range(1, n+1):
        dp_deletion[i][0] = -10e6
    for i in range(1, n+1):
        dp[i][0] = open_gap_penalty + (i-1)*extend_gap_penalty
    for i in range(1, m+1):
        dp[0][i] = open_gap_penalty + (i-1)*extend_gap_penalty
    for i in range(1, n+1):
        for j in range(1, m+1):
            dp_insertion[i][j] = max(
                dp_insertion[i-1][j] + extend_gap_penalty,
                dp[i-1][j] + open_gap_penalty
            )
            dp_deletion[i][j] = max(
                dp_deletion[i][j-1] + extend_gap_penalty,
                dp[i][j-1] + open_gap_penalty
            )
            aminoacid1_idx = subsistution_matrix.alphabet.find(seq1[i-1])
            aminoacid2_idx = subsistution_matrix.alphabet.find(seq2[j-1])
            dp[i][j] = max(
                dp[i-1][j-1] + subsistution_matrix[aminoacid1_idx][aminoacid2_idx],
                dp_insertion[i][j],
                dp_deletion[i][j],
            )
    score = dp[n][m]

    alignment1 = ''
    alignment2 = ''
    current_matrix = 'substitution'
    while n > 0 or m > 0:
        aminoacid1_idx = subsistution_matrix.alphabet.find(seq1[n-1])
        aminoacid2_idx = subsistution_matrix.alphabet.find(seq2[m-1])
        if n == 0:
            current_matrix = 'deletion'
        elif m == 0:
            current_matrix = 'insertion'
        if current_matrix == 'substitution':
            if dp[n][m] == dp[n-1][m-1] + subsistution_matrix[aminoacid1_idx][aminoacid2_idx]:
                alignment1 = seq1[n-1] + alignment1
                alignment2 = seq2[m-1] + alignment2
                n -= 1
                m -= 1
            elif dp[n][m] == dp_insertion[n][m]:
                current_matrix = 'insertion'
            elif dp[n][m] == dp_deletion[n][m]:
                current_matrix = 'deletion'
        elif current_matrix == 'insertion':
            if dp_insertion[n][m] == dp[n-1][m] + open_gap_penalty:
                current_matrix = 'substitution'
            alignment1 = seq1[n-1] + alignment1
            alignment2 = '-' + alignment2
            n -= 1
        elif current_matrix == 'deletion':
            if dp_deletion[n][m] == dp[n][m-1] + open_gap_penalty:
                current_matrix = 'substitution'
            alignment2 = seq2[m-1] + alignment2
            alignment1 = '-' + alignment1
            m -= 1
    return (score, alignment1, alignment2)

def generate_comparison(my_alignment1, my_alignment2):
    comparison_str = ''
    for i in range(len(my_alignment1)):
        if my_alignment1[i] == my_alignment2[i]:
            comparison_str += '|'
        elif my_alignment1[i] != my_alignment2[i] and my_alignment1[i] != '-' and my_alignment2[i] != '-':
            comparison_str += '.'
        else:
            comparison_str += '-'
    return comparison_str



matrix = substitution_matrices.load("BLOSUM62")

#  ARNDCQEGHILKMFPSTWYVBZX
# seq1, seq2 = gattaca_sequences
seq1 = 'YAFDLGYTCMFPVLLGGGELHIVQKETYTAPDEI'
seq2 = 'AFDVSAGDFARALLTGGQLIVCPNEVKMDPASLYAII'
# seq1, seq2 = generate_seq(10, 20)  # absolute random sequences

open_gap_score = -3
extend_gap_score = -2

my_score, my_alignment1, my_alignment2 = AfinityGaps(seq1, seq2, matrix, open_gap_score, extend_gap_score)
comparison_str = generate_comparison(my_alignment1, my_alignment2)

print(f'My solution:\nScore = {my_score}:')
print(f'target\t\t  0 {my_alignment1} {len(seq1)}')
print(f'\t\t  0 {comparison_str} {len(comparison_str)}')
print(f'query\t\t  0 {my_alignment2} {len(seq2)}\n')

aligner = Bio.Align.PairwiseAligner()
aligner.substitution_matrix = matrix
aligner.open_gap_score = open_gap_score
aligner.extend_gap_score = extend_gap_score
print('Test solution:')
for alignment in aligner.align(seq1, seq2):
    print("Score = %.1f:" % alignment.score)
    print(alignment)

My solution:
Score = 33.0:
target		  0 YAFDL--GYTCMFP-VLLGGGELHIV-QKETYTAPDE----I 34
		  0 -|||.--|-.--|.-.||.||.|-||-..|....|..----| 42
query		  0 -AFDVSAG-D--FARALLTGGQL-IVCPNEVKMDPASLYAII 37

Test solution:
Score = 33.0:
target            0 YAFDL--GYTCMFP-VLLGGGELHIV-QKETYTAPDE----I 34
                  0 -|||.--|-.--|.-.||.||.|-||-..|....|..----| 42
query             0 -AFDVSAG-D--FARALLTGGQL-IVCPNEVKMDPASLYAII 37

