# TASK 1 - Implementation of Sequence Alignment Algorithms
Zofia Łągiewka 313096

In [2]:
import csv
import numpy as np

In [3]:
def load_matrix(path):
    """
    Loads a substitution matrix that includes match, mismatch, and gap penalties from a CSV file

        Parameters:
        - path (str): filepath to the substitution matrix in CSV format
    
        Returns:
        - matrix_ (dict): substitution matrix in a form of a nested dictionary where each key is a nucleotide, and each value is a dictionary mapping other nucleotides to their scores
    """
    matrix_ = {}
    with open(path, 'r') as file:
        r = csv.reader(file)
        nucleotides1 = [header.strip() for header in next(r)[1:]]
        for row in r:
            nucleotide = row[0].strip()
            values = list(map(int, [x.strip() for x in row[1:]]))
            matrix_[nucleotide] = dict(zip(nucleotides1, values))
    return matrix_

In [337]:
def fill_matrices(sequence1, sequence2, substitution_matrix, GP):
    """
    Fills the scoring and direction matrices
    
        Parameters:
        - sequence1 (str): first DNA sequence
        - sequence2 (str): second DNA sequence
        - substitution_matrix (dict): dictionary that includes match, mismatch, and gap penalties
        - GP (int): gap penalty 
    
        Returns:
        - scoring_matrix (np.ndarray): matrix containing scores for each alignment
        - direction_matrix (np.ndarray): directions for traceback ('d' for diagonal, 'u' for up, and 'l' for left)
    """
    x = len(sequence1) + 1
    y = len(sequence2) + 1
    scoring_matrix = np.zeros((x, y), dtype=int)
    direction_matrix = np.empty((x, y), dtype=object) 

    for i in range(1, x):
        scoring_matrix[i][0] = scoring_matrix[i - 1][0] + GP
        direction_matrix[i][0] = ['u']
    for j in range(1, y):
        scoring_matrix[0][j] = scoring_matrix[0][j - 1] + GP
        direction_matrix[0][j] = ['l']

    for i in range(1, x):
        for j in range(1, y):
            diag = scoring_matrix[i - 1][j - 1] + substitution_matrix[sequence1[i - 1]][sequence2[j - 1]]
            up = scoring_matrix[i - 1][j] + GP
            left = scoring_matrix[i][j - 1] + GP
            
            max_score = max(diag, up, left)
            scoring_matrix[i][j] = max_score

            directions = []
            if diag == max_score:
                directions.append("d")
            if up == max_score:
                directions.append("u")
            if left == max_score:
                directions.append("l")
            direction_matrix[i][j] = directions

    return scoring_matrix, direction_matrix