# Needleman-Wunsch with Affine Scoring

### FASTA FILE INTO LIST

In [2]:
# fasta reader
def read_fasta(file):
    with open(file, 'r') as f:
        lines = f.readlines()
    return lines

# fasta parser
def parse_fasta(lines):
    seq_list = []
    seq=''
    for line,index in zip(lines,range(len(lines))):
        if index == len(lines)-1:
            seq += line.strip()
            seq_list.append(seq)
        if line.startswith('>'):
            seq_list.append(seq)
            seq = ''
            continue
        else:
            seq += line.strip()
    for i in seq_list:
        if i == '':
            seq_list.remove(i)
    return seq_list


### SUBSTITUTION MATRIX INTO DICTIONARY

In [3]:
# matrix txt reader
def read_matrix(file):
    with open(file, 'r') as f:
        lines = f.readlines()
    return lines


# matrix parser
def parse_matrix(lines):
    matrix = []
    for line in lines:
        if line.startswith(' '):
            item = line.strip().split()
            item.insert(0, ' ')
            matrix.append(item)
        else:
            matrix.append(line.strip().split())

    return matrix

# matrix to dictionary
def matrix_to_dict(matrix):
    matrix_dict = {}
    for i in range(1, len(matrix)):
        for j in range(1, len(matrix[i])):
            matrix_dict[matrix[i][0] + matrix[0][j]] = int(matrix[i][j])
    return matrix_dict

In [4]:
# get score
def score(term_1, term_2, matrix_dict):
    term = term_1 + term_2
    return matrix_dict[term]


In [5]:
def output(alignment_seq1, alignment_seq2, alignment_num, final, filepath,
           seqname1, seqname2, len_seq1, len_seq2):
    interval = ''
    matches = 0
    indels = 0
    indel_length = 0
    alignment_length = len(alignment_seq1)
    average_alignment_length = (len_seq1 + len_seq2) / 2
    gap_recorder = []

    for k in range(0, len(alignment_seq1)):
        if alignment_seq1[k] == alignment_seq2[
                k] and alignment_seq1[k] != '-' and alignment_seq2[k] != '-':
            gap_recorder = []
            interval += '|'
            matches += 1

        elif alignment_seq1[k] != alignment_seq2[k] and alignment_seq1[
                k] != '-' and alignment_seq2[k] != '-':
            gap_recorder = []
            interval += '*'

        else:
            interval += ' '
            indel_length += 1
            if alignment_seq1[k] == '-':
                if gap_recorder == []:
                    gap_recorder.append(1)
                    indels += 1

                else:
                    if gap_recorder[-1] == 1:
                        continue
                    else:
                        gap_recorder.append(1)
                        indels += 1

            if alignment_seq2[k] == '-':
                if gap_recorder == []:
                    gap_recorder.append(2)
                    indels += 1
                else:
                    if gap_recorder[-1] == 2:
                        continue
                    else:
                        gap_recorder.append(2)
                        indels += 1

    percentidentity = matches / average_alignment_length
    mean_indel_length = round(indel_length / indels, 1)
    score = final

    print('\n', 'Alignment #', alignment_num, '\n', '\n', 'Matches: ', matches,
          '\n', 'Percent Identity: ', percentidentity, '\n', 'Indels: number=',
          indels, '  mean length=', mean_indel_length, '\n',
          'Alighment length: ', alignment_length, '\n', 'Score=', score, '\n',
          '\n')

    f = open(filepath, 'a')
    f.write('\n')
    f.write('Alignment #' + str(alignment_num))
    f.write('\n')
    f.write('\n')
    f.write('Sequence #1 ' + seqname1)
    f.write('\n')
    f.write('Sequence #2 ' + seqname2)
    f.write('\n')
    f.write('Matches: ' + str(matches))
    f.write('\n')
    f.write('Percent Identity: ' + str(percentidentity))
    f.write('\n')
    f.write('Indels: number=' + str(indels) + '  ' + 'mean length=' +
            str(mean_indel_length))
    f.write('\n')
    f.write('Alignment length:' + str(alignment_length))
    f.write('\n')
    f.write('Score=' + str(score))
    f.write('\n')
    f.write('\n')

    for k in range(1, alignment_length // 60 + 1):
        print('\n' + alignment_seq1[60 * (k - 1):60 * k] + '\n' +
              interval[60 * (k - 1):60 * k] + '\n' +
              alignment_seq2[60 * (k - 1):60 * k])
        f.write('\n')
        f.write('\n')
        f.write(alignment_seq1[60 * (k - 1):60 * k])
        f.write('\n')
        f.write(interval[60 * (k - 1):60 * k])
        f.write('\n')
        f.write(alignment_seq2[60 * (k - 1):60 * k])

    if alignment_length % 60 != 0:
        print('\n' + alignment_seq1[60 * k:] + '\n' + interval[60 * k:] +
              '\n' + alignment_seq2[60 * k:] + '\n' + '\n')
        f.write('\n')
        f.write('\n')
        f.write(alignment_seq1[60 * k:])
        f.write('\n')

        f.write(interval[60 * k:])
        f.write('\n')

        f.write(alignment_seq2[60 * k:])
        f.write('\n')


## AFFINE SCORING

In [6]:
g = 4      #gap open panelty
e = 0.1    #gap extend panelty

In [7]:
def initialize(nrows, ncols):

    ret = []

    for x in range(nrows):
        # For each row, add an empty list
        ret.append([])

        for y in range(ncols):
            # Add a zero to each column in each row
            ret[-1].append(0)
    # make the (0,0) cell 0
    ret[0][0] = 0
    # Return the matrix
    return ret

In [67]:
def needleman_wunsch_affinescoring(seq1, seq2, score_matrix,g ,e):
    # initialize matrices
    match_matrix = initialize(len(seq1)+1, len(seq2)+1)   #!!
    insertion_matrix = initialize(len(seq1)+1, len(seq2)+1)
    deletion_matrix = initialize(len(seq1)+1, len(seq2)+1)
    max_matrix = initialize(len(seq1) + 1, len(seq2) + 1)
    
    

    # fill first row and column
    for i in range(1, len(seq1)+1):
        match_matrix[i][0] = -10000
        insertion_matrix[i][0] =  -10000
        deletion_matrix[i][0] = - g - e * (i - 1)
        

    for j in range(1, len(seq2)+1):
        match_matrix[0][j] = -10000
        insertion_matrix[0][j] =  - g - e * (j - 1)
        deletion_matrix[0][j] = -10000
        
        

    for i in range(1, len(seq1) + 1):
        for j in range(1, len(seq2) + 1):
            match_matrix[i][j] = max(match_matrix[i - 1][j - 1] + score(seq1[i - 1], seq2[j - 1], score_matrix),
                                         insertion_matrix[i - 1][j - 1]+ score(seq1[i - 1], seq2[j - 1], score_matrix),
                                         deletion_matrix[i - 1][j - 1]+ score(seq1[i - 1], seq2[j - 1], score_matrix))

            insertion_matrix[i][j] = max(match_matrix[i][j-1] - g ,
                                             insertion_matrix[i][j-1] - e, deletion_matrix[i][j-1] - g )
            #deletion_matrix[i][j-1] - g ))

            deletion_matrix[i][j] = max(match_matrix[i-1][j] - g,
                                            deletion_matrix[i-1][j] - e, insertion_matrix[i-1][j] - g)
            #insertion_matrix[i-1][j] - g ))
            
            max_matrix[i][j] = max(match_matrix[i][j],insertion_matrix[i][j],deletion_matrix[i][j])




    #return (match_matrix[len(seq1)][len(seq2)],insertion_matrix[len(seq1)][len(seq2)])
    # Traceback
    i = len(seq1)
    j = len(seq2)
    alignment_seq1 = ''
    alignment_seq2 = ''
    while i > 0 and j > 0:
        if (max_matrix[i][j] == match_matrix[i][j]):
            alignment_seq1 = seq1[i - 1] + alignment_seq1
            alignment_seq2 = seq2[j - 1] + alignment_seq2
            i -= 1
            j -= 1
        elif (max_matrix[i][j] == deletion_matrix[i][j]):
            alignment_seq1 = seq1[i - 1] + alignment_seq1
            alignment_seq2 = '-' + alignment_seq2
            i -= 1
        elif (max_matrix[i][j] == insertion_matrix[i][j]):
            alignment_seq1 = '-' + alignment_seq1
            alignment_seq2 = seq2[j - 1] + alignment_seq2
            j -= 1

    while i > 0:
        alignment_seq1 = seq1[i - 1] + alignment_seq1
        alignment_seq2 = '-' + alignment_seq2
        i -= 1
        
    while j > 0:
        alignment_seq1 = '-' + alignment_seq1
        alignment_seq2 = seq2[j - 1] + alignment_seq2
        j -= 1
        

    #print(alignment_seq1)
    #print(alignment_seq2)
    #return traceback_matrix
    return(alignment_seq1, alignment_seq2, max_matrix[len(seq1)][len(seq2)], len(seq1), len(seq2))

In [9]:
# Loading

matrix_dict = matrix_to_dict(parse_matrix(read_matrix('matrix.txt')))
close_first_seq = parse_fasta(read_fasta('close-first.fasta'))
close_second_seq = parse_fasta(read_fasta('close-second.fasta'))

In [65]:
for k in range(len(close_first_seq)):

    alignment_seq1, alignment_seq2, final, len_seq_1, len_seq_2 = needleman_wunsch_affinescoring(
        close_first_seq[k], close_second_seq[k], matrix_dict, g, e)
    output(alignment_seq1, alignment_seq2, k+1,final,'affine-close.txt','close seq1','close seq2', len_seq_1, len_seq_2)



 Alignment # 1 
 
 Matches:  247 
 Percent Identity:  0.9097605893186004 
 Indels: number= 26   mean length= 1.4 
 Alighment length:  290 
 Score= 1111.8999999999999 
 


AGACGGAGTTCCT--TCTTGAGGCA-ATCTGCGTCTC---A---ATCATAGCCCTCTCTG
|||||||| ||||  |||||||||| ||**|||||||   |   ||||||||||||||| 
AGACGGAG-TCCTGCTCTTGAGGCACATTCGCGTCTCTAGACCCATCATAGCCCTCTCT-

GCTACCCGCCCGATATCAATC-CTGTTGATTA-TTTCACAGCCCACAAGCCGGCCTG-AG
|||||||||||||| | |||| |||||||||| |||||||||||||||||||| ||| ||
GCTACCCGCCCGAT-T-AATCTCTGTTGATTATTTTCACAGCCCACAAGCCGG-CTGAAG

CAAG-AACGG--AGCGCCTACCTACTTTTATGATTGGGATTACAGATTACAGACG-TTTG
|||| |||||  |||| ||||||| ||||||| |||||||||||||||||||||| ||||
CAAGCAACGGACAGCG-CTACCTA-TTTTATG-TTGGGATTACAGATTACAGACGTTTTG

G-TATCTG---AG-CCATGTCTGCCT-ATAATGATTCGGGCTGGG-CCGAAGTGCTACAG
| ||||*|   || |||||||||||| || ||||||||||||||| ||||||||||*|||
GCTATCGGCCTAGCCCATGTCTGCCTAAT-ATGATTCGGGCTGGGACCGAAGTGCTCCAG

TAGACATTAGACAT----CCAAAGCGACGGTGGATGATCTAAATACTTGG
||||||||||||||    |||||||||*||||||*| ||||

In [66]:
distant_first_seq = parse_fasta(read_fasta('distant-first.fasta'))
distant_second_seq = parse_fasta(read_fasta('distant-second.fasta'))
for k in range(len(distant_first_seq)):

    alignment_seq1, alignment_seq2, final, len_seq_1, len_seq_2 = needleman_wunsch_affinescoring(
        distant_first_seq[k], distant_second_seq[k], matrix_dict, g, e)
    output(alignment_seq1, alignment_seq2, k + 1, final, 'affine-distant.txt',
           'distant seq1', 'distant seq2', len_seq_1, len_seq_2)



 Alignment # 1 
 
 Matches:  177 
 Percent Identity:  0.6232394366197183 
 Indels: number= 68   mean length= 2.4 
 Alighment length:  365 
 Score= 617.3999999999991 
 


-----C--------ACTCCTTTGGCTCCCGATTTAATACAATCCGC-GTACAGTGAGGCG
     |        || ||* | ||||    ||| | || |   || ||||  |   |||
GGGTGCATGAATAAAC-CCC-T-GCTC----TTT-A-AC-A---GCCGTAC--T---GCG

GCG---GGC-C-C--------GATTACAGATTAC-----A-GG------GG----GC---
|*|     | | |        |||||||||||||     | ||      ||    *|   
GAGAAC--CTCGCAGTTAAAAGATTACAGATTACATTTAACGGCTCTCTGGTACCACTAA

--CGG--T-T--TTAGTAC-GATATCC-T--C---GA-AGCCGCAT-G--C--GG-CCGG
  |||  | |  ||| *|| | | ||| |  |   |* ||* |||| |  |  || ||||
AACGGCTTCTGCTTA-CACTG-T-TCCGTCCCGGGGTTAGA-GCATGGATCAGGGTCCGG

-CCTATGCTGAACCCCGGCTTCTTTAAGCCCTAGACATTAGACATGGGAGGCCTATAAAC
 ||||  |||      ||         |   ||||||||||||||**    |||*|*|*|
CCCTA--CTG------GG---------G---TAGACATTAGACATCC----CCTTTGACC

T-GTAATCTAT-CAGATC--TCTATCTGCTAAT-TT----TATCATCGTAGACCTTT-TT
| ||**||||| ||*|||  |||||*|||  |