# Needleman-Wunsch with Linear Scoring

### FASTA FILE INTO LIST

In [301]:
# fasta reader
def read_fasta(file):
    with open(file, 'r') as f:
        lines = f.readlines()
    return lines

# fasta parser
def parse_fasta(lines):
    seq_list = []
    seq=''
    for line,index in zip(lines,range(len(lines))):
        if index == len(lines) - 1:
            seq += line.strip()
            seq_list.append(seq)
        if line.startswith('>'):
            seq_list.append(seq)
            seq = ''
            continue
        else:
            seq += line.strip()
    for i in seq_list:
        if i == '':
            seq_list.remove(i)
    return seq_list


### SUBSTITUTION MATRIX INTO DICTIONARY

In [302]:
# matrix txt reader
def read_matrix(file):
    with open(file, 'r') as f:
        lines = f.readlines()
    return lines


# matrix parser
def parse_matrix(lines):
    matrix = []
    for line in lines:
        if line.startswith(' '):
            item = line.strip().split()
            item.insert(0, ' ')
            matrix.append(item)
        else:
            matrix.append(line.strip().split())

    return matrix

# matrix to dictionary
def matrix_to_dict(matrix):
    matrix_dict = {}
    for i in range(1, len(matrix)):
        for j in range(1, len(matrix[i])):
            matrix_dict[matrix[i][0] + matrix[0][j]] = int(matrix[i][j])
    return matrix_dict

In [303]:
# get score
def score(term_1, term_2, matrix_dict):
    term = term_1 + term_2
    return matrix_dict[term]


## LINEAR

In [304]:
def initialize(nrows, ncols):
    
    ret = []
    
    for x in range(nrows):
        # For each row, add an empty list
        ret.append([])
        
        for y in range(ncols):
            # Add a large negative value to each column in each row
            ret[-1].append(-10000)
    # make the (0,0) cell 0
    ret[0][0] = 0
    # Return the matrix
    return ret

In [305]:
def needleman_wunsch(seq1, seq2, score_matrix):
    # initialize matrix
    traceback_matrix = initialize(len(seq1)+1, len(seq2)+1)   #!!

    # fill first row and column
    for i in range(1, len(seq1)+1):
        traceback_matrix[i][0] = traceback_matrix[0][0] - i
    for j in range(1, len(seq2)+1):
        traceback_matrix[0][j] = traceback_matrix[0][0] - j
    #print(traceback_matrix)
    # Set up traceback matrix
    for i in range(1, len(seq1) + 1):
        for j in range(1, len(seq2) + 1):
            match = traceback_matrix[i-1][j-1] + score(seq1[i - 1], seq2[j - 1], score_matrix)
            #mismatch = traceback_matrix[i-1, j-1] + score(seq1[i-1-1], seq2[j-1-1], score_matrix)
            delete = traceback_matrix[i-1][j] - 1
            insert = traceback_matrix[i][j-1] - 1
            traceback_matrix[i][j] = int(max(match, delete, insert))
    # # Traceback
    i = len(seq1)
    j = len(seq2)
    alignment_seq1 = ''
    alignment_seq2 = ''
    while i > 0 or j > 0:
        if traceback_matrix[i][j] == traceback_matrix[i - 1][j - 1] + score(
                seq1[i - 1], seq2[j - 1], score_matrix) and i > 0 and j > 0:
            alignment_seq1 = seq1[i - 1] + alignment_seq1
            alignment_seq2 = seq2[j - 1] + alignment_seq2
            i -= 1
            j -= 1

        elif traceback_matrix[i][j] == traceback_matrix[i - 1][j] - 1 and i > 0:
            alignment_seq1 = seq1[i - 1] + alignment_seq1
            alignment_seq2 = '-' + alignment_seq2
            i -= 1
        elif traceback_matrix[i][j] == traceback_matrix[i][j - 1] - 1 and j > 0:
            alignment_seq1 = '-' + alignment_seq1
            alignment_seq2 = seq2[j - 1] + alignment_seq2
            j -= 1
        elif i == 0 and j == 0:
            break
        else:
            break

    # while j > 0:
    #     alignment_seq1 = seq1[i - 1] + alignment_seq1
    #     alignment_seq2 = '-' + alignment_seq2
    #     j -= 1
    # while i > 0:
    #     alignment_seq1 = seq1[i - 1] + alignment_seq1
    #     alignment_seq2 = seq2[j - 1] + alignment_seq2
    #     i -= 1

    #print(alignment_seq1)
    #print(alignment_seq2)
    #return traceback_matrix
    return(alignment_seq1, alignment_seq2, traceback_matrix[len(seq1)][len(seq2)], len(seq1), len(seq2))

In [306]:
def output(alignment_seq1, alignment_seq2, alignment_num,final,filepath,seqname1,seqname2, len_seq1, len_seq2):
    interval = ''
    matches = 0
    indels = 0
    indel_length = 0
    alignment_length = len(alignment_seq1)
    average_alignment_length = (len_seq1 + len_seq2) / 2
    gap_recorder = []


    for k in range(0,len(alignment_seq1)):
        if alignment_seq1[k] == alignment_seq2[k] and alignment_seq1[k] != '-' and alignment_seq2[k] != '-' :
            gap_recorder = []
            interval += '|'
            matches += 1

        elif alignment_seq1[k] != alignment_seq2[k] and alignment_seq1[k] != '-' and alignment_seq2[k] != '-':
            gap_recorder = []
            interval += '*'

        else:
            interval += ' '
            indel_length += 1
            if alignment_seq1[k] == '-':
                if gap_recorder == []:
                    gap_recorder.append(1)
                    indels += 1

                else:
                    if gap_recorder[-1] == 1:
                        continue
                    else:
                        gap_recorder.append(1)
                        indels += 1

            if alignment_seq2[k] == '-':
                if gap_recorder == []:
                    gap_recorder.append(2)
                    indels += 1
                else:
                    if gap_recorder[-1] == 2:
                        continue
                    else:
                        gap_recorder.append(2)
                        indels += 1


    percentidentity = matches / average_alignment_length
    mean_indel_length = round(indel_length / indels, 1)
    score = final


    print('\n','Alignment #',alignment_num, '\n',
          '\n',
          'Matches: ', matches,'\n',
          'Percent Identity: ', percentidentity,'\n',
          'Indels: number=', indels, '  mean length=', mean_indel_length,'\n',
          'Alighment length: ', alignment_length,'\n',
          'Score=',score,'\n','\n')

    f = open(filepath,'a')
    f.write('\n')
    f.write('Alignment #' + str(alignment_num))
    f.write('\n')
    f.write('\n')
    f.write('Sequence #1 ' + seqname1)
    f.write('\n')
    f.write('Sequence #2 ' + seqname2)
    f.write('\n')
    f.write('Matches: ' + str(matches))
    f.write('\n')
    f.write('Percent Identity: ' + str(percentidentity))
    f.write('\n')
    f.write('Indels: number=' + str(indels) + '  ' + 'mean length=' + str(mean_indel_length))
    f.write('\n')
    f.write('Alignment length:' + str(alignment_length))
    f.write('\n')
    f.write('Score=' + str(score))
    f.write('\n')
    f.write('\n')



    for k in range(1, alignment_length//60 + 1):
        print('\n' + alignment_seq1[60 * (k-1): 60 * k] + '\n' + interval[60 * (k-1): 60 * k] +'\n'+
                       alignment_seq2[60 * (k-1): 60 * k])
        f.write('\n')
        f.write('\n')
        f.write(alignment_seq1[60 * (k-1): 60 * k])
        f.write('\n')
        f.write(interval[60 * (k-1): 60 * k] )
        f.write('\n')
        f.write(alignment_seq2[60 * (k-1): 60 * k])

    if alignment_length % 60 != 0:
        print('\n'+alignment_seq1[60 * k:] + '\n'+ interval[60 * k:] + '\n'+
                        alignment_seq2[60 * k:] + '\n' + '\n')
        f.write('\n')
        f.write('\n')
        f.write(alignment_seq1[60 * k:])
        f.write('\n')

        f.write(interval[60 * k:])
        f.write('\n')

        f.write(alignment_seq2[60 * k:])
        f.write('\n')


In [307]:
# Loading

matrix_dict = matrix_to_dict(parse_matrix(read_matrix('matrix.txt')))
close_first_seq = parse_fasta(read_fasta('close-first.fasta'))
close_second_seq = parse_fasta(read_fasta('close-second.fasta'))

In [309]:
for k in range(len(close_first_seq)):

    alignment_seq1, alignment_seq2, final, len_seq_1, len_seq_2 = needleman_wunsch(close_first_seq[k], close_second_seq[k], matrix_dict)
    output(alignment_seq1, alignment_seq2, k+1,final,'linear-close.txt','close seq1','close seq2', len_seq_1, len_seq_2)
    


 Alignment # 1 
 
 Matches:  248 
 Percent Identity:  0.9134438305709024 
 Indels: number= 35   mean length= 1.3 
 Alighment length:  295 
 Score= 1193 
 


AGACGGAGTTCCT--TCTTGAGGCA-A-TCTGCGTCTC---A---ATCATAGCCCTCTCT
|||||||| ||||  |||||||||| | || |||||||   |   |||||||||||||||
AGACGGAG-TCCTGCTCTTGAGGCACATTC-GCGTCTCTAGACCCATCATAGCCCTCTCT

GGCTACCCGCCCGATATCAATC-CTGTTGATTA-TTTCACAGCCCACAAGCCGGCCTG-A
 |||||||||||||| | |||| |||||||||| |||||||||||||||||||| ||| |
-GCTACCCGCCCGAT-T-AATCTCTGTTGATTATTTTCACAGCCCACAAGCCGG-CTGAA

GCAAG-AACGG--AGCGCCTACCTACTTTTATGATTGGGATTACAGATTACAGACG-TTT
||||| |||||  |||| ||||||| ||||||| |||||||||||||||||||||| |||
GCAAGCAACGGACAGCG-CTACCTA-TTTTATG-TTGGGATTACAGATTACAGACGTTTT

GG-TAT----CTGAG-CCATGTCTGCCT-ATAATGATTCGGGCTGGG-CCGAAGTGCT-A
|| |||    || || |||||||||||| || ||||||||||||||| ||||||||||  
GGCTATCGGCCT-AGCCCATGTCTGCCTAAT-ATGATTCGGGCTGGGACCGAAGTGCTC-

CAGTAGACATTAGACAT----CCAAAGCGA-CGGTGG-ATGATCTAAATACTTGG
|||||||||||||||||    |||||||||  ||||| | | ||||||||

In [310]:
distant_first_seq = parse_fasta(read_fasta('distant-first.fasta'))
distant_second_seq = parse_fasta(read_fasta('distant-second.fasta'))

In [311]:
for k in range(len(distant_first_seq)):

    alignment_seq1, alignment_seq2, final, len_seq_1, len_seq_2 = needleman_wunsch(
        distant_first_seq[k], distant_second_seq[k], matrix_dict)
    output(alignment_seq1, alignment_seq2, k + 1, final, 'linear-distant.txt',
           'distant seq1', 'distant seq2', len_seq_1, len_seq_2)



 Alignment # 1 
 
 Matches:  180 
 Percent Identity:  0.6338028169014085 
 Indels: number= 86   mean length= 1.7 
 Alighment length:  356 
 Score= 724 
 


-----CAC---T---CCTTTGGCTCCCGATTTAATACAATCCGCGTACAGTGAGGCGG-C
     ||*   |   ||**| |||  |  ||| | |||*  | |||||  |   ||||  
GGGTGCATGAATAAACCCCT-GCT--C--TTT-A-ACAG--C-CGTAC--T---GCGGA-

GGGCC-CG-A-TT--ACAGATTACAGG--GGGC------CGG-T-TTTAGTA-CGA-T--
|**|| || | ||  | |||||||||*    *|      ||| | |*|*||| | | |  
GAACCTCGCAGTTAAA-AGATTACAGATT--ACATTTAACGGCTCTCTGGTACC-ACTAA

-ATC--C-TC-G---A-A--G--CCG-CATGCGGCCGGCCT--ATGC-TGAA-C----CC
 | |  | || |   | |  |  ||| | * |||  || *|  | || ||*| |    *|
AA-CGGCTTCTGCTTACACTGTTCCGTC-C-CGG--GG-TTAGA-GCATGGATCAGGGTC

CGGCTTCT-TT-AAGCCCTAGACATTAGACAT--GGGAGGCC-TATAA-ACT-GTAATCT
|||| *|| *| **|   ||||||||||||||        || | |*|  || ||**|||
CGGC-CCTACTGGGG---TAGACATTAGACATCC------CCTT-TGAC-CTAGTGGTCT

AT-CA-GATC--TCTAT-CTGCTAAT-TT----TATCATCGTAGACCTTT-TTCAAAGGC
|| ||  |||  |||||  |||  || ||    ||  ||  || | 