# Test new scoring alg with updated kmer extension
So the old algorithm worked something like this:
```
MER: 4
MERS: 4
MERSV: 4
MERSVT: 3.2
MERSVTP: 3.3
...
```
So the score didn't increase with the length of the kmer. We would like the score to increase with it so that we can get information from that. 

## New scoring algorithm
Surprise: its the same one as the April 6 one but the normalizing factor is the length of the spectrum since that remains constant across all kmer extensions

In [22]:
def compare_masses(spectrum: list, reference: list) -> float:
    '''
    CREATED APRIL 27 2020
    Score two spectra against eachother. Simple additive scoring with bonuses for streaks
    Divides by the length of the reference to make it length biased for the reference

    Note:   the difference between this one and the other April one is this one divides 
            by the length of the spectrum. This is because all extended kmers will 
            be getting longer, so we want the score to increase with the length, not stay the same

    Inputs:
        spectrum:   list of floats (from mass spectra)
        reference:  list of floats (calculated from protein sequence)
    Outputs:
        score:      float score 
    '''
    if len(spectrum) == 0 or len(reference) == 0:
        return 0.0
    streak = 0
    last = True
    score = 0
    max_streak = 0
    for refmass in reference:
    
        if refmass in spectrum:
            if last == True:
                streak += 1
                max_streak = max([streak, max_streak])
            score += 1
            last = True 

        else:
            streak = 0
            last = False
    
    score += max_streak
    score /= float(len(spectrum))
    return score 


### Test it and see if it increases with the kmer increase

In [23]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.spectra import gen_spectra

protseq = 'WQTSMALWARMVPYA'
fullseq = 'MALWARM'
mer3 = 'MAL'
mer5 = 'MALWA'

fullspec = gen_spectra.gen_spectrum(fullseq)['spectrum']
spec3 = gen_spectra.gen_spectrum(mer3)['spectrum']
spec5 = gen_spectra.gen_spectrum(mer5)['spectrum']

# compare them 
score3 = compare_masses(fullspec, spec3)
score5 = compare_masses(fullspec, spec5)
fullscore = compare_masses(fullspec, fullspec)

print('3mer score: {}'.format(score3))
print('5mer score: {}'.format(score5))
print('full score: {}'.format(fullscore))


3mer score: 0.25
5mer score: 0.39285714285714285
full score: 2.0


## New kmer extension logic
So now that we know that the score DOES increase as a function of length, lets work on the kmer extension part of it

In [24]:
def new_entry(old_entry: dict, prot: str, spectrum: list, ion='b') -> dict:
    '''
    Generate a new entry from the old entry
    
    Input:
        old_entry:   dict entry with k, sequence, b and y scores, start and end positions
        prot:        str sequence of the protein
        spectrum:    list spectrum to score against
    kwargs:
        ion:         str ion type to determine which. Options are 'b', 'y'. Default='b' 
    Ouptut:
        new_entry:   dict entry with the new k, new sequence, new b and y scores, new start and end positions
    '''
    # check that we are operating on a valid entry
    keys = ['k', 'starting_position', 'ending_position', 'b_score', 'y_score']
    if any([old_entry[k] is None for k in keys]) or prot is None or spectrum is None:
        return old_entry

    starting_pos = old_entry['starting_position'] if ion == 'b' else old_entry['starting_position'] - 1
    ending_pos = old_entry['ending_position'] + 1 if ion == 'b' else old_entry['ending_position']
    if starting_pos < 0 or ending_pos > len(prot) - 1:
        return old_entry

    # check for negative lengths
    if starting_pos > ending_pos or ending_pos < starting_pos:
        return old_entry

    mer_seq = prot[starting_pos:ending_pos+1]
    mer_spec_b = gen_spectra.gen_spectrum(mer_seq, ion='b')['spectrum']
    mer_spec_y = gen_spectra.gen_spectrum(mer_seq, ion='y')['spectrum']
    return {
        'k': old_entry['k'] + 1,
        'sequence': mer_seq,
        'starting_position': starting_pos,
        'ending_position': ending_pos,
        'b_score': compare_masses(spectrum, mer_spec_b),
        'y_score': compare_masses(spectrum, mer_spec_y)
    }

def extend_kmer(spectrum: list, sequence: str, kmer: dict, ion: str, stall_length=3) -> dict:
    '''
    Extend a kmer until the score tells us that the adding amino acids doens't make it a better alignment
    
    Inputs:
        spectrum:       list of floats. The mass spectrum in question
        sequence:       str The full protein sequence we are pulling amino acids from 
        kmer:           dict of the form
                        {
                            b_score: float, 
                            y_score: float,
                            k: int, 
                            starting_position: int,
                            ending_position: int,
                        }
        ion:            str the ion type we are looking at. Should be 'b' or 'y'
    kwargs:
        stall_length:   int the number of iterations a subsequence is allowed to go witth 
                        no increase in score before finishing kmer growth on a certain kmer. Default=3
    Outputs
        dict with updated values of the form
            {
                b_score: float,
                y_score: float, 
                k: int, 
                starting_position: int,
                ending_positoin: int,
            }
    '''
    if ion.lower() not in ['b', 'y']:
        return kmer
    score_key = 'b_score' if ion.lower() == 'b' else 'y_score'
    # keep track of the last time a score increased
    last_maintenance = kmer
    # keep going until we run out of extension
    while stall_length > 0:
        updated = new_entry(kmer, sequence, spectrum, ion=ion)
        if updated[score_key] > kmer[score_key] and updated[score_key] > 0 and updated['k'] != kmer['k']:
            last_maintenance = updated
        else: 
            stall_length -= 1
        kmer = updated
    return last_maintenance

In [25]:
spectrum = fullspec
sequence = protseq
kmer = {
    'b_score': 0,
    'y_score': 0,
    'k': 3,
    'starting_position': 4,
    'ending_position': 6,
}
ion = 'b'
extended = extend_kmer(spectrum, sequence, kmer, ion=ion)
print(extended)

{'k': 7, 'sequence': 'MALWARM', 'starting_position': 4, 'ending_position': 10, 'b_score': 1.0, 'y_score': 1.0}
