# Optimizing the scoring algorithm
Since the bulk of the program's time is being spent on kmer extension and scoring, any improvements to the scoring algorithm will translate well to the reduction of total time.

__NOTE__: these optimizations are taking into account the mass error tolerances since that will be a major component of the algorithm

## Current iteration (plus mass tolerance)

In [1]:
def ppm(observed: float, actual: float) -> float:
    '''
    Calculate the ppm difference between the observed and actual
    '''
    return abs(((observed - actual)/actual) * 1000000)

def compare_masses(spectrum: list, reference: list, ppm_tolerance: float) -> float:
    '''
    CREATED APRIL 27 2020
    Score two spectra against eachother. Simple additive scoring with bonuses for streaks
    Divides by the length of the reference to make it length biased for the reference

    Note:   the difference between this one and the other April one is this one divides 
            by the length of the spectrum. This is because all extended kmers will 
            be getting longer, so we want the score to increase with the length, not stay the same

    Inputs:
        spectrum:       list of floats (from mass spectra)
        reference:      list of floats (calculated from protein sequence)
        ppm_tolerance:  float of the mass error tolerance allowed
    Outputs:
        score:      float score 
    '''
    if len(spectrum) == 0 or len(reference) == 0:
        return 0.0
    streak = 0
    last = True
    score = 0
    max_streak = 0
    for refmass in reference:
        found = False
        # see if an observed is in the spectrum +/- the ppm
        for observedmass in spectrum:
            if ppm(observedmass, refmass) < ppm_tolerance:
                found = True
                break
    
        if found:
            if last == True:
                streak += 1
                max_streak = max([streak, max_streak])
            score += 1
            last = True 

        else:
            streak = 0
            last = False
    
    score += max_streak
    score /= float(len(spectrum))
    return score 

## Testing optimization iteration

In [2]:
from bisect import bisect

def ppm_opt(reference: float, ppm_tolerance: float) -> float:
    '''
    Calculate the ppm difference between the observed and actual
    '''
    return abs((ppm_tolerance / 1000000)*reference)

def optimized_compare_masses(observed: list, reference: list, ppm_tolerance: float) -> float:
    '''
    CREATED MAY 19 2020
    Score two spectra against eachother. Simple additive scoring with bonuses for streaks
    Divides by the length of the reference to make it length biased for the reference

    Note:   the difference between this one and the April 27 one is this one attempts
            to be more optimized in terms of search complexity

    Inputs:
        observed:       list of floats (from mass spectra)
        reference:      list of floats (calculated from protein sequence)
        ppm_tolerance:  float of the mass error tolerance allowed
    Outputs:
        score:      float score 
    '''
    if len(observed) == 0 or len(reference) == 0:
        return 0.0
    observed.sort()
    def boundaries(mass):
        tol = ppm_opt(mass, ppm_tolerance)
        return [mass - tol, mass + tol]
                
    # calculate the boundaries for each of the reference masses for binary search
    observed_boundaries = []
    for obs in observed:
        observed_boundaries += boundaries(obs)
        
    # local variables for score
    streak = 0
    last = True
    score = 0
    max_streak = 0
    
    for ref in reference:
        # see if an observed is in the spectrum +/- the ppm by binary searchy
        found = bisect(observed_boundaries, ref) % 2
    
        # increment score if found
        if found:
            if last == True:
                streak += 1
                max_streak = max([streak, max_streak])
            score += 1
            last = True 

        else:
            streak = 0
            last = False
    
    score += max_streak
    score /= float(len(observed))
    return score 
    
print(optimized_compare_masses([100, 200, 300, 350], [100, 200, 300, 400], 20))
print(compare_masses([100, 200, 300, 350], [100, 200, 300, 400], 20))

1.5
1.5


## Create some data to run through it
### 1. Load the database

In [3]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.file_io import fasta

fasta_file = '../../testing framework/data/databases/4prots.fasta'
database = fasta.read(fasta_file, True)

database = {x['name']: x for x in database}

### 2.  Generate the peptides, hybrid proteins and peptides

In [4]:
import modules
from modules.sequence_generation import proteins, peptides

num_hybs = 5
min_length= 5
max_length = 35
num_peptides = 500
min_cont = 3 #min contribution for each side of a hybrid

# make hybrid proteins
hyb_prots = proteins.generate_hybrids([x for _, x in database.items()], num_hybs, min_contribution=max_length)
# create peptides
non_hybrid_peps = peptides.gen_peptides([x for _, x in database.items()], num_peptides, min_length=min_length, max_length=max_length, digest='random', dist='beta')
# create hybrid peptides
hyb_peps = peptides.gen_peptides(hyb_prots, num_hybs, min_length=min_length, max_length=max_length, digest='random', min_contribution=min_cont, hybrid_list=True)

all_proteins_raw = [x for _,x in database.items()] + hyb_prots
all_peptides_raw = non_hybrid_peps + hyb_peps

peptides = {}
for i, pep in enumerate(all_peptides_raw):
    peptides[i] = pep
    peptides[i]['scan_no'] = i

Generating hybrid protein 0/5[0%]Generating hybrid protein 1/5[20%]Generating hybrid protein 2/5[40%]Generating hybrid protein 3/5[60%]Generating hybrid protein 4/5[80%]
Finished generating hybrid proteins


### 3. Generate spectra

In [5]:
from src.spectra import gen_spectra
from src.utils import utils

spectra = []
sorted_keys = [int(c) for c in peptides.keys()]
sorted_keys.sort()
for k in sorted_keys:
    pep = peptides[k]
    cont = gen_spectra.gen_spectrum(pep['sequence'])
    spec = cont['spectrum']
    pm = cont['precursor_mass']
    spectra.append({'spectrum': spec, 'precursor_mass': pm})


## Run data through old one and time it

In [6]:
# iterate through all the spectra and all the 3-mers of the database
from src.file_io import fasta
import time

db = fasta.read(fasta_file)
make_mers = lambda x, k: [x[i:i+k] for i in range(len(x)-k+1)]
kmers = []
for prot in db:
    kmers += make_mers(prot['sequence'], 3)

old_scored = []
st = time.time()
speclen = len(spectra)
for i, spec in enumerate(spectra):
    print('On spectrum {}/{}\r'.format(i, speclen), end='')
    for kmer in kmers:
        old_scored.append(((kmer, compare_masses(spec['spectrum'], gen_spectra.gen_spectrum(kmer)['spectrum'], 10))))
print('Time taken for old version: {}'.format(time.time() - st))

Time taken for old version: 170.37341785430908


## Run data through new one and time it

In [7]:
# iterate through all the spectra and all the 3-mers of the database
from src.file_io import fasta
import time

db = fasta.read(fasta_file)
make_mers = lambda x, k: [x[i:i+k] for i in range(len(x)-k+1)]
kmers = []
for prot in db:
    kmers += make_mers(prot['sequence'], 3)

new_scored = []    
    
st = time.time()
speclen = len(spectra)
for i, spec in enumerate(spectra):
    print('On spectrum {}/{}\r'.format(i, speclen), end='')
    for kmer in kmers:
        new_scored.append((kmer, optimized_compare_masses(spec['spectrum'], gen_spectra.gen_spectrum(kmer)['spectrum'], 10)))
print('Time taken for new version: {}'.format(time.time() - st))

Time taken for new version: 38.86096000671387


## Make sure the results are the same

In [8]:
not_in = []
yes_in = []
for i in range(len(new_scored)):
    if new_scored[i] != old_scored[i]:
        not_in.append(i)
    else:
        yes_in.append(i)

In [9]:
print(len(yes_in))
print(len(not_in))

1297045
0


In [13]:
print(new_scored[not_in[500]])
print(old_scored[not_in[500]])

('LLD', 0.038461538461538464)
('LLD', 0.07692307692307693)


In [35]:
testseq = 'LL'
testprot = 'LQVT'

testseqspec = gen_spectra.gen_spectrum(testseq)['spectrum']
testprotspec = gen_spectra.gen_spectrum(testprot)['spectrum']
print(compare_masses(testseqspec, testprotspec, 20))
print(optimized_compare_masses(testseqspec, testprotspec, 20))

spectrum:
[57.549308435, 66.554590785, 114.09134043499999, 114.091340435, 123.096622785, 132.101905135, 227.175404435, 245.185969135]
ppm for 57.549308435 compared to reference 57.549308435: 0.0
hit for 57.549308435
ppm for 57.549308435 compared to reference 60.536398285000004: 49343.69956958869
ppm for 66.554590785 compared to reference 60.536398285000004: 99414.4460274442
ppm for 114.09134043499999 compared to reference 60.536398285000004: 884673.4141312481
ppm for 114.091340435 compared to reference 60.536398285000004: 884673.4141312484
ppm for 123.096622785 compared to reference 60.536398285000004: 1033431.559728281
ppm for 132.101905135 compared to reference 60.536398285000004: 1182189.705325314
ppm for 227.175404435 compared to reference 60.536398285000004: 2752707.641532922
ppm for 245.185969135 compared to reference 60.536398285000004: 3050223.9327269876
ppm for 57.549308435 compared to reference 110.070605285: 477160.06207115314
ppm for 66.554590785 compared to reference 110.0