# Backbone coverage algorithm

The current scoring algorithm just tries to find as many matching ions as possible in one spectrum as in another instead of trying to identify amino acids by their backbone. Instead, lets make an algorithm that scores a spectrum by its backbone coverage of a theoretical spectrum.

In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.objects import Spectrum
from src.spectra.gen_spectra import gen_spectrum
from src.utils import ppm_to_da

In [15]:
def backbone_scoring_alg(observed: Spectrum, reference: str, ppm_tolerance: int) -> float:
    '''
    Scoring algorithm based on backbone coverage of the reference. The scoring algorithm 
    returns a number between 0 and 100 + 3*(len(reference)-1). The calculation of the score is as follows:
    
        1. A percentage is given for the number of bond sites successfully identified
        2. For each bond site that has > 1 ion that describes it, an extra point is awarded. 
        
    Example:
        reference:   ABCDE, 4 junctions to describe
        observed:    ions: b1+, y1++, y2+, b4+
        
        ions for A, E, DE, ABCD found. Coverage = A**DE with DE described by both E and ABCD
        
        Score = %(3/4) + 1 = 75 + 1 = 76 
        
    Inputs:
        observed:       (Spectrum) spectrum being scored against
        reference:      (str) reference amino acid sequence being scored against the spectrum
        ppm_tolerance:  (int) tolerance to allow in ppm for each peak
    '''
    jcount = [0 for _ in range(len(reference)-1)]
    
    for ion in ['b', 'y']:
        for charge in [1, 2]:
            singled_seq = reference[:-1] if ion == 'b' else reference[1:]
            peaks = gen_spectrum(singled_seq, charge=charge, ion=ion)['spectrum']
            peaks = peaks if ion == 'b' else peaks[::-1]
            for i in range(len(peaks)):
                da_tol = ppm_to_da(peaks[i], ppm_tolerance)
                if any([peaks[i] - da_tol <= obs_peak <= peaks[i] + da_tol for obs_peak in observed.spectrum]):
                    jcount[i] += 1
    
    jcoverage = int(100 * sum([1 if jc > 0 else 0 for jc in jcount]) / len(jcount))
    extrapoints = sum([jc - 1 if jc > 1 else 0 for jc in jcount])
    
    return jcoverage + extrapoints
    

## Test it

### Situation 1: full coverage of 1 ion type

In [16]:
seq = 'MALWARM'
bs = gen_spectrum(seq, ion='b', charge=1)['spectrum']
print(backbone_scoring_alg(Spectrum(bs), seq, 10))


100


### Situation 2: full coverage by 2 ion types

In [18]:
seq = 'MALWARM'
bs = gen_spectrum(seq[:4], ion='b', charge=1)['spectrum']
ys = gen_spectrum(seq[5:], ion='y', charge=1)['spectrum']
print(backbone_scoring_alg(Spectrum(bs + ys), seq, 10))


100


### Situation 3: Partial coverage by 1 ion type

In [19]:
seq = 'MALWARM'
bs = gen_spectrum(seq[:4], ion='b', charge=1)['spectrum']
print(backbone_scoring_alg(Spectrum(bs), seq, 10))


66


### Situation 4: Partial coverage by 2 ion types

In [20]:
seq = 'MALWARM'
bs = gen_spectrum(seq[:2], ion='b', charge=1)['spectrum']
ys = gen_spectrum(seq[5:], ion='y', charge=1)['spectrum']
print(backbone_scoring_alg(Spectrum(bs + ys), seq, 10))


66


### Situation 5: Overlapping coverage of 2 ion types

In [21]:
seq = 'MALWARM'
bs = gen_spectrum(seq, ion='b', charge=1)['spectrum']
ys = gen_spectrum(seq, ion='y', charge=1)['spectrum']
print(backbone_scoring_alg(Spectrum(bs + ys), seq, 10))


106


### Situation 6: Full coverage by all ions

In [22]:
seq = 'MALWARM'
allspec = gen_spectrum(seq)['spectrum']
print(backbone_scoring_alg(Spectrum(allspec), seq, 10))


118


### Situation 7: No coverage

In [23]:
print(backbone_scoring_alg(Spectrum(), seq, 10))

0


# Ion specific backbone coverage scoring algorithm
Instead of either JUST blindly looking for ions or JUST looking for backbone coverage, try and find a middleground

In [None]:
def ion_backbone_score(observed: Spectrum, reference: str, ion: str, ppm_tolerance: int) -> float:
    '''
    Scoring algorithm based on backbone coverage of the reference. The scoring algorithm 
    returns a number between 0 and 100 + 3*(len(reference)-1). The calculation of the score is as follows:
    
        1. A percentage is given for the number of bond sites successfully identified
        2. For each bond site that has > 1 ion that describes it, an extra point is awarded. 
        
    Example:
        reference:   ABCDE, 4 junctions to describe
        observed:    ions: b1+, y1++, y2+, b4+
        
        ions for A, E, DE, ABCD found. Coverage = A**DE with DE described by both E and ABCD
        
        Score = %(3/4) + 1 = 75 + 1 = 76 
        
    Inputs:
        observed:       (Spectrum) spectrum being scored against
        reference:      (str) reference amino acid sequence being scored against the spectrum
        ion:            (str) the ion type to focus on. Options are 'b' or 'y'
        ppm_tolerance:  (int) tolerance to allow in ppm for each peak
    '''
    jcount = [0 for _ in range(len(reference)-1)]
    
    for charge in [1, 2]:
        singled_seq = reference[:-1] if ion == 'b' else reference[1:]
        peaks = gen_spectrum(singled_seq, charge=charge, ion=ion)['spectrum']
        peaks = peaks if ion == 'b' else peaks[::-1]
        for i in range(len(peaks)):
            da_tol = ppm_to_da(peaks[i], ppm_tolerance)
            if any([peaks[i] - da_tol <= obs_peak <= peaks[i] + da_tol for obs_peak in observed.spectrum]):
                jcount[i] += 1

    jcoverage = int(100 * sum([1 if jc > 0 else 0 for jc in jcount]) / len(jcount))
    extrapoints = sum([jc - 1 if jc > 1 else 0 for jc in jcount])
    
    return jcoverage + extrapoints