# intensity ion backbone score
The ion backbone score, but also taking into account the relative intensity that is being made up by identified peaks

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.objects import Spectrum
from src.spectra.gen_spectra import gen_spectrum
from src.utils import ppm_to_da

In [2]:
sum([1 if jc == 0 else 0 for jc in jcount])

NameError: name 'jcount' is not defined

In [3]:
def intensity_ion_backbone_score(observed: Spectrum, reference: str, ion: str, ppm_tolerance: int) -> float:
    '''
    Scoring algorithm that factors in how much of the backbone is identified along with the abundance each peak 
    contributes to the total score. The formula is:

        count(# backbone cleavages found) + 1 for every additional hit of an already identifed cleavage * total percentage
        of intensity covered by identfied peaks

    Example:
        reference:   ABCDE, 4 junctions to describe
        observed:    ions: b1+, y1++, y2+, b4+, with relative intensities of (100, 200, 100, 50) of a total 600
        
        ions for A, E, DE, ABCD found. Coverage = A**DE with DE described by both E and ABCD
        
        Score = %(3/4) + 1 = 75 + 1 = 76 *(100 + 200 + 100 + 50)/600 = 57 

    Inputs:
        observed:       (Spectrum) spectrum being scored against
        reference:      (str) reference amino acid sequence being scored against the spectrum
        ion:            (str) the ion type to focus on. Options are 'b' or 'y'
        ppm_tolerance:  (int) tolerance to allow in ppm for each peak
    Outputs:
        (float) score according to the formula 
    '''
    # check to see if observed is nothing
    if len(observed.spectrum) == 0:
        return 0

    # keep track of the junction (bond) sites found
    jcount = [0 for _ in range(len(reference)-1)]

    # keep track of the abundances that contribute to our score
    ided_abundances = 0
    
    for charge in [1, 2]:

        # take off the trailing or leading amino acid from the reference according to ion type
        singled_seq = reference[:-1] if ion == 'b' else reference[1:]

        # get the m/z peaks
        peaks = gen_spectrum(singled_seq, charge=charge, ion=ion)['spectrum']
        
        # go through each peak and try and match it to an observed one
        for i in range(len(peaks)):

            # take tolerance into account 
            da_tol = ppm_to_da(peaks[i], ppm_tolerance)

            # get hits
            peak_hits = list(map(lambda idx_x: idx_x[0] if peaks[i] - da_tol <= idx_x[1] <= peaks[i] + da_tol else None, enumerate(observed.spectrum)))

            # remove None from peak hits
            peak_hits = [idx for idx in peak_hits if idx is not None]

            # if the len > 1, increment
            if len(peak_hits) > 0:
                jcount[i] += 1
                ided_abundances += sum([observed.abundance[idx] for idx in peak_hits])

    # if any entry has at least 1, that bond has been identified at least once
    jcoverage = sum([1 if jc > 0 else 0 for jc in jcount])

    # if an entry has more than 1, we give it extra points
    extrapoints = sum([jc - 1 if jc > 1 else 0 for jc in jcount]) // 3
    
    return (jcoverage + extrapoints) * (ided_abundances / observed.total_intensity)

In [4]:
def intensity_ion_backbone_score2(observed: Spectrum, reference: str, ion: str, ppm_tolerance: int) -> float:
    '''
    Scoring algorithm that factors in how much of the backbone is identified along with the abundance each peak 
    contributes to the total score. The formula is:

        count(# backbone cleavages found) + 1 for every additional hit of an already identifed cleavage * total percentage
        of intensity covered by identfied peaks

    Example:
        reference:   ABCDE, 4 junctions to describe
        observed:    ions: b1+, y1++, y2+, b4+, with relative intensities of (100, 200, 100, 50) of a total 600
        
        ions for A, E, DE, ABCD found. Coverage = A**DE with DE described by both E and ABCD
        
        Score = %(3/4) + 1 = 75 + 1 = 76 *(100 + 200 + 100 + 50)/600 = 57 

    Inputs:
        observed:       (Spectrum) spectrum being scored against
        reference:      (str) reference amino acid sequence being scored against the spectrum
        ion:            (str) the ion type to focus on. Options are 'b' or 'y'
        ppm_tolerance:  (int) tolerance to allow in ppm for each peak
    Outputs:
        (float) score according to the formula 
    '''
    # check to see if observed is nothing
    if len(observed.spectrum) == 0:
        return 0

    # keep track of the junction (bond) sites found
    jcount = [0 for _ in range(len(reference)-1)]

    # keep track of the abundances that contribute to our score
    ided_abundances = 0
    
    for charge in [1, 2]:

        # take off the trailing or leading amino acid from the reference according to ion type
        singled_seq = reference[:-1] if ion == 'b' else reference[1:]

        # get the m/z peaks
        peaks = gen_spectrum(singled_seq, charge=charge, ion=ion)['spectrum']
        
        # go through each peak and try and match it to an observed one
        for i in range(len(peaks)):

            # take tolerance into account 
            da_tol = ppm_to_da(peaks[i], ppm_tolerance)

            # get hits
            peak_hits = list(map(lambda idx_x: idx_x[0] if peaks[i] - da_tol <= idx_x[1] <= peaks[i] + da_tol else None, enumerate(observed.spectrum)))

            # remove None from peak hits
            peak_hits = [idx for idx in peak_hits if idx is not None]

            # if the len > 1, increment
            if len(peak_hits) > 0:
                jcount[i] += 1
                ided_abundances += sum([observed.abundance[idx] for idx in peak_hits])

    # if any entry has at least 1, that bond has been identified at least once
    jcoverage = sum([1 if jc > 0 else 0 for jc in jcount])
    
    # subtract points for every missing bond 
    missing_bonds = sum([1 if jc == 0 else 0 for jc in jcount])
    print(f'Missing {missing_bonds}/{len(jcount)} bonds')
    jcoverage -= missing_bonds

    # if an entry has more than 1, we give it extra points
    extrapoints = sum([jc - 1 if jc > 1 else 0 for jc in jcount]) // 3
    
    return (jcoverage + extrapoints) * (ided_abundances / observed.total_intensity)

In [5]:
seq1 = 'MALWARM'
seq2 = 'MALW'
seq3 = 'WARM'
seq4 = 'QQQPS'

observed_spec = gen_spectrum(seq1)['spectrum']
abundances = [1 for _ in range(len(observed_spec))]
observed = Spectrum(observed_spec, abundances, sum(abundances))


In [6]:
print(intensity_ion_backbone_score(observed, seq2, 'b', 20))
print(intensity_ion_backbone_score2(observed, seq2, 'b', 20))

0.8571428571428571
Missing 0/3 bonds
0.8571428571428571


In [8]:
print(intensity_ion_backbone_score(observed, seq3, 'y', 20))
print(intensity_ion_backbone_score2(observed, seq3, 'y', 20))

0.8571428571428571
Missing 0/3 bonds
0.8571428571428571


In [9]:
print(intensity_ion_backbone_score(observed, seq2, 'y', 20))
print(intensity_ion_backbone_score2(observed, seq2, 'y', 20))

0.0
Missing 3/3 bonds
-0.0


In [7]:
intensity_ion_backbone_score(observed, seq3, 'b', 20)

j coverage: 0
extra points: 0
observed / total 0/28 [0%]


0.0

In [8]:
intensity_ion_backbone_score(observed, seq4, 'b', 20)

j coverage: 0
extra points: 0
observed / total 0/28 [0%]


0.0

In [9]:
intensity_ion_backbone_score(observed, seq4, 'y', 20)

j coverage: 0
extra points: 0
observed / total 0/28 [0%]


0.0

In [10]:
from src.file_io.mzML import read

mzmlfile = '/Users/zacharymcgrath/Desktop/nod2 data/single/singleRealSpectrum.mzml'

spec = None
for e in read(mzmlfile):
    spec = e
    break
    
print(spec)

Spectrum(spectrum=[70.06517028808594, 74.06005859375, 84.04444122314453, 87.05548095703125, 88.03839111328125, 102.05506896972656, 120.08072662353516, 129.10128784179688, 156.0776824951172, 169.09774780273438, 181.06008911132812, 184.10662841796875, 199.0702667236328, 201.12315368652344, 212.1022186279297, 213.10667419433594, 217.08192443847656, 218.07821655273438, 245.07611083984375, 246.07858276367188, 277.1161193847656, 283.1413879394531, 316.1118469238281, 331.17498779296875, 402.1655578613281, 439.4844665527344, 449.7722473144531, 527.2117309570312, 550.8241577148438, 585.966796875, 591.6425170898438, 591.9791870117188, 592.3104248046875, 607.8389282226562, 608.3428955078125, 615.3208618164062, 615.6561279296875, 615.9962158203125, 634.8439331054688, 643.3582763671875, 643.8055419921875, 643.8550415039062, 644.3562622070312, 674.2733764648438, 707.87841796875, 708.3804931640625, 708.8803100585938, 781.4142456054688, 838.4376220703125, 838.9308471679688], abundance=[2547.6833496093

In [15]:
intensity_ion_backbone_score(spec, 'NFEANTTIGRIRFH', 'y', 20) * len('NFEANTTIGRIRFH') // 2

j coverage: 53
extra points: 0
observed / total 7131.206878662109/58779.7287902832 [12%]


45.0

In [16]:
intensity_ion_backbone_score(spec, 'LPS', 'y', 20) * len('LPS') // 2

j coverage: 50
extra points: 0
observed / total 3846.709716796875/58779.7287902832 [6%]


4.0

In [13]:
intensity_ion_backbone_score(spec, 'TDS', 'b', 20)

j coverage: 100
extra points: 0
observed / total 7733.043212890625/58779.7287902832 [13%]


13.1559695358257

In [14]:
intensity_ion_backbone_score(spec, 'DEA', 'b', 20)

j coverage: 50
extra points: 0
observed / total 6083.39990234375/58779.7287902832 [10%]


5.174743085365672

In [20]:
gen_spectrum('EAPNFEANTTIGRIRFHD', charge=2)

{'spectrum': [65.528572935,
  101.047129935,
  149.573511935,
  206.594975435,
  280.129182435,
  344.650478935,
  380.169035935,
  437.190499435,
  487.714338935,
  538.238178435,
  594.780210435,
  623.2909424349999,
  701.3414979349999,
  757.8835299349998,
  835.9340854349998,
  909.4682924349999,
  977.9977484349998,
  1035.5112199349999,
  67.526030285,
  136.055486285,
  209.589693285,
  287.64024878500004,
  344.18228078500005,
  422.2328362850001,
  450.7435682850001,
  507.2856002850001,
  557.8094397850001,
  608.3332792850001,
  665.3547427850001,
  700.8732997850001,
  765.394596285,
  838.9288032850001,
  895.950266785,
  944.4766487850001,
  979.995205785,
  1044.5165022850001],
 'precursor_mass': 1044.5165022499998}

In [21]:
gen_spectrum('DEAPNFEANTTIGRIRFH', charge=2)

{'spectrum': [58.520747935,
  123.04204443500001,
  158.56060143500002,
  207.08698343500004,
  264.108446935,
  337.642653935,
  402.163950435,
  437.682507435,
  494.703970935,
  545.2278104349999,
  595.7516499349999,
  652.2936819349999,
  680.8044139349998,
  758.8549694349998,
  815.3970014349998,
  893.4475569349997,
  966.9817639349998,
  1035.5112199349999,
  78.54201478499999,
  152.07622178499997,
  230.12677728499997,
  286.66880928499995,
  364.7193647849999,
  393.23009678499994,
  449.77212878499995,
  500.29596828499996,
  550.819807785,
  607.841271285,
  643.3598282849999,
  707.8811247849999,
  781.4153317849999,
  838.4367952849999,
  886.9631772849999,
  922.4817342849999,
  987.003030785,
  1044.516502285],
 'precursor_mass': 1044.5165022499998}