# Pre-processing spectra to make easier to search

Idea: make one big spectrum that we will then tag masses with amino acid sequences

Flow: 
1. Load all spectra
2. Make a single large spectrum called `S`
3. Find the `max_length` peptide from the maximum mass
4. Load the database
5. For each protein `P` of the database
    1. For each kmer `k` of length `max_length`
        1. Calculate each individual spectrum for `(b+, b++, y+, y++)` (we call `ts`)
        2. For each `ts`:
            1. For each mass `m` of this `ts`:
                1. Binary search `S` for `m` plus/minus tolerance
                2. If the mass is found, add the `k(m)` to a dictionary to keep for later split by ion type
6. Build a MassDawg for both `b` and `y` kmers taken from the search
7. For each input spectrum:
    1. Search both the `b` and `y` MassDawgs for sequences 
    2. Make an alignment
                

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from pyteomics import fasta
from collections import defaultdict
from src.sequence.gen_spectra import gen_spectrum, gen_min_ordering
from src.file_io import mzML
from src.objects import Spectrum
from src.utils import ppm_to_da
from src.scoring import scoring

import bisect

from mass_dawg import PyMassDawg

import numpy as np

ppm_tol = 20

In [2]:
def longest_array(window, length):
    return int(np.ceil(1/window * length * 186.079313))

def sparse_it(spectrum, window, length):
    sparse = np.zeros(longest_array(window, length))
    for mz in spectrum:
        mz_direct = int(mz/window)
        sparse[mz_direct] = 1
    
    return sparse

## 1. Load all spectra

In [3]:
spectra_file = '/Users/zacharymcgrath/Desktop/nod2 data/single/singleRealSpectrum.mzml'
fasta_file = '/Users/zacharymcgrath/Desktop/nod2 data/filteredNOD2.fasta'

In [4]:
spectra = mzML.read(spectra_file, peak_filter=25)

## 2. Make the single large spectrum

In [5]:
all_spectra = sorted([x for spectrum in spectra for x in spectrum.spectrum])

## 3. Find the max length possible peptide

In [6]:
max_len = int(np.ceil(all_spectra[-1]/ 57.021464))

## 4. Load the database into RAM

In [7]:
prots = {}
for i, entry in enumerate(fasta.read(fasta_file)):
    prots[i] = entry

## 5. The search

In [8]:
b_hits = {}
y_hits = {}
plen = len(prots)

for i, prot_entry in prots.items():
    
    print(f'\rOn protein {i+1}/{plen} [{int((i+1) * 100 / plen)}%]', end='')
    
    for j in range(len(prot_entry.sequence) - max_len):
        kmer = prot_entry.sequence[j:j+max_len]
        
        for ion in 'by':
            for charge in [1, 2]:
                spec = gen_spectrum(kmer, ion=ion, charge=charge)['spectrum']
            
                for c, mass in enumerate(spec):

                    da_tol = ppm_to_da(mass, ppm_tol)
                    lb = mass - da_tol
                    ub = mass + da_tol
                    
                    beginning_entry = bisect.bisect_left(all_spectra, lb)
                    
                    # see if the NEXT value is in the range. If so, keep the kmer
                    if beginning_entry + 1 < len(all_spectra) and all_spectra[beginning_entry] <= ub:
                        if ion == 'b':
                            b_hits[kmer[:c+1]] = None
                        else:
                            y_hits[kmer[-c-1:]] = None                     
                    

On protein 279/279 [100%]

## 6. Build the MassDawgs for the different ions

In [9]:
b_dog = PyMassDawg()
y_dog = PyMassDawg()

In [10]:
for kmer in sorted(b_hits.keys(), key=gen_min_ordering):
    bs = gen_spectrum(kmer, ion='b', charge=1)['spectrum']
    bd = gen_spectrum(kmer, ion='b', charge=2)['spectrum']
    b_dog.insert(bs, bd, kmer)
b_dog.finish()

In [11]:
for kmer in sorted(y_hits.keys(), key=lambda x: gen_min_ordering(x[::-1])):
    ys = gen_spectrum(kmer, ion='y', charge=1)['spectrum']
    yd = gen_spectrum(kmer, ion='y', charge=2)['spectrum']
    y_dog.insert(ys, yd, kmer[::-1])

y_dog.finish()

## 7. For each spectrum, search the graphs and build an alignment

In [20]:
b_search_results = b_dog.fuzzy_search(spectra[1].spectrum, 1, ppm_tol)
sorted_b_results = sorted([(kmer, scoring.score_subsequence(spectra[1].spectrum, kmer, ppm_tol)[0]) for kmer in b_search_results], key=lambda x: x[1], reverse=True)
max_score = sorted_b_results[0][1]
filtered_b_results = [x for x in sorted_b_results if x[1] == max_score]
print(filtered_b_results)

[('SAGT', 3), ('SATG', 3), ('GEAD', 3), ('EGAD', 3), ('ADAD', 3), ('ADAL', 3), ('GEAI', 3), ('SSAA', 3), ('DAAL', 3), ('EGAL', 3), ('SAAS', 3), ('SSVP', 3), ('GEAL', 3), ('DAAI', 3), ('SASA', 3), ('ADAI', 3), ('EGAI', 3), ('DAAD', 3), ('SSPV', 3)]


In [21]:
y_search_results = [x[::-1] for x in y_dog.fuzzy_search(spectra[1].spectrum, 1, ppm_tol)]
sorted_y_results = sorted([(kmer, scoring.score_subsequence(spectra[1].spectrum, kmer, ppm_tol)[1]) for kmer in y_search_results], key=lambda x: x[1], reverse=True)
max_score = sorted_y_results[0][1]
filtered_y_results = [x for x in sorted_y_results if x[1] == max_score]
print(filtered_y_results)

[('AIVGYK', 6)]
