# Pre-processing spectra to make easier to search

Idea: make one big spectrum that we will then tag masses with amino acid sequences

Flow: 
1. Load all spectra
2. Make a single large spectrum called `S`
3. Find the `max_length` peptide from the maximum mass
4. Load the database
5. For each protein `P` of the database
    1. For each kmer `k` of length `max_length`
        1. Calculate each individual spectrum for `(b+, b++, y+, y++)` (we call `ts`)
        2. For each `ts`:
            1. For each mass `m` of this `ts`:
                1. Binary search `S` for `m` plus/minus tolerance
                2. If the mass is found, add the `k(m)` to a dictionary to keep for later split by ion type
6. Build a MassDawg for both `b` and `y` kmers taken from the search
7. For each input spectrum:
    1. Search both the `b` and `y` MassDawgs for sequences 
    2. Make an alignment
                

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from pyteomics import fasta
from collections import defaultdict
from src.sequence.gen_spectra import gen_spectrum, gen_min_ordering
from src.file_io import mzML
from src.objects import Spectrum, Database
from src.utils import ppm_to_da
from src.scoring import scoring
from src.tree import Tree
from src.identfication import alignment

from src.database import extract_protein_name

import bisect

from mass_dawg import PyMassDawg

import numpy as np

ppm_tol = 20

In [2]:
def longest_array(window, length):
    return int(np.ceil(1/window * length * 186.079313))

def sparse_it(spectrum, window, length):
    sparse = np.zeros(longest_array(window, length))
    for mz in spectrum:
        mz_direct = int(mz/window)
        sparse[mz_direct] = 1
        sparse[mz_direct-1] = .5
        sparse[mz_direct+1] = .5
    
    return sparse

def to_percent(index, total):
    return int(100 * (index + 1)/total)

## 1. Load all spectra

In [3]:
spectra_file = '/Users/zacharymcgrath/Desktop/nod2 data/single/singleRealSpectrum.mzml'
fasta_file = '/Users/zacharymcgrath/Desktop/nod2 data/filteredNOD2.fasta'

In [4]:
spectra = mzML.read(spectra_file, peak_filter=25)

## 2. Make the single large spectrum

In [5]:
all_spectra = sorted([x for spectrum in spectra for x in spectrum.spectrum])

## 3. Find the max length possible peptide

In [6]:
max_len = int(np.ceil(all_spectra[-1]/ 57.021464))

## 4. Load the database into RAM

In [7]:
prots = {}
for i, entry in enumerate(fasta.read(fasta_file)):
    name = extract_protein_name(entry)
    prots[name] = entry

## 5. The search

In [8]:
%%time
def find_kmer_hits(kmer: str, prot_name: str) -> None:
    for ion in 'by':
        for charge in [1, 2]:
            spec = gen_spectrum(kmer, ion=ion, charge=charge)['spectrum']

            for c, mass in enumerate(spec):

                da_tol = ppm_to_da(mass, ppm_tol)
                lb = mass - da_tol
                ub = mass + da_tol

                beginning_entry = bisect.bisect_left(all_spectra, lb)

                # see if the NEXT value is in the range. If so, keep the kmer
                if beginning_entry + 1 < len(all_spectra) and all_spectra[beginning_entry] <= ub:
                    
                    if ion == 'b':
                        b_hits[kmer[:c+1]] = None
                        t.insert(prot_name, kmer[:c+1])
                    else:
                        y_hits[kmer[-c-1:]] = None
                        t.insert(prot_name, kmer[-c-1:])

b_hits = {}
y_hits = {}
plen = len(prots)

t = Tree()

for i, (name, prot_entry) in enumerate(prots.items()):
    
    
    print(f'\rOn protein {i+1}/{plen} [{int((i+1) * 100 / plen)}%]', end='')
    
    for j in range(1, max_len):
        kmer = prot_entry.sequence[:j]
        find_kmer_hits(kmer, name)
    
    for j in range(len(prot_entry.sequence) - max_len):
        kmer = prot_entry.sequence[j:j+max_len]
        find_kmer_hits(kmer, name)     
        
    for j in range(len(prot_entry.sequence) - max_len, len(prot_entry.sequence)):
        kmer = prot_entry.sequence[j:]
        find_kmer_hits(kmer, name)
                    

On protein 279/279 [100%]CPU times: user 15 s, sys: 183 ms, total: 15.2 s
Wall time: 15.2 s


## 6. Score all of the remaining sequnces against each spectrum

In [9]:
%%time

b_results = defaultdict(list)
y_results = defaultdict(list)

for i, b_hit in enumerate(b_hits):
    print(f'\rOn b-ion kmer {i+1}/{len(b_hits)} [{to_percent(i, len(b_hits))}%]', end='')
    
    for j, spectrum in enumerate(spectra):
        b_results[j].append(
            (
                b_hit, 
                scoring.score_subsequence(spectrum.spectrum, b_hit, ppm_tol)[0]
            )
        )
print()
for i, y_hit in enumerate(y_hits):
    print(f'\rOn y-ion kmer {i+1}/{len(y_hits)} [{to_percent(i, len(y_hits))}%]', end='')
    
    for j, spectrum in enumerate(spectra):
        y_results[j].append(
            (
                y_hit, 
                scoring.score_subsequence(spectrum.spectrum, y_hit, ppm_tol)[1]
            )
        )

On b-ion kmer 1/28587 [0%]On b-ion kmer 2/28587 [0%]On b-ion kmer 3/28587 [0%]On b-ion kmer 4/28587 [0%]On b-ion kmer 5/28587 [0%]On b-ion kmer 6/28587 [0%]On b-ion kmer 7/28587 [0%]On b-ion kmer 8/28587 [0%]On b-ion kmer 9/28587 [0%]On b-ion kmer 10/28587 [0%]On b-ion kmer 11/28587 [0%]On b-ion kmer 12/28587 [0%]On b-ion kmer 13/28587 [0%]On b-ion kmer 14/28587 [0%]On b-ion kmer 15/28587 [0%]On b-ion kmer 16/28587 [0%]On b-ion kmer 17/28587 [0%]On b-ion kmer 18/28587 [0%]On b-ion kmer 19/28587 [0%]On b-ion kmer 20/28587 [0%]On b-ion kmer 21/28587 [0%]On b-ion kmer 22/28587 [0%]On b-ion kmer 23/28587 [0%]On b-ion kmer 24/28587 [0%]On b-ion kmer 25/28587 [0%]On b-ion kmer 26/28587 [0%]On b-ion kmer 27/28587 [0%]On b-ion kmer 28/28587 [0%]On b-ion kmer 29/28587 [0%]On b-ion kmer 30/28587 [0%]On b-ion kmer 31/28587 [0%]On b-ion kmer 32/28587 [0%]On b-ion kmer 33/28587 [0%]On b-ion kmer 34/28587 [0%]On b-ion kmer 35/28587 [0%]On b-ion kmer 36/28587 [0%]

On b-ion kmer 28587/28587 [100%]
On y-ion kmer 26679/26679 [100%]CPU times: user 25.7 s, sys: 1.96 s, total: 27.7 s
Wall time: 29.5 s


In [10]:
x = 10
for i in range(len(spectra)):
    b_results[i].sort(key=lambda x: x[1], reverse=True)
    y_results[i].sort(key=lambda x: x[1], reverse=True)
    print(f'============================ spectrum {i}')
    print(f'b results: \n{b_results[i][:x]}')
    print(f'y results: \n{y_results[i][:x]}')

b results: 
[('EVDICTVGL', 5), ('KKLDLNCD', 4), ('LSEGLTTLD', 4), ('HRYVEVF', 4), ('EVVEEAENGRDAPA', 4), ('KTFSHELS', 4), ('LSISADIET', 4), ('LSISADIETIGE', 4), ('KAAGWSELS', 4), ('KKDRESGE', 4)]
y results: 
[('PVNSPMTKG', 5), ('RAAQGRAYGNLGNTHYL', 4), ('SRPEDKVT', 3), ('GDLGNVTAGK', 3), ('NSPMTKG', 3), ('SDLGTKLQDPRVMTTLS', 3), ('DLTPKDIE', 3), ('SFVHLESL', 3), ('AVFSPSRSFVHLESL', 3), ('SSMAYPNLVAMASQ', 3)]
b results: 
[('SSPENIL', 4), ('ADAL', 3), ('DAAD', 3), ('ADALASAAGHL', 3), ('DPVPLPN', 3), ('ADALQAGAS', 3), ('ADALQAGASQ', 3), ('ASHVPTLQVLRP', 3), ('SASA', 3), ('SASADLSRSKTTSA', 3)]
y results: 
[('AIVGYK', 6), ('IVGYK', 5), ('VGYK', 4), ('ALTFAK', 4), ('VQLYK', 3), ('GYK', 3), ('GVYK', 3), ('SGVYK', 3), ('LAAFTK', 3), ('HFHAGYK', 3)]
b results: 
[('TVAGLSTHALCHTRL', 4), ('TVPPAAPAGEGGPPAPPPN', 4), ('TTTTTFKG', 4), ('SSPNGILLFREA', 4), ('TTALLKIDITDTE', 4), ('SLTVDVTSPASKV', 4), ('TDKQEKKEVPKC', 4), ('TTWLQWASLLFVDN', 4), ('SSLISALLGQMQ', 4), ('SIQRSAPGGGGKRY', 4)]
y results: 
[(

In [11]:
%%time 

window = .02

b_results_sparse = defaultdict(list)
y_results_sparse = defaultdict(list)

def add_hit(kmer, result, ion):
    res_dict = b_results_sparse if ion == 'b' else y_results_sparse
    
    for i, value in enumerate(result):
        res_dict[i].append((kmer, value))

sparsified_spectra = np.zeros((len(spectra), longest_array(window, max_len)))
for i, spectrum in enumerate(spectra):
    sparsified_spectra[i] = sparse_it(spectrum.spectrum, window, max_len)

for i, b_hit in enumerate(b_hits):
    print(f'\rOn b-ion kmer {i+1}/{len(b_hits)} [{to_percent(i, len(b_hits))}%]', end='')
    sparse_b = sparse_it(gen_spectrum(b_hit, ion='b')['spectrum'], window, max_len)
    res = np.dot(sparsified_spectra, sparse_b)
    add_hit(b_hit, res, 'b')
    
print()
for i, y_hit in enumerate(y_hits):
    print(f'\rOn y-ion kmer {i+1}/{len(y_hits)} [{to_percent(i, len(y_hits))}%]', end='')
    sparse_y = sparse_it(gen_spectrum(y_hit, ion='y')['spectrum'], window, max_len)
    res = np.dot(sparsified_spectra, sparse_y)
    add_hit(y_hit, res, 'y')


On b-ion kmer 28587/28587 [100%]
On y-ion kmer 26679/26679 [100%]CPU times: user 1min 53s, sys: 13.5 s, total: 2min 6s
Wall time: 40.3 s


In [12]:
x = 10
for i in range(len(spectra)):
    b_results_sparse[i].sort(key=lambda x: x[1], reverse=True)
    y_results_sparse[i].sort(key=lambda x: x[1], reverse=True)
    print(f'============================ spectrum {i}')
    print(f'b results: \n{b_results_sparse[i][:x]}')
    print(f'y results: \n{y_results_sparse[i][:x]}')

b results: 
[('KTFSHELS', 6.0), ('KAAGWSELS', 6.0), ('EVDICTVGL', 6.0), ('NWRENEYLTLQVPAF', 5.5), ('HRYVEVF', 5.5), ('YHRFTLDEKNY', 5.5), ('WRNVNGVNYASVTRNQ', 5.5), ('KYTGSHKE', 5.5), ('KDGLEMEK', 5.5), ('LSEGLTTLD', 5.0)]
y results: 
[('PVNSPMTKG', 7.5), ('HFNVSQVT', 5.5), ('YDRPGAG', 5.25), ('PTFQFYK', 5.0), ('SSQSRRLDDQRASVGSL', 5.0), ('SNQPEGVSI', 5.0), ('ECGHLRAQLEEQG', 4.75), ('NSPMTKG', 4.5), ('SSMAYPNLVAMASQ', 4.5), ('NLGNTHYL', 4.5)]
b results: 
[('SAPLPSATAH', 5.5), ('ADAL', 4.75), ('ADALASAAGHL', 4.75), ('ADALQAGAS', 4.75), ('ADALQAGASQ', 4.75), ('ASHVPTLQVLRP', 4.75), ('QELAKY', 4.75), ('ADAI', 4.75), ('ADAINT', 4.75), ('ADAINTE', 4.75)]
y results: 
[('AIVGYK', 9.0), ('IVGYK', 7.5), ('VGYK', 6.0), ('ALTFAK', 6.0), ('NFELRYK', 5.5), ('VQLYK', 4.5), ('GYK', 4.5), ('GVYK', 4.5), ('SGVYK', 4.5), ('NEETIK', 4.5)]
b results: 
[('TVPPAAPAGEGGPPAPPPN', 6.0), ('SLTVDVTSPASKV', 6.0), ('TDKQEKKEVPKC', 6.0), ('TVAGLSTHALCHTRL', 5.5), ('SELACIYSALILH', 5.5), ('SDKEGHKYVTVVANF', 5.5), ('

## 7. For each spectrum, search the graphs and build an alignment

In [11]:
alignments = {}
gap = 2
db = Database(fasta_file, prots, t)
for spec_c, spectrum in enumerate(spectra):
#     print(f'On spectrum {spec_c + 1}/{len(spectra)} [{int(100 * (spec_c+1)/len(spectra))}%]')
#     b_kmers = db.b_dawg.fuzzy_search(spectrum.spectrum, gap, ppm_tol)
#     sorted_b_results = sorted(
#         [(kmer, scoring.score_subsequence(spectrum.spectrum, kmer, ppm_tol)[0]) for kmer in b_kmers],
#          key=lambda x: x[1], reverse=True
#     )
#     max_score = sorted_b_results[0][1]
#     filtered_b_results = [x[0] for x in sorted_b_results if x[1] == max_score]
    
#     print(f'\nB results for sequence {spec_c}:\n{filtered_b_results}')

#     y_kmers = db.y_dawg.fuzzy_search(spectrum.spectrum, gap, ppm_tol)
#     sorted_y_results = sorted(
#         [(kmer, scoring.score_subsequence(spectrum.spectrum, kmer, ppm_tol)[1]) for kmer in y_kmers], 
#         key=lambda x: x[1], 
#         reverse=True
#     )
#     max_score = sorted_y_results[0][1]
#     filtered_y_results = [x[0] for x in sorted_y_results if x[1] == max_score]
#     print(f'\nY results for sequence {spec_c}:\n{filtered_y_results}')
    
    alignments[spec_c] = alignment.attempt_alignment(
        spectrum,
        db,
        [y[0] for y in b_results[spec_c][:x]],
        [y[0] for y in y_results[spec_c][:x]],
        ppm_tolerance=ppm_tol,
        n=3
    )

In [12]:
print('Sequence \t | \t scores \t | \t precursor distance')
for i, als in alignments.items():
    print(f'Alignments for sequence {i}')
    for a in als:
        print(f'{a.sequence} \t | \t b: {a.b_score}, y: {a.y_score} \t | \t {a.precursor_distance}')

Sequence 	 | 	 scores 	 | 	 precursor distance
Alignments for sequence 0
LPVNSPMTKGD 	 | 	 b: 0, y: 0 	 | 	 0.00045174999991104414
EEAENGRDAPA 	 | 	 b: 0, y: 0 	 | 	 0.040534250000064276
Alignments for sequence 1
DAAIVGYK 	 | 	 b: 3, y: 6 	 | 	 0.0012217500000701875
ADALVGYK 	 | 	 b: 3, y: 6 	 | 	 0.0012217500000701875
DAADVGYK 	 | 	 b: 3, y: 4 	 | 	 0.9726612500000442
Alignments for sequence 2
Alignments for sequence 3
DPQVEQLEL 	 | 	 b: 6, y: 2 	 | 	 0.000672749999921507
DFLDSFSEI 	 | 	 b: 0, y: 4 	 | 	 0.9727772500000356
DFLESYGLE 	 | 	 b: 0, y: 3 	 | 	 0.9727732500000457
Alignments for sequence 4
DLQTLALWSRM 	 | 	 b: 4, y: 8 	 | 	 0.0001922500000546279
DLQTLAWSRMD 	 | 	 b: 5, y: 0 	 | 	 0.9716317500000287
DLQTLADKRWS 	 | 	 b: 4, y: 0 	 | 	 0.5011292499999627
Alignments for sequence 5
DLTEYLSRFGEVV 	 | 	 b: 5, y: 5 	 | 	 0.000567749999845546
TVEEYLSRFGEVV 	 | 	 b: 5, y: 5 	 | 	 0.000567749999845546
LSEEELKQLEPDV 	 | 	 b: 4, y: 4 	 | 	 0.5020087500001864
Alignments for sequence 6
ELT