# Pre-processing spectra to make easier to search

Idea: make one big spectrum that we will then tag masses with amino acid sequences

Flow: 
1. Load all spectra
2. Make a single large spectrum called `S`
3. Find the `max_length` peptide from the maximum mass
4. Load the database
5. For each protein `P` of the database
    1. For each kmer `k` of length `max_length`
        1. Calculate each individual spectrum for `(b+, b++, y+, y++)` (we call `ts`)
        2. For each `ts`:
            1. For each mass `m` of this `ts`:
                1. Binary search `S` for `m` plus/minus tolerance
                2. If the mass is found, add the `k(m)` to a dictionary to keep for later split by ion type
6. Build a MassDawg for both `b` and `y` kmers taken from the search
7. For each input spectrum:
    1. Search both the `b` and `y` MassDawgs for sequences 
    2. Make an alignment
                

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from pyteomics import fasta
from collections import defaultdict
from src.sequence.gen_spectra import gen_spectrum, gen_min_ordering
from src.file_io import mzML
from src.objects import Spectrum, Database
from src.utils import ppm_to_da
from src.scoring import scoring, mass_comparisons
from src.tree import Tree
from src.identfication import alignment
from src.database import extract_protein_name

import bisect

from mass_dawg import PyMassDawg
from math import ceil

import numpy as np

ppm_tol = 20

In [2]:
def longest_array(window, length):
    return int(np.ceil(1/window * length * 186.079313))

def sparse_it(spectrum, window, length):
    sparse = np.zeros(longest_array(window, length))
    for mz in spectrum:
        mz_direct = int(mz/window)
        sparse[mz_direct] = 1
#         sparse[mz_direct-1] = .5
#         sparse[mz_direct+1] = .5
    
    return sparse

def to_percent(index, total):
    return int(100 * (index + 1)/total)

## 1. Load all spectra

In [3]:
spectra_file = '/Users/zacharymcgrath/Desktop/nod2 data/single/singleRealSpectrum.mzml'
fasta_file = '/Users/zacharymcgrath/Desktop/nod2 data/filteredNOD2.fasta'

In [4]:
spectra = mzML.read(spectra_file, peak_filter=30)
print(f'Average length of spectra: {np.mean(list(map(len, [spectrum.spectrum for spectrum in spectra])))}')

Average length of spectra: 30.0


## 2. Make the single large spectrum

In [5]:
all_spectra = sorted([x for spectrum in spectra for x in spectrum.spectrum])

## 3. Find the max length possible peptide

In [6]:
max_len = int(np.ceil(all_spectra[-1]/ 57.021464))

## 4. Load the database into RAM

In [7]:
prots = {}
for i, entry in enumerate(fasta.read(fasta_file)):
    name = extract_protein_name(entry)
    prots[name] = entry

## 5. The search

In [8]:
%%time
def find_kmer_hits(kmer: str, prot_name: str) -> None:
    for ion in 'by':
        for charge in [1, 2]:
            spec = gen_spectrum(kmer, ion=ion, charge=charge)['spectrum']

            for c, mass in enumerate(spec):

                da_tol = ppm_to_da(mass, ppm_tol)
                lb = mass - da_tol
                ub = mass + da_tol

                beginning_entry = bisect.bisect_left(all_spectra, lb)

                # see if the NEXT value is in the range. If so, keep the kmer
                if beginning_entry + 1 < len(all_spectra) and all_spectra[beginning_entry] <= ub:
                    
                    if ion == 'b':
                        b_hits[kmer[:c+1]] = None
                        t.insert(prot_name, kmer[:c+1])
                    else:
                        y_hits[kmer[-c-1:]] = None
                        t.insert(prot_name, kmer[-c-1:])

b_hits = {}
y_hits = {}
plen = len(prots)

t = Tree()

for i, (name, prot_entry) in enumerate(prots.items()):
    
    
    print(f'\rOn protein {i+1}/{plen} [{int((i+1) * 100 / plen)}%]', end='')
    
    for j in range(1, max_len):
        kmer = prot_entry.sequence[:j]
        find_kmer_hits(kmer, name)
    
    for j in range(len(prot_entry.sequence) - max_len):
        kmer = prot_entry.sequence[j:j+max_len]
        find_kmer_hits(kmer, name)     
        
    for j in range(len(prot_entry.sequence) - max_len, len(prot_entry.sequence)):
        kmer = prot_entry.sequence[j:]
        find_kmer_hits(kmer, name)
                    

On protein 279/279 [100%]CPU times: user 14.1 s, sys: 117 ms, total: 14.2 s
Wall time: 14.2 s


## 6. Score all of the remaining sequnces against each spectrum

In [9]:
# %%time

# bt = MassTree(ppm_tol)
# yt = MassTree(ppm_tol)

# for i, b_hit in enumerate(b_hits):
#     print(f'\rOn b-ion kmer {i+1}/{len(b_hits)} [{to_percent(i, len(b_hits))}%]', end='')
    
#     bs = gen_spectrum(b_hit, ion='b', charge=1)['spectrum']
#     bd = gen_spectrum(b_hit, ion='b', charge=2)['spectrum']
#     bt.add_sequence(b_hit, bs, bd)
        
# print()
# for i, y_hit in enumerate(y_hits):
#     print(f'\rOn y-ion kmer {i+1}/{len(y_hits)} [{to_percent(i, len(y_hits))}%]', end='')
    
#     ys = gen_spectrum(y_hit, ion='y', charge=1)['spectrum']
#     yd = gen_spectrum(y_hit, ion='y', charge=2)['spectrum']
#     yt.add_sequence(y_hit[::-1], ys, yd)
    

In [10]:
# x = 10
# for i in range(len(spectra)):
#     b_results[i].sort(key=lambda x: x[1], reverse=True)
#     y_results[i].sort(key=lambda x: x[1], reverse=True)
#     print(f'============================ spectrum {i}')
#     print(f'b results: \n{b_results[i][:x]}')
#     print(f'y results: \n{y_results[i][:x]}')
    

In [11]:
# # do it the np dot product way
# window = 1
# sparse_spectra = np.zeros((len(spectra), longest_array(window, max_len)))

# for i, spectrum in enumerate(spectra):
#     sparse_spectra[i] = sparse_it(spectrum.spectrum, window, max_len)

In [12]:
# go through all of the b and y hits and add them to the hits
scored_b_hits = defaultdict(list)
scored_y_hits = defaultdict(list)

def add_hit(results, kmer, r_d):
    for i, value in enumerate(results):
        r_d[i].append((kmer, value))
        
for i, b_hit in enumerate(b_hits):
    print(f'\rScoring b_hit {i+1}/{len(b_hits)} [{to_percent(i, len(b_hits))}%]', end='')
#     sparse_b = sparse_it(gen_spectrum(b_hit, ion='b')['spectrum'], window, max_len)
    results = [scoring.intensity_ion_backbone_score(spectrum, b_hit, 'b', ppm_tol) for spectrum in spectra]
    add_hit(results, b_hit, scored_b_hits)

print()
for i, y_hit in enumerate(y_hits):
    print(f'\rScoring y_hit {i+1}/{len(y_hits)} [{to_percent(i, len(y_hits))}%]', end='')
    #sparse_y = sparse_it(gen_spectrum(b_hit, ion='y')['spectrum'], window, max_len)
    results = [scoring.intensity_ion_backbone_score(spectrum, y_hit, 'y', ppm_tol) for spectrum in spectra]
    add_hit(results, y_hit, scored_y_hits)

Scoring b_hit 33352/33352 [100%]
Scoring y_hit 31324/31324 [100%]

In [13]:
db = Database(fasta_file, prots, t)

## 7. For each spectrum, search the graphs and build an alignment

In [14]:
alignments = {}

satisfied_percent = .2
max_gap = 2
TOP_X = 10

for i, spectrum in enumerate(spectra):
    print(f'Creating an alignment for spectrum {i+1}/{len(spectra)} [{to_percent(i, len(spectra))}%]\r', end='')
    
# TREE/GRAPH SITUATION
#     max_len = int(ceil(spectrum.spectrum[-1]/ 57.021464))
#     b_satisfied, y_satisfied, satisfied = False, False, False
#     b_hits, y_hits = [], []
#     gap = 2
    

#     while gap <= max_gap and not satisfied:
#         print('\nIN THE WHILE LOOP')
#         print(f'Current gap is {gap}')
        
#         if not b_satisfied:
#             b_hits = bt.fuzzy_search(spectrum.spectrum, gap)
#             b_satisfied = (len(b_hits) > 0 and max(list(map(len, b_hits))) >= max_len * satisfied_percent) or gap >= max_len
            
#         if not y_satisfied:
#             y_hits = yt.fuzzy_search(spectrum.spectrum, gap)
#             y_satisfied = (len(y_hits) > 0 and max(list(map(len, y_hits))) >= max_len * satisfied_percent) or gap >= max_len
            
#         satisfied = b_satisfied and y_satisfied
        
#         gap += 1
        
#     scored_b_hits[i].sort(key=lambda x: x[1], reverse=True)
#     scored_y_hits[i].sort(key=lambda x: x[1], reverse=True)
    
    this_b_dawg = PyMassDawg()
    this_y_dawg = PyMassDawg()
    
    for b_hit, score in scored_b_hits[i]:
        if score > 0:
            bs = gen_spectrum(b_hit, ion='b', charge=1)['spectrum']
            bd = gen_spectrum(b_hit, ion='b', charge=2)['spectrum']
            this_b_dawg.insert(bs, bd, b_hit)
    this_b_dawg.finish()
    
    for y_hit, score in scored_y_hits[i]:
        if score > 0:
            ys = gen_spectrum(y_hit, ion='y', charge=1)['spectrum']
            yd = gen_spectrum(y_hit, ion='y', charge=2)['spectrum']
            this_y_dawg.insert(ys, yd, y_hit[::-1])
    this_y_dawg.finish()
    
    b_hits = this_b_dawg.fuzzy_search(spectrum.spectrum, 2, ppm_tol)
    y_hits = this_y_dawg.fuzzy_search(spectrum.spectrum, 2, ppm_tol)
    
    b_hits.sort(key=len, reverse=True)
    y_hits.sort(key=len, reverse=True)
    

#     refined_b_hits = sorted(
#         [(x, mass_comparisons.optimized_compare_masses(spectrum.spectrum, gen_spectrum(x, ion='b')['spectrum'])) for x in b_hits], 
#         key=lambda x: x[1], 
#         reverse=True
#     )[:TOP_X]

#     refined_y_hits = sorted(
#         [(x, mass_comparisons.optimized_compare_masses(spectrum.spectrum, gen_spectrum(x, ion='y')['spectrum'])) for x in y_hits], 
#         key=lambda x: x[1], 
#         reverse=True
#     )[:TOP_X]
    
    alignments[i] = alignment.attempt_alignment(
        spectrum, 
        db, 
#         [x[0] for x in scored_b_hits[i][:TOP_X] if x[1] > 0], 
#         [x[0] for x in scored_y_hits[i][:TOP_X] if x[1] > 0], 
        b_hits[:TOP_X],
        y_hits[:TOP_X],
        ppm_tolerance=ppm_tol, 
        n=3, 
        scoring_alg='ion'
    )
    


Creating an alignment for spectrum 9/9 [100%]

In [15]:
print('Sequence \t | \t scores \t | \t precursor distance')
for i, als in alignments.items():
    print(f'Alignments for sequence {i}')
    for a in als.alignments:
        print(f'{a.sequence} \t | \t b: {a.b_score}, y: {a.y_score} \t | \t {a.precursor_distance}')

Sequence 	 | 	 scores 	 | 	 precursor distance
Alignments for sequence 0
Alignments for sequence 1
SILASFNS 	 | 	 b: 3, y: 1 	 | 	 0.9908572499999764
SKVFPVSA 	 | 	 b: 1, y: 2 	 | 	 0.9884067499999674
ILASFNSS 	 | 	 b: 1, y: 1 	 | 	 0.9908572499999764
Alignments for sequence 2
Alignments for sequence 3
AGGATVEPAGGAL 	 | 	 b: 4, y: 1 	 | 	 0.00494475000004968
TVTDGLHSGAL 	 | 	 b: 3, y: 1 	 | 	 0.004944249999994099
GSGPPSGTGLGAL 	 | 	 b: 3, y: 1 	 | 	 0.004944250000107786
Alignments for sequence 4
Alignments for sequence 5
Alignments for sequence 6
Alignments for sequence 7
Alignments for sequence 8
STGEKGFGTGSPL 	 | 	 b: 2, y: 1 	 | 	 0.055824250000000575
SAAFSSVGSATVGP 	 | 	 b: 2, y: 1 	 | 	 0.055824750000056156
SAAFSSVGSALPSG 	 | 	 b: 2, y: 1 	 | 	 0.05582525000011174
