# Reverse "indexing"
Instead of trying to index all of the k-mers in a database and store it in RAM, lets pre-process the spectra and do 1 sweep through the database, finding the best scoring k-mers in the database for each spectrum

### Steps
1. Load spectra into memory
2. Turn each spectra into a sparse 1s and 0s matrix 
3. Load the database into RAM
4. For each protein
    1. For each MAX_LENGTH-mer
        1. Generate the spectrum for this k-mer
        2. Sparsify the spectrum
        3. Calculate the dot product of this sparse k-mer against the input
        4. For each spectrum
            1. Keep track of the top X scoring peptides by (protein, position)
    2. Do for both b and y ions (seperately)

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from pyteomics import fasta
from collections import defaultdict
from src.sequence.gen_spectra import gen_spectrum
from src.file_io import mzML
from src.objects import Spectrum

import ms_deisotope
from ms_deisotope.test.common import datafile

import numpy as np

In [2]:
def longest_array(window, length):
    return int(np.ceil(1/window * length * 186.079313))

In [3]:
def longest_peptide(max_mass):
    return int(np.ceil(max_mass / 57.021464))

In [4]:
def sparse_it(spectrum, window, length):
    sparse = np.zeros(longest_array(window, length), dtype=np.int8)
    for mz in spectrum:
        mz_direct = int(mz/window)
        sparse[mz_direct] = np.int8(1)
    
    return sparse

In [5]:
fasta_file = '/Users/zacharymcgrath/Desktop/nod2 data/filteredNOD2.fasta'
spectra_file = '/Users/zacharymcgrath/Desktop/nod2 data/single/singleRealSpectrum.mzml'
window_size = .01

## 1. Load spectra into memory

In [6]:
spectra = mzML.read(spectra_file, peak_filter=20)

In [7]:
max_mass = max([x for spec in spectra for x in spec.spectrum])
print(f'Max mass: {max_mass}')
print(f'Max peptide length: {longest_peptide(max_mass)}')
max_length = longest_peptide(max_mass)
avg_mass = np.mean([x for spec in spectra for x in spec.spectrum])
avg_len = longest_peptide(avg_mass)
print(f'Average mass: {avg_mass}')
print(f'Average peptide length: {avg_len}')

Max mass: 1311.6202392578125
Max peptide length: 24
Average mass: 368.8827159881592
Average peptide length: 7


## 2. Sparsify the spectra

In [8]:
sparsified_spectra = np.zeros((len(spectra), longest_array(window_size, max_length)), dtype=np.int8)

In [9]:
for i, spec in enumerate(spectra):
    print(f'\rSparsifying spectrum {i}/{len(spectra)}', end='')

    sparsified_spectra[i] = sparse_it(spec.spectrum, window_size, max_length)

Sparsifying spectrum 0/9Sparsifying spectrum 1/9Sparsifying spectrum 2/9Sparsifying spectrum 3/9Sparsifying spectrum 4/9Sparsifying spectrum 5/9Sparsifying spectrum 6/9Sparsifying spectrum 7/9Sparsifying spectrum 8/9

## 3. Load database into RAM

In [10]:
prots = {}
for i, prot in enumerate(fasta.read(fasta_file)):
    prot_name = prot.description.split('|')[-1].split(' ')[0]
    prots[prot_name] = prot.sequence
    

## 4. Go through each protein and each MAX_LENGTH-mer and calculate the scores

In [11]:
top_x = 10
top_b = defaultdict(list)
top_y = defaultdict(list)
top_by = defaultdict(list)

def add_score(ion, spec_idx, score, prot, start_idx):
    d = top_b if ion == 'b' else (top_y if ion == 'y' else top_by)
    
    new_entry = (score, prot, start_idx)
    
    if len(d[spec_idx]) > top_x and score > 0:
        d[spec_idx].append(new_entry)
        d[spec_idx].sort(key=lambda x: x[0], reverse=True)
        
    else:
        d[spec_idx] = sorted(d[spec_idx] + [new_entry], key=lambda x: x[0], reverse=True)
        
def score_kmer(kmer, prot_name, start_idx):
    b_spec = gen_spectrum(kmer, ion='b')['spectrum']
    y_spec = gen_spectrum(kmer, ion='y')['spectrum']
    spec = gen_spectrum(kmer)['spectrum']
    
    sparse_b = sparse_it(b_spec, window_size, max_length)
    sparse_y = sparse_it(y_spec, window_size, max_length)
    sparse = sparse_it(spec, window_size, max_length)
    
    b_scores = np.dot(sparsified_spectra, sparse_b)
    y_scores = np.dot(sparsified_spectra, sparse_y)
    scores = np.dot(sparsified_spectra, sparse)
    
    [add_score('b', i, score, prot_name, start_idx) for i, score in enumerate(b_scores)]
    [add_score('y', i, score, prot_name, start_idx + len(kmer) - 1) for i, score in enumerate(b_scores)]
    [add_score('by', i, score, prot_name, start_idx) for i, score in enumerate(scores)]
    

In [12]:
plen = len(prots)

for i, (prot_name, prot_seq) in enumerate(prots.items()):
    
    # if the protein is less than our max_len, just do 1 calculation
    for j in range(1, max_length):
        score_kmer(prot_seq[:j], prot_name, j)
        
    # otherwise go through the entire thing
    for j in range(len(prot_seq) - max_length):
        print(f'\rOn protein {i}/{plen} [{int(100 * (i+1) / plen)}%]     ({int(100 * (j+1)/(len(prot_seq)-max_length))}% Done with protein)', end='')

        score_kmer(prot_seq[j:j+max_length], prot_name, j)
        
    for j in range(len(prot_seq) - max_length, len(prot)):
        score_kmer(prot_seq[j:], prot_name, j)
        

On protein 55/279 [20%]     (4% Done with protein)n)

KeyboardInterrupt: 

In [None]:
for spec_count in top_b:
    print(f'Spectrum {spec_count}')
    for i in range(top_x):
        b_prot = prots[top_b[spec_count][i][1]]
        b_start_idx = top_b[spec_count][i][2]
        b_score = top_b[spec_count][i][0]
        
        y_prot = prots[top_y[spec_count][i][1]]
        y_start_idx = top_y[spec_count][i][2]
        y_score = top_y[spec_count][i][0]
        
        by_prot = prots[top_by
                        [spec_count]
                        [i]
                        [1]]
        by_start_idx = top_by[spec_count][i][2]
        by_score = top_by[spec_count][i][0]
        
        print(f'''\t{i}. b: {b_prot[b_start_idx:b_start_idx+avg_len]}: {b_score} \t y: {y_prot[y_start_idx-avg_len:y_start_idx+1]}: {y_score} \t both: {by_prot[by_start_idx:by_start_idx+avg_len]}: {by_score}''')
        

Hybrid: 0
```
DLPVNSPMTKG
```

Non hybrid: 1
``` 
DAAIVGYK
```

Non hybrid: 2
```
DEAPNFEANTTIGRIRFH
```

Non hybrid: 3
``` 
DPQVEQLEL
```

Hybrid 1: 4
```
DLQTLALWSRM
```

Non hybrid: 5
```
DLTEYLSRFGEVV
```

Hybrid 2: 6
```
DLQTLEVE
```

Non Hybrid: 7
```
DLSSASAIMGNAKVKAHGKKVITAFN
```

Non hybrid: 8
```
EEILKAFKLF
```