# Try the comet approach
Comet is fast and goes through each protein every single time and computes a fast xcorr score. We should try and do the same. 

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import numpy as np

from src.spectra.gen_spectra import gen_spectrum
from src.file_io import mzML
from src.utils import insort_by_index
from pyteomics import fasta

In [2]:
def xcorr(observed: np.ndarray, reference: np.ndarray, value=50) -> float:
    '''
    An xcorr scoring algorithm 
    
    Formula: dot(reference, y'), y' = observed - (sum(-75, 75)observed[i]/150)
    
    Inputs should be sparsely populated np arrays. Indexing should observe the formula
    
    idx = int(m/w), m is mass, w is bin width (aka tolerance)
    
    Inputs:
        observed:    (np.ndarray) list of peaks normalized to value
        reference:   (np.ndarray) list of peaks normalized to value
    kwargs: 
        value:       (number) value given to the normalized peaks. Default=50
    Outputs:
        (float) the score of the formula shown above
    '''
    def sum_term(i):
        min_idx = max(0, i-75)
        max_idx = min(len(observed) - 1, i + 75)
        return np.sum(observed[min_idx:max_idx])/(max_idx - min_idx)

    # calculate the y prime term 
    y_prime = np.asarray([observed[i] - sum_term(i) for i in range(len(observed))])
        
    # fill y prime or reference to be the same size. fill with zeros
    if len(y_prime) < len(reference):
        y_prime = np.concatenate((y_prime, [0 for _ in range(len(reference) - len(y_prime))]))
    else:
        reference = np.concatenate((reference, [0 for _ in range(len(y_prime) - len(reference))]))
        
    return np.dot(reference, y_prime)/(sum(reference)*value)

In [3]:
def make_sparse_array(spectrum: list, width: float, value=50) -> np.ndarray:
    '''
    Make a spectrum (a list of floats) into a sparsely populated array for xcorr 
    calculation. Indices are calculated by
    
    idx = int(m/w), m is mass, w is bin width
    
    width is the tolerance in Da to allow when calculating scores. All peaks with some value
    are given a new value of 50.
    
    Inputs:
        spectrum:   (list) float mass values of peaks
        width:      (float) mass tolerance to accept to make bin width
    kwargs:
        value:      (number) value to give peaks at the new index. Default=50
    Outputs:
        (np.ndarray) sparesly populated list
    '''
    # find the largest mass and make that the length of the array
    list_size = int(max(spectrum)//width) + 1
    
    sparse = np.zeros(list_size)
    
    
    # populate sparse at the index for each mass
    for m in spectrum:
        sparse[int(m // width)] = 50

    return sparse

# Use these tools to search

In [6]:
spectra_file = '../data/testing_output/cis_spliced/realisticCisSplicedSpectra.mzML'
fasta_file = '../../testing framework/data/databases/100prots.fasta'

prot_iterator = fasta.read(fasta_file)
spectra_iterator = mzML.read(spectra_file)

proteins = [x for x in prot_iterator]


In [7]:
num_spec = len(spectra_iterator)
tol = 0.02

for i, spectrum in enumerate(spectra_iterator):
    print(f'On spectrum {i+1}/{num_spec}')
    
    sparse_spectrum = make_sparse_array(spectrum.spectrum, tol)
    
    best_b_matches = []
    best_y_matches = []
    
    
    for j, prot in enumerate(proteins):
        print(f'on kmer {j}/{100}\r', end='')
        
        # go through the seqeunce and extend
        for i in range(len(prot.sequence) - 2):
            kmer_len = 20 if len(prot.sequence) < i + 20 else len(prot.sequence) - i
            kmer = prot.sequence[i:i+kmer_len]
            
            # make the b and y sparse spectra
            b_sparse_ref = make_sparse_array(gen_spectrum(kmer, ion='b')['spectrum'], tol)
            y_sparse_ref = make_sparse_array(gen_spectrum(kmer, ion='y')['spectrum'], tol)
            
            # score and insert
            b_xcorr = xcorr(sparse_spectrum, b_sparse_ref)
            y_xcorr = xcorr(sparse_spectrum, y_sparse_ref)
            
            best_b_matches = insort_by_index((b_xcorr, kmer), best_b_matches, 0)
            best_y_matches = insort_by_index((y_xcorr, kmer), best_y_matches, 0)
    

On spectrum 1/125
on kmer 0/100

KeyboardInterrupt: 