# Try the comet approach
Comet is fast and goes through each protein every single time and computes a fast xcorr score. We should try and do the same. 

In [15]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import numpy as np

from src.spectra.gen_spectra import gen_spectrum
from src.file_io import mzML
from src.utils import insort_by_index
from pyteomics import fasta

In [16]:
def xcorr(observed: np.ndarray, reference: np.ndarray, value=50) -> float:
    '''
    An xcorr scoring algorithm 
    
    Formula: dot(reference, y'), y' = observed - (sum(-75, 75)observed[i]/150)
    
    Inputs should be sparsely populated np arrays. Indexing should observe the formula
    
    idx = int(m/w), m is mass, w is bin width (aka tolerance)
    
    Inputs:
        observed:    (np.ndarray) list of peaks normalized to value
        reference:   (np.ndarray) list of peaks normalized to value
    kwargs: 
        value:       (number) value given to the normalized peaks. Default=50
    Outputs:
        (float) the score of the formula shown above
    '''
    def sum_term(i):
        min_idx = max(0, i-75)
        max_idx = min(len(observed) - 1, i + 75)
        return np.sum(observed[min_idx:max_idx])/(max_idx - min_idx)

    # calculate the y prime term 
    y_prime = np.asarray([observed[i] - sum_term(i) for i in range(len(observed))])
        
    # fill y prime or reference to be the same size. fill with zeros
    if len(y_prime) < len(reference):
        y_prime = np.concatenate((y_prime, [0 for _ in range(len(reference) - len(y_prime))]))
    else:
        reference = np.concatenate((reference, [0 for _ in range(len(y_prime) - len(reference))]))
        
    return np.dot(reference, y_prime)/(sum(reference)*value)

In [17]:
def make_sparse_array(spectrum: list, width: float, value=50) -> np.ndarray:
    '''
    Make a spectrum (a list of floats) into a sparsely populated array for xcorr 
    calculation. Indices are calculated by
    
    idx = int(m/w), m is mass, w is bin width
    
    width is the tolerance in Da to allow when calculating scores. All peaks with some value
    are given a new value of 50.
    
    Inputs:
        spectrum:   (list) float mass values of peaks
        width:      (float) mass tolerance to accept to make bin width
    kwargs:
        value:      (number) value to give peaks at the new index. Default=50
    Outputs:
        (np.ndarray) sparesly populated list
    '''
    # find the largest mass and make that the length of the array
    list_size = int(max(spectrum)//width) + 1
    
    sparse = np.zeros(list_size)
    
    
    # populate sparse at the index for each mass
    for m in spectrum:
        sparse[int(m // width)] = 50

    return sparse

# Use these tools to search

In [4]:
spectra_file = '../data/testing_output/cis_spliced/realisticCisSplicedSpectra.mzML'
fasta_file = '../../testing framework/data/databases/100prots.fasta'

prot_iterator = fasta.read(fasta_file)
spectra_iterator = mzML.read(spectra_file)

proteins = [x for x in prot_iterator]


{'dataProcessingRef': 'dp_sp_0', 'defaultArrayLength': 50, 'id': 'spectrum=0', 'index': 0, 'scanList': {'count': 1, 'scan': [{'scan start time': -1.0}], 'no combination': ''}, 'precursorList': {'count': 1, 'precursor': [{'isolationWindow': {'isolation window target m/z': 721.92753925}, 'selectedIonList': {'count': 1, 'selectedIon': [{'selected ion m/z': 721.92753925, 'charge state': 2.0}]}, 'activation': {'dissociation method': ''}}]}, 'spectrum representation': '', 'ms level': 2, 'mass spectrum': '', 'centroid spectrum': '', 'count': 2, 'm/z array': array([  58.53948789,   65.05267024,  115.57943913,  116.07004241,
        179.61115681,  186.37996628,  193.62883232,  222.22044281,
        230.1128905 ,  230.13137158,  230.15200322,  257.67648607,
        263.31144982,  278.65769527,  314.21757693,  358.20844833,
        386.25351072,  408.71714185,  465.25604508,  514.35257128,
        543.32706338,  556.30485941,  574.80648925,  607.38057644,
        627.42341376,  657.88293583,  664

ZeroDivisionError: division by zero

In [7]:
num_spec = len(spectra_iterator)
tol = 0.02

for i, spectrum in enumerate(spectra_iterator):
    print(f'On spectrum {i+1}/{num_spec}')
    
    sparse_spectrum = make_sparse_array(spectrum.spectrum, tol)
    
    best_b_matches = []
    best_y_matches = []
    
    
    for j, prot in enumerate(proteins):
        print(f'on kmer {j}/{100}\r', end='')
        
        # go through the seqeunce and extend
        for i in range(len(prot.sequence) - 2):
            kmer_len = 20 if len(prot.sequence) < i + 20 else len(prot.sequence) - i
            kmer = prot.sequence[i:i+kmer_len]
            
            # make the b and y sparse spectra
            b_sparse_ref = make_sparse_array(gen_spectrum(kmer, ion='b')['spectrum'], tol)
            y_sparse_ref = make_sparse_array(gen_spectrum(kmer, ion='y')['spectrum'], tol)
            
            # score and insert
            b_xcorr = xcorr(sparse_spectrum, b_sparse_ref)
            y_xcorr = xcorr(sparse_spectrum, y_sparse_ref)
            
            best_b_matches = insort_by_index((b_xcorr, kmer), best_b_matches, 0)
            best_y_matches = insort_by_index((y_xcorr, kmer), best_y_matches, 0)
    

On spectrum 1/125
on kmer 0/100

KeyboardInterrupt: 

In [5]:
import struct, base64
decoded = base64.decodebytes(b'5aBDScpnL0hqNGVIsXOSScNnGEkCe4BIwwEVSTaEC0gmetpIMohjSnFALEpokNlHgVGkSM1gGUiEj65ILlYJSeWe6Et8KmFIALYQSfseU0iy7rRIxBduSCcFu0dBUKRIGoAPSEFCiEiwpktIhmORSJyjPUhyO/RI/rXLSeeaJ0iCFohIsG/dSL0opkjNSYxIz/gdSAy+5UgI6INJ5+/YSEakDUmAgYFIhmdeSCKlvkgY5xFJIAR7SEgy+Eip3hdI5+BmSEfWrkg2jRlIy7sDSW4RE0hnqBhI2GstSIiJFUgmfvxIBYuKSMn8HUjhLqtIjD7fSFuatUiyrRBIjd4YSHcomEjaYmBIZ8m2SNPFS0iMXDtJi579SAgdoEgA+GpIjr3xR9tSlEgcg71IGi4mSHbvnUgLnTZJufLASKlTWkjqpoBIrI+1SIyWgEmsEilIwLsVSEdZ60cu3vZIRBtLSGzwk0jlS9lH3S4cSORW3EhEQjlI2G7QSN9ytkgHo/pH8ho8SeDObUjmmxdImLmCSAfZdEmec+hIHEbyR2jGd0dqsc5ImOvdSfoZmUkn2jlJAymMScQKH0kgjF5JIcEWSWrSOUkOeqJIJ/ZwSU+sIkmIjZtIqrwgSm177kgsMtxJFlOaSahwvUj4IstI9ic7SE42Xkk=')
print(decoded)

b'\xe5\xa0CI\xcag/Hj4eH\xb1s\x92I\xc3g\x18I\x02{\x80H\xc3\x01\x15I6\x84\x0bH&z\xdaH2\x88cJq@,Jh\x90\xd9G\x81Q\xa4H\xcd`\x19H\x84\x8f\xaeH.V\tI\xe5\x9e\xe8K|*aH\x00\xb6\x10I\xfb\x1eSH\xb2\xee\xb4H\xc4\x17nH\'\x05\xbbGAP\xa4H\x1a\x80\x0fHAB\x88H\xb0\xa6KH\x86c\x91H\x9c\xa3=Hr;\xf4H\xfe\xb5\xcbI\xe7\x9a\'H\x82\x16\x88H\xb0o\xddH\xbd(\xa6H\xcdI\x8cH\xcf\xf8\x1dH\x0c\xbe\xe5H\x08\xe8\x83I\xe7\xef\xd8HF\xa4\rI\x80\x81\x81H\x86g^H"\xa5\xbeH\x18\xe7\x11I \x04{HH2\xf8H\xa9\xde\x17H\xe7\xe0fHG\xd6\xaeH6\x8d\x19H\xcb\xbb\x03In\x11\x13Hg\xa8\x18H\xd8k-H\x88\x89\x15H&~\xfcH\x05\x8b\x8aH\xc9\xfc\x1dH\xe1.\xabH\x8c>\xdfH[\x9a\xb5H\xb2\xad\x10H\x8d\xde\x18Hw(\x98H\xdab`Hg\xc9\xb6H\xd3\xc5KH\x8c\\;I\x8b\x9e\xfdH\x08\x1d\xa0H\x00\xf8jH\x8e\xbd\xf1G\xdbR\x94H\x1c\x83\xbdH\x1a.&Hv\xef\x9dH\x0b\x9d6I\xb9\xf2\xc0H\xa9SZH\xea\xa6\x80H\xac\x8f\xb5H\x8c\x96\x80I\xac\x12)H\xc0\xbb\x15HGY\xebG.\xde\xf6HD\x1bKHl\xf0\x93H\xe5K\xd9G\xdd.\x1cH\xe4V\xdcHDB9H\xd8n\xd0H\xdfr\xb6H\x07\xa3\xfaG\xf2\x1a<I\xe0\xcemH\xe6\x9

In [14]:
struct.unpack('<125f', decoded)



(801294.3125,
 179615.15625,
 234705.65625,
 1199734.125,
 624252.1875,
 263128.0625,
 610332.1875,
 142864.84375,
 447441.1875,
 3727884.5,
 2822172.25,
 111392.8125,
 336524.03125,
 157059.203125,
 357500.125,
 562530.875,
 30490058.0,
 230569.9375,
 592736.0,
 216187.921875,
 370549.5625,
 243807.0625,
 95754.3046875,
 336514.03125,
 146944.40625,
 279058.03125,
 208538.75,
 297756.1875,
 194190.4375,
 500187.5625,
 1668799.75,
 171627.609375,
 278708.0625,
 453501.5,
 340293.90625,
 287310.40625,
 161763.234375,
 470512.375,
 1080577.0,
 444287.21875,
 580164.375,
 265228.0,
 227742.09375,
 390441.0625,
 597617.5,
 257040.5,
 508306.25,
 155514.640625,
 236419.609375,
 358066.21875,
 157236.84375,
 539580.6875,
 150597.71875,
 156321.609375,
 177583.375,
 153126.125,
 517105.1875,
 283736.15625,
 161779.140625,
 350583.03125,
 457204.375,
 371922.84375,
 148150.78125,
 156538.203125,
 311619.71875,
 229771.40625,
 374347.21875,
 208663.296875,
 767432.75,
 519412.34375,
 327912.25,

In [None]:
ref = [116.034219435, 229.118283435, 357.176861435, 58.520747935, 115.06277993500001, 179.09206893500001]
observed = [70.06484985351562, 72.07889556884766, 84.04356384277344, 86.095703125, 88.0396499633789, 101.07088470458984, 159.089599609375, 201.12228393554688, 202.1270751953125, 212.10423278808594, 213.08998107910156, 219.09384155273438, 226.12986755371094, 229.1165008544922, 230.11068725585938, 230.11761474609375, 242.153564453125, 246.14280700683594, 306.16021728515625, 325.1853332519531, 353.17779541015625, 357.17578125, 393.1902770996094, 394.1918029785156, 430.2205505371094, 440.2127685546875, 441.2143859863281, 458.2236022949219, 459.2294616699219, 536.2758178710938, 543.3084716796875, 544.2866821289062, 553.2975463867188, 579.2705078125, 580.2720336914062, 614.3399658203125, 624.333251953125, 642.3447875976562, 658.3429565429688, 666.859619140625, 692.3502197265625, 693.3529052734375, 737.4146118164062, 763.3924560546875, 764.3958129882812, 876.477783203125, 877.4713134765625, 977.5200805664062, 978.5184326171875, 1106.587890625]

sparseref = make_sparse_array(ref, 20)
sparseobserved = make_sparse_array(observed, 20)

