# Optimize the initial k-mer search
right now we are doing 8000 comparisons for EACH spectrum in the observed to filter.

### Imports

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.objects import Spectrum, BasicScoredKmer, Kmer
from src.database import Database
from src.scoring import scoring
from src.spectra.gen_spectra import gen_spectrum
from src.file_io import fasta
from src.identfication import filtering

from modules.sequence_generation import proteins, peptides

from collections import namedtuple, defaultdict
from bisect import bisect, bisect_right
import time
import math

## Current method (26 May 2020)

In [2]:
def find_interesting_kmers(spectrum: Spectrum, mers: list) -> list:
    '''
    Go through a list of mers and find the ones that are considered interesting

    Inputs:
        spectrum:   Spectrum namedtuple instance
        mers:       list of strings containing kmers
    Outputs:
        (b_anchors, y_anchors): list of BasicScoredKmer namedtuple instances
    '''
    # for ever mer, score the subsequence
    mer_scores = []
    for mer in mers:
        b_score, y_score = scoring.score_subsequence(spectrum.spectrum, mer)
        bsk = BasicScoredKmer(b_score, y_score, mer)
        mer_scores.append(bsk)
    b_anchors = filtering.score_filter(mer_scores, 'b_score')
    y_anchors = filtering.score_filter(mer_scores, 'y_score')
    return (b_anchors, y_anchors)


## Binary search on tuples
Build a list for all 3-mers that is of the form
```python
all3mers = [(reference_mass, protein, starting_position), ... ]

for observed in spectra:
    for mass in observed:
        i = binarysearch(mass - error, all3mers)
        hits = []
        for j in range(i, inf):
            if all3mers[i + j][0] <= mass + error:
                hits.append(all3mers[i + j]
            else break
```

In [3]:
MassSequence = namedtuple('MassSequence', ['mass', 'sequence'])

def ppm_opt(observed: float, ppm_tolerance: float) -> float:
    '''
    Calculate the ppm tolerance
    '''
    return abs((ppm_tolerance / 1000000)*observed)

def make_all_base_mers(database: Database, base_mer: int) -> list:
    '''
    Create the list of all the base mers from 1 to base_mer with mass, protein, start position information
    
    Inputs:
        database:    (Database) source of the sequences
        base_mer:    (int) the base k-mer to make up to
    Outputs:
        list of MassSequence for all singly, doubly b and y masses
    '''
    allbasemers = []
    database.set_kmer_size(base_mer)
    database.index()
    md = database.metadata
    for mer in md:
        mer_spec = gen_spectrum(mer)['spectrum']
        for mass in mer_spec:
            allbasemers.append(MassSequence(mass, mer))
        
    allbasemers.sort(key=lambda x: x.mass)
    return allbasemers

def seach_base_kmers(observed: Spectrum, allbasemers: list, tolerance: float) -> list:
    '''
    Search through all of the base kmers and find those that gave us good hits
    
    Inputs:
        spectrum:    (Spectrum) what to sequence
        database:    (Database) source of the sequences
        allbasemers: (list of MassSequence) all of the basemers made from the function 'make_all_base_mers'
        tolerance:   (float) the ppm tolerance to allow
        base_mer:    (int) length of the base kmer
    Outputs:
        list of kmer strings
    '''
    hits = []
    allbasemermasses = [x.mass for x in allbasemers]
    for mass in observed.spectrum:
        tol = ppm_opt(mass, tolerance)
        lb_mass = mass - tol
        ub_mass = mass + tol
        start_hit_index = bisect(allbasemermasses, lb_mass)
        for i in range(start_hit_index, len(allbasemers)):
            if allbasemers[i].mass <= ub_mass:
                hits.append(allbasemers[i].sequence)
            else: 
                break
                
    return hits
    

## Boundary hashing with binary search
Use the same technique as the last attempt, but instead of a pure binary search, do a binary search on a smaller list that you hash into

In [4]:
def make_all_base_mers_hash_bs(database: Database, base_mer: int) -> defaultdict:
    '''
    Create the list of all the base mers from 1 to base_mer with mass, protein, start position information
    
    Inputs:
        database:    (Database) source of the sequences
        base_mer:    (int) the base k-mer to make up to
    Outputs:
        list of MassSequence for all singly, doubly b and y masses
    '''
    allbasemers = defaultdict(list)
    database.set_kmer_size(base_mer)
    database.index()
    md = database.metadata
    for mer in md:
        mer_spec = gen_spectrum(mer)['spectrum']
        rev_mer = mer[::-1]
        rev_mer_spec = gen_spectrum(mer)['spectrum']
        for mass in mer_spec:
            mass_key = math.floor(mass)
            allbasemers[mass_key].append(MassSequence(mass, mer))

    for _, massseqlists in allbasemers.items():
        massseqlists.sort(key=lambda x: x.mass)
    return allbasemers

def seach_base_kmers_hash_bs(observed: Spectrum, allbasemers: dict, tolerance: float) -> list:
    '''
    Search through all of the base kmers and find those that gave us good hits
    
    Inputs:
        spectrum:    (Spectrum) what to sequence
        allbasemers: (dict of list of MassSequence) all of the basemers made from the function 'make_all_base_mers_hash'
        tolerance:   (float) the ppm tolerance to accept for each mass
    Outputs:
        list of MassSequence for all masses that were in the acceptable range of an observed mass
    '''
    hits = []
    for mass in observed.spectrum:
        tol = ppm_opt(mass, tolerance)
        lb_mass = mass - tol
        ub_mass = mass + tol
        lb_mass_key = math.floor(lb_mass)
        ub_mass_key = math.floor(ub_mass)
        
        lb_list = allbasemers[lb_mass_key]
        hitstartindex = bisect([x.mass for x in lb_list], lb_mass)
        hits += [lb_list[i].sequence for i in range(hitstartindex, len(lb_list)) if lb_mass <= lb_list[i].mass <= ub_mass]
        if lb_mass_key != ub_mass_key:
            ub_list = allbasemers[ub_mass_key]
            hitendindex = bisect_right([x.mass for x in ub_list], ub_mass)
            hits += [ub_list[i].sequence for i in range(0, hitendindex) if lb_mass <= ub_list[i].mass <= ub_mass]
            
    return hits
    

## Boundary hashing without binary search
Use the same technique as the last attempt, but instead of a pure binary search, do a binary search on a smaller list that you hash into

In [5]:
def make_all_base_mers_hash(database: Database, base_mer: int) -> defaultdict:
    '''
    Create the list of all the base mers from 1 to base_mer with mass, protein, start position information
    
    Inputs:
        database:    (Database) source of the sequences
        base_mer:    (int) the base k-mer to make up to
    Outputs:
        list of MassSequence for all singly, doubly b and y masses
    '''
    allbasemers = defaultdict(list)
    database.set_kmer_size(base_mer)
    database.index()
    md = database.metadata
    for mer in md:
        mer_spec = gen_spectrum(mer)['spectrum']
        rev_mer = mer[::-1]
        rev_mer_spec = gen_spectrum(mer)['spectrum']
        for mass in mer_spec:
            mass_key = math.floor(mass)
            allbasemers[mass_key].append(MassSequence(mass, mer))

    return allbasemers

def seach_base_kmers_hash(observed: Spectrum, allbasemers: dict, tolerance: float) -> list:
    '''
    Search through all of the base kmers and find those that gave us good hits
    
    Inputs:
        spectrum:    (Spectrum) what to sequence
        allbasemers: (dict of list of MassSequence) all of the basemers made from the function 'make_all_base_mers_hash'
        tolerance:   (float) the ppm tolerance to accept for each mass
    Outputs:
        list of MassSequence for all masses that were in the acceptable range of an observed mass
    '''
    hits = []
    for mass in observed.spectrum:
        tol = ppm_opt(mass, tolerance)
        lb_mass = mass - tol
        ub_mass = mass + tol
        lb_mass_key = math.floor(lb_mass)
        ub_mass_key = math.floor(ub_mass)
        
        hits += [x.sequence for x in allbasemers[ub_mass_key] if lb_mass <= x.mass <= ub_mass]
        if lb_mass_key != ub_mass_key:
            hits += [x.sequence for x in allbasemers[ub_mass_key] if lb_mass <= x.mass <= ub_mass]
            
    return hits
    

## Run these and test the time
So we need to be fair here. What we'll do is the following:
1. Include any setup time thats necessary (nothing that they both need)
2. Run it on different numbers of observed spectrum and database lengths to see how well they do

### Setup for both
#### 1. Database

In [6]:
fastafile = '../../testing framework/data/databases/100prots.fasta'
db = Database(fastafile)

#### 2. Spectra

In [7]:
database = fasta.read(fastafile, True)

database = {x['name']: x for x in database}

num_hybs = 5
min_length= 5
max_length = 35
num_peptides = 1000
min_cont = 3 #min contribution for each side of a hybrid

# create peptides
non_hybrid_peps = peptides.gen_peptides([x for _, x in database.items()], num_peptides, min_length=min_length, max_length=max_length, digest='random', dist='beta')
# create hybrid peptides

all_proteins_raw = [x for _,x in database.items()]
all_peptides_raw = non_hybrid_peps 

peps = {}
for i, pep in enumerate(all_peptides_raw):
    peps[i] = pep
    peps[i]['scan_no'] = i

spectra = []
sorted_keys = [int(c) for c in peps.keys()]
sorted_keys.sort()
for k in sorted_keys:
    pep = peps[k]
    cont = gen_spectrum(pep['sequence'])
    spec = cont['spectrum']
    pm = cont['precursor_mass']
    spectra.append({'spectrum': spec, 'precursor_mass': pm})


### Run the stuff

In [8]:
old_st = time.time()
db.index()
md = db.metadata
mers = list(md.keys())
speclen = len(spectra)
old_res_dict = {}
for i, spec in enumerate(spectra):
    print('spectrum {}/{}\r'.format(i, speclen), end='')
    s = Spectrum(spec['spectrum'], [], '2', 0, spec['precursor_mass'], '')
    old_res_dict[i] = find_interesting_kmers(s, mers)
print('\ntotal time for the old method: {}'.format(time.time() - old_st))

7487 unique kmers
spectrum 999/1000
total time for the old method: 396.88234782218933


In [9]:
bs_st = time.time()
all3mers = make_all_base_mers(db, 3)
speclen = len(spectra)
bs_res_dict = {}
for i, spec in enumerate(spectra):
    print('spectrum {}/{}\r'.format(i, speclen), end='')
    s = Spectrum(spec['spectrum'], [], '2', 0, spec['precursor_mass'], '')
    bs_res_dict[i] = seach_base_kmers(s, all3mers, 20)
print('\ntotal time for the binary search on large list method: {}'.format(time.time() - bs_st))

7487 unique kmers
spectrum 999/1000
total time for the binary search on large list method: 19.75441598892212


In [10]:
bsh_st = time.time()
all3mers = make_all_base_mers_hash_bs(db, 3)
speclen = len(spectra)
bsh_res_dict = {}
for i, spec in enumerate(spectra):
    print('spectrum {}/{}\r'.format(i, speclen), end='')
    s = Spectrum(spec['spectrum'], [], '2', 0, spec['precursor_mass'], '')
    bsh_res_dict[i] = seach_base_kmers_hash_bs(s, all3mers, 20)
#     bsh_res_dict[i] += search_base_kmers_hash(s, all3mersrev, 0.05)
print('\ntotal time for the binary search on top of hash: {}'.format(time.time() - bsh_st))

7487 unique kmers
spectrum 999/1000
total time for the binary search on top of hash: 3.712261915206909


In [11]:
hs_st = time.time()
all3mers = make_all_base_mers_hash(db, 3)
speclen = len(spectra)
hs_res_dict = {}
for i, spec in enumerate(spectra):
    print('spectrum {}/{}\r'.format(i, speclen), end='')
    s = Spectrum(spec['spectrum'], [], '2', 0, spec['precursor_mass'], '')
    hs_res_dict[i] = seach_base_kmers_hash(s, all3mers, 20)
#     bsh_res_dict[i] += search_base_kmers_hash(s, all3mersrev, 0.05)
print('\ntotal time for pure hash method: {}'.format(time.time() - hs_st))

7487 unique kmers
spectrum 999/1000
total time for pure hash method: 3.9834797382354736


## Compare results (for now just the hash and the binary one)

In [11]:
check_len = len(bsh_res_dict)
for i in range(len(bsh_res_dict)):
    print('checking elements {}/{}\r'.format(i, check_len), end='')
    if not all([x in bs_res_dict[i] for x in bsh_res_dict[i]]):
        print(f'NOT EQUAL AT {i}')

checking elements 5999/6000

## Compare results (hash and binary hash)

In [12]:
check_len = len(hs_res_dict)
for i in range(len(hs_res_dict)):
    print('checking elements {}/{}\r'.format(i, check_len), end='')
    if not all([x in hs_res_dict[i] for x in bsh_res_dict[i]]):
        print(f'NOT EQUAL AT {i}')

checking elements 5999/6000

## Compare the results (hash to the old one)

In [12]:
check_len = len(old_res_dict)
for i in range(len(old_res_dict)):
    total_results = [x.kmer for x in old_res_dict[i][0]] + [x.kmer for x in old_res_dict[i][1]]
    print('checking elements {}/{}\r'.format(i, check_len), end='')
    if not all([x in total_results for x in hs_res_dict[i]]):
        print(f'NOT EQUAL AT {i}')

checking elements 999/1000

In [15]:
print(len(list(set([x.kmer for x in old_res_dict[0][0]] + [x.kmer for x in old_res_dict[0][1]]))))
print(len(list(set(bsh_res_dict[0]))))

871
871
