# Optimize the initial k-mer search
right now we are doing 8000 comparisons for EACH spectrum in the observed to filter.

### Imports

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.types.objects import Spectrum, BasicScoredKmer, Kmer
from src.types.database import Database
from src.scoring import scoring
from src.spectra.gen_spectra import gen_spectrum
from src.file_io import fasta
from src.identfication import filtering

from modules.sequence_generation import proteins, peptides

from collections import namedtuple, defaultdict
from bisect import bisect
import time
import math

## Current method (26 May 2020)

In [2]:
def find_interesting_kmers(spectrum: Spectrum, mers: list) -> (list, list):
    '''
    Go through a list of mers and find the ones that are considered interesting

    Inputs:
        spectrum:   Spectrum namedtuple instance
        mers:       list of strings containing kmers
    Outputs:
        (b_anchors, y_anchors): list of BasicScoredKmer namedtuple instances
    '''
    # for ever mer, score the subsequence
    mer_scores = []
    for mer in mers:
        b_score, y_score = scoring.score_subsequence(spectrum.spectrum, mer)
        bsk = BasicScoredKmer(b_score, y_score, mer)
        mer_scores.append(bsk)
    b_anchors = filtering.score_filter(mer_scores, 'b_score')
    y_anchors = filtering.score_filter(mer_scores, 'y_score')
    return (b_anchors, y_anchors)


## Binary search on tuples
Build a list for all 3-mers that is of the form
```python
all3mers = [(reference_mass, protein, starting_position), ... ]

for observed in spectra:
    for mass in observed:
        i = binarysearch(mass - error, all3mers)
        hits = []
        for j in range(i, inf):
            if all3mers[i + j][0] <= mass + error:
                hits.append(all3mers[i + j]
            else break
```

In [3]:
MassSequence = namedtuple('MassSequence', ['mass', 'protein', 'start_position'])

def make_all_base_mers(database: Database, base_mer: int) -> list:
    '''
    Create the list of all the base mers from 1 to base_mer with mass, protein, start position information
    
    Inputs:
        database:    (Database) source of the sequences
        base_mer:    (int) the base k-mer to make up to
    Outputs:
        list of MassSequence for all singly, doubly b and y masses
    '''
    allbasemers = []
    database.set_kmer_size(base_mer)
    database.index()
    md = database.metadata
    for mer, lokmers in md.items():
        mer_spec = gen_spectrum(mer)['spectrum']
        for kmer in lokmers:
            for mass in mer_spec:
                allbasemers.append(MassSequence(mass, kmer.protein, kmer.start_position))
    allbasemers.sort(key=lambda x: x.mass)
    return allbasemers

def seach_base_kmers(observed: Spectrum, database: Database, allbasemers: list, tolerance: float, base_mer: int) -> list:
    '''
    Search through all of the base kmers and find those that gave us good hits
    
    Inputs:
        spectrum:    (Spectrum) what to sequence
        database:    (Database) source of the sequences
        allbasemers: (list of MassSequence) all of the basemers made from the function 'make_all_base_mers'
        tolerance:   (float) the tolerance in Da to accept
        base_mer:    (int) length of the base kmer
    Outputs:
        list of kmer strings
    '''
    hits = []
    allbasemermasses = [x.mass for x in allbasemers]
    for mass in observed.spectrum:
        lb_mass = mass - tolerance
        ub_mass = mass + tolerance
        start_hit_index = bisect(allbasemermasses, lb_mass)
        for i in range(start_hit_index, len(allbasemers)):
            if allbasemers[i].mass <= ub_mass:
                hits.append(allbasemers[i])
            else: 
                break
                
    # so here we want to take ALL of the hits (since they are non-zero) and make base_mers out of them
    mers = {} # used for fast indexing to see if the mer is already found
    for hit in hits:
        merseq = database.get_entry_by_name(hit.protein).sequence[hit.start_position: hit.start_position + base_mer]
        mers[merseq] = 1
    return list(mers.keys())
    

## Boundary hashing with binary search
Use the same technique as the last attempt, but instead of a pure binary search, do a binary search on a smaller list that you hash into

In [14]:
def make_all_base_mers_hash(database: Database, base_mer: int) -> defaultdict:
    '''
    Create the list of all the base mers from 1 to base_mer with mass, protein, start position information
    
    Inputs:
        database:    (Database) source of the sequences
        base_mer:    (int) the base k-mer to make up to
    Outputs:
        list of MassSequence for all singly, doubly b and y masses
    '''
    allbasemers = defaultdict(list)
    database.set_kmer_size(base_mer)
    database.index()
    md = database.metadata
    for mer, lokmers in md.items():
        mer_spec = gen_spectrum(mer)['spectrum']
        for kmer in lokmers:
            for mass in mer_spec:
                mass_key = math.floor(mass)
                if mass_key not in allbasemers:
                    allbasemers[mass_key] = []
                allbasemers[mass_key].append(MassSequence(mass, kmer.protein, kmer.start_position))
    return allbasemers

def seach_base_kmers_hash(observed: Spectrum, database: Database, allbasemers: dict, tolerance: float, base_mer: int) -> list:
    '''
    Search through all of the base kmers and find those that gave us good hits
    
    Inputs:
        spectrum:    (Spectrum) what to sequence
        allbasemers: (dict of list of MassSequence) all of the basemers made from the function 'make_all_base_mers_hash'
        tolerance:   (float) the tolerance in Da to accept
    Outputs:
        list of MassSequence for all masses that were in the acceptable range of an observed mass
    '''
    hits = []
    for mass in observed.spectrum:
        lb_mass = mass - tolerance
        ub_mass = mass + tolerance
        lb_mass_key = math.floor(lb_mass)
        ub_mass_key = math.floor(ub_mass)
        
        hits += [x for x in allbasemers[ub_mass_key] if lb_mass <= x.mass <= ub_mass]
        if lb_mass_key != ub_mass_key:
            hits += [x for x in allbasemers[ub_mass_key] if lb_mass <= x.mass <= ub_mass]
            
    # so here we want to take ALL of the hits (since they are non-zero) and make base_mers out of them
    mers = {} # used for fast indexing to see if the mer is already found
    for hit in hits:
        merseq = database.get_entry_by_name(hit.protein).sequence[hit.start_position: hit.start_position + base_mer]
        mers[merseq] = 1
    return list(mers.keys())
    

## Run these and test the time
So we need to be fair here. What we'll do is the following:
1. Include any setup time thats necessary (nothing that they both need)
2. Run it on different numbers of observed spectrum and database lengths to see how well they do

### Setup for both
#### 1. Database

In [5]:
fastafile = '../../testing framework/data/databases/100prots.fasta'
db = Database(fastafile)

#### 2. Spectra

In [6]:
database = fasta.read(fastafile, True)

database = {x['name']: x for x in database}

num_hybs = 5
min_length= 5
max_length = 35
num_peptides = 1000
min_cont = 3 #min contribution for each side of a hybrid

# create peptides
non_hybrid_peps = peptides.gen_peptides([x for _, x in database.items()], num_peptides, min_length=min_length, max_length=max_length, digest='random', dist='beta')
# create hybrid peptides

all_proteins_raw = [x for _,x in database.items()]
all_peptides_raw = non_hybrid_peps 

peps = {}
for i, pep in enumerate(all_peptides_raw):
    peps[i] = pep
    peps[i]['scan_no'] = i

spectra = []
sorted_keys = [int(c) for c in peps.keys()]
sorted_keys.sort()
for k in sorted_keys:
    pep = peps[k]
    cont = gen_spectrum(pep['sequence'])
    spec = cont['spectrum']
    pm = cont['precursor_mass']
    spectra.append({'spectrum': spec, 'precursor_mass': pm})


### Run the stuff

In [7]:
old_st = time.time()
db.index()
md = db.metadata
mers = list(md.keys())
speclen = len(spectra)
old_res_dict = {}
for i, spec in enumerate(spectra):
    print('spectrum {}/{}\r'.format(i, speclen), end='')
    s = Spectrum(spec['spectrum'], [], '2', 0, spec['precursor_mass'], '')
    old_res_dict[i] = find_interesting_kmers(s, mers)
print('\ntotal time for the old method: {}'.format(time.time() - old_st))

7487 unique kmers
spectrum 999/1000
total time for the old method: 406.42886781692505


In [7]:
bs_st = time.time()
all3mers = make_all_base_mers(db, 3)
speclen = len(spectra)
bs_res_dict = {}
for i, spec in enumerate(spectra):
    print('spectrum {}/{}\r'.format(i, speclen), end='')
    s = Spectrum(spec['spectrum'], [], '2', 0, spec['precursor_mass'], '')
    bs_res_dict[i] = seach_base_kmers(s, db, all3mers, 0.05, 3)
print('\ntotal time for the binary search on large list method: {}'.format(time.time() - bs_st))

7487 unique kmers
spectrum 999/1000
total time for the binary search on large list method: 170.562105178833


In [15]:
bsh_st = time.time()
all3mers = make_all_base_mers_hash(db, 3)
speclen = len(spectra)
bsh_res_dict = {}
for i, spec in enumerate(spectra):
    print('spectrum {}/{}\r'.format(i, speclen), end='')
    s = Spectrum(spec['spectrum'], [], '2', 0, spec['precursor_mass'], '')
    bsh_res_dict[i] = seach_base_kmers_hash(s, db, all3mers, 0.05, 3)
print('\ntotal time for the binary search on large list method: {}'.format(time.time() - bsh_st))

7487 unique kmers
spectrum 999/1000
total time for the binary search on large list method: 57.76775813102722


## Compare results (for now just the hash and the binary one)

In [10]:
check_len = len(bsh_res_dict)
for i in range(len(bsh_res_dict)):
    print('checking elements {}/{}\r'.format(i, check_len), end='')
    if not any([x in bs_res_dict[i] for x in bsh_res_dict[i]]):
        print(f'NOT EQUAL AT {i}')

checking elements 13/1000

KeyboardInterrupt: 