# test new kmer extension
While the last one was sort of fast, lets make it faster if we can
Instead of looking at all points of interest and try and just brute force it with a tree beam search kind of thing

### lets first see how big these things get

In [5]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.types.database import Database
fasta_file = '../../testing framework/data/databases/6000prots.fasta'
db = Database(fasta_file, True, 1)
kmers = [i for i in range(3, 4)]
for kmer in kmers:
    db.kmer_size = kmer
    db.index()
    dictsize = sys.getsizeof(db.metadata) / 1000000
    numkmers = len(db.metadata)
    print('for k={}, size={}MB, len={}'.format(kmer, dictsize, numkmers))

for k=3, size=0.295016MB, len=8477


## Use the optimized initial kmer search to look for best looking spectrum

### Setup for both
#### 1. Database

In [6]:
fastafile = '../../testing framework/data/databases/100prots.fasta'
db = Database(fastafile)

#### 2. Spectra

In [7]:
from src.types.objects import Spectrum
from src.file_io import fasta
from modules.sequence_generation import peptides
from src.spectra.gen_spectra import gen_spectrum
import math

database = fasta.read(fastafile, True)

database = {x['name']: x for x in database}

num_hybs = 5
min_length= 5
max_length = 35
num_peptides = 1000
min_cont = 3 #min contribution for each side of a hybrid

# create peptides
non_hybrid_peps = peptides.gen_peptides([x for _, x in database.items()], num_peptides, min_length=min_length, max_length=max_length, digest='random', dist='beta')
# create hybrid peptides

all_proteins_raw = [x for _,x in database.items()]
all_peptides_raw = non_hybrid_peps 

peps = {}
for i, pep in enumerate(all_peptides_raw):
    peps[i] = pep
    peps[i]['scan_no'] = i

spectra = []
sorted_keys = [int(c) for c in peps.keys()]
sorted_keys.sort()
for k in sorted_keys:
    pep = peps[k]
    cont = gen_spectrum(pep['sequence'])
    spec = cont['spectrum']
    pm = cont['precursor_mass']
    spectra.append({'spectrum': spec, 'precursor_mass': pm})


In [8]:
from collections import namedtuple, defaultdict
MassSequence = namedtuple('MassSequence', ['mass', 'sequence'])

def make_all_base_mers_hash(database: Database, base_mer: int) -> defaultdict:
    '''
    Create the list of all the base mers from 1 to base_mer with mass, protein, start position information
    
    Inputs:
        database:    (Database) source of the sequences
        base_mer:    (int) the base k-mer to make up to
    Outputs:
        list of MassSequence for all singly, doubly b and y masses
    '''
    allbasemers = defaultdict(list)
    database.set_kmer_size(base_mer)
    database.index()
    md = database.metadata
    for mer in md:
        mer_spec = gen_spectrum(mer)['spectrum']
        rev_mer = mer[::-1]
        rev_mer_spec = gen_spectrum(mer)['spectrum']
        for mass in mer_spec:
            mass_key = math.floor(mass)
            allbasemers[mass_key].append(MassSequence(mass, mer))

    return allbasemers

def seach_base_kmers_hash(observed: Spectrum, allbasemers: dict, tolerance: float) -> list:
    '''
    Search through all of the base kmers and find those that gave us good hits
    
    Inputs:
        spectrum:    (Spectrum) what to sequence
        allbasemers: (dict of list of MassSequence) all of the basemers made from the function 'make_all_base_mers_hash'
        tolerance:   (float) the ppm tolerance to accept for each mass
    Outputs:
        list of MassSequence for all masses that were in the acceptable range of an observed mass
    '''
    hits = []
    for mass in observed.spectrum:
        tol = ppm_opt(mass, tolerance)
        lb_mass = mass - tol
        ub_mass = mass + tol
        lb_mass_key = math.floor(lb_mass)
        ub_mass_key = math.floor(ub_mass)
        
        hits += [x.sequence for x in allbasemers[ub_mass_key] if lb_mass <= x.mass <= ub_mass]
        if lb_mass_key != ub_mass_key:
            hits += [x.sequence for x in allbasemers[ub_mass_key] if lb_mass <= x.mass <= ub_mass]
            
    return hits

## Run it

In [None]:
import time
from collections import Counter 
import matplotlib.pyplot as plt

min_peptide_len = 20
max_pep_len = 20

st = time.time()
db = Database(fasta_file, True)
allmers = make_all_base_mers_hash(db, max_pep_len)
speclen = len(spectra)
hs_res_dict = {}
mer_counter = Counter()
tracker = []

for i, spec in enumerate(spectra):
    print('spectrum {}/{}\r'.format(i, speclen), end='')
    s = Spectrum(spec['spectrum'], [], '2', 0, spec['precursor_mass'], '')
    res = seach_base_kmers_hash(s, allmers, 20)
    mer_counter(res)
    tracker = [c for _, c in mer_counter.most_common()]
    
print('\ntotal time for pure hash method: {}'.format(time.time() - st))