# Optimize the k-mer extension algorithm
So right now we are extending each amino acid by each protein seperately instead of doing something smarter. The problem with this is that we are possibly checking the same k-mer more than once. We should optimize

In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from collections import namedtuple
import time

## The current way

In [3]:
from src.scoring import mass_comparisons
from src.spectra import gen_spectra
from src.types.objects import ScoredKmer, Spectrum, Kmer
from src.types.database import Entry

def new_entry(old_entry: ScoredKmer, protein: Entry, spectrum: Spectrum, ion='b') -> ScoredKmer:
    '''
    Generate a new entry from the old entry
    
    Input:
        old_entry:   dict entry with k, sequence, b and y scores, start and end positions
        prot:        Entry class instance 
        spectrum:    list spectrum to score against
    kwargs:
        ion:         str ion type to determine which. Options are 'b', 'y'. Default='b' 
    Ouptut:
        Score object
    '''
    starting_pos = old_entry.kmer.start_position if ion == 'b' else old_entry.kmer.start_position - 1
    ending_pos = old_entry.kmer.end_position + 1 if ion == 'b' else old_entry.kmer.end_position
    if starting_pos < 0 or ending_pos > len(protein.sequence) - 1:
        return old_entry

    # check for negative lengths
    if starting_pos > ending_pos or ending_pos < starting_pos:
        return old_entry

    mer_seq = protein.sequence[starting_pos:ending_pos+1]
    mer_spec_b = gen_spectra.gen_spectrum(mer_seq, ion='b')['spectrum']
    mer_spec_y = gen_spectra.gen_spectrum(mer_seq, ion='y')['spectrum']
    longer_kmer = Kmer(old_entry.kmer.k + 1, mer_seq, protein.name, starting_pos, ending_pos)
    new_sk = ScoredKmer( mass_comparisons.optimized_compare_masses(spectrum.spectrum, mer_spec_b), mass_comparisons.optimized_compare_masses(spectrum.spectrum, mer_spec_y), longer_kmer)
    return new_sk

def extend_kmer(spectrum: Spectrum, protein: Entry, kmer: ScoredKmer, ion: str, stall_length=3) -> dict:
    '''
    Extend a kmer until the score tells us that the adding amino acids doens't make it a better alignment
    
    Inputs:
        spectrum:           Spectrum namedtuple instance
        protein_sequence:   Entry class instance
        kmer:               ScoredKmer namedtuple instance
        ion:                str the ion type we are looking at. Should be 'b' or 'y'
    kwargs:
        stall_length:   int the number of iterations a subsequence is allowed to go witth 
                        no increase in score before finishing kmer growth on a certain kmer. Default=3
    Outputs
        Score object
    '''
    if ion.lower() not in ['b', 'y']:
        return kmer
    score_key = 'b_score' if ion.lower() == 'b' else 'y_score'
    # keep track of the last time a score increased
    last_maintenance = kmer
    # keep going until we run out of extension
    while stall_length > 0:
        updated = new_entry(kmer, protein, spectrum, ion=ion)
        if getattr(updated, score_key) > getattr(kmer, score_key) and getattr(updated, score_key) > 0 and updated.kmer.k != kmer.kmer.k:
            last_maintenance = updated
        else: 
            stall_length -= 1
        kmer = updated
    return last_maintenance

## Skip seach it
add a certain a number of amino acids to see if it increases

In [4]:
from operator import itemgetter

def new_entry_skipped(old_entry: ScoredKmer, protein: Entry, spectrum: Spectrum, skip_len: int, ion='b') -> ScoredKmer:
    '''
    Generate a new entry from the old entry
    
    Input:
        old_entry:   dict entry with k, sequence, b and y scores, start and end positions
        prot:        Entry class instance 
        spectrum:    list spectrum to score against
        skip_len:    (int) number of amino acids to try and skip 
    kwargs:
        ion:         str ion type to determine which. Options are 'b', 'y'. Default='b' 
    Ouptut:
        Score object
    '''
    append_len = skip_len if old_entry.kmer.end_position + skip_len < len(protein.sequence) - 1 else 1
    prepend_len = skip_len if old_entry.kmer.start_position - skip_len > 0 else 1
    starting_pos = old_entry.kmer.start_position if ion == 'b' else old_entry.kmer.start_position - prepend_len
    ending_pos = old_entry.kmer.end_position + append_len if ion == 'b' else old_entry.kmer.end_position
    if starting_pos < 0 or ending_pos > len(protein.sequence) - 1:
        return old_entry

    # check for negative lengths
    if starting_pos > ending_pos or ending_pos < starting_pos:
        return old_entry

    mer_seq = protein.sequence[starting_pos:ending_pos+1]
    mer_spec_b = gen_spectra.gen_spectrum(mer_seq, ion='b')['spectrum']
    mer_spec_y = gen_spectra.gen_spectrum(mer_seq, ion='y')['spectrum']
    longer_kmer = Kmer(old_entry.kmer.k + 1, mer_seq, protein.name, starting_pos, ending_pos)
    new_sk = ScoredKmer( mass_comparisons.optimized_compare_masses(spectrum.spectrum, mer_spec_b), mass_comparisons.optimized_compare_masses(spectrum.spectrum, mer_spec_y), longer_kmer)
    return new_sk

def skip_search(spectrum: Spectrum, protein: Entry, kmer: ScoredKmer, ion: str, stall_length=3) -> dict:
    '''
    Extend a kmer until the score tells us that the adding amino acids doens't make it a better alignment
    
    Inputs:
        spectrum:           Spectrum namedtuple instance
        protein_sequence:   Entry class instance
        kmer:               ScoredKmer namedtuple instance
        ion:                str the ion type we are looking at. Should be 'b' or 'y'
    kwargs:
        stall_length:   int the number of iterations a subsequence is allowed to go witth 
                        no increase in score before finishing kmer growth on a certain kmer. Default=3
    Outputs
        Score object
    '''
    if ion.lower() not in ['b', 'y']:
        return kmer
    score_key = 'b_score' if ion.lower() == 'b' else 'y_score'
    # keep going until we run out of extension
    while True:
        updated = new_entry(kmer, protein, spectrum, stall_len, ion=ion,)
        if getattr(updated, score_key) > getattr(kmer, score_key) and getattr(updated, score_key) > 0 and updated.kmer.k != kmer.kmer.k:
            kmer = updated
        else: 
            break
    # so we've skipped enough, now start at length n-skip_len and extend
    best = max([new_entry(kmer, protein, spectrum, i-skip_len, ion) for i in range()], key=itemgetter(score_key))
    return best

## Basic max search

In [None]:
def new_entry_skipped(old_entry: ScoredKmer, protein: Entry, spectrum: Spectrum, skip_len: int, ion='b') -> ScoredKmer:
    '''
    Generate a new entry from the old entry
    
    Input:
        old_entry:   dict entry with k, sequence, b and y scores, start and end positions
        prot:        Entry class instance 
        spectrum:    list spectrum to score against
        skip_len:    (int) number of amino acids to try and skip 
    kwargs:
        ion:         str ion type to determine which. Options are 'b', 'y'. Default='b' 
    Ouptut:
        Score object
    '''
    append_len = skip_len if old_entry.kmer.end_position + skip_len < len(protein.sequence) - 1 else 1
    prepend_len = skip_len if old_entry.kmer.start_position - skip_len > 0 else 1
    starting_pos = old_entry.kmer.start_position if ion == 'b' else old_entry.kmer.start_position - prepend_len
    ending_pos = old_entry.kmer.end_position + append_len if ion == 'b' else old_entry.kmer.end_position
    if starting_pos < 0 or ending_pos > len(protein.sequence) - 1:
        return old_entry

    # check for negative lengths
    if starting_pos > ending_pos or ending_pos < starting_pos:
        return old_entry

    mer_seq = protein.sequence[starting_pos:ending_pos+1]
    mer_spec_b = gen_spectra.gen_spectrum(mer_seq, ion='b')['spectrum']
    mer_spec_y = gen_spectra.gen_spectrum(mer_seq, ion='y')['spectrum']
    longer_kmer = Kmer(old_entry.kmer.k + 1, mer_seq, protein.name, starting_pos, ending_pos)
    new_sk = ScoredKmer( mass_comparisons.optimized_compare_masses(spectrum.spectrum, mer_spec_b), mass_comparisons.optimized_compare_masses(spectrum.spectrum, mer_spec_y), longer_kmer)
    return new_sk

def max_search(spectrum: Spectrum, protein: Entry, kmer: ScoredKmer, ion: str, stall_length=3) -> dict:
    '''
    Extend a kmer until the score tells us that the adding amino acids doens't make it a better alignment
    
    Inputs:
        spectrum:           Spectrum namedtuple instance
        protein_sequence:   Entry class instance
        kmer:               ScoredKmer namedtuple instance
        ion:                str the ion type we are looking at. Should be 'b' or 'y'
    kwargs:
        stall_length:   int the number of iterations a subsequence is allowed to go witth 
                        no increase in score before finishing kmer growth on a certain kmer. Default=3
    Outputs
        Score object
    '''
    if ion.lower() not in ['b', 'y']:
        return kmer
    score_key = 'b_score' if ion.lower() == 'b' else 'y_score'
    # keep going until we run out of extension
    base_index = kmer.kmer.k + 1
        
    while True:
        updated = [new_entry_skipped(spectrum, protein, kmer, ion, i) for i in range(base, base+2*stall_len)]
        if max(updated, key=itemgetter(score_key)).kmer.k < base+stall_len:
            return max(updated, key=itemgetter(score_key))
        else:
            base += 2*stall_len

# Run and time these

## 1. Load fasta database

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.file_io import fasta

fasta_file = '../../testing framework/data/databases/100prots.fasta'
database = fasta.read(fasta_file, True)

database = {x['name']: x for x in database}

## 2.  Generate the peptides, hybrid proteins and peptides

In [2]:
from modules.sequence_generation import proteins, peptides
test_directory = '../../testing framework/data/testing_output/'

num_hybs = 5
min_length= 5
max_length = 35
num_peptides = 100
min_cont = 3 #min contribution for each side of a hybrid

# make hybrid proteins
hyb_prots = proteins.generate_hybrids([x for _, x in database.items()], num_hybs, min_contribution=max_length)
# create peptides
non_hybrid_peps = peptides.gen_peptides([x for _, x in database.items()], num_peptides, min_length=min_length, max_length=max_length, digest='random', dist='beta')
# create hybrid peptides
hyb_peps = peptides.gen_peptides(hyb_prots, num_hybs, min_length=min_length, max_length=max_length, digest='random', min_contribution=min_cont, hybrid_list=True)

all_proteins_raw = [x for _,x in database.items()] + hyb_prots
all_peptides_raw = non_hybrid_peps + hyb_peps

peptides = {}
for i, pep in enumerate(all_peptides_raw):
    peptides[i] = pep
    peptides[i]['scan_no'] = i

Generating hybrid protein 0/5[0%]Generating hybrid protein 1/5[20%]Generating hybrid protein 2/5[40%]Generating hybrid protein 3/5[60%]Generating hybrid protein 4/5[80%]
Finished generating hybrid proteins


## 3. Generate spectra

In [3]:
from src.spectra import gen_spectra
from src.utils import utils
from modules.sequence_generation import write_spectra

utils.make_dir(test_directory)

spectra = []
sorted_keys = [int(c) for c in peptides.keys()]
sorted_keys.sort()
for k in sorted_keys:
    pep = peptides[k]
    cont = gen_spectra.gen_spectrum(pep['sequence'])
    spec = cont['spectrum']
    pm = cont['precursor_mass']
    spectra.append({'spectrum': spec, 'precursor_mass': pm})
write_spectra.write_mzml('testSpectraFile', spectra, output_dir=test_directory)


Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


'../../testing framework/data/testing_output/testSpectraFile.mzML'

# Run on hyped search because its already all there

In [4]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import time
from src import runner

test_directory = '../../testing framework/data/testing_output/'
fasta_file = '../../testing framework/data/databases/100prots.fasta'

args = {
    'spectra_folder': test_directory,
    'database_file': fasta_file,
    'output_dir': test_directory,
    'min_peptide_len': 3,
    'max_peptide_len': 35,
}
st = time.time()
runner.run(args)
print('\nTotal runtime: {} seconds'.format(time.time() - st))

Loading database...
Done. Indexing database...
Done.
Number of 3-mers found in the database: 7487
Analyzing spectra file 1/1[0%]

Analyzing spectrum 345/345[99%]
Finished search. Writting results to ../../testing framework/data/testing_output/...

Total runtime: 2799.412964105606 seconds


# Run on the brute force one

In [None]:
keyed_spec = {}
for i, spec in enumerate(spectra):
    keyed_spec[i] = Spectrum(spec['spectrum'], [500 for _ in range(len(spec['spectrum']))], 2, i, spec['precursor_mass'], '') 

keyed_results = {}
st_dn = time.time()
l = len(keyed_spec.keys())
for i, spec in keyed_spec.items():
    print('on spec {}/{}\r'.format(i, l), end='')
    keyed_results[i] = skipped_search(spec, ScoredSequence(0, 0, ''))