# Small k-mer scores to find points of interest

### import all the scoring tools

In [1]:
import sys
sys.path.append('/Users/zacharymcgrath/Documents/Layer_Research/Proteomics_Experiments/Database_Experiments/src')

from scoring import comparisons
from spectra import gen_spectra

import numpy as np
from copy import deepcopy

### Make some fake data for the time being

#### reference 

In [2]:
insulin = 'MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN'
make_mers = lambda k, seq: [seq[i:i+k] for i in range(len(seq) - k + 1)]
ins_3mers = make_mers(3, insulin)


#### "unknown"

In [3]:
pepseq = 'PGAGSLQPLALEGSLQKR'
pepstart = insulin.index(pepseq)
peplen = len(pepseq)
pepspec = gen_spectra.gen_spectrum(pepseq)['spectrum']
print(insulin[pepstart+peplen-1])

R


### run all the kmers against this unknown spectra

In [4]:
scores = []
for i, merseq in enumerate(ins_3mers):
    b_merspec = gen_spectra.gen_spectrum(merseq, ion='b')['spectrum']
    y_merspec = gen_spectra.gen_spectrum(merseq, ion='y')['spectrum']
    b_score = comparisons.compare_masses(pepspec, b_merspec)
    y_score = comparisons.compare_masses(pepspec, y_merspec)
    entry = {
        'k': 3,
        'sequence': merseq,
        'starting_position': i,
        'ending_position': i + 2,
        'b-score': b_score,
        'y-score': y_score
    }
    scores.append(entry)
    
for sc in scores:
    (sc['b-score'] > 0 or sc['y-score'] > 0)  and print(sc)

{'k': 3, 'sequence': 'WMR', 'starting_position': 3, 'ending_position': 5, 'b-score': 0.0, 'y-score': 1.0}
{'k': 3, 'sequence': 'PLL', 'starting_position': 8, 'ending_position': 10, 'b-score': 1.3333333333333333, 'y-score': 0.0}
{'k': 3, 'sequence': 'GPD', 'starting_position': 17, 'ending_position': 19, 'b-score': 1.0, 'y-score': 0.0}
{'k': 3, 'sequence': 'PDP', 'starting_position': 18, 'ending_position': 20, 'b-score': 1.3333333333333333, 'y-score': 0.0}
{'k': 3, 'sequence': 'PAA', 'starting_position': 20, 'ending_position': 22, 'b-score': 1.0, 'y-score': 0.0}
{'k': 3, 'sequence': 'GER', 'starting_position': 43, 'ending_position': 45, 'b-score': 0.0, 'y-score': 1.0}
{'k': 3, 'sequence': 'PKT', 'starting_position': 51, 'ending_position': 53, 'b-score': 1.3333333333333333, 'y-score': 0.0}
{'k': 3, 'sequence': 'KTR', 'starting_position': 52, 'ending_position': 54, 'b-score': 0.0, 'y-score': 1.0}
{'k': 3, 'sequence': 'TRR', 'starting_position': 53, 'ending_position': 55, 'b-score': 0.0, 'y

In [5]:
# save the scores as (score, position)
b_scores = [(x['b-score'], x['starting_position']) for x in scores]
y_scores = [(x['y-score'], x['starting_position']) for x in scores]

## Find a way to separate the interesting scores from the rest

### try going for 2 stddevs away

In [7]:
sb = np.std([b[0] for b in b_scores])
sy = np.std([y[0] for y in y_scores])

s2b = [sc for sc in scores if sc['b-score'] > 2*sb]
s2y = [sc for sc in scores if sc['y-score'] > 2*sy]

print(s2b)
print(s2y)
print(len(s2b) + len(s2y))

[{'k': 3, 'sequence': 'PLL', 'starting_position': 8, 'ending_position': 10, 'b-score': 1.3333333333333333, 'y-score': 0.0}, {'k': 3, 'sequence': 'GPD', 'starting_position': 17, 'ending_position': 19, 'b-score': 1.0, 'y-score': 0.0}, {'k': 3, 'sequence': 'PDP', 'starting_position': 18, 'ending_position': 20, 'b-score': 1.3333333333333333, 'y-score': 0.0}, {'k': 3, 'sequence': 'PAA', 'starting_position': 20, 'ending_position': 22, 'b-score': 1.0, 'y-score': 0.0}, {'k': 3, 'sequence': 'PKT', 'starting_position': 51, 'ending_position': 53, 'b-score': 1.3333333333333333, 'y-score': 0.0}, {'k': 3, 'sequence': 'GPG', 'starting_position': 70, 'ending_position': 72, 'b-score': 1.0, 'y-score': 0.0}, {'k': 3, 'sequence': 'PGA', 'starting_position': 71, 'ending_position': 73, 'b-score': 3.6666666666666665, 'y-score': 0.0}, {'k': 3, 'sequence': 'QPL', 'starting_position': 77, 'ending_position': 79, 'b-score': 1.0, 'y-score': 0.0}, {'k': 3, 'sequence': 'PLA', 'starting_position': 78, 'ending_positio

## Only look at the 2 stddev scores and increase kmer length

In [8]:
make_mer_sp = lambda starting_pos, prot, k: prot[starting_pos: starting_pos+k]
make_mer_ep = lambda ending_pos, prot, k: prot[ending_pos-k+1:ending_pos+1]

ins_4mers = []
print(pepseq)
print('')

for b in s2b:
    merseq = make_mer_sp(b['starting_position'], insulin, 4)    
    b_merspec = gen_spectra.gen_spectrum(merseq, ion='b')['spectrum']
    y_merspec = gen_spectra.gen_spectrum(merseq, ion='y')['spectrum']
    b_score = comparisons.compare_masses(pepspec, b_merspec)
    y_score = comparisons.compare_masses(pepspec, y_merspec)
    entry = deepcopy(b)
    
    entry['k'] = 4
    entry['sequence'] = merseq
    entry['ending_position'] = entry['starting_position'] + 3
    entry['b-score'] = b_score
    entry['y-score'] = y_score
    
    ins_4mers.append(entry)
    
    print('k={} \t k={}\n{} \t {}\n{} \t {} \n'.format(b['k'], entry['k'], b['sequence'], merseq, b['b-score'], b_score))

for y in s2y:
    merseq = make_mer_ep(y['ending_position'], insulin, 4)  
    b_merspec = gen_spectra.gen_spectrum(merseq, ion='b')['spectrum']
    y_merspec = gen_spectra.gen_spectrum(merseq, ion='y')['spectrum']
    b_score = comparisons.compare_masses(pepspec, b_merspec)
    y_score = comparisons.compare_masses(pepspec, y_merspec)
    entry = deepcopy(y)
    
    entry['k'] = 4
    entry['sequence'] = merseq
    entry['starting_position'] = entry['ending_position'] - 3
    entry['b-score'] = b_score
    entry['y-score'] = y_score
    ins_4mers.append(entry)
    
    print('k={} \t k={}\n{} \t {}\n{} \t {} \n'.format(y['k'], entry['k'], y['sequence'], merseq, y['y-score'], y_score))
    
    

PGAGSLQPLALEGSLQKR

k=3 	 k=4
PLL 	 PLLA
1.3333333333333333 	 1.0 

k=3 	 k=4
GPD 	 GPDP
1.0 	 0.75 

k=3 	 k=4
PDP 	 PDPA
1.3333333333333333 	 1.0 

k=3 	 k=4
PAA 	 PAAA
1.0 	 0.75 

k=3 	 k=4
PKT 	 PKTR
1.3333333333333333 	 1.0 

k=3 	 k=4
GPG 	 GPGA
1.0 	 1.5 

k=3 	 k=4
PGA 	 PGAG
3.6666666666666665 	 3.75 

k=3 	 k=4
QPL 	 QPLA
1.0 	 0.75 

k=3 	 k=4
PLA 	 PLAL
1.3333333333333333 	 1.0 

k=3 	 k=4
WMR 	 LWMR
1.0 	 0.75 

k=3 	 k=4
GER 	 CGER
1.0 	 0.75 

k=3 	 k=4
KTR 	 PKTR
1.0 	 0.75 

k=3 	 k=4
TRR 	 KTRR
1.0 	 0.75 

k=3 	 k=4
QKR 	 LQKR
3.6666666666666665 	 3.75 



## Look at 2 stddevs again for the 4mer case

In [9]:
sb = np.std([b['b-score'] for b in ins_4mers])
sy = np.std([y['y-score'] for y in ins_4mers])

s2b4 = [sc for sc in ins_4mers if sc['b-score'] > 2*sb]
s2y4 = [sc for sc in ins_4mers if sc['y-score'] > 2*sy]

print(s2b4)
print(s2y4)

[{'k': 4, 'sequence': 'PGAG', 'starting_position': 71, 'ending_position': 74, 'b-score': 3.75, 'y-score': 0.0}]
[{'k': 4, 'sequence': 'LQKR', 'starting_position': 85, 'ending_position': 88, 'b-score': 0.0, 'y-score': 3.75}]


### use these for 5 mers

In [10]:
ins_5mers = []

for b in s2b4:
    merseq = make_mer_sp(b['starting_position'], insulin, 5)    
    b_merspec = gen_spectra.gen_spectrum(merseq, ion='b')['spectrum']
    y_merspec = gen_spectra.gen_spectrum(merseq, ion='y')['spectrum']
    b_score = comparisons.compare_masses(pepspec, b_merspec)
    y_score = comparisons.compare_masses(pepspec, y_merspec)
    entry = deepcopy(b)
    
    entry['k'] = 5
    entry['sequence'] = merseq
    entry['ending_position'] = entry['starting_position'] + 4
    entry['b-score'] = b_score
    entry['y-score'] = y_score
    
    ins_5mers.append(entry)
    
    print('k={} \t k={}\n{} \t {}\n{} \t {} \n'.format(b['k'], entry['k'], b['sequence'], merseq, b['b-score'], b_score))

for y in s2y4:
    merseq = make_mer_ep(y['ending_position'], insulin, 4)  
    b_merspec = gen_spectra.gen_spectrum(merseq, ion='b')['spectrum']
    y_merspec = gen_spectra.gen_spectrum(merseq, ion='y')['spectrum']
    b_score = comparisons.compare_masses(pepspec, b_merspec)
    y_score = comparisons.compare_masses(pepspec, y_merspec)
    entry = deepcopy(y)
    
    entry['k'] = 5
    entry['sequence'] = merseq
    entry['starting_position'] = entry['ending_position'] - 4
    entry['b-score'] = b_score
    entry['y-score'] = y_score
    ins_5mers.append(entry)
    
    print('k={} \t k={}\n{} \t {}\n{} \t {} \n'.format(y['k'], entry['k'], y['sequence'], merseq, y['y-score'], y_score))
    

k=4 	 k=5
PGAG 	 PGAGS
3.75 	 3.8 

k=4 	 k=5
LQKR 	 LQKR
3.75 	 3.75 



## Automating the process
### Process
1. Run small k-mers (k=3)
2. Filter out the low scores by using 2stddevs
3. Increase k-mer lengths till scores go down

### Mock data, kmer making functions, and any constants

In [19]:
insulin = 'MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN'
pepseq = 'PGAGSLQPLALEGSLQKR'
pepstart = insulin.index(pepseq)
peplen = len(pepseq)
pepspec = gen_spectra.gen_spectrum(pepseq)['spectrum']
# quick kmer making functions and sdevs
make_mer_sp = lambda starting_pos, prot, k: prot[starting_pos: starting_pos+k]
make_mer_ep = lambda ending_pos, prot, k: prot[ending_pos-k+1:ending_pos+1]
sdevs = lambda scores: np.std(scores)
# the starting kmer length to use
starting_k = 3


### Find the important kmers to look for

In [25]:
prot = insulin
base_scores = []

for i in range(len(prot) - starting_k + 1):
    kmer = make_mer_sp(i, prot, starting_k)
    kmerspec_b = gen_spectra.gen_spectrum(kmer, ion='b')['spectrum']
    kmerspec_y = gen_spectra.gen_spectrum(kmer, ion='y')['spectrum']
    b_score = comparisons.compare_masses(pepspec, kmerspec_b)
    y_score = comparisons.compare_masses(pepspec, kmerspec_y)
    
    entry = {
        'k': starting_k,
        'sequence': kmer,
        'b-score': b_score,
        'y-score': y_score,
        'starting_position': i, 
        'ending_position': i + starting_k - 1
    }
    base_scores.append(entry)
    
b_sdevs = sdevs([b['b-score'] for b in base_scores])
y_sdevs = sdevs([y['y-score'] for y in base_scores])

b_anchors = [b for b in base_scores if b['b-score'] >= b_sdevs*2]
y_anchors = [y for y in base_scores if y['y-score'] >= y_sdevs*2]

### incrementing k and only taking scores that stay the same or increase

In [33]:
b_scores = deepcopy(b_anchors)
y_scores = deepcopy(y_anchors)

# keep track of the ones that we want to increment. Prepend it to the front and take the top
b_list = []
y_list = []

def new_entry(old_entry: dict, prot: str, spectrum: list, ion='b') -> dict:
    '''
    Generate a new entry from the old entry
    
    Input:
        old_entry:   dict entry with k, sequence, b and y scores, start and end positions
        prot:        str sequence of the protein
        spectrum:    list spectrum to score against
    kwargs:
        ion:         str ion type to determine which. Options are 'b', 'y'. Default='b' 
    Ouptut:
        new_entry:   dict entry with the new k, new sequence, new b and y scores, new start and end positions
    '''
    starting_pos = old_entry['starting_position'] if ion == 'b' else old_entry['starting_position'] - 1
    ending_pos = old_entry['ending_position'] + 1 if ion == 'b' else old_entry['ending_position']
    mer_seq = prot[starting_pos:ending_pos+1]
    mer_spec_b = gen_spectra.gen_spectrum(mer_seq, ion='b')['spectrum']
    mer_spec_y = gen_spectra.gen_spectrum(mer_seq, ion='y')['spectrum']
    return {
        'k': old_entry['k'] + 1,
        'sequence': mer_seq,
        'starting_position': starting_pos,
        'ending_position': ending_pos,
        'b-score': comparisons.compare_masses(spectrum, mer_spec_b),
        'y-score': comparisons.compare_masses(spectrum, mer_spec_y)
    }
while(len(b_scores)):
    b_tmp = []
    for i in range(len(b_scores)):
        updated = new_entry(b_scores[i], prot, pepspec, ion='b')
        if updated['b-score'] >= b_scores[i]['b-score']:
            b_tmp.append(updated)
        else: 
            b_list.insert(0, b_scores[i])
    b_scores = b_tmp
    
while(len(y_scores)):
    y_tmp = []
    for i in range(len(y_scores)):
        updated = new_entry(y_scores[i], prot, pepspec, ion='y')
        if updated['y-score'] >= y_scores[i]['y-score']:
            y_tmp.append(updated)
        else: 
            y_list.insert(0, y_scores[i])
    y_scores = y_tmp
    
print('Top b result: \n{}\n'.format(b_list[0]))
print('Top y result: \n{}\n'.format(y_list[0]))
        

Top b result: 
{'k': 18, 'sequence': 'PGAGSLQPLALEGSLQKR', 'starting_position': 71, 'ending_position': 88, 'b-score': 3.9444444444444446, 'y-score': 3.9444444444444446}

Top y result: 
{'k': 18, 'sequence': 'PGAGSLQPLALEGSLQKR', 'starting_position': 71, 'ending_position': 88, 'b-score': 3.9444444444444446, 'y-score': 3.9444444444444446}

