# Small k-mer scores to find points of interest

### import all the scoring tools

In [2]:
import sys
sys.path.append('/Users/zacharymcgrath/Documents/Layer_Research/Proteomics_Experiments/Database_Experiments/src')

from scoring import comparisons
from spectra import gen_spectra

import numpy as np
from copy import deepcopy

### Make some fake data for the time being

#### reference 

In [3]:
insulin = 'MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN'
make_mers = lambda k, seq: [seq[i:i+k] for i in range(len(seq) - k + 1)]
ins_3mers = make_mers(3, insulin)


#### "unknown"

In [4]:
pepseq = 'PGAGSLQPLALEGSLQKR'
pepstart = insulin.index(pepseq)
peplen = len(pepseq)
pepspec = gen_spectra.gen_spectrum(pepseq)['spectrum']
print(insulin[pepstart+peplen-1])

R


### run all the kmers against this unknown spectra

In [5]:
scores = []
for i, merseq in enumerate(ins_3mers):
    b_merspec = gen_spectra.gen_spectrum(merseq, ion='b')['spectrum']
    y_merspec = gen_spectra.gen_spectrum(merseq, ion='y')['spectrum']
    b_score = comparisons.compare_masses(pepspec, b_merspec)
    y_score = comparisons.compare_masses(pepspec, y_merspec)
    entry = {
        'k': 3,
        'sequence': merseq,
        'starting_position': i,
        'ending_position': i + 2,
        'b-score': b_score,
        'y-score': y_score
    }
    scores.append(entry)
    
for sc in scores:
    (sc['b-score'] > 0 or sc['y-score'] > 0)  and print(sc)

{'k': 3, 'sequence': 'WMR', 'starting_position': 3, 'ending_position': 5, 'b-score': 0.0, 'y-score': 1.0}
{'k': 3, 'sequence': 'PLL', 'starting_position': 8, 'ending_position': 10, 'b-score': 1.0, 'y-score': 0.0}
{'k': 3, 'sequence': 'GPD', 'starting_position': 17, 'ending_position': 19, 'b-score': 1.0, 'y-score': 0.0}
{'k': 3, 'sequence': 'PDP', 'starting_position': 18, 'ending_position': 20, 'b-score': 1.0, 'y-score': 0.0}
{'k': 3, 'sequence': 'PAA', 'starting_position': 20, 'ending_position': 22, 'b-score': 1.0, 'y-score': 0.0}
{'k': 3, 'sequence': 'GER', 'starting_position': 43, 'ending_position': 45, 'b-score': 0.0, 'y-score': 1.0}
{'k': 3, 'sequence': 'PKT', 'starting_position': 51, 'ending_position': 53, 'b-score': 1.0, 'y-score': 0.0}
{'k': 3, 'sequence': 'KTR', 'starting_position': 52, 'ending_position': 54, 'b-score': 0.0, 'y-score': 1.0}
{'k': 3, 'sequence': 'TRR', 'starting_position': 53, 'ending_position': 55, 'b-score': 0.0, 'y-score': 1.0}
{'k': 3, 'sequence': 'GPG', 'st

In [6]:
# save the scores as (score, position)
b_scores = [(x['b-score'], x['starting_position']) for x in scores]
y_scores = [(x['y-score'], x['starting_position']) for x in scores]

## Find a way to separate the interesting scores from the rest

### try going for 2 stddevs away

In [7]:
sb = np.std([b[0] for b in b_scores])
sy = np.std([y[0] for y in y_scores])

s2b = [sc for sc in scores if sc['b-score'] > 2*sb]
s2y = [sc for sc in scores if sc['y-score'] > 2*sy]

print(s2b)
print(s2y)
print(len(s2b) + len(s2y))

[{'k': 3, 'sequence': 'PLL', 'starting_position': 8, 'ending_position': 10, 'b-score': 1.0, 'y-score': 0.0}, {'k': 3, 'sequence': 'GPD', 'starting_position': 17, 'ending_position': 19, 'b-score': 1.0, 'y-score': 0.0}, {'k': 3, 'sequence': 'PDP', 'starting_position': 18, 'ending_position': 20, 'b-score': 1.0, 'y-score': 0.0}, {'k': 3, 'sequence': 'PAA', 'starting_position': 20, 'ending_position': 22, 'b-score': 1.0, 'y-score': 0.0}, {'k': 3, 'sequence': 'PKT', 'starting_position': 51, 'ending_position': 53, 'b-score': 1.0, 'y-score': 0.0}, {'k': 3, 'sequence': 'GPG', 'starting_position': 70, 'ending_position': 72, 'b-score': 1.0, 'y-score': 0.0}, {'k': 3, 'sequence': 'PGA', 'starting_position': 71, 'ending_position': 73, 'b-score': 2.6666666666666665, 'y-score': 0.0}, {'k': 3, 'sequence': 'QPL', 'starting_position': 77, 'ending_position': 79, 'b-score': 1.0, 'y-score': 0.0}, {'k': 3, 'sequence': 'PLA', 'starting_position': 78, 'ending_position': 80, 'b-score': 1.0, 'y-score': 0.0}]
[{'k

## Only look at the 2 stddev scores and increase kmer length

In [8]:
make_mer_sp = lambda starting_pos, prot, k: prot[starting_pos: starting_pos+k]
make_mer_ep = lambda ending_pos, prot, k: prot[ending_pos-k+1:ending_pos+1]

ins_4mers = []

for b in s2b:
    merseq = make_mer_sp(b['starting_position'], insulin, 4)    
    b_merspec = gen_spectra.gen_spectrum(merseq, ion='b')['spectrum']
    y_merspec = gen_spectra.gen_spectrum(merseq, ion='y')['spectrum']
    b_score = comparisons.compare_masses(pepspec, b_merspec)
    y_score = comparisons.compare_masses(pepspec, y_merspec)
    entry = deepcopy(b)
    
    entry['k'] = 4
    entry['ending_position'] = entry['starting_position'] + 3
    entry['b-score'] = b_score
    entry['y-score'] = y_score
    
    ins_4mers.append(entry)

for y in s2y:
    merseq = make_mer_sp(y['starting_position'], insulin, 4)    
    b_merspec = gen_spectra.gen_spectrum(merseq, ion='b')['spectrum']
    y_merspec = gen_spectra.gen_spectrum(merseq, ion='y')['spectrum']
    b_score = comparisons.compare_masses(pepspec, b_merspec)
    y_score = comparisons.compare_masses(pepspec, y_merspec)
    entry = deepcopy(y)
    
    entry['k'] = 4
    entry['ending_position'] = entry['starting_position'] + 3
    entry['b-score'] = b_score
    entry['y-score'] = y_score
    ins_4mers.append(entry)
    
print(len(ins_4mers))

14
