# Get the table entries of correct kmers
Run the first part of hyped search where we get the tables of the binned kmers and see if the kmers are iterative
1. Load the spectrumMill results
2. For each spectrum entry, correlate the scan number to the correct sequence
3. Run hyped search (modified to return the table)
4. Look at the entry for the correct b and y entries
5. Save them
6. Look at the tables generated for all of them and look at the entry for the correct kmer

## 1. load spectrum mill results

In [1]:
import pandas as pd
results_file = '/Users/zacharymcgrath/Downloads/NOD2_E3_results.ssv'
df = pd.read_csv(results_file, sep=';')
df.head(10)

Unnamed: 0,number,filename,parent_charge,score,deltaForwardReverseScore,deltaRank1Rank2Score,percent_scored_peak_intensity,totalIntensity,previous_aa,sequence,next_aa,retentionTimeMin,chromatographicPeakWidthSec,parent_m_over_z,species,entry_name
0,1,NOD2_E3.13446.13477.2,2,10.1,10.1,9.91,84.5,183000.0,(E),DPQVEQLEL,(-),48.35,26.0,535.7725,MOUSE,ins1C18
1,2,NOD2_E3.18005.18246.2,2,12.84,11.07,12.84,97.8,40000000.0,(G),DLQTLALEVA,(-),65.78,29.0,536.8007,MOUSE,ins1C3
2,3,NOD2_E3.13729.13828.2,2,12.43,6.68,7.86,90.7,2200000.0,(G),DLQTLALE,(-),49.52,22.0,451.746,MOUSE,ins1C5
3,4,NOD2_E3.15226.15503.2,2,11.17,6.21,6.67,89.1,1740000.0,(G),DLQTLAL,(-),54.38,169.0,387.2243,MOUSE,ins1C6
4,5,NOD2_E3.21510.21510.2,2,12.54,12.54,12.54,91.3,91900.0,(G),DLQTLALLL,(D),76.92,3.0,500.3081,MOUSE,HYBRID: mouse ins1C PQVEQLELGGSPGDLQTLAL-LLDEG...
5,6,NOD2_E3.12771.12902.3,3,18.14,15.36,14.36,86.8,13900000.0,(E),DPQVAQLELGGEVEDPQVAQLELGGGPGAG,(D),46.05,26.0,977.4894,MOUSE,HYBRID: mouse ins2C EVEDPQVAQLELGG-EVEDPQVAQLE...
6,7,NOD2_E3.7065.7065.2,2,12.55,6.28,7.14,81.3,282000.0,(G),DLPVNSPMTKG,(D),26.08,15.0,579.7953,MOUSE,HYBRID: mouse ins2C EVEDPQVAQLELGGGPGAGD-LPVNS...
7,8,NOD2_E3.16373.16401.2,2,16.86,16.86,14.08,91.4,298000.0,(G),DLQTLALWSRM,(D),58.82,21.0,667.3501,MOUSE,HYBRID: mouse ins1C PQVEQLELGGSPGDLQTLAL-WSRMD...
8,9,NOD2_E3.10614.10681.2,2,16.7,16.7,14.0,88.8,3420000.0,(G),DLQTLALNAAR,(D),38.58,17.0,593.3338,MOUSE,HYBRID: mouse ins1C PQVEQLELGGSPGDLQTLAL-NAARD...
9,10,NOD2_E3.10635.10674.3,3,18.25,18.25,18.25,94.9,249000.0,(G),DLQTLALNAAR,(D),38.58,20.0,395.8918,MOUSE,HYBRID: mouse ins1C PQVEQLELGGSPGDLQTLAL-NAARD...


## 2. Associate each spectrum entry with the correct value

In [6]:
from pyteomics import mzml
mzml_file  = '/Users/zacharymcgrath/Desktop/nod2 data/filteredSpec/filteredNOD2.mzML'
scan_no_keyed_results = {}
for i, e in enumerate(mzml.read(mzml_file)):

    id_ = e['id'].replace('.pkl', '')
    res_value = df.loc[df['filename'] == id_]['sequence'][i]
    if res_value is None:
        print(id)
        break
    scan_no_keyed_results[int(e['index'])] = res_value
    

In [7]:
print(scan_no_keyed_results[4])

DLQTLALLL


## 3, 4, 5. Run hyped search and save the correct kmer entry

In [8]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src import runner

specPath = '/Users/zacharymcgrath/Desktop/nod2 data/filteredSpec/'
fastaPath = '/Users/zacharymcgrath/Desktop/nod2 data/filteredNOD2.fasta'
outputDir = '/Users/zacharymcgrath/Desktop/nod2 data/filteredSpec/'
minPep = 3
maxPep = 25
tolerance = 20
verbose = True
scoringAlg = 'ibb'

params = {
    'spectra_folder': specPath,
    'database_file': fastaPath,
    'output_dir': outputDir,
    'min_peptide_len': minPep,
    'max_peptide_len': maxPep,
    'tolerance': tolerance,
    'verbose': verbose, 
    'scoring_alg': scoringAlg
}

runner.run(params)

Loading database...
Adding protein 279/279 to tree
Done.
Building hashes for kmers...
Indexing database for k=25...
102341 unique kmers
Done
Looking at kmer 102341/102341
Done.
Analyzing spectra file 1/1[0%]

Analyzing spectrum 1086/1086[99%]
Finished search. Writting results to /Users/zacharymcgrath/Desktop/nod2 data/filteredSpec/...


## 6. Go through each entry and look at the base kmer entry

In [18]:
import json

all_tables = json.load(open('/Users/zacharymcgrath/Desktop/nod2 data/filteredSpec/summary.json'))

b_kmers = []
y_kmers = []
missed = []
c = 0
both_missed = []

for name, tables in all_tables.items():
    print(f'on {c}/{len(all_tables)}\r', end='')
    c += 1
    scan_no = int(name.split('_')[-1])
    actual_sequence = scan_no_keyed_results[scan_no]
    
    bmissed = False
    ymissed = False
    
    b_table = tables['alignments']['b']
    y_table = tables['alignments']['y']
    
    if actual_sequence[:3] in b_table:
        b_kmers.append((actual_sequence, b_table[actual_sequence[:3]]))
    else: 
        missed.append(('b', scan_no))
        bmissed = True
    if actual_sequence[-3:] in y_table:
        y_kmers.append((actual_sequence, y_table[actual_sequence[-3:]]))
    else:
        missed.append(('y', scan_no))
        ymissed = True
        
    if ymissed and bmissed:
        both_missed.append(scan_no)
    


on 0/1086on 1/1086on 2/1086on 3/1086on 4/1086on 5/1086on 6/1086on 7/1086on 8/1086on 9/1086on 10/1086on 11/1086on 12/1086on 13/1086on 14/1086on 15/1086on 16/1086on 17/1086on 18/1086on 19/1086on 20/1086on 21/1086on 22/1086on 23/1086on 24/1086on 25/1086on 26/1086on 27/1086on 28/1086on 29/1086on 30/1086on 31/1086on 32/1086on 33/1086on 34/1086on 35/1086on 36/1086on 37/1086on 38/1086on 39/1086on 40/1086on 41/1086on 42/1086on 43/1086on 44/1086on 45/1086on 46/1086on 47/1086on 48/1086on 49/1086on 50/1086on 51/1086on 52/1086on 53/1086on 54/1086on 55/1086on 56/1086on 57/1086on 58/1086on 59/1086on 60/1086on 61/1086on 62/1086on 63/1086on 64/1086on 65/1086on 66/1086on 67/1086on 68/1086on 69/1086on 70/1086on 71/1086on 72/1086on 73/1086on 74/1086on 75/1086on 76/1086on 77/1086on 78/1086on 79/1086on 80/1086on 81/1086on 82/1086on 83/1086on 84/1086on 85/1086on 86/1086on 87/1086on 88/1086on 89/1086on 90/1086on 91/108

In [10]:
from collections import defaultdict
from operator import itemgetter
def overlap_assembler(l: list, ion: str) -> str:
    '''
    Take a list of sequences and return the longest sequence that contains
    the most of the rest of the sequences.

    Example:
        l: [ABCDEFG, ABCDE, ABCXY, ABCD, ABC]

        ABCDEFG contains [ABCDE, ABCD, ABC] which is the most of any others
        so ABCDEFG would be returned

    Inputs:
        l:      (list) the sequences to assemble together   
        ion:    (str) the ion this is performed for. y goes right to left, b left to right
    Outputs:
        (str) the sequence that contains the most of the others. If no
                sequence can be assembled, None is returned
    '''
    # sort longest to shortest
    l.sort(key=len, reverse=True)
    
    # got through each element and count the number of smaller 
    # sequences it contains
    assembler = defaultdict(lambda: 0)
    for i, e in enumerate(l[:-1]):
        for e2 in l[i+1:]:
            
            # get left to right or right to left depending on the ion
            cmp_str = e[:len(e2)] if ion == 'b' else e[-len(e2):]
                        
            if e2 == cmp_str:
                assembler[e] += 1
                
    # return the str if possible, None otherwise
    try:    
        return max([(k, v) for k, v in assembler.items()], key=itemgetter(1))[0]
    except:
        return None

In [11]:
m_counter = 0
for b_kmer in b_kmers:
    overlapped = overlap_assembler(b_kmer[1], 'b')
    if overlapped is None or overlapped != b_kmer[0][:len(overlapped)]:
        m_counter += 1
        print(f'Miss for sequence: {b_kmer[0]}. Attempted: {overlapped}. List: \n{b_kmer[1]}')
        
print(f'missed {m_counter}/{len(b_kmers)}')

Miss for sequence: DPQVAQLELGGEVEDPQVAQLELGGGPGAG. Attempted: DPQVAQLELGGGPGA. List: 
['DPQNHVLYSNRSAAYAKKGD', 'DPQVEQLELGGSPGDLQ', 'DPQVAQLELGGGPGAG', 'DPQVAQLELGGGPGA', 'DPQVAQLELGGGPGA', 'DPQVAQLELGGGPG', 'DPQVAQLELGGGP', 'DPQVAQLELGGG', 'DPQVAQLELGGG', 'DPQVAQLELGG', 'DPQVEQLELGG', 'DPQVEQLELGG', 'DPQVAQLELG', 'DPQVEQLELG', 'DPQVAQLEL', 'DPQHPPLPL', 'DPQVAQLE', 'DPQHPPLP', 'DPQVAQL', 'DPQVAQ', 'DPQVA', 'DPQV', 'DPQ']
Miss for sequence: DLQTLALNAAR. Attempted: DLQDQL. List: 
['DLQDQL', 'DLQT', 'DLQ']
Miss for sequence: DLPVGRSV. Attempted: DLPQRKPSL. List: 
['DLPAIPGVTS', 'DLPQRKPSL', 'DLPQRKP', 'DLP']
Miss for sequence: DTGAGSIREAGGAFGKREKAEE. Attempted: DTGFVDIPQQ. List: 
['DTGFVDIPQQ', 'DTGA', 'DTG']
Miss for sequence: DYFEEYGKI. Attempted: DYFCACLAKVKGANDGI. List: 
['DYFCACLAKVKGANDGI', 'DYFLFRDSDILGKY', 'DYFSQL', 'DYFE', 'DYF']
Miss for sequence: EVRKALSRQEMQEVQSSRSGRGGNFGFG. Attempted: None. List: 
['EVREDLRLPEGD', 'EVRKALSRQE', 'EVRSG']
Miss for sequence: DLNRNFP. Attempted: 

In [12]:
m_counter = 0
for y_kmer in y_kmers:
    overlapped = overlap_assembler(y_kmer[1], 'y')
    if overlapped is None or overlapped != y_kmer[0][-len(overlapped):]:
        m_counter += 1
        print(f'Miss for sequence: {y_kmer[0]}. Attempted: {overlapped}. List: \n{y_kmer[1]}')
        
print(f'missed {m_counter}/{len(y_kmers)}')

Miss for sequence: DPQVEQLEL. Attempted: None. List: 
['YNYVWANCFEITLEL', 'ESYGLEL', 'YQASLEL']
Miss for sequence: DLQTLALEVA. Attempted: ELGGGPGAGDLQTLALEVA. List: 
['ELGGGPGAGDLQTLALEVA', 'EVA']
Miss for sequence: DLQTLALE. Attempted: QLSTYLDPALE. List: 
['QLSTYLDPALE', 'EFQREALE', 'ALE']
Miss for sequence: DLQTLAL. Attempted: NPDLAL. List: 
['VDDFDNLAL', 'RNNEKKLAL', 'NPDLAL', 'NPDLAL', 'QTLAL']
Miss for sequence: DLQTLALLL. Attempted: LHEIFTSPLNLLL. List: 
['LHEIFTSPLNLLL', 'HEIFTSPLNLLLL', 'FTSPLNLLL', 'ASLPALLL']
Miss for sequence: DLQTLALWSRM. Attempted: KQAEILQESRM. List: 
['KQAEILQESRM', 'YLHLSVSRM', 'ILQESRM', 'WSRM', 'SRM']
Miss for sequence: DLQTLALNAAR. Attempted: ESKKAARAAR. List: 
['ESKKAARAAR', 'GYQAGAAR', 'RAALAAR', 'AARAAR', 'AAR']
Miss for sequence: DLQTLALNAAR. Attempted: ESKKAARAAR. List: 
['ESKKAARAAR', 'RKQREEAAR', 'AAR']
Miss for sequence: DEILRVV. Attempted: LIRVV. List: 
['LIRVV', 'ILRVV', 'IRVV', 'LRVV', 'RVV']
Miss for sequence: DLKTPAGLQVLN. Attempted: AKSF

In [17]:
table_keys = list(all_tables.keys())
for m in missed:
    correct_seq = scan_no_keyed_results[m[1]]
    table = all_tables[table_keys[m[1]]]['alignments'][m[0]]
    mer = correct_seq[:3] if m[0] == 'b' else correct_seq[-3:]
    print(f'{mer} for sequence {correct_seq} not found in table')
    print(table)
    break

DLP for sequence DLPVNSPMTKG not found in table
{'GRS': ['GRS', 'GRSRDY', 'GRSRMISQEAFA'], 'GSR': ['GSR', 'GSRNMGGPYG', 'GSRYINKEIQN', 'GSRNGKTSKKITISDCGQ'], 'SGR': ['SGR', 'SGRTAHYKLTST'], 'SRG': ['SRG', 'SRGIPNMLLNDEE'], 'RGS': ['RGS', 'RGSIINGADWYSF', 'RGSVWTKAKAAFENWE'], 'LPD': ['LPD', 'LPDDINVY', 'LPDTEKTE', 'LPDDINVY', 'LPDEENRRESKD'], 'PVE': ['PVE', 'PVEGAF', 'PVEISFCVG', 'PVEDSQGQTKVGNE'], 'DPI': ['DPI', 'DPISYHIS'], 'DIP': ['DIP', 'DIPIPD', 'DIPVVCLESDNGNI'], 'VPE': ['VPE', 'VPENIPAGSSI'], 'PLD': ['PLD', 'PLDPTSGSSA', 'PLDVKSPEAQ', 'PLDTVTFYK'], 'PDL': ['PDL', 'PDLLTMVVDYR'], 'LDP': ['LDP', 'LDPVEKDDNNMPV'], 'PEV': ['PEV', 'PEVQQIMS'], 'PID': ['PID', 'PIDVCEIGS', 'PIDAPSPLENLEQKETP'], 'SNP': ['SNPV', 'SNPAAIPHAA', 'SNPKTQEQ', 'SNPAAIPHAA', 'SNPAMAPRERKAGC'], 'APT': ['APTQ', 'APTFNVTVT', 'APTFNVTVT'], 'SAA': ['SAAAP', 'SAAPA', 'SAAIDFE', 'SAAS', 'SAAQAAAQTN', 'SAACGGSAAPGF'], 'AAP': ['AAPGT', 'AAPAS', 'AAPPE', 'AAPE', 'AAPAGGAAPSTAAA', 'AAPRPRPPVYDDGPTGPD'], 'ATP': ['ATPQ', 'AT

In [19]:
print(len(both_missed))

9
