# See how well it did on real data
The spectra are real. We have results from `ssv` files from SpectrumMill. We can load this and see if the sequences we identified matches the sequences SpectrumMill identified, plus the hybrid ones

## 1. Load the `ssv` file

In [1]:
import pandas as pd
import json

specmilresultsfile = '/Users/zacharymcgrath/Downloads/NOD2_E3_results.ssv'
specmilresults = pd.read_csv(specmilresultsfile, sep=';')


In [2]:
specmilresults.loc[specmilresults['filename'] == 'NOD2_E3.7065.7065.2']

Unnamed: 0,number,filename,parent_charge,score,deltaForwardReverseScore,deltaRank1Rank2Score,percent_scored_peak_intensity,totalIntensity,previous_aa,sequence,next_aa,retentionTimeMin,chromatographicPeakWidthSec,parent_m_over_z,species,entry_name
6,7,NOD2_E3.7065.7065.2,2,12.55,6.28,7.14,81.3,282000.0,(G),DLPVNSPMTKG,(D),26.08,15.0,579.7953,MOUSE,HYBRID: mouse ins2C EVEDPQVAQLELGGGPGAGD-LPVNS...


### 1.a Get the hybrids out of the dataframe

In [3]:
specmilhybs = specmilresults.loc[specmilresults['entry_name'].str.contains('HYBRID')]
filenameindexedhybs = {row['filename']: row['sequence'] for _, row in specmilhybs.iterrows()}


### 1.b Get the non hybrids out of the dataframe

In [4]:
specmilnonhybs = specmilresults.loc[~specmilresults['entry_name'].str.contains('HYBRID')]
filenameindexednonhybs = {row['filename']: row['sequence'] for _, row in specmilnonhybs.iterrows()}

## 2. Load the results from hypedsearch

In [5]:
nonhybridresultsfile = '/Users/zacharymcgrath/Desktop/Experiment output/filtered_NOD2_E3_AUG_25/summary.tsv'
hybridresultsfile = '/Users/zacharymcgrath/Desktop/Experiment output/filtered_NOD2_E3_AUG_25/hybrid_summary.tsv'
resultsjsonfile = '/Users/zacharymcgrath/Desktop/Experiment output/filtered_NOD2_E3_AUG_25/summary.json'

nonhybridresults = pd.read_csv(nonhybridresultsfile, sep='\t')
hybridresults = pd.read_csv(hybridresultsfile, sep='\t')


In [6]:
nonhybridresults.head()

Unnamed: 0.1,Unnamed: 0,proteins,sequence,b_score,y_score,total_score,precursor_distance,entry name,id
0,0,['sp|P01325|INS1_MOUSE Insulin-1 OS=Mus muscul...,DPQVEQLEL,6,2,9,0.000673,0,NOD2_E3.13446.13477.2.pkl
1,1,['sp|P01325|INS1_MOUSE Insulin-1 OS=Mus muscul...,DLQTLALEVA,7,2,11,0.002875,1,NOD2_E3.18005.18246.2.pkl
2,2,['sp|P17751|TPIS_MOUSE Triosephosphate isomera...,DLQRLEPGTM,1,2,2,0.497173,6,NOD2_E3.7065.7065.2.pkl
3,3,['sp|Q920A5|RISC_MOUSE Retinoid-inducible seri...,LDTQLKPRNT,2,3,4,0.000812,8,NOD2_E3.10614.10681.2.pkl
4,4,['sp|P01027|CO3_MOUSE Complement C3 OS=Mus mus...,ELTNIELL,6,1,12,0.971732,10,NOD2_E3.11427.11516.2.pkl


In [7]:
hybridresults.head()

Unnamed: 0.1,Unnamed: 0,left_proteins,right_proteins,sequence,hybrid_sequence,b_score,y_score,total_score,precursor_distance,entry name,id
0,0,['sp|P63154|CRNL1_MOUSE Crooked neck-like prot...,['sp|Q78JW9|UBFD1_MOUSE Ubiquitin domain-conta...,EVQITADAG,EVQITA-DAG,5,0,7,0.478554,2,NOD2_E3.13729.13828.2.pkl
1,1,['sp|P01325|INS1_MOUSE Insulin-1 OS=Mus muscul...,['sp|Q62186|SSRD_MOUSE Translocon-associated p...,DLQTLALLL,DLQT-LALLL,6,2,10,0.000194,4,NOD2_E3.21510.21510.2.pkl
2,2,[],['sp|P01326|INS2_MOUSE Insulin-2 OS=Mus muscul...,PGAGDLQTLALEVAQDPQVAQLELGGGPGAG,PGAGDLQTLALEVAQ-DPQVAQLELGGGPGAG,6,9,5,0.985426,5,NOD2_E3.12771.12902.3.pkl
3,3,['sp|P01325|INS1_MOUSE Insulin-1 OS=Mus muscul...,['sp|P26339|CMGA_MOUSE Chromogranin-A OS=Mus m...,DLQTLALWSRM,DLQTLAL-WSRM,4,8,10,0.000192,7,NOD2_E3.16373.16401.2.pkl
4,4,['sp|P01901|HA1B_MOUSE H-2 class I histocompat...,['sp|Q02819|NUCB1_MOUSE Nucleobindin-1 OS=Mus ...,PCTLLLLNARA,PCTLLL-LNARA,1,5,6,0.99306,9,NOD2_E3.10635.10674.3.pkl


## 3. Check to see the overlap in results

In [8]:
# pairs are (hypedsearch, specmil, id)
untracked = []
nonhybpairs = []
for _, row in nonhybridresults.iterrows():
    fname = row['id'].replace('.pkl', '')
    if fname in filenameindexednonhybs:
        nonhybpairs.append((row['sequence'], filenameindexednonhybs[fname], fname, 'non-hybrid'))
    elif fname in filenameindexedhybs:
        nonhybpairs.append((row['sequence'], filenameindexedhybs[fname], fname, 'hybrid'))
    else:
        untracked.append((row['sequence'], fname, '---'))

hybpairs = []
for _, row in hybridresults.iterrows():
    fname = row['id'].replace('.pkl', '')
    if fname in filenameindexednonhybs:
        hybpairs.append((row['sequence'], filenameindexednonhybs[fname], fname, 'non-hybrid'))
    elif fname in filenameindexedhybs:
        hybpairs.append((row['sequence'], filenameindexedhybs[fname], fname, 'hybrid'))
    else:
        untracked.append((row['sequence'], fname, '---'))

untracked

[]

In [9]:
gc = 0
missed_nhpairs = []
for pair in nonhybpairs:
    # replace LI with B 
    hyped_LI_less = pair[0].replace('L', 'B').replace('I', 'B')
    specmil_LI_less = pair[1].replace('L', 'B').replace('I', 'B')
    if hyped_LI_less != specmil_LI_less:
        missed_nhpairs.append(pair)
        print(pair)
    else:
        gc += 1
print(f'Successfully aligned {gc}/{len(filenameindexednonhybs)} ')

('DLQRLEPGTM', 'DLPVNSPMTKG', 'NOD2_E3.7065.7065.2', 'hybrid')
('LDTQLKPRNT', 'DLQTLALNAAR', 'NOD2_E3.10614.10681.2', 'hybrid')
('ELTNIELL', 'DLQTLEVE', 'NOD2_E3.11427.11516.2', 'hybrid')
('KKFKMKL', 'DVYKGVLK', 'NOD2_E3.4039.4039.3', 'non-hybrid')
('HTDLSSLDMMS', 'ERGFFYTPMS', 'NOD2_E3.10632.10703.2', 'non-hybrid')
('YVHTVMACYIG', 'ELGISTPEELGL', 'NOD2_E3.16641.16641.2', 'non-hybrid')
('KKNISEAELEEYT', 'DFGKFVLSSGKFYG', 'NOD2_E3.12179.12179.3', 'non-hybrid')
('EKLIQDH', 'DVNWGYE', 'NOD2_E3.10602.10740.2', 'non-hybrid')
('YSNRSAAYAKKGDYQ', 'DPYNMLPPKAASGTKE', 'NOD2_E3.6331.6331.3', 'non-hybrid')
('VSGAASLSPL', 'ELARFHK', 'NOD2_E3.1309.1309.2', 'non-hybrid')
('NPDHLSVLEKTA', 'DSFIQTSQKRI', 'NOD2_E3.5205.5249.3', 'non-hybrid')
('AAVIQLISVYA', 'DFINWLIQT', 'NOD2_E3.21473.21473.2', 'non-hybrid')
('CGVVCEHSPFDGI', 'DFPEEVAIAEEL', 'NOD2_E3.19743.19743.2', 'non-hybrid')
('AKYFLAEL', 'ELAKYFLA', 'NOD2_E3.10174.10274.2', 'non-hybrid')
('QELKEDHWILWWKK', 'EQVFSKYGQISEVVVVK', 'NOD2_E3.10343.10343

In [10]:
gch = 0
missed_hpairs = []
for pair in hybpairs:
    if pair[0] != pair[1]:
        missed_hpairs.append(pair)
        print(pair)
    else:
        gch += 1
print(f'Successfully aligned {gch}/{len(filenameindexedhybs)}')

('EVQITADAG', 'DLQTLALE', 'NOD2_E3.13729.13828.2', 'non-hybrid')
('PGAGDLQTLALEVAQDPQVAQLELGGGPGAG', 'DPQVAQLELGGEVEDPQVAQLELGGGPGAG', 'NOD2_E3.12771.12902.3', 'hybrid')
('PCTLLLLNARA', 'DLQTLALNAAR', 'NOD2_E3.10635.10674.3', 'hybrid')
('EVQITALEVE', 'DLQTLALEVE', 'NOD2_E3.16920.17393.2', 'hybrid')
('KKILDSVGIEADDDRHPLL', 'DINAYNGETPTEKLPFPII', 'NOD2_E3.18104.18104.3', 'non-hybrid')
('DLEVLLLKLESMIGGFRNAK', 'DINAYNGETPTEKLPFPIID', 'NOD2_E3.17517.17517.3', 'non-hybrid')
('TTTTTFKGVDPCGHKCIPPS', 'DEAPNFEANTTIGRIRFH', 'NOD2_E3.9670.9702.4', 'non-hybrid')
('SEGAQAHTAKHARMEAEREK', 'DTGAGSIREAGGAFGKREKAEE', 'NOD2_E3.3808.3850.4', 'non-hybrid')
('SEDTVAGLNTFMDLIEQVEK', 'DTGAGSIREAGGAFGKREKAEE', 'NOD2_E3.3820.3820.5', 'non-hybrid')
('AAVAHEEDLYGPEGPYGVFAGR', 'DVTKGRKFYGPEGPYGVFAGR', 'NOD2_E3.7225.7225.4', 'non-hybrid')
('SNRSYVVEAMERFG', 'EVQSSRSGRGGNFGFG', 'NOD2_E3.4133.4133.3', 'non-hybrid')
('VYPGEASRVSVADHRMFAEYLASENQR', 'DEIKPSSAPELQAVRMFAEYLASENQR', 'NOD2_E3.17256.17256.4', 'non-hybrid')

## 4. See what the other alignments were for the missed sets

In [11]:
# load the json
jsres = json.load(open(resultsjsonfile, 'r'))

# all the entries are <filename>_<scan_no> so go through and index them by their filename
filenameindexedhypedsearch = {}
for name, alignments in jsres.items():
    # get the id
    fname = alignments['spectrum']['id'].replace('.pkl', '')
    
    filenameindexedhypedsearch[fname] = alignments


In [12]:
# for each of the unsuccesfull non hybrids, print the others
for nh in missed_nhpairs:
    print()
    print(f'Acutal alignment: {nh[1]} \t Hybrid: {"non" not in nh[3]} \t ID: {nh[2]}')
    print('Attempted alignemnts:')
    for alignment in filenameindexedhypedsearch[nh[2]]['alignments']:
        hybrid = 'hybrid_sequence' in alignment
        foundstring = '*' if alignment['sequence'] == nh[1] else ''
        print(f'{foundstring}Sequence: {alignment["sequence"]} \t Hybrid: {hybrid} \t Total score: {alignment["total_score"]} \t \
              B score: {alignment["b_score"]} \t Y score: {alignment["y_score"]}')


Acutal alignment: DLPVNSPMTKG 	 Hybrid: True 	 ID: NOD2_E3.7065.7065.2
Attempted alignemnts:
Sequence: DLQRLEPGTM 	 Hybrid: False 	 Total score: 2 	               B score: 1 	 Y score: 2
Sequence: LPVNSPMTKGD 	 Hybrid: False 	 Total score: 0 	               B score: 0 	 Y score: 0

Acutal alignment: DLQTLALNAAR 	 Hybrid: True 	 ID: NOD2_E3.10614.10681.2
Attempted alignemnts:
Sequence: LDTQLKPRNT 	 Hybrid: False 	 Total score: 4 	               B score: 2 	 Y score: 3
Sequence: DKVSKGLALQQ 	 Hybrid: True 	 Total score: 2 	               B score: 2 	 Y score: 0
Sequence: IQDKVSKGKAL 	 Hybrid: True 	 Total score: 1 	               B score: 1 	 Y score: 0

Acutal alignment: DLQTLEVE 	 Hybrid: True 	 ID: NOD2_E3.11427.11516.2
Attempted alignemnts:
Sequence: ELTNIELL 	 Hybrid: False 	 Total score: 12 	               B score: 6 	 Y score: 1
Sequence: ELTNLELD 	 Hybrid: True 	 Total score: 10 	               B score: 6 	 Y score: 1
Sequence: TLNELEDL 	 Hybrid: True 	 Total score: 10 	        

In [13]:
for h in missed_hpairs:
    print()
    print(f'Acutal alignment: {h[1]} \t Hybrid: {"non" not in h[3]} \t ID: {h[2]}')
    print('Attempted alignments:')
    for alignment in filenameindexedhypedsearch[h[2]]['alignments']:
        hybrid = 'hybrid_sequence' in alignment
        foundstring = '*' if alignment['sequence'] == h[1] else ''
        print(f'{foundstring}Sequence: {alignment["sequence"]} \t Hybrid: {hybrid} \t Total score: {alignment["total_score"]} \t \
              B score: {alignment["b_score"]} \t Y score: {alignment["y_score"]}')


Acutal alignment: DLQTLALE 	 Hybrid: False 	 ID: NOD2_E3.13729.13828.2
Attempted alignments:
Sequence: EVQITADAG 	 Hybrid: True 	 Total score: 7 	               B score: 5 	 Y score: 0
Sequence: SREDAGAPV 	 Hybrid: False 	 Total score: 6 	               B score: 0 	 Y score: 4
Sequence: LTAPSATSPG 	 Hybrid: True 	 Total score: 4 	               B score: 3 	 Y score: 0

Acutal alignment: DPQVAQLELGGEVEDPQVAQLELGGGPGAG 	 Hybrid: True 	 ID: NOD2_E3.12771.12902.3
Attempted alignments:
Sequence: PGAGDLQTLALEVAQDPQVAQLELGGGPGAG 	 Hybrid: True 	 Total score: 5 	               B score: 6 	 Y score: 9
Sequence: PGAGDLQTLALEVAEDPQVAQLELGGGPGAG 	 Hybrid: True 	 Total score: 5 	               B score: 6 	 Y score: 8
Sequence: GPGAGDLQTLALEEDPQVAQLELGGGPGAGD 	 Hybrid: True 	 Total score: 1 	               B score: 2 	 Y score: 2

Acutal alignment: DLQTLALNAAR 	 Hybrid: True 	 ID: NOD2_E3.10635.10674.3
Attempted alignments:
Sequence: PCTLLLLNARA 	 Hybrid: True 	 Total score: 6 	               B sco

Sequence: DLEVLLLKWTKKGLVSGG 	 Hybrid: True 	 Total score: 4 	               B score: 3 	 Y score: 3
Sequence: DLEVLLLKLESLVGTVVGG 	 Hybrid: True 	 Total score: 4 	               B score: 3 	 Y score: 3

Acutal alignment: DYPQAMKHYTEAIKRNPR 	 Hybrid: False 	 ID: NOD2_E3.4497.4497.5
Attempted alignments:
Sequence: KKDNGHFAPDISSYVLSDNI 	 Hybrid: True 	 Total score: 1 	               B score: 3 	 Y score: 0
Sequence: KKDNGHIFAPDISSYVLSDN 	 Hybrid: True 	 Total score: 1 	               B score: 3 	 Y score: 0
Sequence: TFLKKQCETMLEISSYVLS 	 Hybrid: True 	 Total score: 1 	               B score: 0 	 Y score: 2

Acutal alignment: DYSTTPGGTLFSTTPGGTRIIY 	 Hybrid: False 	 ID: NOD2_E3.15450.15493.3
Attempted alignments:
Sequence: DYSTTPGGTLFSMAPPVRDPVP 	 Hybrid: True 	 Total score: 3 	               B score: 3 	 Y score: 4
*Sequence: DYSTTPGGTLFSTTPGGTRIIY 	 Hybrid: True 	 Total score: 3 	               B score: 3 	 Y score: 4
Sequence: DYSTTPTLFSTTPGGTRIIYD 	 Hybrid: True 	 Total score: 1 	   