# See how well it did on real data
The spectra are real. We have results from `ssv` files from SpectrumMill. We can load this and see if the sequences we identified matches the sequences SpectrumMill identified, plus the hybrid ones

## 1. Load the `ssv` file

In [1]:
import pandas as pd
import json

specmilresultsfile = '/Users/zacharymcgrath/Downloads/NOD2_E3_results.ssv'
specmilresults = pd.read_csv(specmilresultsfile, sep=';')


In [2]:
specmilresults.loc[specmilresults['filename'] == 'NOD2_E3.7065.7065.2']

Unnamed: 0,number,filename,parent_charge,score,deltaForwardReverseScore,deltaRank1Rank2Score,percent_scored_peak_intensity,totalIntensity,previous_aa,sequence,next_aa,retentionTimeMin,chromatographicPeakWidthSec,parent_m_over_z,species,entry_name
6,7,NOD2_E3.7065.7065.2,2,12.55,6.28,7.14,81.3,282000.0,(G),DLPVNSPMTKG,(D),26.08,15.0,579.7953,MOUSE,HYBRID: mouse ins2C EVEDPQVAQLELGGGPGAGD-LPVNS...


### 1.a Get the hybrids out of the dataframe

In [3]:
specmilhybs = specmilresults.loc[specmilresults['entry_name'].str.contains('HYBRID')]
filenameindexedhybs = {row['filename']: row['sequence'] for _, row in specmilhybs.iterrows()}


### 1.b Get the non hybrids out of the dataframe

In [4]:
specmilnonhybs = specmilresults.loc[~specmilresults['entry_name'].str.contains('HYBRID')]
filenameindexednonhybs = {row['filename']: row['sequence'] for _, row in specmilnonhybs.iterrows()}

## 2. Load the results from hypedsearch

In [5]:
hyped_res_dir = '/Users/zacharymcgrath/Desktop/Experiment output/filtered_NOD2_E3_AUG_26/'

nonhybridresultsfile = hyped_res_dir + 'summary.tsv'
hybridresultsfile = hyped_res_dir + 'hybrid_summary.tsv'
resultsjsonfile = hyped_res_dir + 'summary.json'

nonhybridresults = pd.read_csv(nonhybridresultsfile, sep='\t')
hybridresults = pd.read_csv(hybridresultsfile, sep='\t')


In [6]:
nonhybridresults.head()

Unnamed: 0.1,Unnamed: 0,proteins,sequence,b_score,y_score,total_score,precursor_distance,entry name,id
0,0,['sp|P01325|INS1_MOUSE Insulin-1 OS=Mus muscul...,DPQVEQLEL,7,2,9,0.000673,0,NOD2_E3.13446.13477.2.pkl
1,1,['sp|P01325|INS1_MOUSE Insulin-1 OS=Mus muscul...,DLQTLALEVA,7,2,9,0.002875,1,NOD2_E3.18005.18246.2.pkl
2,2,['sp|P01325|INS1_MOUSE Insulin-1 OS=Mus muscul...,DLQTLALE,5,3,8,0.000878,2,NOD2_E3.13729.13828.2.pkl
3,3,['sp|P01325|INS1_MOUSE Insulin-1 OS=Mus muscul...,DLQTLAL,4,2,6,0.000438,3,NOD2_E3.15226.15503.2.pkl
4,4,['sp|P01325|INS1_MOUSE Insulin-1 OS=Mus muscul...,DLQTLALEV,6,0,6,0.971245,4,NOD2_E3.21510.21510.2.pkl


In [7]:
hybridresults.head()

Unnamed: 0.1,Unnamed: 0,left_proteins,right_proteins,sequence,hybrid_sequence,b_score,y_score,total_score,precursor_distance,entry name,id
0,0,['sp|P01325|INS1_MOUSE Insulin-1 OS=Mus muscul...,['sp|P26339|CMGA_MOUSE Chromogranin-A OS=Mus m...,DLQTLALWSRM,DLQTLAL-WSRM,5,8,13,0.000192,7,NOD2_E3.16373.16401.2.pkl
1,1,['sp|P01325|INS1_MOUSE Insulin-1 OS=Mus muscul...,['sp|Q8VDC1|FYCO1_MOUSE FYVE and coiled-coil d...,DLQTLALQQR,DLQTL-ALQQR,4,8,12,0.000812,8,NOD2_E3.10614.10681.2.pkl
2,2,['sp|P63154|CRNL1_MOUSE Crooked neck-like prot...,['sp|Q02819|NUCB1_MOUSE Nucleobindin-1 OS=Mus ...,EVQITALNARA,EVQITA-LNARA,4,8,12,0.504749,9,NOD2_E3.10635.10674.3.pkl
3,3,['sp|Q3UVY5|PCX4_MOUSE Pecanex-like protein 4 ...,['sp|Q9WTX5|SKP1_MOUSE S-phase kinase-associat...,ELTNLEDL,ELTN-LEDL,6,2,8,0.000293,10,NOD2_E3.11427.11516.2.pkl
4,4,['sp|P17742|PPIA_MOUSE Peptidyl-prolyl cis-tra...,"['sp|O35143|ATIF1_MOUSE ATPase inhibitor, mito...",KKITISDCGQAFGKREKAEE,KKITISDCGQ-AFGKREKAEE,3,3,6,0.48581,20,NOD2_E3.3820.3820.5.pkl


## 3. Check to see the overlap in results

In [8]:
# pairs are (hypedsearch, specmil, id)
untracked = []
nonhybpairs = []
for _, row in nonhybridresults.iterrows():
    fname = row['id'].replace('.pkl', '')
    if fname in filenameindexednonhybs:
        nonhybpairs.append((row['sequence'], filenameindexednonhybs[fname], fname, 'non-hybrid'))
    elif fname in filenameindexedhybs:
        nonhybpairs.append((row['sequence'], filenameindexedhybs[fname], fname, 'hybrid'))
    else:
        untracked.append((row['sequence'], fname, '---'))

hybpairs = []
for _, row in hybridresults.iterrows():
    fname = row['id'].replace('.pkl', '')
    if fname in filenameindexednonhybs:
        hybpairs.append((row['sequence'], filenameindexednonhybs[fname], fname, 'non-hybrid'))
    elif fname in filenameindexedhybs:
        hybpairs.append((row['sequence'], filenameindexedhybs[fname], fname, 'hybrid'))
    else:
        untracked.append((row['sequence'], fname, '---'))

untracked

[]

In [9]:
gc = 0
missed_nhpairs = []
for pair in nonhybpairs:
    # replace LI with B 
    hyped_LI_less = pair[0].replace('L', 'B').replace('I', 'B')
    specmil_LI_less = pair[1].replace('L', 'B').replace('I', 'B')
    if hyped_LI_less != specmil_LI_less:
        missed_nhpairs.append(pair)
        print(pair)
    else:
        gc += 1
print(f'Successfully aligned {gc}/{len(filenameindexednonhybs)} ')

('DLQTLALEV', 'DLQTLALLL', 'NOD2_E3.21510.21510.2', 'hybrid')
('LPVNSPMTKGD', 'DLPVNSPMTKG', 'NOD2_E3.7065.7065.2', 'hybrid')
('SSPAYYIFQELADKCSPTL', 'DINAYNGETPTEKLPFPII', 'NOD2_E3.18104.18104.3', 'non-hybrid')
('SWAKSAGSKSSGGRMEAEREK', 'DTGAGSIREAGGAFGKREKAEE', 'NOD2_E3.3808.3850.4', 'non-hybrid')
('DPQVEQLELGGSGP', 'DPQVEQLELGGSPG', 'NOD2_E3.12220.12676.2', 'non-hybrid')
('LALLFLWESHPTQ', 'EDPQVEQLELGGSPG', 'NOD2_E3.11823.11823.2', 'non-hybrid')
('DLQTLALEVAAG', 'DLQTLALEVAQ', 'NOD2_E3.17685.17852.2', 'non-hybrid')
('EVQITALEVAQQ', 'DLQTLALEVAQQ', 'NOD2_E3.18205.18816.2', 'non-hybrid')
('EVQITALEVAQQ', 'DLQTLALEVAQQ', 'NOD2_E3.18217.18415.3', 'non-hybrid')
('IDDGDGQVNY', 'DIDGDGQVNY', 'NOD2_E3.8265.8265.2', 'non-hybrid')
('DMLNIDDA', 'DMINEVDA', 'NOD2_E3.9494.9615.2', 'non-hybrid')
('DLAILLRG', 'DLALILSAG', 'NOD2_E3.17934.18031.2', 'non-hybrid')
('LDMLKIF', 'DLMLKLF', 'NOD2_E3.16581.16645.2', 'non-hybrid')
('KKISLAHSLRTPENT', 'DYPGLGKWNKKLLY', 'NOD2_E3.9383.9562.4', 'non-hybrid')
('

In [10]:
gch = 0
missed_hpairs = []
for pair in hybpairs:
    if pair[0] != pair[1]:
        missed_hpairs.append(pair)
        print(pair)
    else:
        gch += 1
print(f'Successfully aligned {gch}/{len(filenameindexedhybs)}')

('DLQTLALQQR', 'DLQTLALNAAR', 'NOD2_E3.10614.10681.2', 'hybrid')
('EVQITALNARA', 'DLQTLALNAAR', 'NOD2_E3.10635.10674.3', 'hybrid')
('ELTNLEDL', 'DLQTLEVE', 'NOD2_E3.11427.11516.2', 'hybrid')
('KKITISDCGQAFGKREKAEE', 'DTGAGSIREAGGAFGKREKAEE', 'NOD2_E3.3820.3820.5', 'non-hybrid')
('LHYFFLSTFAWLLVQGLNLGGSLTRQM', 'DIYNFFSPLNPVRVHIEIGPDGRVTGEA', 'NOD2_E3.17254.17254.4', 'non-hybrid')
('KKYESEESVSKGSWQVFAGR', 'DVTKGRKFYGPEGPYGVFAGR', 'NOD2_E3.7225.7225.4', 'non-hybrid')
('SSDDVVKVEKQ', 'DSEAVSVRKLAG', 'NOD2_E3.4632.4632.3', 'non-hybrid')
('AQADRLTQEPEGYTHLSTGDLLRAEVSSG', 'EVRKALSRQEMQEVQSSRSGRGGNFGFG', 'NOD2_E3.5288.5288.5', 'non-hybrid')
('TETVVCALRIQNIGEQGHMALLGHSLGAY', 'DEIKPSSAPELQAVRMFAEYLASENQR', 'NOD2_E3.17256.17256.4', 'non-hybrid')
('SSSSYRRENCIKAF', 'DSRPGGYGYGYGRSR', 'NOD2_E3.3190.3222.4', 'non-hybrid')
('DLQTLALERAVQ', 'DLQTLALEVARQ', 'NOD2_E3.14557.14557.2', 'non-hybrid')
('TASALGPELGGSPG', 'QVEQLELGGSPG', 'NOD2_E3.9045.9108.2', 'non-hybrid')
('TTKTYFAEGSK', 'ERGFFYTPKS', 'NOD2_

## 4. See what the other alignments were for the missed sets

In [11]:
# load the json
jsres = json.load(open(resultsjsonfile, 'r'))

# all the entries are <filename>_<scan_no> so go through and index them by their filename
filenameindexedhypedsearch = {}
for name, alignments in jsres.items():
    # get the id
    fname = alignments['spectrum']['id'].replace('.pkl', '')
    
    filenameindexedhypedsearch[fname] = alignments


In [12]:
# for each of the unsuccesfull non hybrids, print the others
for nh in missed_nhpairs:
    print()
    print(f'Acutal alignment: {nh[1]} \t Hybrid: {"non" not in nh[3]} \t ID: {nh[2]}')
    print('Attempted alignemnts:')
    for alignment in filenameindexedhypedsearch[nh[2]]['alignments']:
        hybrid = 'hybrid_sequence' in alignment
        foundstring = '*' if alignment['sequence'] == nh[1] else ''
        print(f'{foundstring}Sequence: {alignment["sequence"]} \t Hybrid: {hybrid} \t Total score: {alignment["total_score"]} \t \
              B score: {alignment["b_score"]} \t Y score: {alignment["y_score"]}')


Acutal alignment: DLQTLALLL 	 Hybrid: True 	 ID: NOD2_E3.21510.21510.2
Attempted alignemnts:
Sequence: DLQTLALEV 	 Hybrid: False 	 Total score: 6 	               B score: 6 	 Y score: 0
Sequence: NLASGVVDLL 	 Hybrid: False 	 Total score: 2 	               B score: 0 	 Y score: 2
Sequence: LNLASGVVDL 	 Hybrid: False 	 Total score: 1 	               B score: 0 	 Y score: 1

Acutal alignment: DLPVNSPMTKG 	 Hybrid: True 	 ID: NOD2_E3.7065.7065.2
Attempted alignemnts:
Sequence: LPVNSPMTKGD 	 Hybrid: False 	 Total score: 0 	               B score: 0 	 Y score: 0

Acutal alignment: DINAYNGETPTEKLPFPII 	 Hybrid: False 	 ID: NOD2_E3.18104.18104.3
Attempted alignemnts:
Sequence: SSPAYYIFQELADKCSPTL 	 Hybrid: False 	 Total score: 6 	               B score: 3 	 Y score: 3
Sequence: SSPVEASFCESFRDQMSII 	 Hybrid: True 	 Total score: 5 	               B score: 2 	 Y score: 3
Sequence: SSAPYYIFQELADKCSPTL 	 Hybrid: True 	 Total score: 5 	               B score: 3 	 Y score: 2

Acutal alignment: DTGAG

In [13]:
for h in missed_hpairs:
    print()
    print(f'Acutal alignment: {h[1]} \t Hybrid: {"non" not in h[3]} \t ID: {h[2]}')
    print('Attempted alignments:')
    for alignment in filenameindexedhypedsearch[h[2]]['alignments']:
        hybrid = 'hybrid_sequence' in alignment
        foundstring = '*' if alignment['sequence'] == h[1] else ''
        print(f'{foundstring}Sequence: {alignment["sequence"]} \t Hybrid: {hybrid} \t Total score: {alignment["total_score"]} \t \
              B score: {alignment["b_score"]} \t Y score: {alignment["y_score"]}')


Acutal alignment: DLQTLALNAAR 	 Hybrid: True 	 ID: NOD2_E3.10614.10681.2
Attempted alignments:
Sequence: DLQTLALQQR 	 Hybrid: True 	 Total score: 12 	               B score: 4 	 Y score: 8
Sequence: DLQTLAALGQR 	 Hybrid: True 	 Total score: 11 	               B score: 4 	 Y score: 7
Sequence: DLQTLKPRNT 	 Hybrid: True 	 Total score: 9 	               B score: 4 	 Y score: 5

Acutal alignment: DLQTLALNAAR 	 Hybrid: True 	 ID: NOD2_E3.10635.10674.3
Attempted alignments:
Sequence: EVQITALNARA 	 Hybrid: True 	 Total score: 12 	               B score: 4 	 Y score: 8
Sequence: EVQITALGQRA 	 Hybrid: True 	 Total score: 11 	               B score: 4 	 Y score: 7
Sequence: EVQITALQQR 	 Hybrid: True 	 Total score: 11 	               B score: 4 	 Y score: 7

Acutal alignment: DLQTLEVE 	 Hybrid: True 	 ID: NOD2_E3.11427.11516.2
Attempted alignments:
Sequence: ELTNLEDL 	 Hybrid: True 	 Total score: 8 	               B score: 6 	 Y score: 2
Sequence: ELTNLEVE 	 Hybrid: True 	 Total score: 8 	      