# See how well it did on real data
The spectra are real. We have results from `ssv` files from SpectrumMill. We can load this and see if the sequences we identified matches the sequences SpectrumMill identified, plus the hybrid ones

## 1. Load the `ssv` file

In [1]:
import pandas as pd
import json

specmilresultsfile = '/Users/zacharymcgrath/Downloads/NOD2_E3_results.ssv'
specmilresults = pd.read_csv(specmilresultsfile, sep=';')


In [2]:
specmilresults.loc[specmilresults['filename'] == 'NOD2_E3.7065.7065.2']

Unnamed: 0,number,filename,parent_charge,score,deltaForwardReverseScore,deltaRank1Rank2Score,percent_scored_peak_intensity,totalIntensity,previous_aa,sequence,next_aa,retentionTimeMin,chromatographicPeakWidthSec,parent_m_over_z,species,entry_name
6,7,NOD2_E3.7065.7065.2,2,12.55,6.28,7.14,81.3,282000.0,(G),DLPVNSPMTKG,(D),26.08,15.0,579.7953,MOUSE,HYBRID: mouse ins2C EVEDPQVAQLELGGGPGAGD-LPVNS...


### 1.a Get the hybrids out of the dataframe

In [3]:
specmilhybs = specmilresults.loc[specmilresults['entry_name'].str.contains('HYBRID')]
filenameindexedhybs = {row['filename']: row['sequence'] for _, row in specmilhybs.iterrows()}


### 1.b Get the non hybrids out of the dataframe

In [4]:
specmilnonhybs = specmilresults.loc[~specmilresults['entry_name'].str.contains('HYBRID')]
filenameindexednonhybs = {row['filename']: row['sequence'] for _, row in specmilnonhybs.iterrows()}

## 2. Load the results from hypedsearch

In [5]:
hyped_res_dir = '/Users/zacharymcgrath/Desktop/Experiment output/filtered_NOD2_E3_AUG_26/'

nonhybridresultsfile = hyped_res_dir + 'summary.tsv'
hybridresultsfile = hyped_res_dir + 'hybrid_summary.tsv'
resultsjsonfile = hyped_res_dir + 'summary.json'

nonhybridresults = pd.read_csv(nonhybridresultsfile, sep='\t')
hybridresults = pd.read_csv(hybridresultsfile, sep='\t')


In [6]:
nonhybridresults.head()

Unnamed: 0.1,Unnamed: 0,proteins,sequence,b_score,y_score,total_score,precursor_distance,entry name,id
0,0,['sp|P01325|INS1_MOUSE Insulin-1 OS=Mus muscul...,DPQVEQLEL,7,2,9,0.000673,0,NOD2_E3.13446.13477.2.pkl
1,1,['sp|P01325|INS1_MOUSE Insulin-1 OS=Mus muscul...,DLQTLALEVA,7,2,9,0.002875,1,NOD2_E3.18005.18246.2.pkl
2,2,['sp|P01325|INS1_MOUSE Insulin-1 OS=Mus muscul...,DLQTLAL,4,2,6,0.000438,3,NOD2_E3.15226.15503.2.pkl
3,3,['sp|P17751|TPIS_MOUSE Triosephosphate isomera...,DLQRLEPGTM,1,3,4,0.497173,6,NOD2_E3.7065.7065.2.pkl
4,4,[],DLQTLALEVE,7,4,11,0.004477,11,NOD2_E3.16920.17393.2.pkl


In [7]:
hybridresults.head()

Unnamed: 0.1,Unnamed: 0,left_proteins,right_proteins,sequence,hybrid_sequence,b_score,y_score,total_score,precursor_distance,entry name,id
0,0,['sp|P27546|MAP4_MOUSE Microtubule-associated ...,['sp|P01325|INS1_MOUSE Insulin-1 OS=Mus muscul...,TTPGTLALE,TTPG-TLALE,6,3,9,0.000877,2,NOD2_E3.13729.13828.2.pkl
1,1,['sp|Q91ZI0|CELR3_MOUSE Cadherin EGF LAG seven...,['sp|P16675|PPGB_MOUSE Lysosomal protective pr...,TLAITASPLL,TLAITA-SPLL,5,3,8,0.000194,4,NOD2_E3.21510.21510.2.pkl
2,2,['sp|P01325|INS1_MOUSE Insulin-1 OS=Mus muscul...,['sp|P26339|CMGA_MOUSE Chromogranin-A OS=Mus m...,DLQTLALWSRM,DLQTLAL-WSRM,5,8,13,0.000192,7,NOD2_E3.16373.16401.2.pkl
3,3,['sp|P01325|INS1_MOUSE Insulin-1 OS=Mus muscul...,['sp|Q8VDC1|FYCO1_MOUSE FYVE and coiled-coil d...,DLQTLALQQR,DLQTL-ALQQR,4,8,12,0.000812,8,NOD2_E3.10614.10681.2.pkl
4,4,['sp|P63154|CRNL1_MOUSE Crooked neck-like prot...,['sp|Q02819|NUCB1_MOUSE Nucleobindin-1 OS=Mus ...,EVQITALNARA,EVQITA-LNARA,4,8,12,0.504749,9,NOD2_E3.10635.10674.3.pkl


## 3. Check to see the overlap in results

In [8]:
# pairs are (hypedsearch, specmil, id)
untracked = []
nonhybpairs = []
for _, row in nonhybridresults.iterrows():
    fname = row['id'].replace('.pkl', '')
    if fname in filenameindexednonhybs:
        nonhybpairs.append((row['sequence'], filenameindexednonhybs[fname], fname, 'non-hybrid'))
    elif fname in filenameindexedhybs:
        nonhybpairs.append((row['sequence'], filenameindexedhybs[fname], fname, 'hybrid'))
    else:
        untracked.append((row['sequence'], fname, '---'))

hybpairs = []
for _, row in hybridresults.iterrows():
    fname = row['id'].replace('.pkl', '')
    if fname in filenameindexednonhybs:
        hybpairs.append((row['sequence'], filenameindexednonhybs[fname], fname, 'non-hybrid'))
    elif fname in filenameindexedhybs:
        hybpairs.append((row['sequence'], filenameindexedhybs[fname], fname, 'hybrid'))
    else:
        untracked.append((row['sequence'], fname, '---'))

untracked

[]

In [9]:
gc = 0
missed_nhpairs = []
for pair in nonhybpairs:
    # replace LI with B 
    hyped_LI_less = pair[0].replace('L', 'B').replace('I', 'B')
    specmil_LI_less = pair[1].replace('L', 'B').replace('I', 'B')
    if hyped_LI_less != specmil_LI_less:
        missed_nhpairs.append(pair)
        print(pair)
    else:
        gc += 1
print(f'Successfully aligned {gc}/{len(filenameindexednonhybs)} ')

('DLQRLEPGTM', 'DLPVNSPMTKG', 'NOD2_E3.7065.7065.2', 'hybrid')
('SWAKSAGSKSSGGRMEAEREK', 'DTGAGSIREAGGAFGKREKAEE', 'NOD2_E3.3808.3850.4', 'non-hybrid')
('SNRSYVVEYIPDAE', 'EVQSSRSGRGGNFGFG', 'NOD2_E3.4133.4133.3', 'non-hybrid')
('DLQTLAIERAVQ', 'DLQTLALEVARQ', 'NOD2_E3.14557.14557.2', 'non-hybrid')
('VEDPQVEQLELGGSGP', 'VEDPQVEQLELGGSPG', 'NOD2_E3.12025.12088.2', 'non-hybrid')
('DMLNIDDA', 'DMINEVDA', 'NOD2_E3.9494.9615.2', 'non-hybrid')
('DLEQLKIAEKFSQ', 'DLELQKIAEKFSQ', 'NOD2_E3.12672.12707.3', 'non-hybrid')
('LKTLVKP', 'DLTIKLP', 'NOD2_E3.11916.11949.2', 'non-hybrid')
('TPNVVFF', 'VNPTVFF', 'NOD2_E3.14326.14398.2', 'non-hybrid')
('DLGAARANLEKETLHKQYHLVKSH', 'DLVFIFWAPENAPLKSKMIYASSK', 'NOD2_E3.15991.16129.4', 'non-hybrid')
('TTEVNMASP', 'EVFENQSP', 'NOD2_E3.5748.5895.2', 'non-hybrid')
('GGEELESLSAIEAELEKVAHQ', 'DQELESLSAIEAELEKVAHQ', 'NOD2_E3.18335.18335.3', 'non-hybrid')
('QDELESLSAIEAELEKVAHQL', 'DQELESLSAIEAELEKVAHQL', 'NOD2_E3.20614.20694.3', 'non-hybrid')
('QDELESLSAIEAELEKVAHQ

In [10]:
gch = 0
missed_hpairs = []
for pair in hybpairs:
    if pair[0] != pair[1]:
        missed_hpairs.append(pair)
        print(pair)
    else:
        gch += 1
print(f'Successfully aligned {gch}/{len(filenameindexedhybs)}')

('TTPGTLALE', 'DLQTLALE', 'NOD2_E3.13729.13828.2', 'non-hybrid')
('TLAITASPLL', 'DLQTLALLL', 'NOD2_E3.21510.21510.2', 'hybrid')
('DLQTLALQQR', 'DLQTLALNAAR', 'NOD2_E3.10614.10681.2', 'hybrid')
('EVQITALNARA', 'DLQTLALNAAR', 'NOD2_E3.10635.10674.3', 'hybrid')
('ELTNLEVE', 'DLQTLEVE', 'NOD2_E3.11427.11516.2', 'hybrid')
('SIQRSAPGGGGKRGHKCIPPS', 'DEAPNFEANTTIGRIRFH', 'NOD2_E3.9670.9702.4', 'non-hybrid')
('SSFQKLAPSEAFGKREKAEE', 'DTGAGSIREAGGAFGKREKAEE', 'NOD2_E3.3820.3820.5', 'non-hybrid')
('VTTVQSRPRVDGKLDSPSRQVRQMLFD', 'DIYNFFSPLNPVRVHIEIGPDGRVTGEA', 'NOD2_E3.17254.17254.4', 'non-hybrid')
('KKLEAAEERRKSQEAVFAGR', 'DVTKGRKFYGPEGPYGVFAGR', 'NOD2_E3.7225.7225.4', 'non-hybrid')
('SSDDVVKVEKQ', 'DSEAVSVRKLAG', 'NOD2_E3.4632.4632.3', 'non-hybrid')
('SVPTFEAVSGPPPA', 'DVAVFEAVSGPPPA', 'NOD2_E3.15658.15779.2', 'non-hybrid')
('LEGQCNPDLRLLGCARYNSYQSFQTPQ', 'EVRKALSRQEMQEVQSSRSGRGGNFGFG', 'NOD2_E3.5288.5288.5', 'non-hybrid')
('KKFCGVLK', 'DVYKGVLK', 'NOD2_E3.4039.4039.3', 'non-hybrid')
('TETVVCAL

## 4. See what the other alignments were for the missed sets

In [11]:
# load the json
jsres = json.load(open(resultsjsonfile, 'r'))

# all the entries are <filename>_<scan_no> so go through and index them by their filename
filenameindexedhypedsearch = {}
for name, alignments in jsres.items():
    # get the id
    fname = alignments['spectrum']['id'].replace('.pkl', '')
    
    filenameindexedhypedsearch[fname] = alignments


In [12]:
# for each of the unsuccesfull non hybrids, print the others
for nh in missed_nhpairs:
    print()
    print(f'Acutal alignment: {nh[1]} \t Hybrid: {"non" not in nh[3]} \t ID: {nh[2]}')
    print('Attempted alignemnts:')
    for alignment in filenameindexedhypedsearch[nh[2]]['alignments']:
        hybrid = 'hybrid_sequence' in alignment
        foundstring = '*' if alignment['sequence'] == nh[1] else ''
        print(f'{foundstring}Sequence: {alignment["sequence"]} \t Hybrid: {hybrid} \t Total score: {alignment["total_score"]} \t \
              B score: {alignment["b_score"]} \t Y score: {alignment["y_score"]}')


Acutal alignment: DLPVNSPMTKG 	 Hybrid: True 	 ID: NOD2_E3.7065.7065.2
Attempted alignemnts:
Sequence: DLQRLEPGTM 	 Hybrid: False 	 Total score: 4 	               B score: 1 	 Y score: 3
Sequence: DDQESLRPSN 	 Hybrid: False 	 Total score: 3 	               B score: 0 	 Y score: 3
Sequence: LPVNSPMTKGD 	 Hybrid: False 	 Total score: 0 	               B score: 0 	 Y score: 0

Acutal alignment: DTGAGSIREAGGAFGKREKAEE 	 Hybrid: False 	 ID: NOD2_E3.3808.3850.4
Attempted alignemnts:
Sequence: SWAKSAGSKSSGGRMEAEREK 	 Hybrid: False 	 Total score: 6 	               B score: 4 	 Y score: 2

Acutal alignment: EVQSSRSGRGGNFGFG 	 Hybrid: False 	 ID: NOD2_E3.4133.4133.3
Attempted alignemnts:
Sequence: SNRSYVVEYIPDAE 	 Hybrid: False 	 Total score: 8 	               B score: 4 	 Y score: 4
*Sequence: EVQSSRSGRGGNFGFG 	 Hybrid: False 	 Total score: 6 	               B score: 2 	 Y score: 4
Sequence: CGLTVIDLEVYGGVFG 	 Hybrid: True 	 Total score: 5 	               B score: 2 	 Y score: 3

Acutal alignm

In [13]:
for h in missed_hpairs:
    print()
    print(f'Acutal alignment: {h[1]} \t Hybrid: {"non" not in h[3]} \t ID: {h[2]}')
    print('Attempted alignments:')
    for alignment in filenameindexedhypedsearch[h[2]]['alignments']:
        hybrid = 'hybrid_sequence' in alignment
        foundstring = '*' if alignment['sequence'] == h[1] else ''
        print(f'{foundstring}Sequence: {alignment["sequence"]} \t Hybrid: {hybrid} \t Total score: {alignment["total_score"]} \t \
              B score: {alignment["b_score"]} \t Y score: {alignment["y_score"]}')


Acutal alignment: DLQTLALE 	 Hybrid: False 	 ID: NOD2_E3.13729.13828.2
Attempted alignments:
Sequence: TTPGTLALE 	 Hybrid: True 	 Total score: 9 	               B score: 6 	 Y score: 3
*Sequence: DLQTLALE 	 Hybrid: False 	 Total score: 8 	               B score: 5 	 Y score: 3
Sequence: EVQLTALE 	 Hybrid: True 	 Total score: 8 	               B score: 5 	 Y score: 3

Acutal alignment: DLQTLALLL 	 Hybrid: True 	 ID: NOD2_E3.21510.21510.2
Attempted alignments:
Sequence: TLAITASPLL 	 Hybrid: True 	 Total score: 8 	               B score: 5 	 Y score: 3
*Sequence: DLQTLALLL 	 Hybrid: True 	 Total score: 8 	               B score: 6 	 Y score: 2
Sequence: DLQTLSPLL 	 Hybrid: True 	 Total score: 7 	               B score: 4 	 Y score: 3

Acutal alignment: DLQTLALNAAR 	 Hybrid: True 	 ID: NOD2_E3.10614.10681.2
Attempted alignments:
Sequence: DLQTLALQQR 	 Hybrid: True 	 Total score: 12 	               B score: 4 	 Y score: 8
Sequence: DLQTLALFHK 	 Hybrid: True 	 Total score: 11 	             