# See how well it did on real data
The spectra are real. We have results from `ssv` files from SpectrumMill. We can load this and see if the sequences we identified matches the sequences SpectrumMill identified, plus the hybrid ones

## 1. Load the `ssv` file

In [1]:
import pandas as pd
import json

specmilresultsfile = '/Users/zacharymcgrath/Downloads/NOD2_E3_results.ssv'
specmilresults = pd.read_csv(specmilresultsfile, sep=';')


In [13]:
specmilresults.loc[specmilresults['filename'] == 'NOD2_E3.7065.7065.2']

Unnamed: 0,number,filename,parent_charge,score,deltaForwardReverseScore,deltaRank1Rank2Score,percent_scored_peak_intensity,totalIntensity,previous_aa,sequence,next_aa,retentionTimeMin,chromatographicPeakWidthSec,parent_m_over_z,species,entry_name
6,7,NOD2_E3.7065.7065.2,2,12.55,6.28,7.14,81.3,282000.0,(G),DLPVNSPMTKG,(D),26.08,15.0,579.7953,MOUSE,HYBRID: mouse ins2C EVEDPQVAQLELGGGPGAGD-LPVNS...


### 1.a Get the hybrids out of the dataframe

In [3]:
specmilhybs = specmilresults.loc[specmilresults['entry_name'].str.contains('HYBRID')]
filenameindexedhybs = {row['filename']: row['sequence'] for _, row in specmilhybs.iterrows()}


### 1.b Get the non hybrids out of the dataframe

In [4]:
specmilnonhybs = specmilresults.loc[~specmilresults['entry_name'].str.contains('HYBRID')]
filenameindexednonhybs = {row['filename']: row['sequence'] for _, row in specmilnonhybs.iterrows()}

## 2. Load the results from hypedsearch

In [5]:
nonhybridresultsfile = '/Users/zacharymcgrath/Desktop/Experiment output/filtered_NOD2_E3/summary.tsv'
hybridresultsfile = '/Users/zacharymcgrath/Desktop/Experiment output/filtered_NOD2_E3/hybrid_summary.tsv'
resultsjsonfile = '/Users/zacharymcgrath/Desktop/Experiment output/filtered_NOD2_E3/summary.json'

nonhybridresults = pd.read_csv(nonhybridresultsfile, sep='\t')
hybridresults = pd.read_csv(hybridresultsfile, sep='\t')


In [6]:
nonhybridresults.head()

Unnamed: 0.1,Unnamed: 0,proteins,sequence,b_score,y_score,total_score,precursor_distance,entry name,id
0,0,['INS1_MOUSE Insulin-1'],DPQVEQLEL,6,2,9,0.000673,/Users/zacharymcgrath/Desktop/nod2 data/filter...,NOD2_E3.13446.13477.2.pkl
1,1,"['INS1_MOUSE Insulin-1', 'INS2_MOUSE Insulin-2']",DLQTLALEVA,7,2,9,0.002875,/Users/zacharymcgrath/Desktop/nod2 data/filter...,NOD2_E3.18005.18246.2.pkl
2,2,"['INS1_MOUSE Insulin-1', 'INS2_MOUSE Insulin-2']",DLQTLALE,5,3,8,0.000878,/Users/zacharymcgrath/Desktop/nod2 data/filter...,NOD2_E3.13729.13828.2.pkl
3,3,"['INS1_MOUSE Insulin-1', 'INS2_MOUSE Insulin-2']",DLQTLAL,4,2,6,0.000438,/Users/zacharymcgrath/Desktop/nod2 data/filter...,NOD2_E3.15226.15503.2.pkl
4,4,['CMGA_MOUSE Chromogranin-A'],LPVNSPMTKGD,0,0,0,0.000452,/Users/zacharymcgrath/Desktop/nod2 data/filter...,NOD2_E3.7065.7065.2.pkl


In [7]:
hybridresults.head()

Unnamed: 0.1,Unnamed: 0,left_proteins,right_proteins,sequence,hybrid_sequence,b_score,y_score,total_score,precursor_distance,entry name,id
0,0,"['INS1_MOUSE Insulin-1', 'INS2_MOUSE Insulin-2']",['ERP44_MOUSE Endoplasmic reticulum resident p...,DLQTLALLI,DLQTLAL-LI,6,2,8,0.000194,/Users/zacharymcgrath/Desktop/nod2 data/filter...,NOD2_E3.21510.21510.2.pkl
1,1,['INS2_MOUSE Insulin-2'],['INS2_MOUSE Insulin-2'],PGAGDLQTLALEVAEDPQVAQLELGGGPGAG,PGAGDLQTLALEVA-EDPQVAQLELGGGPGAG,6,5,12,0.493418,/Users/zacharymcgrath/Desktop/nod2 data/filter...,NOD2_E3.12771.12902.3.pkl
2,2,"['INS1_MOUSE Insulin-1', 'INS2_MOUSE Insulin-2']","['CMGA_MOUSE Chromogranin-A', 'ANKS6_MOUSE Ank...",DLQTLALWSRM,DLQTLAL-WSRM,4,8,13,0.000192,/Users/zacharymcgrath/Desktop/nod2 data/filter...,NOD2_E3.16373.16401.2.pkl
3,3,['HA1B_MOUSE H-2 class I histocompatibility an...,[],PCTLLLGAQRL,PCTLL(L)GAQRL,0,2,3,0.99306,/Users/zacharymcgrath/Desktop/nod2 data/filter...,NOD2_E3.10635.10674.3.pkl
4,4,['TRNK1_MOUSE TPR and ankyrin repeat-containin...,"['ACPM_MOUSE Acyl carrier protein, mitochondri...",SSPAARRRVLYVLKLYDK,SSPAARR-RVLYVLKLYDK,4,2,7,1.07132,/Users/zacharymcgrath/Desktop/nod2 data/filter...,NOD2_E3.18104.18104.3.pkl


## 3. Check to see the overlap in results

In [8]:
# pairs are (hypedsearch, specmil, id)
nonhybpairs = []
for _, row in nonhybridresults.iterrows():
    fname = row['id'].replace('.pkl', '')
    if fname in filenameindexednonhybs:
        nonhybpairs.append((row['sequence'], filenameindexednonhybs[fname], fname, 'non-hybrid'))
    else:
        nonhybpairs.append((row['sequence'], filenameindexedhybs[fname], fname, 'hybrid'))
        
hybpairs = []
for _, row in hybridresults.iterrows():
    fname = row['id'].replace('.pkl', '')
    if fname in filenameindexednonhybs:
        hybpairs.append((row['sequence'], filenameindexednonhybs[fname], fname, 'non-hybrid'))
    else:
        hybpairs.append((row['sequence'], filenameindexedhybs[fname], fname, 'hybrid'))



In [9]:
gc = 0
missed_nhpairs = []
for pair in nonhybpairs:
    if pair[0] != pair[1]:
        missed_nhpairs.append(pair)
        print(pair)
    else:
        gc += 1
print(f'Successfully aligned {gc}/{len(filenameindexednonhybs)} ')

('LPVNSPMTKGD', 'DLPVNSPMTKG', 'NOD2_E3.7065.7065.2', 'hybrid')
('LDTQLKPRNT', 'DLQTLALNAAR', 'NOD2_E3.10614.10681.2', 'hybrid')
('ELTNIELL', 'DLQTLEVE', 'NOD2_E3.11427.11516.2', 'hybrid')
('ELQKQKEDL', 'DLQTLALEVE', 'NOD2_E3.16920.17393.2', 'hybrid')
('SAAPGFHQQLRL', 'DLQTLALEVAQQ', 'NOD2_E3.18217.18415.3', 'non-hybrid')
('VIKAAADNKDQLEKATGLT', 'DKQPVKVLVGANFEEVAF', 'NOD2_E3.13690.13690.3', 'non-hybrid')
('NKIEEFLEE', 'DFIETYYLS', 'NOD2_E3.16911.16911.2', 'non-hybrid')
('SGEHSIIGRTMVVHEKQDDLGKGGNE', 'DGTITTKELGTVMRSLGQNPTEAELQ', 'NOD2_E3.14735.14774.4', 'non-hybrid')
('EKGITGKGVVITVLD', 'DYPGLGKWNKKLL', 'NOD2_E3.8613.8647.4', 'non-hybrid')
('TCSRDSTAR', 'GNPDDSFLE', 'NOD2_E3.9224.9311.2', 'non-hybrid')
('FSSFGPISEVVVV', 'DFPEEVAIAEEL', 'NOD2_E3.19743.19743.2', 'non-hybrid')
('TEENPRSFPASQTEAHEDPD', 'DTEENPRSFPASQTEAHEDP', 'NOD2_E3.5092.5092.2', 'non-hybrid')
('LNPDGFERAREGDCGLGDSGPPGTS', 'LQDTEENPRSFPASQTEAHEDP', 'NOD2_E3.5394.5468.3', 'non-hybrid')
('SSATTFRLLWE', 'ELAKYFLAELL', 'NOD

In [10]:
gch = 0
missed_hpairs = []
for pair in hybpairs:
    if pair[0] != pair[1]:
        missed_hpairs.append(pair)
        print(pair)
    else:
        gch += 1
print(f'Successfully aligned {gch}/{len(filenameindexedhybs)}')

('DLQTLALLI', 'DLQTLALLL', 'NOD2_E3.21510.21510.2', 'hybrid')
('PGAGDLQTLALEVAEDPQVAQLELGGGPGAG', 'DPQVAQLELGGEVEDPQVAQLELGGGPGAG', 'NOD2_E3.12771.12902.3', 'hybrid')
('PCTLLLGAQRL', 'DLQTLALNAAR', 'NOD2_E3.10635.10674.3', 'hybrid')
('SSPAARRRVLYVLKLYDK', 'DINAYNGETPTEKLPFPII', 'NOD2_E3.18104.18104.3', 'non-hybrid')
('SEESALNHLQVEGKMVSRTE', 'DTGAGSIREAGGAFGKREKAEE', 'NOD2_E3.3808.3850.4', 'non-hybrid')
('SSSLEKRWVESKHKSDFGK', 'DTGAGSIREAGGAFGKREKAEE', 'NOD2_E3.3820.3820.5', 'non-hybrid')
('LDEVKDYVLIPNASQPESKVFYLKMKG', 'DIYNFFSPLNPVRVHIEIGPDGRVTGEA', 'NOD2_E3.17254.17254.4', 'non-hybrid')
('KKLEAAEERRKSHEGVFAGR', 'DVTKGRKFYGPEGPYGVFAGR', 'NOD2_E3.7225.7225.4', 'non-hybrid')
('SSSSYRRENCIKAF', 'DSRPGGYGYGYGRSR', 'NOD2_E3.3190.3222.4', 'non-hybrid')
('DPQVEQLELGGSETIGPN', 'DPQVEQLELGGSPGDLQT', 'NOD2_E3.14834.14834.2', 'non-hybrid')
('TGTSATNNVNILGGSPG', 'EDPQVEQLELGGSPG', 'NOD2_E3.11823.11823.2', 'non-hybrid')
('TTKTYFGTPSK', 'ERGFFYTPKS', 'NOD2_E3.5323.5391.3', 'non-hybrid')
('TGAGTALAL

## 4. See what the other alignments were for the missed sets

In [11]:
# load the json
jsres = json.load(open(resultsjsonfile, 'r'))

# all the entries are <filename>_<scan_no> so go through and index them by their filename
filenameindexedhypedsearch = {}
for name, alignments in jsres.items():
    # get the id
    fname = alignments['spectrum']['id'].replace('.pkl', '')
    
    filenameindexedhypedsearch[fname] = alignments


In [12]:
# for each of the unsuccesfull non hybrids, print the others
for nh in missed_nhpairs:
    print()
    print(f'Acutal alignment: {nh[1]} \t Hybrid: {"non" not in nh[3]} \t ID: {nh[2]}')
    print('Attempted alignemnts:')
    for alignment in filenameindexedhypedsearch[nh[2]]['alignments']:
        hybrid = 'hybrid_sequence' in alignment
        print(f'Sequence: {alignment["sequence"]} \t Hybrid: {hybrid} \t Total score: {alignment["total_score"]} \t \
              B score: {alignment["b_score"]} \t Y score: {alignment["y_score"]}')


Acutal alignment: DLPVNSPMTKG 	 Hybrid: True 	 ID: NOD2_E3.7065.7065.2
Attempted alignemnts:
Sequence: LPVNSPMTKGD 	 Hybrid: False 	 Total score: 0 	               B score: 0 	 Y score: 0

Acutal alignment: DLQTLALNAAR 	 Hybrid: True 	 ID: NOD2_E3.10614.10681.2
Attempted alignemnts:
Sequence: LDTQLKPRNT 	 Hybrid: False 	 Total score: 7 	               B score: 2 	 Y score: 3
Sequence: KSGKLSQALQQ 	 Hybrid: True 	 Total score: 0 	               B score: 0 	 Y score: 0

Acutal alignment: DLQTLEVE 	 Hybrid: True 	 ID: NOD2_E3.11427.11516.2
Attempted alignemnts:
Sequence: ELTNIELL 	 Hybrid: False 	 Total score: 6 	               B score: 5 	 Y score: 1
Sequence: ELDVEELP 	 Hybrid: True 	 Total score: 6 	               B score: 5 	 Y score: 0
Sequence: ELDVEENP 	 Hybrid: True 	 Total score: 5 	               B score: 5 	 Y score: 0

Acutal alignment: DLQTLALEVE 	 Hybrid: True 	 ID: NOD2_E3.16920.17393.2
Attempted alignemnts:
Sequence: ELQKQKEDL 	 Hybrid: False 	 Total score: 6 	           