# See how well it did on real data
The spectra are real. We have results from `ssv` files from SpectrumMill. We can load this and see if the sequences we identified matches the sequences SpectrumMill identified, plus the hybrid ones

## 1. Load the `ssv` file

In [2]:
import pandas as pd
import json

specmilresultsfile = '/Users/zacharymcgrath/Desktop/raw inputs/BALB3_E3/BALB3_E3.ssv'
specmilresults = pd.read_csv(specmilresultsfile, sep=';')
specmilresults.head(5)

Unnamed: 0,number,filename,parent_charge,score,deltaForwardReverseScore,deltaRank1Rank2Score,percent_scored_peak_intensity,totalIntensity,previous_aa,sequence,next_aa,retentionTimeMin,chromatographicPeakWidthSec,parent_m_over_z,species,entry_name
0,1,NOD2_E3.13446.13477.2,2,10.1,10.1,9.91,84.5,183000.0,(E),DPQVEQLEL,(-),48.35,26.0,535.7725,MOUSE,ins1C18
1,2,NOD2_E3.18005.18246.2,2,12.84,11.07,12.84,97.8,40000000.0,(G),DLQTLALEVA,(-),65.78,29.0,536.8007,MOUSE,ins1C3
2,3,NOD2_E3.13729.13828.2,2,12.43,6.68,7.86,90.7,2200000.0,(G),DLQTLALE,(-),49.52,22.0,451.746,MOUSE,ins1C5
3,4,NOD2_E3.15226.15503.2,2,11.17,6.21,6.67,89.1,1740000.0,(G),DLQTLAL,(-),54.38,169.0,387.2243,MOUSE,ins1C6
4,5,NOD2_E3.21510.21510.2,2,12.54,12.54,12.54,91.3,91900.0,(G),DLQTLALLL,(D),76.92,3.0,500.3081,MOUSE,HYBRID: mouse ins1C PQVEQLELGGSPGDLQTLAL-LLDEG...


### 1.a Get the hybrids out of the dataframe

In [3]:
specmilhybs = specmilresults.loc[specmilresults['entry_name'].str.contains('HYBRID')]
filenameindexedhybs = {row['filename']: row['sequence'] for _, row in specmilhybs.iterrows()}

### 1.b Get the non hybrids out of the dataframe

In [4]:
specmilnonhybs = specmilresults.loc[~specmilresults['entry_name'].str.contains('HYBRID')]
filenameindexednonhybs = {'scan=' + str(row['number']): row['sequence'] for _, row in specmilnonhybs.iterrows()}

## 2. Load the results from hypedsearch

In [5]:
hyped_res_dir = '/Users/zacharymcgrath/Desktop/raw inputs/BALB3_E3/output/'

resultsjsonfile = hyped_res_dir + 'summary.json'

results = json.load(open(resultsjsonfile, 'r'))

## 3. Check to see the overlap in results

In [6]:
missed = {}
found = {}
hyb_hits = {}
nonhyb_hits = {}

for k, v in results.items():
    _id = v['spectrum']['id'].replace('.pkl', '')
    if _id in filenameindexedhybs:
        
        f = False
        
        # we don't want L and I to be the breakers, so swap those with B
        spec_mil_hyb = filenameindexedhybs[_id].replace('I', 'B').replace('L', 'B')
        
        # go through each of the sequences and see if we have a match
        for a in v['alignments']:
            
            # create a sequence free of L and I with B to compare to
            speculated_hybrid = a['sequence'].replace('I', 'B').replace('L', 'B')
            
            if speculated_hybrid == spec_mil_hyb:
                hyb_hits[_id] = v
                f = True
                found[_id] = v
                continue
                
        if not f: 
            missed[_id] = v
            
    else: 
        
        f = False
        # go through each of the sequences and see if we have a match
        for a in v['alignments']:
            if _id in filenameindexednonhybs and a['sequence'] == filenameindexednonhybs[_id]:
                nonhyb_hits[_id] = v
                f = True
                found[_id] = v
                
        if not f: 
            missed[_id] = v
            
print(f'Found {len(hyb_hits)}/{len(filenameindexedhybs)} hybrids')
print(f'Found {len(nonhyb_hits)}/{len(filenameindexednonhybs)} non hybrids')
    

Found 5/8 hybrids
Found 0/1078 non hybrids


## 4. Look at what was missed 

In [7]:
for k, v in missed.items():
    try:
        correct = filenameindexedhybs[k] if k in filenameindexedhybs else filenameindexednonhybs[k]
        if k in filenameindexedhybs:
            print(f'HYBRID ALIGNMENT: \t {correct} \t\t id:{k}')
        else:
            print(f'NON-HYBRID ALIGNMENT: \t {correct} \t\t id:{k}')

        print('======================================')
        for a in v['alignments']:
            hybrid = 'hybrid_sequence' in a
            seq = a['sequence'] if not hybrid else a['hybrid_sequence']
            print(f'{seq} \t hybrid: {hybrid} \t total score: {a["total_score"]} \t b score: {a["b_score"]} \t y score: {a["y_score"]}')

        print()
    except: 
        continue
        

HYBRID ALIGNMENT: 	 DLQTLALNAAR 		 id:NOD2_E3.10635.10674.3
DLQTLALNARA 	 hybrid: False 	 total score: 11 	 b score: 3 	 y score: 8
DLQTLALGQRA 	 hybrid: False 	 total score: 10 	 b score: 3 	 y score: 7
DLQTLALQQR 	 hybrid: False 	 total score: 10 	 b score: 3 	 y score: 7
DLQTLALGAQR 	 hybrid: False 	 total score: 10 	 b score: 3 	 y score: 7
DLQTLAL-QGAR 	 hybrid: True 	 total score: 9.5 	 b score: 3 	 y score: 8
DLQTLAIFHK 	 hybrid: False 	 total score: 9 	 b score: 3 	 y score: 6
DLQTLALRQQ 	 hybrid: False 	 total score: 9 	 b score: 3 	 y score: 6
EVQITALQQR 	 hybrid: False 	 total score: 9 	 b score: 3 	 y score: 6
DLQTLALGRAQ 	 hybrid: False 	 total score: 9 	 b score: 3 	 y score: 6
EVQITALGQRA 	 hybrid: False 	 total score: 9 	 b score: 3 	 y score: 6
DLQTLALQRQ 	 hybrid: False 	 total score: 9 	 b score: 3 	 y score: 6
DLQTLAAQVAR 	 hybrid: False 	 total score: 9 	 b score: 3 	 y score: 6
DLQTLALRQAG 	 hybrid: False 	 total score: 9 	 b score: 3 	 y score: 6
DLQTLAAINAR 	 hy

In [8]:
for k, v in found.items():
    correct = filenameindexedhybs[k] if k in filenameindexedhybs else filenameindexednonhybs[k]
    if k in filenameindexedhybs:
        print(f'HYBRID ALIGNMENT: \t {correct} \t\t id:{k}')
    else:
        print(f'NON-HYBRID ALIGNMENT: \t {correct} \t\t id:{k}')
        
    print('======================================')
    for a in v['alignments']:
        hybrid = 'hybrid_sequence' in a
        hit_marker = '*' if a['sequence'] == correct else ''
        seq = a['sequence'] if not hybrid else a['hybrid_sequence']
        print(f'{hit_marker}{seq} \t hybrid: {hybrid} \t total score: {a["total_score"]} \t b score: {a["b_score"]} \t y score: {a["y_score"]}')
        
    print()

NON-HYBRID ALIGNMENT: 	 DPQVEQLEL 		 id:NOD2_E3.13446.13477.2
DPQVEQLEI 	 hybrid: False 	 total score: 9 	 b score: 7 	 y score: 2
*DPQVEQLEL 	 hybrid: False 	 total score: 9 	 b score: 7 	 y score: 2
DPQVEQLEN 	 hybrid: False 	 total score: 7 	 b score: 7 	 y score: 0
DPEVQQI-EL 	 hybrid: True 	 total score: 6.5 	 b score: 5 	 y score: 2
DPEVQQI-EI 	 hybrid: True 	 total score: 6.5 	 b score: 5 	 y score: 2

NON-HYBRID ALIGNMENT: 	 DLQTLAL 		 id:NOD2_E3.15226.15503.2
*DLQTLAL 	 hybrid: False 	 total score: 6 	 b score: 4 	 y score: 2
TQIDLAL 	 hybrid: False 	 total score: 6 	 b score: 4 	 y score: 2
QTLDLAL 	 hybrid: False 	 total score: 6 	 b score: 4 	 y score: 2
TQLDLAL 	 hybrid: False 	 total score: 6 	 b score: 4 	 y score: 2
LDQT-LAL 	 hybrid: True 	 total score: 5.5 	 b score: 4 	 y score: 2

NON-HYBRID ALIGNMENT: 	 DLQTLALEVA 		 id:NOD2_E3.18005.18246.2
*DLQTLALEVA 	 hybrid: False 	 total score: 9 	 b score: 7 	 y score: 2
TTPGGSPAVDVA 	 hybrid: False 	 total score: 6 	 b scor

ED-MRLELQ 	 hybrid: True 	 total score: 9.0 	 b score: 6 	 y score: 4
SSELDLELQ 	 hybrid: False 	 total score: 9 	 b score: 6 	 y score: 3
*DEMRLELQ 	 hybrid: False 	 total score: 9 	 b score: 5 	 y score: 4
DEMRLEIQ 	 hybrid: False 	 total score: 9 	 b score: 5 	 y score: 4
DEMRLE-IGA 	 hybrid: True 	 total score: 8.5 	 b score: 5 	 y score: 4

NON-HYBRID ALIGNMENT: 	 DGLLSVK 		 id:NOD2_E3.6645.6645.2
*DGLLSVK 	 hybrid: False 	 total score: 7 	 b score: 1 	 y score: 6
LGLLSVK 	 hybrid: False 	 total score: 6 	 b score: 1 	 y score: 5
LGIIVSK 	 hybrid: False 	 total score: 6 	 b score: 1 	 y score: 5
IGLLSVK 	 hybrid: False 	 total score: 6 	 b score: 1 	 y score: 5
GILLSVK 	 hybrid: False 	 total score: 6 	 b score: 1 	 y score: 5

NON-HYBRID ALIGNMENT: 	 ELAKYFLAELL 		 id:NOD2_E3.18136.18173.2
*ELAKYFLAELL 	 hybrid: False 	 total score: 7 	 b score: 5 	 y score: 2
KLSIPYESF-IL 	 hybrid: True 	 total score: 6.0 	 b score: 5 	 y score: 2
KLSIPYESF-LL 	 hybrid: True 	 total score: 6.0 	

*DLNRNFPDL 	 hybrid: False 	 total score: 8 	 b score: 4 	 y score: 4
SGVTTSLDDPL 	 hybrid: False 	 total score: 8 	 b score: 4 	 y score: 4
SLSLNMNDPL 	 hybrid: False 	 total score: 7 	 b score: 5 	 y score: 2
SIISNMNDPL 	 hybrid: False 	 total score: 7 	 b score: 5 	 y score: 2
SILSNMNDPL 	 hybrid: False 	 total score: 7 	 b score: 5 	 y score: 2

NON-HYBRID ALIGNMENT: 	 DTLQEFLKLA 		 id:NOD2_E3.16874.16939.2
*DTLQEFLKLA 	 hybrid: False 	 total score: 14 	 b score: 5 	 y score: 9
DTLQEFLKIA 	 hybrid: False 	 total score: 14 	 b score: 5 	 y score: 9
ESLQEFLKLA 	 hybrid: False 	 total score: 14 	 b score: 5 	 y score: 9
SELQEFLKLA 	 hybrid: False 	 total score: 14 	 b score: 5 	 y score: 9
DTLQEFLLKA 	 hybrid: False 	 total score: 13 	 b score: 5 	 y score: 8

NON-HYBRID ALIGNMENT: 	 DFEKIFAHY 		 id:NOD2_E3.9976.9976.2
*DFEKIFAHY 	 hybrid: False 	 total score: 12 	 b score: 6 	 y score: 6
DFEKI-AFHY 	 hybrid: True 	 total score: 12 	 b score: 6 	 y score: 6
DFEK-LAFHY 	 hybrid: True 	

## Candidates for hybrid score re-evaluation
1. NOD2_E3.4632.4632.3
    * 1st: `SSKT-VVKVEKQ`, score: 10, b: 6, y: 4
    * correct (8th): `DSEAVSVRKLAG`, score: 9, b:1, y: 8
2. NOD2_E3.9365.9365.2
    * 1st: `TTSFG-IEGYVPSQA`, score: 10, b: 6, y: 4
    * correct (8th): `DKSYIEGYVPSQA`, score: 8, y: 4
3. NOD2_E3.9045.9108.2
    * 2nd: `ATSP-QLELGGSPG`, score: 12, b: 8, y: 4
    * correct (10th): `QVEQLELGGSPG`, score: 9, b: 6, y: 3