# See how well it did on real data
The spectra are real. We have results from `ssv` files from SpectrumMill. We can load this and see if the sequences we identified matches the sequences SpectrumMill identified, plus the hybrid ones

## 1. Load the `ssv` file

In [1]:
import pandas as pd
import json

specmilresultsfile = '/Users/zacharymcgrath/Downloads/NOD2_E3_results.ssv'
specmilresults = pd.read_csv(specmilresultsfile, sep=';')


In [2]:
specmilresults.loc[specmilresults['filename'] == 'NOD2_E3.7065.7065.2']

Unnamed: 0,number,filename,parent_charge,score,deltaForwardReverseScore,deltaRank1Rank2Score,percent_scored_peak_intensity,totalIntensity,previous_aa,sequence,next_aa,retentionTimeMin,chromatographicPeakWidthSec,parent_m_over_z,species,entry_name
6,7,NOD2_E3.7065.7065.2,2,12.55,6.28,7.14,81.3,282000.0,(G),DLPVNSPMTKG,(D),26.08,15.0,579.7953,MOUSE,HYBRID: mouse ins2C EVEDPQVAQLELGGGPGAGD-LPVNS...


### 1.a Get the hybrids out of the dataframe

In [3]:
specmilhybs = specmilresults.loc[specmilresults['entry_name'].str.contains('HYBRID')]
filenameindexedhybs = {row['filename']: row['sequence'] for _, row in specmilhybs.iterrows()}

### 1.b Get the non hybrids out of the dataframe

In [4]:
specmilnonhybs = specmilresults.loc[~specmilresults['entry_name'].str.contains('HYBRID')]
filenameindexednonhybs = {row['filename']: row['sequence'] for _, row in specmilnonhybs.iterrows()}

## 2. Load the results from hypedsearch

In [5]:
hyped_res_dir = '/Users/zacharymcgrath/Desktop/Experiment output/FP/truth/'

resultsjsonfile = hyped_res_dir + 'summary.json'

results = json.load(open(resultsjsonfile, 'r'))

## 3. Check to see the overlap in results

In [6]:
missed = {}
found = {}
hyb_hits = {}
nonhyb_hits = {}

for k, v in results.items():
    _id = v['spectrum']['id'].replace('.pkl', '')
    if _id in filenameindexedhybs:
        
        f = False
        # go through each of the sequences and see if we have a match
        for a in v['alignments']:
            if a['sequence'] == filenameindexedhybs[_id]:
                hyb_hits[_id] = v
                f = True
                found[_id] = v
                continue
                
        if not f: 
            missed[_id] = v
            
    else: 
        
        f = False
        # go through each of the sequences and see if we have a match
        for a in v['alignments']:
            if a['sequence'] == filenameindexednonhybs[_id]:
                nonhyb_hits[_id] = v
                f = True
                found[_id] = v
                
        if not f: 
            missed[_id] = v
            
print(f'Found {len(hyb_hits)}/{len(filenameindexedhybs)} hybrids')
print(f'Found {len(nonhyb_hits)}/{len(filenameindexednonhybs)} non hybrids')
    

Found 5/8 hybrids
Found 528/1078 non hybrids


## 4. Look at what was missed 

In [7]:
for k, v in missed.items():
    correct = filenameindexedhybs[k] if k in filenameindexedhybs else filenameindexednonhybs[k]
    if k in filenameindexedhybs:
        print(f'HYBRID ALIGNMENT: \t {correct} \t\t id:{k}')
    else:
        print(f'NON-HYBRID ALIGNMENT: \t {correct} \t\t id:{k}')
        
    print('======================================')
    for a in v['alignments']:
        hybrid = 'hybrid_sequence' in a
        seq = a['sequence'] if not hybrid else a['hybrid_sequence']
        print(f'{seq} \t hybrid: {hybrid} \t total score: {a["total_score"]} \t b score: {a["b_score"]} \t y score: {a["y_score"]}')
        
    print()
        

NON-HYBRID ALIGNMENT: 	 DLQTLALE 		 id:NOD2_E3.13729.13828.2
TTPGTLALE 	 hybrid: False 	 total score: 9 	 b score: 6 	 y score: 3
EVQTLALE 	 hybrid: False 	 total score: 9 	 b score: 6 	 y score: 3
ALGD-TLALE 	 hybrid: True 	 total score: 8.0 	 b score: 6 	 y score: 3
EVQITALE 	 hybrid: False 	 total score: 8 	 b score: 5 	 y score: 3
TIAGLDALE 	 hybrid: False 	 total score: 8 	 b score: 5 	 y score: 3

HYBRID ALIGNMENT: 	 DLQTLALLL 		 id:NOD2_E3.21510.21510.2
DLQTLAL-IL 	 hybrid: True 	 total score: 7.0 	 b score: 6 	 y score: 2
DLQTLAL-II 	 hybrid: True 	 total score: 7.0 	 b score: 6 	 y score: 2
ALTITTGPLL 	 hybrid: False 	 total score: 7 	 b score: 4 	 y score: 3
LATALTSPLL 	 hybrid: False 	 total score: 7 	 b score: 4 	 y score: 3
LATALTSPII 	 hybrid: False 	 total score: 7 	 b score: 4 	 y score: 3

HYBRID ALIGNMENT: 	 DLQTLALNAAR 		 id:NOD2_E3.10614.10681.2
DLQTLALQQR 	 hybrid: False 	 total score: 12 	 b score: 4 	 y score: 8
DLQTLA-LGAQR 	 hybrid: True 	 total score: 11.0 	 b

KKDAEKYAKFSSPA 	 hybrid: False 	 total score: 5 	 b score: 3 	 y score: 2
KKKEDALEDTQPPA 	 hybrid: False 	 total score: 5 	 b score: 3 	 y score: 2

NON-HYBRID ALIGNMENT: 	 DGYLSLLQ 		 id:NOD2_E3.16783.16886.2
GYDLVTIQ 	 hybrid: False 	 total score: 7 	 b score: 4 	 y score: 3
GYDLVTLQ 	 hybrid: False 	 total score: 7 	 b score: 4 	 y score: 3
YGDLTVLQ 	 hybrid: False 	 total score: 7 	 b score: 4 	 y score: 3
GYDLSLLQ 	 hybrid: False 	 total score: 7 	 b score: 4 	 y score: 3
GYDLLSLQ 	 hybrid: False 	 total score: 7 	 b score: 4 	 y score: 3

NON-HYBRID ALIGNMENT: 	 DLGKEIEQKY 		 id:NOD2_E3.5647.5731.3
KKNSVVEQKY 	 hybrid: False 	 total score: 7 	 b score: 3 	 y score: 4
SSILINERKY 	 hybrid: False 	 total score: 6 	 b score: 4 	 y score: 2
SSILINEASKY 	 hybrid: False 	 total score: 6 	 b score: 4 	 y score: 2
SLLSDPTTTKY 	 hybrid: False 	 total score: 6 	 b score: 4 	 y score: 2
KKNKLSSQKY 	 hybrid: False 	 total score: 6 	 b score: 3 	 y score: 3

NON-HYBRID ALIGNMENT: 	 DDRKQTIENSQ

DVLL-VPRTGIPRAP 	 hybrid: True 	 total score: 7.5 	 b score: 3 	 y score: 5
DVLI-VPRTGIPRAP 	 hybrid: True 	 total score: 7.5 	 b score: 3 	 y score: 5
VDII-VPRTGIPRAP 	 hybrid: True 	 total score: 7.5 	 b score: 3 	 y score: 5
VDLI-VPRTGIPRAP 	 hybrid: True 	 total score: 7.5 	 b score: 3 	 y score: 5

NON-HYBRID ALIGNMENT: 	 DVASSPPESSFQKLAPSEYRYTLLR 		 id:NOD2_E3.11757.11757.4
HSSLPEKPPMKPFILT-YNVAMVETL 	 hybrid: True 	 total score: 5 	 b score: 3 	 y score: 2
HSSLPEKPPMKPFIL-WCGHCKQLAP 	 hybrid: True 	 total score: 4.5 	 b score: 3 	 y score: 2
HSSLPEKPPMKPFILTR-PENNPDTP 	 hybrid: True 	 total score: 4.5 	 b score: 3 	 y score: 2
VSMDVCALRIQLFIGLK-CGHCKQLAP 	 hybrid: True 	 total score: 4.0 	 b score: 3 	 y score: 2
PTETNLGMAKDMSPLPE-VGGGSFPTITP 	 hybrid: True 	 total score: 4.0 	 b score: 3 	 y score: 2

NON-HYBRID ALIGNMENT: 	 EALVKALE 		 id:NOD2_E3.6524.6559.2
KALVKALE 	 hybrid: False 	 total score: 8 	 b score: 3 	 y score: 5
KAIVKALE 	 hybrid: False 	 total score: 8 	 b score:

In [8]:
for k, v in found.items():
    correct = filenameindexedhybs[k] if k in filenameindexedhybs else filenameindexednonhybs[k]
    if k in filenameindexedhybs:
        print(f'HYBRID ALIGNMENT: \t {correct} \t\t id:{k}')
    else:
        print(f'NON-HYBRID ALIGNMENT: \t {correct} \t\t id:{k}')
        
    print('======================================')
    for a in v['alignments']:
        hybrid = 'hybrid_sequence' in a
        hit_marker = '*' if a['sequence'] == correct else ''
        seq = a['sequence'] if not hybrid else a['hybrid_sequence']
        print(f'{hit_marker}{seq} \t hybrid: {hybrid} \t total score: {a["total_score"]} \t b score: {a["b_score"]} \t y score: {a["y_score"]}')
        
    print()

NON-HYBRID ALIGNMENT: 	 DPQVEQLEL 		 id:NOD2_E3.13446.13477.2
DPQVEQLEI 	 hybrid: False 	 total score: 9 	 b score: 7 	 y score: 2
*DPQVEQLEL 	 hybrid: False 	 total score: 9 	 b score: 7 	 y score: 2
DPQVEQLEN 	 hybrid: False 	 total score: 7 	 b score: 7 	 y score: 0
TVFSDFLEL 	 hybrid: False 	 total score: 7 	 b score: 5 	 y score: 2
DPEVQQI-EI 	 hybrid: True 	 total score: 6.5 	 b score: 5 	 y score: 2

NON-HYBRID ALIGNMENT: 	 DLQTLALEVA 		 id:NOD2_E3.18005.18246.2
*DLQTLALEVA 	 hybrid: False 	 total score: 9 	 b score: 7 	 y score: 2
TTPGGSPAVDVA 	 hybrid: False 	 total score: 6 	 b score: 3 	 y score: 3
EAEKALLE-VA 	 hybrid: True 	 total score: 5.5 	 b score: 4 	 y score: 2
LDQT-SATSPGP 	 hybrid: True 	 total score: 5 	 b score: 3 	 y score: 2
VEQT-SATSPGP 	 hybrid: True 	 total score: 5 	 b score: 3 	 y score: 2

NON-HYBRID ALIGNMENT: 	 DLQTLAL 		 id:NOD2_E3.15226.15503.2
QTLDLAL 	 hybrid: False 	 total score: 6 	 b score: 4 	 y score: 2
TQIDLAL 	 hybrid: False 	 total score: 6 

NON-HYBRID ALIGNMENT: 	 DIYKIREIA 		 id:NOD2_E3.7042.7151.3
KYPIVFRQA 	 hybrid: False 	 total score: 6 	 b score: 4 	 y score: 2
SSKKAVFRQA 	 hybrid: False 	 total score: 6 	 b score: 4 	 y score: 2
KKMFSPGRVA 	 hybrid: False 	 total score: 6 	 b score: 4 	 y score: 2
KYALDRRQA 	 hybrid: False 	 total score: 6 	 b score: 4 	 y score: 2
*DIYKIREIA 	 hybrid: False 	 total score: 6 	 b score: 2 	 y score: 4

NON-HYBRID ALIGNMENT: 	 DSLIGGNASAEGPEGEGTESTVVTGVDIVMNHHLQ 		 id:NOD2_E3.15864.15864.3
DSLIGGNASAELGRRHADGSFSDEMSTILDNLAT 	 hybrid: False 	 total score: 9 	 b score: 3 	 y score: 6
TVDIGGNASAEGPEGEGTESTVVTGVDIVMNHHLQ 	 hybrid: False 	 total score: 9 	 b score: 3 	 y score: 6
*DSLIGGNASAEGPEGEGTESTVVTGVDIVMNHHLQ 	 hybrid: False 	 total score: 9 	 b score: 3 	 y score: 6
VLSESSGPSNASHSSSRESHLSNGLWAGSALLAPE 	 hybrid: False 	 total score: 8 	 b score: 5 	 y score: 3
DSLIGGNAS-EEFKQEVEGTAGLLYVDDPNWPGI 	 hybrid: True 	 total score: 8.0 	 b score: 4 	 y score: 5

NON-HYBRID ALIGNMENT: 	 DTV


NON-HYBRID ALIGNMENT: 	 DEWKAIQNK 		 id:NOD2_E3.3865.3865.3
DEWKALRDV 	 hybrid: False 	 total score: 8 	 b score: 3 	 y score: 5
*DEWKAIQNK 	 hybrid: False 	 total score: 8 	 b score: 3 	 y score: 5
DEWKALRVD 	 hybrid: False 	 total score: 8 	 b score: 3 	 y score: 5
DEWKALKANG 	 hybrid: False 	 total score: 8 	 b score: 3 	 y score: 5
DEWKAI-RDV 	 hybrid: True 	 total score: 7.5 	 b score: 3 	 y score: 5

NON-HYBRID ALIGNMENT: 	 DARPAMAATSFVLMTTFPNKELA 		 id:NOD2_E3.16470.16470.3
DARPAMAATSFVLMT-SVTYEHAL 	 hybrid: True 	 total score: 8.0 	 b score: 5 	 y score: 4
ILTQTNSDFQA-SRKYQEGPDAI 	 hybrid: True 	 total score: 7.5 	 b score: 5 	 y score: 3
DARPAMAATSFVLM-FSDEPELAL 	 hybrid: True 	 total score: 7.5 	 b score: 5 	 y score: 3
GFTLDDAIQTGV-FGADAQGAMSKAL 	 hybrid: True 	 total score: 7.5 	 b score: 4 	 y score: 4
*DARPAMAATSFVLMTTFPNKELA 	 hybrid: False 	 total score: 7 	 b score: 5 	 y score: 2

NON-HYBRID ALIGNMENT: 	 DVKLAQFIEKAAASL 		 id:NOD2_E3.17188.17228.3
*DVKLAQFIEKAAASL 	 

## Candidates for hybrid score re-evaluation
1. NOD2_E3.4632.4632.3
    * 1st: `SSKT-VVKVEKQ`, score: 10, b: 6, y: 4
    * correct (8th): `DSEAVSVRKLAG`, score: 9, b:1, y: 8
2. NOD2_E3.9365.9365.2
    * 1st: `TTSFG-IEGYVPSQA`, score: 10, b: 6, y: 4
    * correct (8th): `DKSYIEGYVPSQA`, score: 8, y: 4
3. NOD2_E3.9045.9108.2
    * 2nd: `ATSP-QLELGGSPG`, score: 12, b: 8, y: 4
    * correct (10th): `QVEQLELGGSPG`, score: 9, b: 6, y: 3