# See how well it did on real data
The spectra are real. We have results from `ssv` files from SpectrumMill. We can load this and see if the sequences we identified matches the sequences SpectrumMill identified, plus the hybrid ones

## 1. Load the `ssv` file

In [1]:
import pandas as pd
import json

specmilresultsfile = '/Users/zacharymcgrath/Downloads/NOD2_E3_results.ssv'
specmilresults = pd.read_csv(specmilresultsfile, sep=';')


In [2]:
specmilresults.loc[specmilresults['filename'] == 'NOD2_E3.7065.7065.2']

Unnamed: 0,number,filename,parent_charge,score,deltaForwardReverseScore,deltaRank1Rank2Score,percent_scored_peak_intensity,totalIntensity,previous_aa,sequence,next_aa,retentionTimeMin,chromatographicPeakWidthSec,parent_m_over_z,species,entry_name
6,7,NOD2_E3.7065.7065.2,2,12.55,6.28,7.14,81.3,282000.0,(G),DLPVNSPMTKG,(D),26.08,15.0,579.7953,MOUSE,HYBRID: mouse ins2C EVEDPQVAQLELGGGPGAGD-LPVNS...


### 1.a Get the hybrids out of the dataframe

In [3]:
specmilhybs = specmilresults.loc[specmilresults['entry_name'].str.contains('HYBRID')]
filenameindexedhybs = {row['filename']: row['sequence'] for _, row in specmilhybs.iterrows()}

### 1.b Get the non hybrids out of the dataframe

In [4]:
specmilnonhybs = specmilresults.loc[~specmilresults['entry_name'].str.contains('HYBRID')]
filenameindexednonhybs = {row['filename']: row['sequence'] for _, row in specmilnonhybs.iterrows()}

## 2. Load the results from hypedsearch

In [5]:
hyped_res_dir = '/Users/zacharymcgrath/Desktop/Experiment output/filtered_NOD2_E3_SEP_22/'

resultsjsonfile = hyped_res_dir + 'summary.json'

results = json.load(open(resultsjsonfile, 'r'))

## 3. Check to see the overlap in results

In [6]:
missed = {}
found = {}
hyb_hits = {}
nonhyb_hits = {}

for k, v in results.items():
    _id = v['spectrum']['id'].replace('.pkl', '')
    if _id in filenameindexedhybs:
        
        f = False
        # go through each of the sequences and see if we have a match
        for a in v['alignments']:
            if a['sequence'] == filenameindexedhybs[_id]:
                hyb_hits[_id] = v
                f = True
                found[_id] = v
                continue
                
        if not f: 
            missed[_id] = v
            
    else: 
        
        f = False
        # go through each of the sequences and see if we have a match
        for a in v['alignments']:
            if a['sequence'] == filenameindexednonhybs[_id]:
                nonhyb_hits[_id] = v
                f = True
                found[_id] = v
                
        if not f: 
            missed[_id] = v
            
print(f'Found {len(hyb_hits)}/{len(filenameindexedhybs)} hybrids')
print(f'Found {len(nonhyb_hits)}/{len(filenameindexednonhybs)} non hybrids')
    

Found 6/8 hybrids
Found 447/1078 non hybrids


## 4. Look at what was missed 

In [7]:
for k, v in missed.items():
    correct = filenameindexedhybs[k] if k in filenameindexedhybs else filenameindexednonhybs[k]
    if k in filenameindexedhybs:
        print(f'HYBRID ALIGNMENT: \t {correct} \t\t id:{k}')
    else:
        print(f'NON-HYBRID ALIGNMENT: \t {correct} \t\t id:{k}')
        
    print('======================================')
    for a in v['alignments']:
        hybrid = 'hybrid_sequence' in a
        seq = a['sequence'] if not hybrid else a['hybrid_sequence']
        print(f'{seq} \t hybrid: {hybrid} \t total score: {a["total_score"]} \t b score: {a["b_score"]} \t y score: {a["y_score"]}')
        
    print()
        

NON-HYBRID ALIGNMENT: 	 DLQTLALE 		 id:NOD2_E3.13729.13828.2
TTPGTLALE 	 hybrid: False 	 total score: 9 	 b score: 6 	 y score: 3
TTPGT-LAIE 	 hybrid: True 	 total score: 9 	 b score: 6 	 y score: 3
TTPGT-IAIE 	 hybrid: True 	 total score: 9 	 b score: 6 	 y score: 3
EVQTLALE 	 hybrid: False 	 total score: 9 	 b score: 6 	 y score: 3
EVGA-TLALE 	 hybrid: True 	 total score: 9 	 b score: 6 	 y score: 3

HYBRID ALIGNMENT: 	 DLQTLALLL 		 id:NOD2_E3.21510.21510.2
DLQTLAL-II 	 hybrid: True 	 total score: 8 	 b score: 6 	 y score: 2
DLQTLAL-IL 	 hybrid: True 	 total score: 8 	 b score: 6 	 y score: 2
TLALA-TSPLL 	 hybrid: True 	 total score: 8 	 b score: 5 	 y score: 3
TLAITA-PSIL 	 hybrid: True 	 total score: 8 	 b score: 5 	 y score: 3
ITALA-TSPLL 	 hybrid: True 	 total score: 8 	 b score: 5 	 y score: 3

HYBRID ALIGNMENT: 	 DLQTLEVE 		 id:NOD2_E3.11427.11516.2
ELTNIE-DL 	 hybrid: True 	 total score: 8 	 b score: 6 	 y score: 2
ELTNIE-VE 	 hybrid: True 	 total score: 8 	 b score: 6 	 y sco

KKTTMASSAQRKRMSP-GRSFRVSIPTDLIA 	 hybrid: True 	 total score: 7 	 b score: 4 	 y score: 3
KKTTMASSAQRKRMS-ELGIKTDGSRQQILA 	 hybrid: True 	 total score: 7 	 b score: 4 	 y score: 3

NON-HYBRID ALIGNMENT: 	 APSDPRLRQFLQKSLAAATGKQELAKYFLAELLSEPNQTEN 		 id:NOD2_E3.17846.17907.5
DQVLPHLLEVDRKTGAVYV-GFPTIYFSPANKKLTPKKYEGG 	 hybrid: True 	 total score: 7 	 b score: 4 	 y score: 3
KKNYRASQQEIQHIVNRHGP-QPHLPQPHLPHLPQQNVVI 	 hybrid: True 	 total score: 7 	 b score: 4 	 y score: 3
TRKVLMEGVRPSNADALVGK-LLKLDRLDLARKELKKMQDQ 	 hybrid: True 	 total score: 7 	 b score: 4 	 y score: 3
KKNYRASQQEIQHIVNRHGPEADRH-TIMDPHAMPYSHSP 	 hybrid: True 	 total score: 7 	 b score: 4 	 y score: 3
TTALLKIDITDTERLSRGS-AYKNVVGGRRSAWRVISSIEQK 	 hybrid: True 	 total score: 7 	 b score: 4 	 y score: 3

NON-HYBRID ALIGNMENT: 	 DPRLRQFLQKSLAAATGKQELAK 		 id:NOD2_E3.9370.9370.5
AAAARGLLHPASAP-SLAAATGKQELAK 	 hybrid: True 	 total score: 8 	 b score: 3 	 y score: 5
KGPEVKEYREKV-GPKESRRTAK 	 hybrid: True 	 total score: 6 	 b sco

TVAVI-LLQ 	 hybrid: True 	 total score: 6 	 b score: 4 	 y score: 2

NON-HYBRID ALIGNMENT: 	 YAIDNPLHYQ 		 id:NOD2_E3.7711.7814.2
YAI(D)LTQHGIC 	 hybrid: True 	 total score: 12 	 b score: 5 	 y score: 7
YAL(D)LTQHGIC 	 hybrid: True 	 total score: 12 	 b score: 5 	 y score: 7
FSIDLTQHGIC 	 hybrid: False 	 total score: 11 	 b score: 5 	 y score: 6
FSL(D)LTQHGIC 	 hybrid: True 	 total score: 11 	 b score: 5 	 y score: 6
AYL(D)LTQHGIC 	 hybrid: True 	 total score: 11 	 b score: 5 	 y score: 6

NON-HYBRID ALIGNMENT: 	 DGRMIVGTLKGF 		 id:NOD2_E3.9915.9982.3
DGRMIVGTIFGK 	 hybrid: False 	 total score: 12 	 b score: 5 	 y score: 7
KCPLMVGTLKGF 	 hybrid: False 	 total score: 11 	 b score: 5 	 y score: 6
PCKMIVGTLKGF 	 hybrid: False 	 total score: 11 	 b score: 5 	 y score: 6
KTDDL-VGTLKGF 	 hybrid: True 	 total score: 11 	 b score: 5 	 y score: 6
KDTDL-VGTLKGF 	 hybrid: True 	 total score: 11 	 b score: 5 	 y score: 6

NON-HYBRID ALIGNMENT: 	 DESHERVFSSSQGVEQVVLGLYIVRG 		 id:NOD2_E3.15391.15391

In [8]:
for k, v in found.items():
    correct = filenameindexedhybs[k] if k in filenameindexedhybs else filenameindexednonhybs[k]
    if k in filenameindexedhybs:
        print(f'HYBRID ALIGNMENT: \t {correct} \t\t id:{k}')
    else:
        print(f'NON-HYBRID ALIGNMENT: \t {correct} \t\t id:{k}')
        
    print('======================================')
    for a in v['alignments']:
        hybrid = 'hybrid_sequence' in a
        hit_marker = '*' if a['sequence'] == correct else ''
        seq = a['sequence'] if not hybrid else a['hybrid_sequence']
        print(f'{hit_marker}{seq} \t hybrid: {hybrid} \t total score: {a["total_score"]} \t b score: {a["b_score"]} \t y score: {a["y_score"]}')
        
    print()

NON-HYBRID ALIGNMENT: 	 DPQVEQLEL 		 id:NOD2_E3.13446.13477.2
DPQVEQLEI 	 hybrid: False 	 total score: 9 	 b score: 7 	 y score: 2
*DPQVEQLEL 	 hybrid: False 	 total score: 9 	 b score: 7 	 y score: 2
DPQVEQLEN 	 hybrid: False 	 total score: 7 	 b score: 7 	 y score: 0
TVFSDFLEL 	 hybrid: False 	 total score: 7 	 b score: 5 	 y score: 2
DPQVEQ-ELD 	 hybrid: True 	 total score: 6.3 	 b score: 6 	 y score: 0

NON-HYBRID ALIGNMENT: 	 DLQTLALEVA 		 id:NOD2_E3.18005.18246.2
*DLQTLALEVA 	 hybrid: False 	 total score: 9 	 b score: 7 	 y score: 2
TTPGGSPAVDVA 	 hybrid: False 	 total score: 7 	 b score: 4 	 y score: 3
AIEAELEKVA 	 hybrid: False 	 total score: 6 	 b score: 4 	 y score: 2
TTPVIGFGPSP 	 hybrid: False 	 total score: 6 	 b score: 3 	 y score: 3
TVQEDGSGPPS 	 hybrid: False 	 total score: 6 	 b score: 3 	 y score: 3

NON-HYBRID ALIGNMENT: 	 DLQTLAL 		 id:NOD2_E3.15226.15503.2
QTLDLAL 	 hybrid: False 	 total score: 6 	 b score: 4 	 y score: 2
GATLDLAL 	 hybrid: False 	 total score: 6 	

*DVIAQGVGKLASVPAGGAVAVSAAPGSAAPAAGSAPAAA 	 hybrid: False 	 total score: 12 	 b score: 6 	 y score: 6
PAASQVASAQPGLAS-AGGAVAVSAAPGSAAPAAGSAPAAA 	 hybrid: True 	 total score: 10.799999999999999 	 b score: 6 	 y score: 7
PAGGAVAVSAAPGSA-DMATNAACASLLKKKQQGTD 	 hybrid: True 	 total score: 10.200000000000001 	 b score: 7 	 y score: 3
PAGGAVAVSAAPGSA-EQMQKDPQALSEHLKNPV 	 hybrid: True 	 total score: 10.200000000000001 	 b score: 7 	 y score: 3

NON-HYBRID ALIGNMENT: 	 DFLHAR 		 id:NOD2_E3.3772.3817.2
*DFLHAR 	 hybrid: False 	 total score: 9 	 b score: 1 	 y score: 8
LGAAALAAV 	 hybrid: False 	 total score: 4 	 b score: 3 	 y score: 1
KNVIGAR 	 hybrid: False 	 total score: 4 	 b score: 1 	 y score: 3
ETFHAR 	 hybrid: False 	 total score: 4 	 b score: 0 	 y score: 4
IVNGLRS 	 hybrid: False 	 total score: 3 	 b score: 3 	 y score: 0

NON-HYBRID ALIGNMENT: 	 DIPFVLSANLHGG 		 id:NOD2_E3.14796.14910.2
EV-PFVLSANLHGG 	 hybrid: True 	 total score: 16.5 	 b score: 4 	 y score: 9
ID-PFVLSANLHGG 	 hybrid

LEELKAFKLF 	 hybrid: False 	 total score: 10 	 b score: 6 	 y score: 4
EIELKAFKLF 	 hybrid: False 	 total score: 10 	 b score: 6 	 y score: 4
ELELKAFKLF 	 hybrid: False 	 total score: 10 	 b score: 6 	 y score: 4
EEILKAFKIF 	 hybrid: False 	 total score: 10 	 b score: 6 	 y score: 4

NON-HYBRID ALIGNMENT: 	 DFLTVMTQKMSEK 		 id:NOD2_E3.12013.12077.3
*DFLTVMTQKMSEK 	 hybrid: False 	 total score: 9 	 b score: 2 	 y score: 7
SFERGMISGGHNPLG 	 hybrid: False 	 total score: 7 	 b score: 3 	 y score: 4
SFERGMFTAEDLR 	 hybrid: False 	 total score: 7 	 b score: 3 	 y score: 4
SFERG-MTQKMSEK 	 hybrid: True 	 total score: 6.3 	 b score: 3 	 y score: 5
SLSLMYNKCRELC 	 hybrid: False 	 total score: 6 	 b score: 3 	 y score: 3

NON-HYBRID ALIGNMENT: 	 EEILKAFKLF 		 id:NOD2_E3.13641.13743.3
K-EILKAFKLF 	 hybrid: True 	 total score: 13.799999999999999 	 b score: 1 	 y score: 9
KLE-LKAFKLF 	 hybrid: True 	 total score: 11.7 	 b score: 2 	 y score: 8
KIELKAFKLF 	 hybrid: False 	 total score: 10 	 b score:

## Candidates for hybrid score re-evaluation
1. NOD2_E3.4632.4632.3
    * 1st: `SSKT-VVKVEKQ`, score: 10, b: 6, y: 4
    * correct (8th): `DSEAVSVRKLAG`, score: 9, b:1, y: 8
2. NOD2_E3.9365.9365.2
    * 1st: `TTSFG-IEGYVPSQA`, score: 10, b: 6, y: 4
    * correct (8th): `DKSYIEGYVPSQA`, score: 8, y: 4
3. NOD2_E3.9045.9108.2
    * 2nd: `ATSP-QLELGGSPG`, score: 12, b: 8, y: 4
    * correct (10th): `QVEQLELGGSPG`, score: 9, b: 6, y: 3